# Spark DataFrame의 메소드/함수/특징 알아보기

In [20]:
from pyspark.sql import SparkSession
import pandas as pd

spark = SparkSession.builder.master("local").appName("spark-dataframe").getOrCreate()
spark

filepath = "/home/ubuntu/working/spark/data/titanic_train.csv"
titanic_sdf = spark.read.csv(filepath, inferSchema=True, header=True)
titanic_pdf = pd.read_csv(filepath)

## Select
### Select 뽑는 방식의 차이

In [21]:
titanic_sdf.select("Name","Fare").show()
titanic_pdf[["Name","Fare"]]

+--------------------+-------+
|                Name|   Fare|
+--------------------+-------+
|Braund, Mr. Owen ...|   7.25|
|Cumings, Mrs. Joh...|71.2833|
|Heikkinen, Miss. ...|  7.925|
|Futrelle, Mrs. Ja...|   53.1|
|Allen, Mr. Willia...|   8.05|
|    Moran, Mr. James| 8.4583|
|McCarthy, Mr. Tim...|51.8625|
|Palsson, Master. ...| 21.075|
|Johnson, Mrs. Osc...|11.1333|
|Nasser, Mrs. Nich...|30.0708|
|Sandstrom, Miss. ...|   16.7|
|Bonnell, Miss. El...|  26.55|
|Saundercock, Mr. ...|   8.05|
|Andersson, Mr. An...| 31.275|
|Vestrom, Miss. Hu...| 7.8542|
|Hewlett, Mrs. (Ma...|   16.0|
|Rice, Master. Eugene| 29.125|
|Williams, Mr. Cha...|   13.0|
|Vander Planke, Mr...|   18.0|
|Masselmani, Mrs. ...|  7.225|
+--------------------+-------+
only showing top 20 rows



Unnamed: 0,Name,Fare
0,"Braund, Mr. Owen Harris",7.2500
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",71.2833
2,"Heikkinen, Miss. Laina",7.9250
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",53.1000
4,"Allen, Mr. William Henry",8.0500
...,...,...
886,"Montvila, Rev. Juozas",13.0000
887,"Graham, Miss. Margaret Edith",30.0000
888,"Johnston, Miss. Catherine Helen ""Carrie""",23.4500
889,"Behr, Mr. Karl Howell",30.0000


In [22]:
# 리스트를 활용해서 select
select_columns = ["Name","Age","Pclass","Fare"]
titanic_sdf.select(select_columns).show()

+--------------------+----+------+-------+
|                Name| Age|Pclass|   Fare|
+--------------------+----+------+-------+
|Braund, Mr. Owen ...|22.0|     3|   7.25|
|Cumings, Mrs. Joh...|38.0|     1|71.2833|
|Heikkinen, Miss. ...|26.0|     3|  7.925|
|Futrelle, Mrs. Ja...|35.0|     1|   53.1|
|Allen, Mr. Willia...|35.0|     3|   8.05|
|    Moran, Mr. James|null|     3| 8.4583|
|McCarthy, Mr. Tim...|54.0|     1|51.8625|
|Palsson, Master. ...| 2.0|     3| 21.075|
|Johnson, Mrs. Osc...|27.0|     3|11.1333|
|Nasser, Mrs. Nich...|14.0|     2|30.0708|
|Sandstrom, Miss. ...| 4.0|     3|   16.7|
|Bonnell, Miss. El...|58.0|     1|  26.55|
|Saundercock, Mr. ...|20.0|     3|   8.05|
|Andersson, Mr. An...|39.0|     3| 31.275|
|Vestrom, Miss. Hu...|14.0|     3| 7.8542|
|Hewlett, Mrs. (Ma...|55.0|     2|   16.0|
|Rice, Master. Eugene| 2.0|     3| 29.125|
|Williams, Mr. Cha...|null|     2|   13.0|
|Vander Planke, Mr...|31.0|     3|   18.0|
|Masselmani, Mrs. ...|null|     3|  7.225|
+----------

### 컬럼 속성 이해하기

In [28]:
print(titanic_sdf["Name"])

print()

titanic_sdf.select(
    titanic_sdf["Fare"],
    titanic_sdf["Fare"] * 100
).show()


# SQL에서 SELECT 절에 나오는 컬럼이 스카프에서의 컬럼 속성이다.
# 예를 들어
# (SLECT Fare, Fare * 100) 일 때
# titanic_sdf["Fare"] == Fare
# titanic_sdf["Fare"] * 100 == Fare * 100

Column<'Name'>

+-------+-----------------+
|   Fare|     (Fare * 100)|
+-------+-----------------+
|   7.25|            725.0|
|71.2833|          7128.33|
|  7.925|            792.5|
|   53.1|           5310.0|
|   8.05|805.0000000000001|
| 8.4583|845.8299999999999|
|51.8625|          5186.25|
| 21.075|           2107.5|
|11.1333|          1113.33|
|30.0708|          3007.08|
|   16.7|           1670.0|
|  26.55|           2655.0|
|   8.05|805.0000000000001|
| 31.275|           3127.5|
| 7.8542|           785.42|
|   16.0|           1600.0|
| 29.125|           2912.5|
|   13.0|           1300.0|
|   18.0|           1800.0|
|  7.225|            722.5|
+-------+-----------------+
only showing top 20 rows



## ⭐⭐⭐ 스파크 데이터 프레임에서 칼럼을 다룰 때 가장 많이 사용하는 방식, `col`

In [36]:
import pyspark.sql.functions as F

titanic_sdf.select(F.col("Fare")).show(1)
titanic_sdf.select(F.col("Fare") * 100).show(1)

+----+
|Fare|
+----+
|7.25|
+----+
only showing top 1 row

+------------+
|(Fare * 100)|
+------------+
|       725.0|
+------------+
only showing top 1 row



In [38]:
# upper, 대문자로 바꾸는 연산자

titanic_sdf.select(
    F.col("Name"),
    F.upper("Name").alias("Cap Name")
).show(3)

+--------------------+--------------------+
|                Name|            Cap Name|
+--------------------+--------------------+
|Braund, Mr. Owen ...|BRAUND, MR. OWEN ...|
|Cumings, Mrs. Joh...|CUMINGS, MRS. JOH...|
|Heikkinen, Miss. ...|HEIKKINEN, MISS. ...|
+--------------------+--------------------+
only showing top 3 rows



## Filter
- SQL에서의 Where와 유사하다.
- 조건문을 SQL과 유사한 형태로 지정 가능하다

In [45]:
titanic_sdf.filter(
    F.col("Embarked")=="Q"
).show(2)

# filter를 where로 바꿔도 동일한 결과가 나온다
titanic_sdf.where(
    F.col("Embarked") == "S"
).show(2)

# AND, OR -> & , |
titanic_sdf.filter(
    (F.col("Embarked") == "S") & (F.col("Pclass") == 1)
).show(1)
titanic_sdf.filter(
        (F.col("Embarked") == "S") | (F.col("Pclass") == 2)
).show(1)

+-----------+--------+------+--------------------+----+----+-----+-----+------+------+-----+--------+
|PassengerId|Survived|Pclass|                Name| Sex| Age|SibSp|Parch|Ticket|  Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+----+----+-----+-----+------+------+-----+--------+
|          6|       0|     3|    Moran, Mr. James|male|null|    0|    0|330877|8.4583| null|       Q|
|         17|       0|     3|Rice, Master. Eugene|male| 2.0|    4|    1|382652|29.125| null|       Q|
+-----------+--------+------+--------------------+----+----+-----+-----+------+------+-----+--------+
only showing top 2 rows

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-----+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket| Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-----+-----+--------+
|          1|       0|  

In [49]:
# SQL에서 LIKE처럼 사용가능하다
titanic_sdf.filter(
    F.col("Name").like("%Miss%")
).show(2)

# 이런형태로 사용도 가능함
titanic_sdf.filter(
    "Name like '%Miss%'"
).show(1)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-----+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket| Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-----+-----+--------+
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|7.925| null|       S|
|         11|       1|     3|Sandstrom, Miss. ...|female| 4.0|    1|    1|         PP 9549| 16.7|   G6|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-----+-----+--------+
only showing top 2 rows

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-----+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket| Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+---

### 예제
- h로 시작하는 사람의 이름을 대문자로 변환하여 출력

In [52]:
titanic_sdf.filter(
    F.lower(F.col('Name')).like('h%')
).select(
    F.upper(F.col('Name')).alias("집가고싶다")
).show()

+--------------------+
|          집가고싶다|
+--------------------+
|HEIKKINEN, MISS. ...|
|HEWLETT, MRS. (MA...|
|HOLVERSON, MR. AL...|
|HARPER, MRS. HENR...|
|HARRIS, MR. HENRY...|
|HOOD, MR. AMBROSE JR|
|HICKMAN, MR. STAN...|
|HAKKARAINEN, MRS....|
|  HALE, MR. REGINALD|
|HONKANEN, MISS. E...|
|  HARRIS, MR. WALTER|
|HOYT, MR. FREDERI...|
|HARRIS, MRS. HENR...|
|HARKNETT, MISS. A...|
|   HOLD, MR. STEPHEN|
|HUNT, MR. GEORGE ...|
|HAMALAINEN, MRS. ...|
|HARRISON, MR. WIL...|
|  HENRY, MISS. DELIA|
|HOSONO, MR. MASABUMI|
+--------------------+
only showing top 20 rows



In [53]:
spark.stop()