# 기본 라이브러리 etc

In [11]:
from pyspark import SparkConf, SparkContext

# SparkConf : 스파크 실행 환경 설정 클래스
# SparkContext : Driver program 실행 환경 구성을 위한 클래스

conf = SparkConf().setMaster("local").setAppName("country-student-count")

# SparkContext의 변수명은 웬만하면 sc로 만드는 것을 권장
sc = SparkContext(conf=conf)
sc

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=country-student-count, master=local) created by __init__ at /tmp/ipykernel_22340/108837187.py:9 

In [12]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("spark-sql").getOrCreate()
spark

## List로 주어진 데이터로 Spark DataFrame 생성

In [25]:
movies = [
    (1, "어벤져스", "마블", 2012, 4, 26),
    (2, "슈퍼맨", "DC", 2013, 6, 13),
    (3, "배트맨", "DC", 2008, 8, 6),
    (4, "겨울왕국", "디즈니", 2014, 1, 16),
    (5, "아이언맨", "마블", 2008, 4, 30)
]
movie_schema = ["id","name","company","year","month","day"]
movie_sdf = spark.createDataFrame(data=movies, schema = movie_schema)

movie_sdf.show()
# 판다스와 스파크 데이터 프레임의 차이점은, 이렇게 정의한다고 해서 만들어지는 것이 아님
# 앞서 RDD에서 그래왔듯이 action을 해야 만들어짐

# 스키마 정보 확인
print(movie_sdf.dtypes) # 컬럼의 자료형 출력

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  1|어벤져스|   마블|2012|    4| 26|
|  2|  슈퍼맨|     DC|2013|    6| 13|
|  3|  배트맨|     DC|2008|    8|  6|
|  4|겨울왕국| 디즈니|2014|    1| 16|
|  5|아이언맨|   마블|2008|    4| 30|
+---+--------+-------+----+-----+---+

[('id', 'bigint'), ('name', 'string'), ('company', 'string'), ('year', 'bigint'), ('month', 'bigint'), ('day', 'bigint')]


## Spark SQL 사용하기
- 근데 이거 왜쓰는거지...?

In [26]:
movie_sdf.createOrReplaceTempView("movies")

query = """
select *
from movies
"""
spark.sql(query).show()

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  1|어벤져스|   마블|2012|    4| 26|
|  2|  슈퍼맨|     DC|2013|    6| 13|
|  3|  배트맨|     DC|2008|    8|  6|
|  4|겨울왕국| 디즈니|2014|    1| 16|
|  5|아이언맨|   마블|2008|    4| 30|
+---+--------+-------+----+-----+---+



## JOIN 구현

In [27]:
# 자료형 타입 불러오기
from pyspark.sql.types import StringType, FloatType, IntegerType  # << 컬럼의 데이터 타입임

# 구조를 만들기 위한 타입 불러오기(필수), 칼럼 순서 등
from pyspark.sql.types import StructField, StructType # StructField는 칼럼을 의미하고, StructType은 데이터프레임을 의미

In [28]:
attendances = [
    (1, 13934592., "KR"),
    (2, 2182227.,"KR"),
    (3, 4226242., "KR"),
    (4, 10303058., "KR"),
    (5, 4300365., "KR")
]

# 스키마 추가하기, SQL에서 TABLE 만들 때처럼 속성을 넣는 것 같다.
att_schema = StructType([
    StructField("id",IntegerType(),True),
    StructField("attendance",FloatType(),True),
    StructField("country",StringType(),True)
])

att_df = spark.createDataFrame(data=attendances, schema=att_schema)
att_df.show()

+---+-----------+-------+
| id| attendance|country|
+---+-----------+-------+
|  1|1.3934592E7|     KR|
|  2|  2182227.0|     KR|
|  3|  4226242.0|     KR|
|  4|1.0303058E7|     KR|
|  5|  4300365.0|     KR|
+---+-----------+-------+



## csv파일로 Spark DataFrame 생성하기

In [33]:
filepath = "/home/ubuntu/working/spark/data/titanic_train.csv"

# 경로를 잘 모르겠다면 ls or pwd 사용할 것
titanic_sdf = spark.read.csv(filepath, inferSchema=True, header=True)

# inferSchema :  컬럼 타입을 자동으로 추론해줌, (즉 스키마를 적당하게 수정하겠다는 의미)
titanic_sdf.printSchema()
titanic_sdf.dtypes

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



[('PassengerId', 'int'),
 ('Survived', 'int'),
 ('Pclass', 'int'),
 ('Name', 'string'),
 ('Sex', 'string'),
 ('Age', 'double'),
 ('SibSp', 'int'),
 ('Parch', 'int'),
 ('Ticket', 'string'),
 ('Fare', 'double'),
 ('Cabin', 'string'),
 ('Embarked', 'string')]

# Pandas DataFrame과 비교해보기

In [39]:
import pandas as pd

titanic_pdf = pd.read_csv(filepath)

print(titanic_pdf.head(3))
print()
print(titanic_sdf)

# 판다스는 데이터프레임이 출력,
# 스파크는 데이터 프레임의 정보가 출력

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  

DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: string, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: string, Embarked: string]


### Describe
- 판다스 Describe는 문자열에 대한 통계 정보는 나오지 않는다.
- Spark Describe는 문자열에 대한 통계 정보도 집계하여 출력한다.

In [46]:
titanic_sdf.describe().show()
titanic_pdf.describe()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                null|  null| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### Shape

In [50]:
titanic_pdf.shape, (len(titanic_sdf.columns), titanic_sdf.count())
# 스파크는 행/열의 개수를 다로 파악해야 한다.
# 데이터의 개수 (행의 개수)

((891, 12), (12, 891))

### Spark DF를 Pandas DF로 변환
- 보통, 분산처리 시스템이 더이상 필요 없다고 판단될때 사용
- 그 이유는, Spark는 대용량 데이터를 처리하는데 특화되었는데, 소량의 데이터에서는 pandas보다 처리속도가 <u>훨씬</u> 느리다.

In [43]:
 # titanic_sdf는 스파크 데이터 프레임이었음
titanic_sdf.show()
titanic_sdf.toPandas()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"""Johnston, Miss. Catherine Helen """"Carrie""""""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C
