In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# !pip install pandas
# !pip install pyarrow
from datetime import date, datetime
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pandas as pd
from datetime import date, datetime
from pyspark.sql import *


Collecting pandas
  Downloading pandas-1.4.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
[K     |████████████████████████████████| 11.7 MB 19.3 MB/s eta 0:00:01
Collecting pytz>=2020.1
  Downloading pytz-2022.2.1-py2.py3-none-any.whl (500 kB)
[K     |████████████████████████████████| 500 kB 8.1 MB/s eta 0:00:01
Installing collected packages: pytz, pandas
Successfully installed pandas-1.4.4 pytz-2022.2.1
Collecting pyarrow
  Downloading pyarrow-9.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.3 MB)
[K     |████████████████████████████████| 35.3 MB 29.7 MB/s eta 0:00:01     |█████████████████               | 18.6 MB 16.9 MB/s eta 0:00:01
Installing collected packages: pyarrow
Successfully installed pyarrow-9.0.0


# 1. DataFrame 생성

- SparkSession 객체를 사용해 DataFrame을 생성할 수 있다.
- SparkSession 객체는 pyspark shell을 실행할 때 spark 라는 이름으로 미리 생성된다.



## Row 객체를 사용해 생성하기

- row : DataFrame에서의 한 행

In [3]:
df = spark.createDataFrame([
    Row(name='하명도', age=21, birth=date(2001,9,6))
    ,Row(name='이상엽', age=22, birth=date(2000,10,8))    
    ,Row(name='강혁준', age=19, birth=date(2003,8,9))    
])

df.show()

                                                                                

+------+---+----------+
|  name|age|     birth|
+------+---+----------+
|하명도| 21|2001-09-06|
|이상엽| 22|2000-10-08|
|강혁준| 19|2003-08-09|
+------+---+----------+



In [4]:
# 스키마 확인
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- birth: date (nullable = true)



## schema를 명시하여 DataFrame 생성

In [7]:
# 튜플에 데이터를 저장하고 스키마를 직접 지정
df2 = spark.createDataFrame([
       ('김경민', 17, date(2005, 10,11))
     , ('김도은', 18, date(2004,12,25))
     , ('김민석', 11, date(2011, 1,10))
], schema='name string, age int, birth date')
df2.show()
df2.printSchema()

+------+---+----------+
|  name|age|     birth|
+------+---+----------+
|김경민| 17|2005-10-11|
|김도은| 18|2004-12-25|
|김민석| 11|2011-01-10|
+------+---+----------+

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- birth: date (nullable = true)



## StructType 객체를 사용해 Schema 지정

In [9]:
schema = StructType([
   StructField("name", StringType(), False),
   StructField("age", IntegerType(), False),
   StructField("birth", DateType(), False)
])

df3 = spark.createDataFrame([
    ('손지수', 22, date(2011, 6, 8))
    ,('유승종', 21, date(2009, 8, 28))
    ,('윤병우', 23, date(2022, 3, 3))
], schema=schema)

df3.show()
df3.printSchema()


+------+---+----------+
|  name|age|     birth|
+------+---+----------+
|손지수| 22|2011-06-08|
|유승종| 21|2009-08-28|
|윤병우| 23|2022-03-03|
+------+---+----------+

root
 |-- name: string (nullable = false)
 |-- age: integer (nullable = false)
 |-- birth: date (nullable = false)



## 중첩스키마적용

In [13]:
data = [
    ('이서정', 21, date(2000, 11, 11), ('010', '1111','2222'))
  , ('이선희', 25, date(1999, 11, 11), ('010', '2222','2222'))
   , ('정주연', 23, date(2244, 6, 23), ('010', '3333','2222'))
]

schema = StructType([
   StructField("name", StringType(), False),
   StructField("age", IntegerType(), False),
   StructField("birth", DateType(), False),
   StructField("phone", StructType([
          StructField("phone1", StringType(), False),
          StructField("phone2", StringType(), False),
          StructField("phone3", StringType(), False),
  ]), False, metadata={'desc':'user phone number'})
])

df4 = spark.createDataFrame(data=data, schema=schema)
df4.show()
df4.printSchema()

schema_json = df4.schema.json()
print(schema_json)



+------+---+----------+-----------------+
|  name|age|     birth|            phone|
+------+---+----------+-----------------+
|이서정| 21|2000-11-11|{010, 1111, 2222}|
|이선희| 25|1999-11-11|{010, 2222, 2222}|
|정주연| 23|2244-06-23|{010, 3333, 2222}|
+------+---+----------+-----------------+

root
 |-- name: string (nullable = false)
 |-- age: integer (nullable = false)
 |-- birth: date (nullable = false)
 |-- phone: struct (nullable = false)
 |    |-- phone1: string (nullable = false)
 |    |-- phone2: string (nullable = false)
 |    |-- phone3: string (nullable = false)

{"fields":[{"metadata":{},"name":"name","nullable":false,"type":"string"},{"metadata":{},"name":"age","nullable":false,"type":"integer"},{"metadata":{},"name":"birth","nullable":false,"type":"date"},{"metadata":{},"name":"phone","nullable":false,"type":{"fields":[{"metadata":{},"name":"phone1","nullable":false,"type":"string"},{"metadata":{},"name":"phone2","nullable":false,"type":"string"},{"metadata":{},"name":"phone3","nu

## Pandas DataFrame으로 생성

pandas.core.frame.DataFrame

+------+---+----------+
|  name|age|     birth|
+------+---+----------+
|하명도| 20|2022-07-01|
|이제동| 21|2022-07-02|
|김명운| 22|2022-07-03|
+------+---+----------+



## DataFrame -> Pandas

Unnamed: 0,name,age,birth
0,하명도,20,2022-07-01
1,이제동,21,2022-07-02
2,김명운,22,2022-07-03


## DataFrame -> pyspark.pandas


                                                                                

Unnamed: 0,name,age,birth
0,하명도,20,2022-07-01
1,이제동,21,2022-07-02
2,김명운,22,2022-07-03


pyspark.sql.dataframe.DataFrame

## 외부파일을 사용해 DataFrame 생성

                                                                                

+--------+------+-------------+--------+-----------+-------------+
|class_cd|school|class_std_cnt|     loc|school_type|teaching_type|
+--------+------+-------------+--------+-----------+-------------+
|     6OL| ANKYI|           20|   Urban| Non-public|     Standard|
|     ZNS| ANKYI|           21|   Urban| Non-public|     Standard|
|     2B1| CCAAW|           18|Suburban| Non-public| Experimental|
|     EPS| CCAAW|           20|Suburban| Non-public| Experimental|
|     IQN| CCAAW|           15|Suburban| Non-public| Experimental|
+--------+------+-------------+--------+-----------+-------------+
only showing top 5 rows

-RECORD 0-------------------
 class_cd      | 6OL        
 school        | ANKYI      
 class_std_cnt | 20         
 loc           | Urban      
 school_type   | Non-public 
 teaching_type | Standard   
only showing top 1 row



## DataFrame 컬럼

- withColumn

+------+---+----------+-----------------+--------+
|  name|age|     birth|            phone|우승여부|
+------+---+----------+-----------------+--------+
|하명도| 15|2022-07-22|{010, 1111, 2222}|        |
|이제동| 20|2021-07-22|{010, 2222, 3333}|        |
|김명운| 25|2020-07-22|{010, 4444, 5555}|        |
|홍진호| 36|2018-07-22|{010, 3333, 4444}|        |
+------+---+----------+-----------------+--------+

+------+---+----------+-----------------+--------+
|  name|age|     birth|            phone|우승여부|
+------+---+----------+-----------------+--------+
|하명도| 15|2022-07-22|{010, 1111, 2222}|    우승|
|이제동| 20|2021-07-22|{010, 2222, 3333}|    우승|
|김명운| 25|2020-07-22|{010, 4444, 5555}|    우승|
|홍진호| 36|2018-07-22|{010, 3333, 4444}|    우승|
+------+---+----------+-----------------+--------+

+------+---+----------+-----------------+---------+
|  name|age|     birth|            phone|   연령대|
+------+---+----------+-----------------+---------+
|하명도| 15|2022-07-22|{010, 1111, 2222}|     10대|
|이제동| 20|2021-07-22|{

### column  내용  변경

+------+---+----------+-----------------+------+
|  name|age|     birth|            phone|연령대|
+------+---+----------+-----------------+------+
|하명도| 15|2022-07-22|{010, 1111, 2222}|어린이|
|이제동| 20|2021-07-22|{010, 2222, 3333}|  청년|
|김명운| 25|2020-07-22|{010, 4444, 5555}|  청년|
|홍진호| 36|2018-07-22|{010, 3333, 4444}|  성인|
+------+---+----------+-----------------+------+



### column 이름 변경

+------+---+----------+-----------------+------+
|  name|age|     birth|            phone|  분류|
+------+---+----------+-----------------+------+
|하명도| 15|2022-07-22|{010, 1111, 2222}|어린이|
|이제동| 20|2021-07-22|{010, 2222, 3333}|  청년|
|김명운| 25|2020-07-22|{010, 4444, 5555}|  청년|
|홍진호| 36|2018-07-22|{010, 3333, 4444}|  성인|
+------+---+----------+-----------------+------+



### column  삭제

+------+---+----------+-----------------+
|  name|age|     birth|            phone|
+------+---+----------+-----------------+
|하명도| 15|2022-07-22|{010, 1111, 2222}|
|이제동| 20|2021-07-22|{010, 2222, 3333}|
|김명운| 25|2020-07-22|{010, 4444, 5555}|
|홍진호| 36|2018-07-22|{010, 3333, 4444}|
+------+---+----------+-----------------+

