In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# !pip install pandas
# !pip install pyarrow

from datetime import date, datetime
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pandas as pd



<jemalloc>: MADV_DONTNEED does not work (memset will be used instead)
<jemalloc>: (This is the expected behaviour if you are running under QEMU)


# 1. DataFrame 생성

- SparkSession 객체를 사용해 DataFrame을 생성할 수 있다.
- SparkSession 객체는 pyspark shell을 실행할 때 spark 라는 이름으로 미리 생성된다.



## Row 객체를 사용해 생성하기

- row : DataFrame에서의 한 행

In [3]:
df = spark.createDataFrame([
    Row(name="윤병우", age=21, birth=date(2001, 9,6)), 
    Row(name="김시찬", age=22, birth=date(2000, 10,6)),
    Row(name="강혁준", age=19, birth=date(2003, 7,6))
])
df.show()

                                                                                

+------+---+----------+
|  name|age|     birth|
+------+---+----------+
|윤병우| 21|2001-09-06|
|김시찬| 22|2000-10-06|
|강혁준| 19|2003-07-06|
+------+---+----------+



In [4]:
# 스키마 확인
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- birth: date (nullable = true)



## schema를 명시하여 DataFrame 생성

In [16]:
# 튜플에 데이터를 저장하고 스키마를 직접 지정
df2 = spark.createDataFrame([
    ('김경민', 17, date(2005, 10,1)),
    ('김도은', 18, date(2004, 12,25)),
    ('김민석', 11, date(2011, 1,10))
], schema='name string, age int, birth date')
df2.show()

[Stage 1:>                                                          (0 + 1) / 1]

+------+---+----------+
|  name|age|     birth|
+------+---+----------+
|김경민| 17|2005-10-01|
|김도은| 18|2004-12-25|
|김민석| 11|2011-01-10|
+------+---+----------+



                                                                                

## StructType 객체를 사용해 Schema 지정

In [21]:
schema = StructType([
    StructField('name', StringType(),False),
    StructField('age', IntegerType(),False),
    StructField('birth', DateType(),False)
])


df3 = spark.createDataFrame([
    ('손지수',22, date(2011, 6, 8))
    ,('유승종',21, date(2009, 8, 28))
    ,('윤병우',23, date(2022, 3, 3))
],schema = schema)
df3.show()
df3.printSchema()

+------+---+----------+
|  name|age|     birth|
+------+---+----------+
|손지수| 22|2011-06-08|
|유승종| 21|2009-08-28|
|윤병우| 23|2022-03-03|
+------+---+----------+



## 중첩스키마적용

In [25]:
data = [
    ('이서정', 21, date(2000, 11 ,11), ('010','1111','2222'))
    ,('이선희', 25, date(1999, 11 ,1), ('010','2222','3333'))
    ,('정주연', 23, date(2244, 11 ,11), ('010','3333','4444')) 
]
schema = StructType([
    StructField('name', StringType(),False),
    StructField('age', IntegerType(),False),
    StructField('birth', DateType(),False),
    StructField('birth', StructType([
        StructField('phone1', StringType(),False),
        StructField('phone2', StringType(),False),
        StructField('phone3', StringType(),False)   
    ]),False,metadata={'desc':'user phone number'})
])
df4 = spark.createDataFrame(data=data,schema=schema)
df4.show()
df4.printSchema()

schema_json = df4.schema.json()
schema_json

+------+---+----------+-----------------+
|  name|age|     birth|            birth|
+------+---+----------+-----------------+
|이서정| 21|2000-11-11|{010, 1111, 2222}|
|이선희| 25|1999-11-01|{010, 2222, 3333}|
|정주연| 23|2244-11-11|{010, 3333, 4444}|
+------+---+----------+-----------------+

root
 |-- name: string (nullable = false)
 |-- age: integer (nullable = false)
 |-- birth: date (nullable = false)
 |-- birth: struct (nullable = false)
 |    |-- phone1: string (nullable = false)
 |    |-- phone2: string (nullable = false)
 |    |-- phone3: string (nullable = false)



'{"fields":[{"metadata":{},"name":"name","nullable":false,"type":"string"},{"metadata":{},"name":"age","nullable":false,"type":"integer"},{"metadata":{},"name":"birth","nullable":false,"type":"date"},{"metadata":{"desc":"user phone number"},"name":"birth","nullable":false,"type":{"fields":[{"metadata":{},"name":"phone1","nullable":false,"type":"string"},{"metadata":{},"name":"phone2","nullable":false,"type":"string"},{"metadata":{},"name":"phone3","nullable":false,"type":"string"}],"type":"struct"}}],"type":"struct"}'

## Pandas DataFrame으로 생성

In [27]:
pandas_df = pd.DataFrame({
    'name':['정현진','한병현','홍효정'],
    'age':[20,21,22],
    'birth':[date(2000,1,1),date(2001,2,2),date(2005,5,5)]
})
pandas_df
df5 = spark.createDataFrame(pandas_df)
df5.show()

Unnamed: 0,name,age,birth
0,정현진,20,2000-01-01
1,한병현,21,2001-02-02
2,홍효정,22,2005-05-05


+------+---+----------+
|  name|age|     birth|
+------+---+----------+
|정현진| 20|2000-01-01|
|한병현| 21|2001-02-02|
|홍효정| 22|2005-05-05|
+------+---+----------+



## DataFrame -> Pandas

In [33]:
pandas_df2 = df5.toPandas()
pandas_df2

Unnamed: 0,name,age,birth
0,정현진,20,2000-01-01
1,한병현,21,2001-02-02
2,홍효정,22,2005-05-05


## DataFrame -> pyspark.pandas


In [36]:
pandas_df3=df5.to_pandas_on_spark()
pandas_df3

Unnamed: 0,name,age,birth
0,정현진,20,2000-01-01
1,한병현,21,2001-02-02
2,홍효정,22,2005-05-05


## 외부파일을 사용해 DataFrame 생성

In [40]:
class_df = spark.read.csv('/dataframe/a_class_info.csv',header=True)
class_df.show(3)

+--------+------+-------------+--------+-----------+-------------+
|class_cd|school|class_std_cnt|     loc|school_type|teaching_type|
+--------+------+-------------+--------+-----------+-------------+
|     6OL| ANKYI|           20|   Urban| Non-public|     Standard|
|     ZNS| ANKYI|           21|   Urban| Non-public|     Standard|
|     2B1| CCAAW|           18|Suburban| Non-public| Experimental|
+--------+------+-------------+--------+-----------+-------------+
only showing top 3 rows



## DataFrame 컬럼

- withColumn

In [8]:
data = [
    ('이서정', 21, date(2000, 11 ,11), ('010','1111','2222'))
    ,('이선희', 25, date(1999, 11 ,1), ('010','2222','3333'))
    ,('정주연', 23, date(2244, 11 ,11), ('010','3333','4444')) 
]
schema = StructType([
    StructField('name', StringType(),False),
    StructField('age', IntegerType(),False),
    StructField('birth', DateType(),False),
    StructField('birth', StructType([
        StructField('phone1', StringType(),False),
        StructField('phone2', StringType(),False),
        StructField('phone3', StringType(),False)   
    ]),False,metadata={'desc':'user phone number'})
])
df4 = spark.createDataFrame(data=data,schema=schema)
df4.show()

                                                                                

+------+---+----------+-----------------+
|  name|age|     birth|            birth|
+------+---+----------+-----------------+
|이서정| 21|2000-11-11|{010, 1111, 2222}|
|이선희| 25|1999-11-01|{010, 2222, 3333}|
|정주연| 23|2244-11-11|{010, 3333, 4444}|
+------+---+----------+-----------------+



In [21]:
# 컬럼 추가 
# 컬럼 이름 : '성별'
# 해당 컬럼에 기본 값으로 넣을 것 lit("") : 공백
df4 = df4.withColumn('성별',lit(''))
# df4.show(3)

# 기본 값 지정 : F , withColumns 사용시 기존의 컬럼과 컬럼명이 같으면 덮어쓴다.
df4 = df4.withColumn('성별',lit('F'))
# df4.show(3)

# 조건에 따라 다른 컬럼값을 가지도록 컬럼을 추가
# when - otherwise 함수사용
temp = df4.withColumn('성별', when(df4.age < 23, '여성')
                      .when(df4.age == 23, 'F')
                      .otherwise('female'))
temp.show(3)

[Stage 21:>                                                         (0 + 1) / 1]

+------+---+----------+-----------------+------+
|  name|age|     birth|            birth|  성별|
+------+---+----------+-----------------+------+
|이서정| 21|2000-11-11|{010, 1111, 2222}|  여성|
|이선희| 25|1999-11-01|{010, 2222, 3333}|female|
|정주연| 23|2244-11-11|{010, 3333, 4444}|     F|
+------+---+----------+-----------------+------+



                                                                                

### column  내용  변경

In [15]:
# 컬럼 추가 
# 컬럼 이름 : '성별'
# 해당 컬럼에 기본 값으로 넣을 것 lit("") : 공백
df4 = df4.withColumn('성별',lit(''))
# df4.show(3)

# 기본 값 지정 : F , withColumns 사용시 기존의 컬럼과 컬럼명이 같으면 덮어쓴다.
df4 = df4.withColumn('성별',lit('F'))
# df4.show(3)

### column 이름 변경

In [16]:
temp = df4.withColumnRenamed('성별','gender')
temp.show()

                                                                                

+------+---+----------+-----------------+------+
|  name|age|     birth|            birth|gender|
+------+---+----------+-----------------+------+
|이서정| 21|2000-11-11|{010, 1111, 2222}|     F|
|이선희| 25|1999-11-01|{010, 2222, 3333}|     F|
|정주연| 23|2244-11-11|{010, 3333, 4444}|     F|
+------+---+----------+-----------------+------+



### column  삭제

In [17]:
temp = df4.drop('성별')
temp.show()

                                                                                

+------+---+----------+-----------------+
|  name|age|     birth|            birth|
+------+---+----------+-----------------+
|이서정| 21|2000-11-11|{010, 1111, 2222}|
|이선희| 25|1999-11-01|{010, 2222, 3333}|
|정주연| 23|2244-11-11|{010, 3333, 4444}|
+------+---+----------+-----------------+

