# **pyspark 패키지를 활용한 Spark 프로그래밍**
## SparkSession 객체 생성

In [3]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[2]") \
                    .appName('sparkedu') \
                    .getOrCreate()
spark

## 리스트객체로 RDD 객체 생성하기

In [None]:
dataList = [("Java", 20000), ("Python", 100000), ("Scala", 3000)]
rdd=spark.sparkContext.parallelize(dataList)
print(rdd)
print(type(rdd))
print(rdd.collect())

In [None]:
import numpy as np
lst=np.random.randint(0,10,20)
rdd=spark.sparkContext.parallelize(lst)
print(type(rdd))
print(rdd.collect())
print(rdd.count())

## 텍스트 파일 내용 읽어서 RDD 객체 생성하기

In [None]:
rdd = spark.sparkContext.textFile("data/korean_stopwords.txt")
print(rdd.collect())

## 생성한 RDD 객체 Spark DataFrame 으로 변환하기

In [None]:
dept = [("Finance",10), 
        ("Marketing",20), 
        ("Sales",30), 
        ("IT",40) 
      ]
rdd = spark.sparkContext.parallelize(dept)
print(rdd.collect())

In [None]:
df = rdd.toDF()
df.printSchema()
df.show()

In [None]:
deptColumns = ["dept_name","dept_id"]
df2 = rdd.toDF(deptColumns)
df2.printSchema()
df2.show(truncate=False)

In [None]:
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
print(type(df))
print(df)
df.printSchema()
df.show()

## CSV 파일 내용 읽어서 DataFrame 객체 생성하기

In [None]:
df = spark.read.csv("data/emp.csv")
df.printSchema()
df.show()

In [None]:
df = spark.read.csv("data/mpgdata.csv")
df.printSchema()
df.show()

In [None]:
df = spark.read.load("data/iris.csv",
                     format="csv", sep=",", inferSchema="true", header="true")
df.printSchema()
df.show()

## JSON 파일 내용 읽어서 DataFrame 객체 생성하기

In [None]:
df = spark.read.json("data/seoul_geo.json")
df.show()
df.select('type')

In [None]:
df = spark.read.load("data/seoul_geo.json", format="json")
df.show()

## 파케이 파일 내용 읽어서 DataFrame 객체 생성하기

In [None]:
df = spark.read.load("data/userdata1.parquet")
df = df.select("first_name", "last_name", "email")
df.show()

## 직접 만든 DataFrame 객체 생성하여 정보 출력하기

In [None]:
data = [("James","","Smith","36636","M",60000),
        ("Michael","Rose","","40288","M",70000),
        ("Robert","","Williams","42114","",400000),
        ("Maria","Anne","Jones","39192","F",500000),
        ("Jen","Mary","Brown","","F",0)]

columns = ["first_name","middle_name","last_name","dob","gender","salary"]
pysparkDF = spark.createDataFrame(data = data, schema = columns)
pysparkDF.printSchema()
pysparkDF.show(truncate=False)

## Spark의 DataFrame 객체를 Pandas의 DataFrame 객체로 변환하기

In [None]:
pandasDF = pysparkDF.toPandas()
print(pandasDF)