## DataFrame operations

In [1]:
from pyspark import SparkContext
sc = SparkContext()
from pyspark.sql import SQLContext
from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType
from pyspark.sql import SparkSession
spark = SparkSession(sc)
sqlContext = SQLContext(sc)

### create DataFrame

In [2]:
# create from RDD
some_rdd = sc.parallelize([Row(name = 'John', age = 19),
                          Row(name = 'Bob', age = 21)])
some_df = sqlContext.createDataFrame(some_rdd)

In [3]:
# define schema
schema = StructType([StructField('personal_name', StringType(), False),
                    StructField('age', IntegerType(), False)])
another_df = sqlContext.createDataFrame(some_rdd, schema)
# another_df.printSchema()

In [5]:
# read from other type of files
# read from pandas
import pandas as pd
pd_df = sqlContext.createDataFrame(pd.DataFrame(range(5)))
# or 
pd_df = spark.createDataFrame(pd.DataFrame(range(5)))
# pd_df.printSchema()

In [7]:
# read from json
import json
data = [ { 'a' : 1, 'b' : 2, 'c' : 3, 'd' : 4, 'e' : 5 } ]
with open('test_data.json', 'w') as add:
    json.dump(data, add)
json_df = sqlContext.read.json('test_data.json')
# or 
json_df = spark.read.json('test_data.json')
# json_df.printSchema()

In [12]:
# read from csv
csv_df = spark.read.csv('titanic.csv', header = True)
# csv_df.printSchema()

In [14]:
# read from parquet
parquet_df = spark.read.load('part-00000-82db5734-5b30-4b12-8c75-3d0197e2f1b0-c000.snappy.parquet')
# parquet_df.printSchema()

### check DataFrame

In [21]:
# check some row
csv_df.show(5)

+-----------+---+-------+---+-----+-----+-----+-----+-----+-----+------+------+-----+------+------+------+------+------+------+------+------+------+------+------+--------+------+------+--------+
|Passengerid|Age|   Fare|Sex|sibsp|zero5|zero6|zero7|zero8|zero9|zero10|zero11|Parch|zero13|zero14|zero15|zero16|zero17|zero18|zero19|zero20|Pclass|zero22|zero23|Embarked|zero25|zero26|2urvived|
+-----------+---+-------+---+-----+-----+-----+-----+-----+-----+------+------+-----+------+------+------+------+------+------+------+------+------+------+------+--------+------+------+--------+
|          1| 22|   7.25|  0|    1|    0|    0|    0|    0|    0|     0|     0|    0|     0|     0|     0|     0|     0|     0|     0|     0|     3|     0|     0|       2|     0|     0|       0|
|          2| 38|71.2833|  1|    1|    0|    0|    0|    0|    0|     0|     0|    0|     0|     0|     0|     0|     0|     0|     0|     0|     1|     0|     0|       0|     0|     0|       1|
|          3| 26|  7.925|

In [22]:
# print schema
csv_df.printSchema()

root
 |-- Passengerid: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- sibsp: string (nullable = true)
 |-- zero5: string (nullable = true)
 |-- zero6: string (nullable = true)
 |-- zero7: string (nullable = true)
 |-- zero8: string (nullable = true)
 |-- zero9: string (nullable = true)
 |-- zero10: string (nullable = true)
 |-- zero11: string (nullable = true)
 |-- Parch: string (nullable = true)
 |-- zero13: string (nullable = true)
 |-- zero14: string (nullable = true)
 |-- zero15: string (nullable = true)
 |-- zero16: string (nullable = true)
 |-- zero17: string (nullable = true)
 |-- zero18: string (nullable = true)
 |-- zero19: string (nullable = true)
 |-- zero20: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- zero22: string (nullable = true)
 |-- zero23: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- zero25: string (nullable = true)
 |-- zero26: st

In [28]:
# take
aaa = csv_df.take(5)
type(aaa)

list

In [32]:
# to rdd
df_rdd = csv_df.rdd

In [33]:
# count
csv_df.count()

1309

In [36]:
# is nul
from pyspark.sql.functions import isnull
csv_df.filter(isnull("Age"))

+-----------+---+----+---+-----+-----+-----+-----+-----+-----+------+------+-----+------+------+------+------+------+------+------+------+------+------+------+--------+------+------+--------+
|Passengerid|Age|Fare|Sex|sibsp|zero5|zero6|zero7|zero8|zero9|zero10|zero11|Parch|zero13|zero14|zero15|zero16|zero17|zero18|zero19|zero20|Pclass|zero22|zero23|Embarked|zero25|zero26|2urvived|
+-----------+---+----+---+-----+-----+-----+-----+-----+-----+------+------+-----+------+------+------+------+------+------+------+------+------+------+------+--------+------+------+--------+
+-----------+---+----+---+-----+-----+-----+-----+-----+-----+------+------+-----+------+------+------+------+------+------+------+------+------+------+------+--------+------+------+--------+

