INIT SPARK SESSION

In [1]:

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

22/04/12 14:37:48 WARN Utils: Your hostname, ZEPHYRUS-G14 resolves to a loopback address: 127.0.1.1; using 172.25.173.37 instead (on interface eth0)
22/04/12 14:37:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/12 14:37:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
#Load the data (csv in my case) into the dataframe
titanic_dtfrm = spark.read.csv("train.csv", header=True, inferSchema=True)
titanic_dtfrm

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen ...",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. Joh...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. ...",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Ja...",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. Willia...",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Tim...",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. ...",male,2.0,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Osc...",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nich...",female,14.0,1,0,237736,30.0708,,C


In [3]:
titanic_dtfrm.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [4]:
titanic_dtfrm.limit(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen ...",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. Joh...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. ...",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Ja...",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. Willia...",male,35.0,0,0,373450,8.05,,S


In [5]:
titanic_dtfrm.select('PassengerId','Survived').limit(10) 

PassengerId,Survived
1,0
2,1
3,1
4,1
5,0
6,0
7,0
8,0
9,1
10,1


In [6]:
titanic_dtfrm.where((titanic_dtfrm.Age < 10) & (titanic_dtfrm.Survived == 1)).limit(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
11,1,3,"Sandstrom, Miss. ...",female,4.0,1,1,PP 9549,16.7,G6,S
44,1,2,"Laroche, Miss. Si...",female,3.0,1,2,SC/Paris 2123,41.5792,,C
59,1,2,"West, Miss. Const...",female,5.0,1,2,C.A. 34651,27.75,,S
79,1,2,"Caldwell, Master....",male,0.83,0,2,248738,29.0,,S
166,1,3,"""Goldsmith, Maste...",male,9.0,0,2,363291,20.525,,S


In [7]:
titanic_dtfrm.agg({'Fare':'avg'})

avg(Fare)
32.2042079685746


In [8]:
titanic_dtfrm.groupBy('Pclass').agg({'Fare':'avg'}).orderBy('Pclass')

Pclass,avg(Fare)
1,84.15468749999992
2,20.66218315217391
3,13.675550101832997


In [9]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf

def round_float_down(x):
    return int(x)

round_float_down_udf = udf(round_float_down, IntegerType())

titanic_dtfrm.select(round_float_down_udf('Fare').alias('Fare rounded down'))


                                                                                

Fare rounded down
7
71
7
53
8
8
51
21
11
30


In [11]:
titanic_dtfrm.createOrReplaceTempView('Titanic')
spark.sql('select * from Titanic').limit(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen ...",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. Joh...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. ...",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Ja...",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. Willia...",male,35.0,0,0,373450,8.05,,S
