Load required libraries

In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession

Initialize SparkContext

In [2]:
sc=SparkContext.getOrCreate()
spark=SparkSession(sc)

Load data

In [3]:
data=spark.read.csv("../data/titanic.csv", inferSchema=True, header=True)

Show data schema

In [4]:
data.printSchema()

root
 |-- pclass: integer (nullable = true)
 |-- survived: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: double (nullable = true)
 |-- sibsp: integer (nullable = true)
 |-- parch: integer (nullable = true)
 |-- ticket: string (nullable = true)
 |-- fare: double (nullable = true)
 |-- cabin: string (nullable = true)
 |-- embarked: string (nullable = true)
 |-- boat: string (nullable = true)
 |-- body: integer (nullable = true)
 |-- home.dest: string (nullable = true)



Show data types 

In [5]:
data.dtypes

[('pclass', 'int'),
 ('survived', 'int'),
 ('name', 'string'),
 ('sex', 'string'),
 ('age', 'double'),
 ('sibsp', 'int'),
 ('parch', 'int'),
 ('ticket', 'string'),
 ('fare', 'double'),
 ('cabin', 'string'),
 ('embarked', 'string'),
 ('boat', 'string'),
 ('body', 'int'),
 ('home.dest', 'string')]

Show logical and physical structure of the DataFrame

In [6]:
data.explain()

== Physical Plan ==
*(1) FileScan csv [pclass#10,survived#11,name#12,sex#13,age#14,sibsp#15,parch#16,ticket#17,fare#18,cabin#19,embarked#20,boat#21,body#22,home.dest#23] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/C:/Users/songaya/Documents/Data Science/Notebooks/Big-Data-With-Spark/dat..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<pclass:int,survived:int,name:string,sex:string,age:double,sibsp:int,parch:int,ticket:strin...


show columns

In [7]:
data.columns

['pclass',
 'survived',
 'name',
 'sex',
 'age',
 'sibsp',
 'parch',
 'ticket',
 'fare',
 'cabin',
 'embarked',
 'boat',
 'body',
 'home.dest']

Show top 5 records with head function

In [8]:
data.head(5)

[Row(pclass=1, survived=1, name='Allen, Miss. Elisabeth Walton', sex='female', age=29.0, sibsp=0, parch=0, ticket='24160', fare=211.3375, cabin='B5', embarked='S', boat='2', body=None, home.dest='St Louis, MO'),
 Row(pclass=1, survived=1, name='Allison, Master. Hudson Trevor', sex='male', age=0.9167, sibsp=1, parch=2, ticket='113781', fare=151.55, cabin='C22 C26', embarked='S', boat='11', body=None, home.dest='Montreal, PQ / Chesterville, ON'),
 Row(pclass=1, survived=0, name='Allison, Miss. Helen Loraine', sex='female', age=2.0, sibsp=1, parch=2, ticket='113781', fare=151.55, cabin='C22 C26', embarked='S', boat=None, body=None, home.dest='Montreal, PQ / Chesterville, ON'),
 Row(pclass=1, survived=0, name='Allison, Mr. Hudson Joshua Creighton', sex='male', age=30.0, sibsp=1, parch=2, ticket='113781', fare=151.55, cabin='C22 C26', embarked='S', boat=None, body=135, home.dest='Montreal, PQ / Chesterville, ON'),
 Row(pclass=1, survived=0, name='Allison, Mrs. Hudson J C (Bessie Waldo Danie

Show top 5 records with show function

In [9]:
data.show(5)

+------+--------+--------------------+------+------+-----+-----+------+--------+-------+--------+----+----+--------------------+
|pclass|survived|                name|   sex|   age|sibsp|parch|ticket|    fare|  cabin|embarked|boat|body|           home.dest|
+------+--------+--------------------+------+------+-----+-----+------+--------+-------+--------+----+----+--------------------+
|     1|       1|Allen, Miss. Elis...|female|  29.0|    0|    0| 24160|211.3375|     B5|       S|   2|null|        St Louis, MO|
|     1|       1|Allison, Master. ...|  male|0.9167|    1|    2|113781|  151.55|C22 C26|       S|  11|null|Montreal, PQ / Ch...|
|     1|       0|Allison, Miss. He...|female|   2.0|    1|    2|113781|  151.55|C22 C26|       S|null|null|Montreal, PQ / Ch...|
|     1|       0|Allison, Mr. Huds...|  male|  30.0|    1|    2|113781|  151.55|C22 C26|       S|null| 135|Montreal, PQ / Ch...|
|     1|       0|Allison, Mrs. Hud...|female|  25.0|    1|    2|113781|  151.55|C22 C26|       S|

Show top 5 records with take function

In [10]:
data.take(5)

[Row(pclass=1, survived=1, name='Allen, Miss. Elisabeth Walton', sex='female', age=29.0, sibsp=0, parch=0, ticket='24160', fare=211.3375, cabin='B5', embarked='S', boat='2', body=None, home.dest='St Louis, MO'),
 Row(pclass=1, survived=1, name='Allison, Master. Hudson Trevor', sex='male', age=0.9167, sibsp=1, parch=2, ticket='113781', fare=151.55, cabin='C22 C26', embarked='S', boat='11', body=None, home.dest='Montreal, PQ / Chesterville, ON'),
 Row(pclass=1, survived=0, name='Allison, Miss. Helen Loraine', sex='female', age=2.0, sibsp=1, parch=2, ticket='113781', fare=151.55, cabin='C22 C26', embarked='S', boat=None, body=None, home.dest='Montreal, PQ / Chesterville, ON'),
 Row(pclass=1, survived=0, name='Allison, Mr. Hudson Joshua Creighton', sex='male', age=30.0, sibsp=1, parch=2, ticket='113781', fare=151.55, cabin='C22 C26', embarked='S', boat=None, body=135, home.dest='Montreal, PQ / Chesterville, ON'),
 Row(pclass=1, survived=0, name='Allison, Mrs. Hudson J C (Bessie Waldo Danie

Show data with three features only

In [11]:
data.select('name','age','survived').show(5)

+--------------------+------+--------+
|                name|   age|survived|
+--------------------+------+--------+
|Allen, Miss. Elis...|  29.0|       1|
|Allison, Master. ...|0.9167|       1|
|Allison, Miss. He...|   2.0|       0|
|Allison, Mr. Huds...|  30.0|       0|
|Allison, Mrs. Hud...|  25.0|       0|
+--------------------+------+--------+
only showing top 5 rows



distinct

In [12]:
data.select('sex').distinct().show()

+------+
|   sex|
+------+
|  null|
|female|
|  male|
+------+



In [13]:
data.select('sex').distinct().count()

3

In [14]:
data.select('sex').groupby('sex').count().show()

+------+-----+
|   sex|count|
+------+-----+
|  null|    1|
|female|  466|
|  male|  843|
+------+-----+



Show the number of columns

In [15]:
len(data.columns)

14

Describe Dataframe for statistical analysis

In [16]:
data.describe().show()

+-------+------------------+-------------------+--------------------+------+------------------+------------------+------------------+------------------+-----------------+-----+--------+------------------+-----------------+-------------------+
|summary|            pclass|           survived|                name|   sex|               age|             sibsp|             parch|            ticket|             fare|cabin|embarked|              boat|             body|          home.dest|
+-------+------------------+-------------------+--------------------+------+------------------+------------------+------------------+------------------+-----------------+-----+--------+------------------+-----------------+-------------------+
|  count|              1309|               1309|                1309|  1309|              1046|              1309|              1309|              1309|             1308|  295|    1307|               486|              121|                745|
|   mean| 2.294881588999236|

Describe single feature for statistical analysis

In [17]:
data.describe('sex').show()

+-------+------+
|summary|   sex|
+-------+------+
|  count|  1309|
|   mean|  null|
| stddev|  null|
|    min|female|
|    max|  male|
+-------+------+



Describe several features for statistical analysis

In [18]:
data.describe('fare','survived').show()

+-------+-----------------+-------------------+
|summary|             fare|           survived|
+-------+-----------------+-------------------+
|  count|             1308|               1309|
|   mean|33.29547928134572| 0.3819709702062643|
| stddev|51.75866823917421|0.48605517086648325|
|    min|              0.0|                  0|
|    max|         512.3292|                  1|
+-------+-----------------+-------------------+



Adding a new column

In [19]:
data=data.withColumn('new_fare',data['fare']+100) # adding 100 on the base fare to get new fare
data.select('name','pclass','fare','new_fare').show(5)

+--------------------+------+--------+--------+
|                name|pclass|    fare|new_fare|
+--------------------+------+--------+--------+
|Allen, Miss. Elis...|     1|211.3375|311.3375|
|Allison, Master. ...|     1|  151.55|  251.55|
|Allison, Miss. He...|     1|  151.55|  251.55|
|Allison, Mr. Huds...|     1|  151.55|  251.55|
|Allison, Mrs. Hud...|     1|  151.55|  251.55|
+--------------------+------+--------+--------+
only showing top 5 rows



Rename a column

In [20]:
data=data.withColumnRenamed('pclass','Passenger_Class') # Rename pclass to Passenger_Class
data.columns

['Passenger_Class',
 'survived',
 'name',
 'sex',
 'age',
 'sibsp',
 'parch',
 'ticket',
 'fare',
 'cabin',
 'embarked',
 'boat',
 'body',
 'home.dest',
 'new_fare']

groupby sum

In [21]:
data.groupby('sex').sum().show()

+------+--------------------+-------------+------------------+----------+----------+------------------+---------+-----------------+
|   sex|sum(Passenger_Class)|sum(survived)|          sum(age)|sum(sibsp)|sum(parch)|         sum(fare)|sum(body)|    sum(new_fare)|
+------+--------------------+-------------+------------------+----------+----------+------------------+---------+-----------------+
|  null|                null|         null|              null|      null|      null|              null|     null|             null|
|female|                1004|          339|        11130.5834|       304|       295|21528.313000000027|     1333|68128.31300000001|
|  male|                2000|          161|20125.083300000002|       349|       209|22022.173899999896|    18125|106222.1739000003|
+------+--------------------+-------------+------------------+----------+----------+------------------+---------+-----------------+



groupby mean

In [22]:
data.groupby('sex').mean().show()

+------+--------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|   sex|avg(Passenger_Class)|      avg(survived)|          avg(age)|        avg(sibsp)|        avg(parch)|         avg(fare)|         avg(body)|     avg(new_fare)|
+------+--------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|  null|                null|               null|              null|              null|              null|              null|              null|              null|
|female|  2.1545064377682404| 0.7274678111587983|  28.6870706185567|0.6523605150214592|0.6330472103004292| 46.19809656652367|           166.625|146.19809656652362|
|  male|   2.372479240806643|0.19098457888493475|30.585232978723408|0.4139976275207592|0.2479240806642942|26.154600831353797|160.39823008849558|126.15460083135427|
+------+--------

In [23]:
data.select('sex','fare').groupby('sex').mean().show()

+------+------------------+
|   sex|         avg(fare)|
+------+------------------+
|  null|              null|
|female| 46.19809656652367|
|  male|26.154600831353797|
+------+------------------+



pivot

In [24]:
data.select('sex','survived').groupby('sex').pivot('survived').sum().show()

+------+----+----+----+
|   sex|null|   0|   1|
+------+----+----+----+
|  null|null|null|null|
|female|null|   0| 339|
|  male|null|   0| 161|
+------+----+----+----+



Drop columns

In [25]:
data.columns

['Passenger_Class',
 'survived',
 'name',
 'sex',
 'age',
 'sibsp',
 'parch',
 'ticket',
 'fare',
 'cabin',
 'embarked',
 'boat',
 'body',
 'home.dest',
 'new_fare']

In [26]:
# Lets new_fare and home.dest columns
data=data.drop('new_fare','home.dest')
data.columns

['Passenger_Class',
 'survived',
 'name',
 'sex',
 'age',
 'sibsp',
 'parch',
 'ticket',
 'fare',
 'cabin',
 'embarked',
 'boat',
 'body']

Show null values for in fare column

In [27]:
data.select('fare').where('fare is null').show()

+----+
|fare|
+----+
|null|
|null|
+----+



Drop null values

In [28]:
drop_null_df=data.dropna()
drop_null_df.select('fare').where('body is null').show()

+----+
|fare|
+----+
+----+



Replace null with specific value

In [29]:
replace_null_df=data.fillna(0.00) # Replace fare null values with 0.00 
replace_null_df.filter(replace_null_df['fare']==0.00).select('name','fare').show()

+--------------------+----+
|                name|fare|
+--------------------+----+
|Andrews, Mr. Thom...| 0.0|
|Chisholm, Mr. Rod...| 0.0|
|    Fry, Mr. Richard| 0.0|
|Harrison, Mr. Wil...| 0.0|
|Ismay, Mr. Joseph...| 0.0|
|Parr, Mr. William...| 0.0|
|Reuchlin, Jonkhee...| 0.0|
|Campbell, Mr. Wil...| 0.0|
|Cunningham, Mr. A...| 0.0|
|"Frost, Mr. Antho...| 0.0|
|Knight, Mr. Robert J| 0.0|
|"Parkes, Mr. Fran...| 0.0|
|Watson, Mr. Ennis...| 0.0|
| Johnson, Mr. Alfred| 0.0|
|Johnson, Mr. Will...| 0.0|
| Leonard, Mr. Lionel| 0.0|
|  Storey, Mr. Thomas| 0.0|
|Tornquist, Mr. Wi...| 0.0|
|                null| 0.0|
+--------------------+----+



Using "like"

In [30]:
data.select("name",data.name.like("Allen")) .show(5)

+--------------------+---------------+
|                name|name LIKE Allen|
+--------------------+---------------+
|Allen, Miss. Elis...|          false|
|Allison, Master. ...|          false|
|Allison, Miss. He...|          false|
|Allison, Mr. Huds...|          false|
|Allison, Mrs. Hud...|          false|
+--------------------+---------------+
only showing top 5 rows



Using "startswith"

In [31]:
data.select("name",data.name.startswith("All")).show(5)

+--------------------+---------------------+
|                name|startswith(name, All)|
+--------------------+---------------------+
|Allen, Miss. Elis...|                 true|
|Allison, Master. ...|                 true|
|Allison, Miss. He...|                 true|
|Allison, Mr. Huds...|                 true|
|Allison, Mrs. Hud...|                 true|
+--------------------+---------------------+
only showing top 5 rows



Using "endswith"

In [32]:
data.select('name',data.name.endswith('n')).show(5)

+--------------------+-----------------+
|                name|endswith(name, n)|
+--------------------+-----------------+
|Allen, Miss. Elis...|             true|
|Allison, Master. ...|            false|
|Allison, Miss. He...|            false|
|Allison, Mr. Huds...|             true|
|Allison, Mrs. Hud...|            false|
+--------------------+-----------------+
only showing top 5 rows



Using "between"

In [33]:
data.select('age',data.age.between(40,50)).show(10)

+------+-----------------------------+
|   age|((age >= 40) AND (age <= 50))|
+------+-----------------------------+
|  29.0|                        false|
|0.9167|                        false|
|   2.0|                        false|
|  30.0|                        false|
|  25.0|                        false|
|  48.0|                         true|
|  63.0|                        false|
|  39.0|                        false|
|  53.0|                        false|
|  71.0|                        false|
+------+-----------------------------+
only showing top 10 rows



Using "contains"

In [34]:
data.select('name',data.name.contains('All')).show(5)

+--------------------+-------------------+
|                name|contains(name, All)|
+--------------------+-------------------+
|Allen, Miss. Elis...|               true|
|Allison, Master. ...|               true|
|Allison, Miss. He...|               true|
|Allison, Mr. Huds...|               true|
|Allison, Mrs. Hud...|               true|
+--------------------+-------------------+
only showing top 5 rows



Using "substr"

In [35]:
data.select('name',data['name'].substr(0,5)).show(10)

+--------------------+---------------------+
|                name|substring(name, 0, 5)|
+--------------------+---------------------+
|Allen, Miss. Elis...|                Allen|
|Allison, Master. ...|                Allis|
|Allison, Miss. He...|                Allis|
|Allison, Mr. Huds...|                Allis|
|Allison, Mrs. Hud...|                Allis|
| Anderson, Mr. Harry|                Ander|
|Andrews, Miss. Ko...|                Andre|
|Andrews, Mr. Thom...|                Andre|
|Appleton, Mrs. Ed...|                Apple|
|Artagaveytia, Mr....|                Artag|
+--------------------+---------------------+
only showing top 10 rows



Using alias

In [36]:
data.select('name',data['name'].substr(0,5).alias('First 5 String Characters')).show(10)

+--------------------+-------------------------+
|                name|First 5 String Characters|
+--------------------+-------------------------+
|Allen, Miss. Elis...|                    Allen|
|Allison, Master. ...|                    Allis|
|Allison, Miss. He...|                    Allis|
|Allison, Mr. Huds...|                    Allis|
|Allison, Mrs. Hud...|                    Allis|
| Anderson, Mr. Harry|                    Ander|
|Andrews, Miss. Ko...|                    Andre|
|Andrews, Mr. Thom...|                    Andre|
|Appleton, Mrs. Ed...|                    Apple|
|Artagaveytia, Mr....|                    Artag|
+--------------------+-------------------------+
only showing top 10 rows



Using conditional operators in filter

In [37]:
data.filter((data.age>=40) & (data.age<=50)).select('name','age').show(5)

+--------------------+----+
|                name| age|
+--------------------+----+
| Anderson, Mr. Harry|48.0|
|Astor, Col. John ...|47.0|
|Baxter, Mrs. Jame...|50.0|
|Beckwith, Mrs. Ri...|47.0|
|Bidois, Miss. Ros...|42.0|
+--------------------+----+
only showing top 5 rows



Using "startswith" in filter

In [38]:
data.filter(data.name.startswith("All")).select('name').show()

+--------------------+
|                name|
+--------------------+
|Allen, Miss. Elis...|
|Allison, Master. ...|
|Allison, Miss. He...|
|Allison, Mr. Huds...|
|Allison, Mrs. Hud...|
|Allen, Mr. Willia...|
|Allum, Mr. Owen G...|
+--------------------+



Using "contains" in filter

In [39]:
data.filter(data['name'].contains('Master')).select('name').show(5)

+--------------------+
|                name|
+--------------------+
|Allison, Master. ...|
|Carter, Master. W...|
|Dodge, Master. Wa...|
|Ryerson, Master. ...|
|Spedden, Master. ...|
+--------------------+
only showing top 5 rows



In [40]:
# sc.stop()