In [67]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,isnan, when, count

import seaborn as sns

In [68]:
spark = SparkSession.builder.appName('Titanic').getOrCreate()

In [69]:
test = spark.read.csv('test.csv',inferSchema=True,header=True)
test.show(5)

+-----------+------+--------------------+------+----+-----+-----+-------+-------+-----+--------+
|PassengerId|Pclass|                Name|   Sex| Age|SibSp|Parch| Ticket|   Fare|Cabin|Embarked|
+-----------+------+--------------------+------+----+-----+-----+-------+-------+-----+--------+
|        892|     3|    Kelly, Mr. James|  male|34.5|    0|    0| 330911| 7.8292| null|       Q|
|        893|     3|Wilkes, Mrs. Jame...|female|47.0|    1|    0| 363272|    7.0| null|       S|
|        894|     2|Myles, Mr. Thomas...|  male|62.0|    0|    0| 240276| 9.6875| null|       Q|
|        895|     3|    Wirz, Mr. Albert|  male|27.0|    0|    0| 315154| 8.6625| null|       S|
|        896|     3|Hirvonen, Mrs. Al...|female|22.0|    1|    1|3101298|12.2875| null|       S|
+-----------+------+--------------------+------+----+-----+-----+-------+-------+-----+--------+
only showing top 5 rows



In [78]:
train = spark.read.csv('train.csv',inferSchema=True,header=True)
train.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [71]:
train.count()

891

In [72]:
train.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in train.columns]).show()

+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|          0|       0|     0|   0|  0|177|    0|    0|     0|   0|  687|       2|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+



In [73]:
train.groupBy('Embarked').count().show()

+--------+-----+
|Embarked|count|
+--------+-----+
|       Q|   77|
|    null|    2|
|       C|  168|
|       S|  644|
+--------+-----+



In [74]:
train = train.na.fill('C',['Embarked'])
train.groupBy('Embarked').count().show()

+--------+-----+
|Embarked|count|
+--------+-----+
|       Q|   77|
|       C|  170|
|       S|  644|
+--------+-----+



In [75]:
from pyspark.ml.feature import StringIndexer,Imputer

indexer = StringIndexer(inputCols=['Sex', 'Embarked'],outputCols=['Sex_id', 'Embarked_id'])
train = indexer.fit(df).transform(df)

imputer = Imputer(inputCols=['Age'],outputCols=['Age_imp']).setStrategy('mean')
train = imputer.fit(train).transform(train)

In [76]:
train.show(5)

+-----------+------+--------------------+------+----+-----+-----+-------+-------+-----+--------+------+-----------+-------+
|PassengerId|Pclass|                Name|   Sex| Age|SibSp|Parch| Ticket|   Fare|Cabin|Embarked|Sex_id|Embarked_id|Age_imp|
+-----------+------+--------------------+------+----+-----+-----+-------+-------+-----+--------+------+-----------+-------+
|        892|     3|    Kelly, Mr. James|  male|34.5|    0|    0| 330911| 7.8292| null|       Q|   0.0|        2.0|   34.5|
|        893|     3|Wilkes, Mrs. Jame...|female|47.0|    1|    0| 363272|    7.0| null|       S|   1.0|        0.0|   47.0|
|        894|     2|Myles, Mr. Thomas...|  male|62.0|    0|    0| 240276| 9.6875| null|       Q|   0.0|        2.0|   62.0|
|        895|     3|    Wirz, Mr. Albert|  male|27.0|    0|    0| 315154| 8.6625| null|       S|   0.0|        0.0|   27.0|
|        896|     3|Hirvonen, Mrs. Al...|female|22.0|    1|    1|3101298|12.2875| null|       S|   1.0|        0.0|   22.0|
+-------

In [77]:
train.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- Sex_id: double (nullable = false)
 |-- Embarked_id: double (nullable = false)
 |-- Age_imp: double (nullable = true)



In [63]:
train = train.select(['Pclass','Sex_id','Age_imp','SibSp','Parch','Fare','Embarked_id','Survived'])
# train = train.drop(['PassengerId','Name','Sex','Age','Ticket','Cabin','Embarked'])

AnalysisException: cannot resolve 'Survived' given input columns: [Age, Age_imp, Cabin, Embarked, Embarked_id, Fare, Name, Parch, PassengerId, Pclass, Sex, Sex_id, SibSp, Ticket];
'Project [Pclass#55, Sex_id#2991, Age_imp#3096, SibSp#59, Parch#60, Fare#62, Embarked_id#2992, 'Survived]
+- Project [PassengerId#54, Pclass#55, Name#56, Sex#57, Age#58, SibSp#59, Parch#60, Ticket#61, Fare#62, Cabin#63, Embarked#64, Sex_id#2991, Embarked_id#2992, cast(CASE WHEN isnull(cast(Age#58 as double)) THEN 30.272590361445783 WHEN (cast(Age#58 as double) = NaN) THEN 30.272590361445783 ELSE cast(Age#58 as double) END as double) AS Age_imp#3096]
   +- Project [PassengerId#54, Pclass#55, Name#56, Sex#57, Age#58, SibSp#59, Parch#60, Ticket#61, Fare#62, Cabin#63, Embarked#64, UDF(cast(Sex#57 as string)) AS Sex_id#2991, UDF(cast(Embarked#64 as string)) AS Embarked_id#2992]
      +- Relation [PassengerId#54,Pclass#55,Name#56,Sex#57,Age#58,SibSp#59,Parch#60,Ticket#61,Fare#62,Cabin#63,Embarked#64] csv


In [48]:
train.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in train.columns]).show()

+-----------+------+----+---+---+-----+-----+------+----+-----+--------+------+-----------+
|PassengerId|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|Sex_id|Embarked_id|
+-----------+------+----+---+---+-----+-----+------+----+-----+--------+------+-----------+
|          0|     0|   0|  0| 86|    0|    0|     0|   1|  327|       0|     0|          0|
+-----------+------+----+---+---+-----+-----+------+----+-----+--------+------+-----------+

