In [152]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml.classification import LogisticRegression

In [153]:
spark = SparkSession.builder.appName('logisticRegression').getOrCreate()

In [154]:
titanic_df = spark.read.csv("D:/train.csv", inferSchema = True, header = True)

In [155]:
titanic_df.where(titanic_df['age'].isNull()).count()

177

In [156]:
titanic_df.describe().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     891|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                null|  null| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

Average age of the passengers is 29.699 and almost 38% of them survived

In [157]:
titanic_df.describe(['Age']).show()

+-------+------------------+
|summary|               Age|
+-------+------------------+
|  count|               714|
|   mean| 29.69911764705882|
| stddev|14.526497332334035|
|    min|              0.42|
|    max|              80.0|
+-------+------------------+



In [158]:
titanic_df = titanic_df.fillna({'Age' : 29.69912})

In [159]:
titanic_df.where(titanic_df['age'].isNull()).count()

0

In [160]:
titanic_df.show()

+-----------+--------+------+--------------------+------+--------+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|     Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+--------+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|    22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|    38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|    26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|    35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|    35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. Jame

In [161]:
#titanic_df.select('Cabin').distinct.show()

In [162]:
#did this for my reference
for col in titanic_df.columns:
    print(col,'\t', 'with null values: ', titanic_df.filter(titanic_df[col].isNull()).count())

PassengerId 	 with null values:  0
Survived 	 with null values:  0
Pclass 	 with null values:  0
Name 	 with null values:  0
Sex 	 with null values:  0
Age 	 with null values:  0
SibSp 	 with null values:  0
Parch 	 with null values:  0
Ticket 	 with null values:  0
Fare 	 with null values:  0
Cabin 	 with null values:  687
Embarked 	 with null values:  0


In [163]:
titanic_df.where(titanic_df['Pclass'].isNull()).count()

0

In [164]:
titanic_df.where(titanic_df['Survived'].isNull()).count()

0

In [165]:
titanic_df.where(titanic_df['Sex'].isNull()).count()

0

In [166]:
titanic_df.where(titanic_df['SibSp'].isNull()).count()

0

In [167]:
titanic_df.where(titanic_df['Parch'].isNull()).count()

0

In [168]:
titanic_df.where(titanic_df['Cabin'].isNull()).count()

687

In [169]:
titanic_df.groupby('Cabin').count().show(100)

+-----------+-----+
|      Cabin|count|
+-----------+-----+
|        A23|    1|
|        B79|    1|
|        E44|    2|
|      F E69|    1|
|        D28|    1|
|        C78|    2|
|        C95|    1|
|      F G73|    2|
|    B58 B60|    2|
|         D7|    1|
|       C128|    1|
|        B39|    1|
|        B22|    2|
|       C110|    1|
|        D21|    1|
|         F2|    3|
|        B30|    1|
|       C104|    1|
|        B50|    1|
|         A6|    1|
|        E31|    1|
|        C90|    1|
|        D45|    1|
|        C45|    1|
|        C65|    2|
|    C22 C26|    3|
|        A19|    1|
|B51 B53 B55|    2|
|        E33|    2|
|        A16|    1|
|         B4|    1|
|    B96 B98|    4|
|        E10|    1|
|        C82|    1|
|        D20|    2|
|        E77|    1|
|        B38|    1|
|         E8|    2|
|        A32|    1|
|        C87|    1|
|        B20|    2|
|        E50|    1|
|        B80|    1|
|        D48|    1|
|        E49|    1|
|       null|  687|
|          T|    1|


In [170]:
titanic_df.groupby('Cabin').count().orderBy('count', ascending = False).show(20)

+-----------+-----+
|      Cabin|count|
+-----------+-----+
|       null|  687|
|    B96 B98|    4|
|C23 C25 C27|    4|
|         G6|    4|
|          D|    3|
|       E101|    3|
|         F2|    3|
|    C22 C26|    3|
|        F33|    3|
|        E33|    2|
|      F G73|    2|
|        B18|    2|
|    B58 B60|    2|
|        D20|    2|
|        B20|    2|
|       C126|    2|
|        B22|    2|
|        E44|    2|
|       C123|    2|
|        C83|    2|
+-----------+-----+
only showing top 20 rows



In [171]:
titanic_df = titanic_df.fillna({'Cabin' : 'G6'})

In [172]:
titanic_df.where(titanic_df['Cabin'].isNull()).count()

0

In [173]:
titanic_df.show(25,False)

+-----------+--------+------+-------------------------------------------------------+------+--------+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|Name                                                   |Sex   |Age     |SibSp|Parch|Ticket          |Fare   |Cabin|Embarked|
+-----------+--------+------+-------------------------------------------------------+------+--------+-----+-----+----------------+-------+-----+--------+
|1          |0       |3     |Braund, Mr. Owen Harris                                |male  |22.0    |1    |0    |A/5 21171       |7.25   |G6   |S       |
|2          |1       |1     |Cumings, Mrs. John Bradley (Florence Briggs Thayer)    |female|38.0    |1    |0    |PC 17599        |71.2833|C85  |C       |
|3          |1       |3     |Heikkinen, Miss. Laina                                 |female|26.0    |0    |0    |STON/O2. 3101282|7.925  |G6   |S       |
|4          |1       |1     |Futrelle, Mrs. Jacques Heath (Lily May Peel)   

#### Shape of the data contained in training.csv

In [174]:
print(titanic_df.count() , len(titanic_df.columns))

891 12


This  means 891 rows and 12 columns

#### Features (or attributes) are recorded for each passenger in training.csv

In [175]:
titanic_df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

'PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked' are the columns recorded for each passenger

#### Provide a schema of the columns to be included in your model for this assignment.  Comment on columns that may require transformation(s). An example of transformation is that of creating dummy variables. List these columns and explain why and what transformation is required. 

In [176]:
titanic_df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = false)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = false)
 |-- Embarked: string (nullable = true)



In [177]:
titanic_predict_df = titanic_df.select('Age','Pclass','Sex', 'Embarked', 'Survived')
titanic_predict_df.printSchema()

root
 |-- Age: double (nullable = false)
 |-- Pclass: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- Survived: integer (nullable = true)



From the available schema above, we can see that columns Name, Sex, Ticket, Cabin and Embarked are of string types. 
So these require transformations i.e., creating dummy variables.
But fro now, we are going to concentrate only on Age, Sex, Embarked and pclass from which Sex and Embarked are of String types(categorical types), hence Sex and Embarked need transformations as the machine learning can understand only the numerical values.


#### Comment on the balance of data in training.csv with regards to each input variable as well as your target variable. Support  comments with appropriate statistics. 

In [178]:
titanic_df.describe().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               891|               891|                891|               891|              891|  891|     891|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                null|  null|29.699118114478157|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

In [179]:
titanic_predict_df.describe().show()

+-------+------------------+------------------+------+--------+-------------------+
|summary|               Age|            Pclass|   Sex|Embarked|           Survived|
+-------+------------------+------------------+------+--------+-------------------+
|  count|               891|               891|   891|     891|                891|
|   mean|29.699118114478157| 2.308641975308642|  null|    null| 0.3838383838383838|
| stddev|13.002015226002914|0.8360712409770491|  null|    null|0.48659245426485753|
|    min|              0.42|                 1|female|       C|                  0|
|    max|              80.0|                 3|  male|       S|                  1|
+-------+------------------+------------------+------+--------+-------------------+



In [180]:
titanic_df.select('Survived','Age','Pclass','Sex','Embarked').describe().show()

+-------+-------------------+------------------+------------------+------+--------+
|summary|           Survived|               Age|            Pclass|   Sex|Embarked|
+-------+-------------------+------------------+------------------+------+--------+
|  count|                891|               891|               891|   891|     891|
|   mean| 0.3838383838383838|29.699118114478157| 2.308641975308642|  null|    null|
| stddev|0.48659245426485753|13.002015226002914|0.8360712409770491|  null|    null|
|    min|                  0|              0.42|                 1|female|       C|
|    max|                  1|              80.0|                 3|  male|       S|
+-------+-------------------+------------------+------------------+------+--------+



In [227]:
titanic_predict_df.groupBy('Age').count().orderBy('count', ascending = False).show()

+--------+-----+
|     Age|count|
+--------+-----+
|29.69912|  177|
|    24.0|   30|
|    22.0|   27|
|    18.0|   26|
|    30.0|   25|
|    28.0|   25|
|    19.0|   25|
|    21.0|   24|
|    25.0|   23|
|    36.0|   22|
|    29.0|   20|
|    32.0|   18|
|    35.0|   18|
|    27.0|   18|
|    26.0|   18|
|    31.0|   17|
|    16.0|   17|
|    23.0|   15|
|    34.0|   15|
|    33.0|   15|
+--------+-----+
only showing top 20 rows



People aged 29.66 or 30 are the largest in the dataset

In [182]:
titanic_predict_df.groupBy('Sex').mean().show()

+------+------------------+-----------------+-------------------+
|   Sex|          avg(Age)|      avg(Pclass)|      avg(Survived)|
+------+------------------+-----------------+-------------------+
|female|28.216730445859895|2.159235668789809| 0.7420382165605095|
|  male|30.505824748700135|2.389948006932409|0.18890814558058924|
+------+------------------+-----------------+-------------------+



Male has higher average age but lesser survival rate. Survival rate is higher for female category. Which means 'Sex' impacts Survived column

In [183]:
titanic_predict_df.groupBy('Pclass').mean().show()

+------+------------------+-----------+-------------------+
|Pclass|          avg(Age)|avg(Pclass)|      avg(Survived)|
+------+------------------+-----------+-------------------+
|     1| 37.04811851851854|        1.0| 0.6296296296296297|
|     3|26.403259307535606|        3.0|0.24236252545824846|
|     2|29.866958260869573|        2.0|0.47282608695652173|
+------+------------------+-----------+-------------------+



Pclass-1 has an average age of 37 and a higher survival rate which means Pclass affects the survival rate

In [184]:
titanic_predict_df.groupBy('Embarked').mean().show()

+--------+------------------+------------------+-------------------+
|Embarked|          avg(Age)|       avg(Pclass)|      avg(Survived)|
+--------+------------------+------------------+-------------------+
|       Q| 29.53534461538462|2.8846153846153846| 0.3974358974358974|
|       C|30.562420000000028|1.8869047619047619| 0.5535714285714286|
|       S|29.494063255813938|2.3488372093023258|0.33798449612403103|
+--------+------------------+------------------+-------------------+



In [228]:
titanic_predict_df.groupBy('Survived').count().show()

+--------+-----+
|Survived|count|
+--------+-----+
|       1|  342|
|       0|  549|
+--------+-----+



Count of people not survived is higher than people survived

Embarked with a 'C' value also has an impact on the survival status. Survived column depends on the Embarked status too.

#### Perform the transformations, if any, identified in step # 3. Perform feature engineering if and where needed, including Vectorization of relevant input variables. Provide a printout of the schema of your feature-engineered data.

In [186]:
Pclass_indexer = StringIndexer(inputCol = 'Pclass', outputCol = 'Pclass_num').fit(titanic_predict_df)
titanic_predict_df = Pclass_indexer.transform(titanic_predict_df) 
titanic_predict_df.show(5)

+----+------+------+--------+--------+----------+
| Age|Pclass|   Sex|Embarked|Survived|Pclass_num|
+----+------+------+--------+--------+----------+
|22.0|     3|  male|       S|       0|       0.0|
|38.0|     1|female|       C|       1|       1.0|
|26.0|     3|female|       S|       1|       0.0|
|35.0|     1|female|       S|       1|       1.0|
|35.0|     3|  male|       S|       0|       0.0|
+----+------+------+--------+--------+----------+
only showing top 5 rows



In [187]:
sex_indexer = StringIndexer(inputCol = 'Sex', outputCol = 'Sex_num').fit(titanic_predict_df)
titanic_predict_df = sex_indexer.transform(titanic_predict_df) 
titanic_predict_df.show(5)

+----+------+------+--------+--------+----------+-------+
| Age|Pclass|   Sex|Embarked|Survived|Pclass_num|Sex_num|
+----+------+------+--------+--------+----------+-------+
|22.0|     3|  male|       S|       0|       0.0|    0.0|
|38.0|     1|female|       C|       1|       1.0|    1.0|
|26.0|     3|female|       S|       1|       0.0|    1.0|
|35.0|     1|female|       S|       1|       1.0|    1.0|
|35.0|     3|  male|       S|       0|       0.0|    0.0|
+----+------+------+--------+--------+----------+-------+
only showing top 5 rows



In [188]:
Embarked_indexer = StringIndexer(inputCol = 'Embarked', outputCol = 'Embarked_num').fit(titanic_predict_df)
titanic_predict_df = Embarked_indexer.transform(titanic_predict_df) 
titanic_predict_df.show(5)

+----+------+------+--------+--------+----------+-------+------------+
| Age|Pclass|   Sex|Embarked|Survived|Pclass_num|Sex_num|Embarked_num|
+----+------+------+--------+--------+----------+-------+------------+
|22.0|     3|  male|       S|       0|       0.0|    0.0|         0.0|
|38.0|     1|female|       C|       1|       1.0|    1.0|         1.0|
|26.0|     3|female|       S|       1|       0.0|    1.0|         0.0|
|35.0|     1|female|       S|       1|       1.0|    1.0|         0.0|
|35.0|     3|  male|       S|       0|       0.0|    0.0|         0.0|
+----+------+------+--------+--------+----------+-------+------------+
only showing top 5 rows



In [189]:
titanic_predict_df.groupBy('Sex_num').count().show()
titanic_predict_df.groupBy('Pclass_num').count().show()
titanic_predict_df.groupBy('Embarked_num').count().show()

+-------+-----+
|Sex_num|count|
+-------+-----+
|    0.0|  577|
|    1.0|  314|
+-------+-----+

+----------+-----+
|Pclass_num|count|
+----------+-----+
|       0.0|  491|
|       1.0|  216|
|       2.0|  184|
+----------+-----+

+------------+-----+
|Embarked_num|count|
+------------+-----+
|         0.0|  645|
|         1.0|  168|
|         2.0|   78|
+------------+-----+



In [190]:
Pclass_encoder = OneHotEncoder(inputCol = 'Pclass_num', outputCol = 'Pclass_Dummy_Vector')
titanic_predict_df = Pclass_encoder.transform(titanic_predict_df)
titanic_predict_df.show(5)

+----+------+------+--------+--------+----------+-------+------------+-------------------+
| Age|Pclass|   Sex|Embarked|Survived|Pclass_num|Sex_num|Embarked_num|Pclass_Dummy_Vector|
+----+------+------+--------+--------+----------+-------+------------+-------------------+
|22.0|     3|  male|       S|       0|       0.0|    0.0|         0.0|      (2,[0],[1.0])|
|38.0|     1|female|       C|       1|       1.0|    1.0|         1.0|      (2,[1],[1.0])|
|26.0|     3|female|       S|       1|       0.0|    1.0|         0.0|      (2,[0],[1.0])|
|35.0|     1|female|       S|       1|       1.0|    1.0|         0.0|      (2,[1],[1.0])|
|35.0|     3|  male|       S|       0|       0.0|    0.0|         0.0|      (2,[0],[1.0])|
+----+------+------+--------+--------+----------+-------+------------+-------------------+
only showing top 5 rows



In [191]:
sex_encoder = OneHotEncoder(inputCol = 'Sex_num', outputCol = 'Sex_Dummy_Vector')
titanic_predict_df = sex_encoder.transform(titanic_predict_df)
titanic_predict_df.show(5)

+----+------+------+--------+--------+----------+-------+------------+-------------------+----------------+
| Age|Pclass|   Sex|Embarked|Survived|Pclass_num|Sex_num|Embarked_num|Pclass_Dummy_Vector|Sex_Dummy_Vector|
+----+------+------+--------+--------+----------+-------+------------+-------------------+----------------+
|22.0|     3|  male|       S|       0|       0.0|    0.0|         0.0|      (2,[0],[1.0])|   (1,[0],[1.0])|
|38.0|     1|female|       C|       1|       1.0|    1.0|         1.0|      (2,[1],[1.0])|       (1,[],[])|
|26.0|     3|female|       S|       1|       0.0|    1.0|         0.0|      (2,[0],[1.0])|       (1,[],[])|
|35.0|     1|female|       S|       1|       1.0|    1.0|         0.0|      (2,[1],[1.0])|       (1,[],[])|
|35.0|     3|  male|       S|       0|       0.0|    0.0|         0.0|      (2,[0],[1.0])|   (1,[0],[1.0])|
+----+------+------+--------+--------+----------+-------+------------+-------------------+----------------+
only showing top 5 rows



In [192]:
Embarked_encoder = OneHotEncoder(inputCol = 'Embarked_num', outputCol = 'Embarked_Dummy_Vector')
titanic_predict_df = Embarked_encoder.transform(titanic_predict_df)
titanic_predict_df.show(5)

+----+------+------+--------+--------+----------+-------+------------+-------------------+----------------+---------------------+
| Age|Pclass|   Sex|Embarked|Survived|Pclass_num|Sex_num|Embarked_num|Pclass_Dummy_Vector|Sex_Dummy_Vector|Embarked_Dummy_Vector|
+----+------+------+--------+--------+----------+-------+------------+-------------------+----------------+---------------------+
|22.0|     3|  male|       S|       0|       0.0|    0.0|         0.0|      (2,[0],[1.0])|   (1,[0],[1.0])|        (2,[0],[1.0])|
|38.0|     1|female|       C|       1|       1.0|    1.0|         1.0|      (2,[1],[1.0])|       (1,[],[])|        (2,[1],[1.0])|
|26.0|     3|female|       S|       1|       0.0|    1.0|         0.0|      (2,[0],[1.0])|       (1,[],[])|        (2,[0],[1.0])|
|35.0|     1|female|       S|       1|       1.0|    1.0|         0.0|      (2,[1],[1.0])|       (1,[],[])|        (2,[0],[1.0])|
|35.0|     3|  male|       S|       0|       0.0|    0.0|         0.0|      (2,[0],[1.0])|

In [193]:
titanic_predict_df.groupby('Pclass').count().show()

+------+-----+
|Pclass|count|
+------+-----+
|     1|  216|
|     3|  491|
|     2|  184|
+------+-----+



In [194]:
titanic_predict_df.groupby('Pclass_Dummy_Vector').count().show()

+-------------------+-----+
|Pclass_Dummy_Vector|count|
+-------------------+-----+
|      (2,[0],[1.0])|  491|
|      (2,[1],[1.0])|  216|
|          (2,[],[])|  184|
+-------------------+-----+



In [195]:
titanic_predict_df.groupby('Embarked').count().show()

+--------+-----+
|Embarked|count|
+--------+-----+
|       Q|   78|
|       C|  168|
|       S|  645|
+--------+-----+



In [196]:
titanic_predict_df.groupby('Embarked_Dummy_Vector').count().show()

+---------------------+-----+
|Embarked_Dummy_Vector|count|
+---------------------+-----+
|        (2,[0],[1.0])|  645|
|        (2,[1],[1.0])|  168|
|            (2,[],[])|   78|
+---------------------+-----+



In [197]:
titanic_predict_df.groupby('Sex').count().show()

+------+-----+
|   Sex|count|
+------+-----+
|female|  314|
|  male|  577|
+------+-----+



In [198]:
titanic_predict_df.groupby('Sex_Dummy_Vector').count().show()

+----------------+-----+
|Sex_Dummy_Vector|count|
+----------------+-----+
|   (1,[0],[1.0])|  577|
|       (1,[],[])|  314|
+----------------+-----+



In [199]:
titanic_predict_df.printSchema()

root
 |-- Age: double (nullable = false)
 |-- Pclass: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass_num: double (nullable = false)
 |-- Sex_num: double (nullable = false)
 |-- Embarked_num: double (nullable = false)
 |-- Pclass_Dummy_Vector: vector (nullable = true)
 |-- Sex_Dummy_Vector: vector (nullable = true)
 |-- Embarked_Dummy_Vector: vector (nullable = true)



In [200]:
titanic_predict_df_assembler = VectorAssembler(inputCols = ['Pclass_Dummy_Vector', 'Age', 'Sex_Dummy_Vector','Embarked_Dummy_Vector'], outputCol = 'features')
titanic_predict_df = titanic_predict_df_assembler.transform(titanic_predict_df)

In [201]:
titanic_model_df = titanic_predict_df.select(['Survived', 'features'])
titanic_model_df.show(3, truncate = False)

+--------+--------------------------+
|Survived|features                  |
+--------+--------------------------+
|0       |[1.0,0.0,22.0,1.0,1.0,0.0]|
|1       |[0.0,1.0,38.0,0.0,0.0,1.0]|
|1       |[1.0,0.0,26.0,0.0,1.0,0.0]|
+--------+--------------------------+
only showing top 3 rows



features has all the columns put into a single vector

#### To train and then test your model, split the data from training.csv into training and test datasets using an 80/20 split. 

In [202]:
titanic_training_df, titanic_test_df = titanic_model_df.randomSplit([0.80,0.20])

In [203]:
print(titanic_training_df.count())

709


In [204]:
print(titanic_test_df.count())

182


In [205]:
titanic_training_df.groupby('Survived').count().show()

+--------+-----+
|Survived|count|
+--------+-----+
|       1|  273|
|       0|  436|
+--------+-----+



In [206]:
titanic_test_df.groupby('Survived').count().show()

+--------+-----+
|Survived|count|
+--------+-----+
|       1|   69|
|       0|  113|
+--------+-----+



Here we can see survived with value 0 has higher count in the training and test datasets. We could they replicate the actual dataset.

#### Build and train the Logistic Regression model. Generate a list of predictions for passengers survival status (survival = 1) based on the trained model. Display actual, predicted, and probability values for the first 10 rows only. 

In [207]:
log_reg = LogisticRegression(labelCol = 'Survived').fit(titanic_training_df)

In [208]:
train_results = log_reg.evaluate(titanic_training_df).predictions

In [209]:
train_results.filter(train_results['Survived'] == 1).filter(train_results['prediction'] == 1).select(['Survived', 'prediction', 'probability']).show(10,False)

+--------+----------+----------------------------------------+
|Survived|prediction|probability                             |
+--------+----------+----------------------------------------+
|1       |1.0       |[0.23858521602789326,0.7614147839721067]|
|1       |1.0       |[0.24412466946770978,0.7558753305322903]|
|1       |1.0       |[0.24412466946770978,0.7558753305322903]|
|1       |1.0       |[0.27915204045045505,0.720847959549545] |
|1       |1.0       |[0.32833170367781317,0.6716682963221868]|
|1       |1.0       |[0.32833170367781317,0.6716682963221868]|
|1       |1.0       |[0.32833170367781317,0.6716682963221868]|
|1       |1.0       |[0.32833170367781317,0.6716682963221868]|
|1       |1.0       |[0.32833170367781317,0.6716682963221868]|
|1       |1.0       |[0.32833170367781317,0.6716682963221868]|
+--------+----------+----------------------------------------+
only showing top 10 rows



Probability at index 1 represents survival status 1 which is higher and probability at index 0 represents survival status 0. SO the model is predicting likelihood of survival with high probability

#### Using the test data from the 80/20 split, evaluate the performance of trained model. 

In [210]:
titanic_test_results = log_reg.evaluate(titanic_test_df).predictions

In [211]:
titanic_test_results.printSchema()

root
 |-- Survived: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [212]:
titanic_test_results.select('Survived', 'prediction', 'probability').show(10, False)

+--------+----------+----------------------------------------+
|Survived|prediction|probability                             |
+--------+----------+----------------------------------------+
|0       |1.0       |[0.32833170367781317,0.6716682963221868]|
|0       |1.0       |[0.32833170367781317,0.6716682963221868]|
|0       |1.0       |[0.34386245001791826,0.6561375499820818]|
|0       |0.0       |[0.7775936054669245,0.2224063945330756] |
|0       |1.0       |[0.26048362860566554,0.7395163713943345]|
|0       |0.0       |[0.6708093419628011,0.3291906580371989] |
|0       |0.0       |[0.6840304212350731,0.31596957876492693]|
|0       |0.0       |[0.6840304212350731,0.31596957876492693]|
|0       |0.0       |[0.690532667942635,0.30946733205736493] |
|0       |0.0       |[0.564453961542326,0.435546038457674]   |
+--------+----------+----------------------------------------+
only showing top 10 rows



In [213]:
tp = titanic_test_results[(titanic_test_results.Survived == 1) & (titanic_test_results.prediction == 1)].count()

In [214]:
tp

48

In [215]:
tn = titanic_test_results[(titanic_test_results.Survived == 0) & (titanic_test_results.prediction == 0)].count()
tn

94

In [216]:
fp = titanic_test_results[(titanic_test_results.Survived == 0) & (titanic_test_results.prediction == 1)].count()
fp

19

In [217]:
fn = titanic_test_results[(titanic_test_results.Survived == 1) & (titanic_test_results.prediction == 0)].count()
fn

21

In [218]:
accuracy = (tp + tn)/(tp+tn + fp+fn)
accuracy

0.7802197802197802

In [219]:
recall_rate = tp/(tp+fn)
recall_rate

0.6956521739130435

In [220]:
precision = tp/(tp+fp)
precision

0.7164179104477612

In [229]:
F1_score = 2*((precision*recall_rate)/(precision+recall_rate))
F1_score

0.7058823529411765

In [223]:
titanic_test_results_summary = log_reg.summary

In [224]:
print("areaUnderROC: " + str(titanic_test_results_summary.areaUnderROC))

areaUnderROC: 0.8530219780219778


F1 is higher which means it has better model.
Accuracy is around 80%, recall_rate is around 69%, Precision is useful when the costs of false positives are high. recall_rate is useful when the cost of false negatives is high. Here, for the Survived class, the model is only right 48% of the time.
F1 score is considered to be a better model when the value is 1 and we have low false positives and low false negatives. Here we got lower false positives and false negatives.

In [146]:
titanic_training_df_summary = log_reg.summary

In [147]:
print(titanic_training_df_summary)

<pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary object at 0x000002462A432F08>


In [148]:
print("areaUnderROC: " + str(titanic_training_df_summary.areaUnderROC))

areaUnderROC: 0.8514561908679555


In [149]:
print(titanic_training_df_summary.accuracy)
print(titanic_training_df_summary.weightedRecall)
print(titanic_training_df_summary.weightedPrecision)
print(titanic_training_df_summary.weightedFMeasure())

0.797768479776848
0.7977684797768481
0.7957148847859118
0.795545199195282


AUC is 85% which implies its better at predicting survived and not survived passengers. Higher the AUC is, better the model is at dtstinguishing between passengers survived and those who did not.We have similar results for the test and trained models