In [29]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, VectorIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [6]:
spark = SparkSession.builder.appName('titanic').getOrCreate()
spark

In [7]:
df = spark.read.csv('titanic.csv',inferSchema=True,header=True)
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [4]:
df.count()

891

In [8]:
df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          6|       0|     3|    Moran, Mr. James|  male|NULL|    0|    0|      

In [9]:
df.groupBy('Name').count().show()

+--------------------+-----+
|                Name|count|
+--------------------+-----+
|"Watt, Mrs. James...|    1|
|Young, Miss. Mari...|    1|
|Parr, Mr. William...|    1|
|Soholt, Mr. Peter...|    1|
|Goldsmith, Mrs. F...|    1|
|    Dimic, Mr. Jovan|    1|
|Harper, Mr. Henry...|    1|
|Reuchlin, Jonkhee...|    1|
|Fahlstrom, Mr. Ar...|    1|
|Hosono, Mr. Masabumi|    1|
| Partner, Mr. Austen|    1|
|Van Impe, Miss. C...|    1|
|Bjornstrom-Steffa...|    1|
|    Saad, Mr. Khalil|    1|
| Sirota, Mr. Maurice|    1|
|Slemen, Mr. Richa...|    1|
|McCormack, Mr. Th...|    1|
|Potter, Mrs. Thom...|    1|
|Palsson, Miss. St...|    1|
|"Nakid, Miss. Mar...|    1|
+--------------------+-----+
only showing top 20 rows



In [11]:
df.groupBy('cabin').count()

DataFrame[cabin: string, count: bigint]

In [12]:
df.groupBy('Embarked').count().show()

+--------+-----+
|Embarked|count|
+--------+-----+
|       Q|   77|
|    NULL|    2|
|       C|  168|
|       S|  644|
+--------+-----+



In [13]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [14]:
cols = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','Survived']
filtered_df = df.select(cols)
filtered_df.show()

+------+------+----+-----+-----+-------+--------+--------+
|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|Survived|
+------+------+----+-----+-----+-------+--------+--------+
|     3|  male|22.0|    1|    0|   7.25|       S|       0|
|     1|female|38.0|    1|    0|71.2833|       C|       1|
|     3|female|26.0|    0|    0|  7.925|       S|       1|
|     1|female|35.0|    1|    0|   53.1|       S|       1|
|     3|  male|35.0|    0|    0|   8.05|       S|       0|
|     3|  male|NULL|    0|    0| 8.4583|       Q|       0|
|     1|  male|54.0|    0|    0|51.8625|       S|       0|
|     3|  male| 2.0|    3|    1| 21.075|       S|       0|
|     3|female|27.0|    0|    2|11.1333|       S|       1|
|     2|female|14.0|    1|    0|30.0708|       C|       1|
|     3|female| 4.0|    1|    1|   16.7|       S|       1|
|     1|female|58.0|    0|    0|  26.55|       S|       1|
|     3|  male|20.0|    0|    0|   8.05|       S|       0|
|     3|  male|39.0|    1|    5| 31.275|       S|       

In [15]:
final_df = filtered_df.na.drop()
final_df.show()

+------+------+----+-----+-----+-------+--------+--------+
|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|Survived|
+------+------+----+-----+-----+-------+--------+--------+
|     3|  male|22.0|    1|    0|   7.25|       S|       0|
|     1|female|38.0|    1|    0|71.2833|       C|       1|
|     3|female|26.0|    0|    0|  7.925|       S|       1|
|     1|female|35.0|    1|    0|   53.1|       S|       1|
|     3|  male|35.0|    0|    0|   8.05|       S|       0|
|     1|  male|54.0|    0|    0|51.8625|       S|       0|
|     3|  male| 2.0|    3|    1| 21.075|       S|       0|
|     3|female|27.0|    0|    2|11.1333|       S|       1|
|     2|female|14.0|    1|    0|30.0708|       C|       1|
|     3|female| 4.0|    1|    1|   16.7|       S|       1|
|     1|female|58.0|    0|    0|  26.55|       S|       1|
|     3|  male|20.0|    0|    0|   8.05|       S|       0|
|     3|  male|39.0|    1|    5| 31.275|       S|       0|
|     3|female|14.0|    0|    0| 7.8542|       S|       

In [17]:
gender_indexer = StringIndexer(inputCol='Sex',outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex',outputCol='SexVec')

In [18]:
embarked_indexer = StringIndexer(inputCol='Embarked',outputCol='EmbarkedIndex')
embarked_encoder = OneHotEncoder(inputCol='EmbarkedIndex',outputCol='EmbarkedVec')

In [21]:
assembler = VectorAssembler(inputCols=['Pclass','SexVec','Age','SibSp','EmbarkedVec','Parch','Fare'],outputCol='features')
assembler

VectorAssembler_ed12579348cf

In [22]:
log_reg_model = LogisticRegression(featuresCol='features',labelCol='Survived')
log_reg_model

LogisticRegression_539cb118365d

In [23]:
pipeline = Pipeline(stages=[gender_indexer,embarked_indexer,gender_encoder,embarked_encoder,assembler,log_reg_model])
pipeline

Pipeline_be41a8241781

In [24]:
train_df, test_df = final_df.randomSplit([0.7,0.3])

In [25]:
lr_model = pipeline.fit(train_df)

In [26]:
test_results = lr_model.transform(test_df)
test_results

DataFrame[Pclass: int, Sex: string, Age: double, SibSp: int, Parch: int, Fare: double, Embarked: string, Survived: int, SexIndex: double, EmbarkedIndex: double, SexVec: vector, EmbarkedVec: vector, features: vector, rawPrediction: vector, probability: vector, prediction: double]

In [27]:
test_results.show()

+------+------+----+-----+-----+--------+--------+--------+--------+-------------+---------+-------------+--------------------+--------------------+--------------------+----------+
|Pclass|   Sex| Age|SibSp|Parch|    Fare|Embarked|Survived|SexIndex|EmbarkedIndex|   SexVec|  EmbarkedVec|            features|       rawPrediction|         probability|prediction|
+------+------+----+-----+-----+--------+--------+--------+--------+-------------+---------+-------------+--------------------+--------------------+--------------------+----------+
|     1|female|16.0|    0|    0|    86.5|       S|       1|     1.0|          0.0|(1,[],[])|(2,[0],[1.0])|(8,[0,2,4,7],[1.0...|[-3.4828536392017...|[0.02980405381359...|       1.0|
|     1|female|16.0|    0|    1| 57.9792|       C|       1|     1.0|          1.0|(1,[],[])|(2,[1],[1.0])|[1.0,0.0,16.0,0.0...|[-3.6151922017768...|[0.02620648967643...|       1.0|
|     1|female|18.0|    2|    2| 262.375|       C|       1|     1.0|          1.0|(1,[],[])|(2,

In [28]:
test_results.select('prediction','Survived').show()

+----------+--------+
|prediction|Survived|
+----------+--------+
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
+----------+--------+
only showing top 20 rows



In [30]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Survived')
evaluator

BinaryClassificationEvaluator_b82be9095c90

In [31]:
area_under_roc = evaluator.evaluate(test_results)
area_under_roc

0.7849300427516518

In [32]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Survived',metricName='areaUnderPR')
area_under_pr = evaluator.evaluate(test_results)
area_under_pr

0.7448053081892788