In [6]:
# Titanic Dataset 
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Titanic').getOrCreate()

In [13]:
data=spark.read.csv('titanic.csv', inferSchema=True, header=True)

In [14]:
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [16]:
data.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [17]:
final_data=data.select(['PassengerId',
 'Survived',
 'Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked'])

In [19]:
# remove the missing data 
final_data=final_data.na.drop()

In [20]:
# Working on categorical data to convert into integers and then one hot encode them 
from pyspark.ml.feature import( VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer)

In [41]:
# eg varible has categories as A B C 
# one hot encoing for A means [1,0,0] a vector having 1 denoting presence of A and 0 for absence of B and C
gender_indexer=StringIndexer(inputCol='Sex', outputCol='SexIndex')
gender_encode=OneHotEncoder(inputCol='SexIndex', outputCol='SexEnc')

In [42]:
embark_indexer=StringIndexer(inputCol='Embarked', outputCol='EmbarkedIndex')
embark_encode=OneHotEncoder(inputCol='EmbarkedIndex', outputCol='EmbarkedEnc')

In [49]:
# Now join all the data using vector assembler
assembler=VectorAssembler(inputCols=['Pclass', 'SexEnc', 'EmbarkedEnc', 'Age', 'SibSp', 'Parch', 'Fare'], outputCol='features')

In [50]:
# Pipeline sets stages for different steps 
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [51]:
log_reg=LogisticRegression(featuresCol= 'features', labelCol='Survived' )

In [52]:
pipeline=Pipeline(stages=[gender_indexer, embark_indexer, gender_encode, embark_encode, assembler, log_reg])

In [53]:
# split the data into train and test 
lr_train, lr_test= final_data.randomSplit([0.7,0.3])


In [54]:
fit_model=pipeline.fit(lr_train)

In [55]:
results=fit_model.transform(lr_test)

Exception ignored in: <function JavaWrapper.__del__ at 0x0000024E1F1FEE18>
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\pyspark\ml\wrapper.py", line 40, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'LogisticRegression' object has no attribute '_java_obj'


In [56]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator 

In [57]:
my_eval=BinaryClassificationEvaluator(rawPredictionCol = 'prediction', labelCol='Survived')

In [63]:
# see the result
results.select(['Survived', 'prediction']).show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       1|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       1|       0.0|
|       1|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       1|       1.0|
|       0|       0.0|
|       1|       1.0|
|       0|       0.0|
|       1|       0.0|
|       0|       0.0|
|       0|       0.0|
|       1|       1.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [68]:
# as we can see from above there are some incorrect predictions so lets check area under the curve
AUC=my_eval.evaluate(results)

In [69]:
AUC

0.7597826086956522