In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Log_reg_titanic').getOrCreate()

In [0]:
df = spark.read.csv('/FileStore/tables/titanic.csv',header=True,inferSchema=True)
df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [0]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [0]:
df.columns

Out[4]: ['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [0]:
#Selecting only the required columns
my_cols = df.select(['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked'])

#Dropping the columns with missing data
final_data = my_cols.na.drop()

In [0]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer

In [0]:
#One hot encoding:

#Sex
#String Indexing first
gender_indexer = StringIndexer(inputCol='Sex',outputCol='Sex_index')
#One hot encoding next
gender_encoder = OneHotEncoder(inputCol='Sex_index',outputCol='Sex_vec')

#Embarked
embark_indexer = StringIndexer(inputCol='Embarked',outputCol='Embark_index')
embark_encoder = OneHotEncoder(inputCol='Embark_index',outputCol='Embark_vec')

In [0]:
assembler = VectorAssembler(inputCols=['Pclass','Sex_vec','Embark_vec','Age','SibSp','Parch','Fare'],outputCol='features')

In [0]:
#Pipeline -> sets stages for different steps
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [0]:
log_reg = LogisticRegression(featuresCol='features',labelCol='Survived')

#Creating the pipeline
pipeline = Pipeline(stages=[gender_indexer,embark_indexer,gender_encoder,embark_encoder,assembler,log_reg])

In [0]:
#Train-test split
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [0]:
#Fit the log reg model using pipeline
fit_model = pipeline.fit(train_data)

In [0]:
#Generating test predictions
test_results = fit_model.transform(test_data)

In [0]:
test_results.select('Survived','prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [0]:
#Evaluating model performance
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Survived')
AUC = eval.evaluate(test_results)
print('AUC is {0}'.format(AUC))

AUC is 0.7482517482517482


In [0]:
eval_new = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='Survived',metricName='accuracy')
accuracy = eval_new.evaluate(test_results)
print('Accuracy is {0}'.format(accuracy))

Accuracy is 0.7536231884057971
