# Logistic Regression Code Along
This is a code along of the famous titanic dataset, its always nice to start off with this dataset because it is an example you will find across pretty much every data analysis language.

In [125]:
from pyspark.sql import SparkSession

In [126]:
spark = SparkSession.builder.appName('myproj').getOrCreate()

In [127]:
data = spark.read.csv('titanic.csv',inferSchema=True,header=True)

In [128]:
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [129]:
data.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [130]:
# we already knew from other courses which columns are really important
my_cols = data.select(['Survived',
 'Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked'])

In [131]:
my_final_data = my_cols.na.drop()

In [132]:
my_final_data.printSchema()

root
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)



### Working with Categorical Columns

Let's break this down into multiple steps to make it all clear.

In [133]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)

In [134]:
gender_indexer = StringIndexer(inputCol='Sex',outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex',outputCol='SexVec')

In [135]:
embark_indexer = StringIndexer(inputCol='Embarked',outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex',outputCol='EmbarkVec')

In [136]:
assembler = VectorAssembler(inputCols=['Pclass',
 'SexVec',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'EmbarkVec'],outputCol='features')

In [137]:
from pyspark.ml.classification import LogisticRegression

## Pipelines 

Let's see an example of how to use pipelines (we'll get a lot more practice with these later!)

In [138]:
from pyspark.ml import Pipeline

In [139]:
log_reg_titanic = LogisticRegression(featuresCol='features',
                                     labelCol='Survived')

In [140]:
pipeline = Pipeline(stages=[gender_indexer,embark_indexer,
                           gender_encoder,embark_encoder,
                           assembler,log_reg_titanic])

In [141]:
train_titanic_data, test_titanic_data = my_final_data.randomSplit([0.7,.3])

In [142]:
fit_model = pipeline.fit(train_titanic_data)

In [143]:
results = fit_model.transform(test_titanic_data)

In [144]:
results.show(1)

+--------+------+----+----+-----+-----+-----+--------+--------+-----------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|Survived|Pclass| Sex| Age|SibSp|Parch| Fare|Embarked|SexIndex|EmbarkIndex|       SexVec|    EmbarkVec|            features|       rawPrediction|         probability|prediction|
+--------+------+----+----+-----+-----+-----+--------+--------+-----------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|       0|     1|male|18.0|    1|    0|108.9|       C|     0.0|        1.0|(1,[0],[1.0])|(2,[1],[1.0])|[1.0,1.0,18.0,1.0...|[-0.8161325405474...|[0.30658523032987...|       1.0|
+--------+------+----+----+-----+-----+-----+--------+--------+-----------+-------------+-------------+--------------------+--------------------+--------------------+----------+
only showing top 1 row



In [145]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [146]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='Survived')

In [147]:
results.select('Survived','prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [148]:
AUC = my_eval.evaluate(results)

In [149]:
AUC

0.7713866368465194

In [150]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [151]:
# For multiclass we can use following metrics:
# (f1|weightedPrecision|weightedRecall|accuracy)
evaluator2 = MulticlassClassificationEvaluator(predictionCol='prediction', 
                                               labelCol='Survived',
                                                metricName='weightedRecall')

In [152]:
recall = evaluator2.evaluate(results.select('Survived','prediction'))

In [153]:
recall

0.7745901639344263