# Logistic Regression Code Along

This is a code along of the famous titanic dataset, its always nice to start off with this dataset because it is an example you will find across pretty much every data analysis language.

In [84]:
# Note that usually all imports would occur at the top and
# most of this would be in an object this layout if for learning purposes only

# Logistic Regression Example
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('logreg').getOrCreate()

In [85]:
# Load training data
data = spark.read.csv('titanic.csv',inferSchema=True,header=True)

In [86]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [87]:
data.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [88]:
data.describe().show()

+-------+-----------------+-------------------+------------------+------------------+------------------+-------------------+-----------------+
|summary|      PassengerId|           Survived|            Pclass|               Age|             SibSp|              Parch|             Fare|
+-------+-----------------+-------------------+------------------+------------------+------------------+-------------------+-----------------+
|  count|              891|                891|               891|               714|               891|                891|              891|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642| 29.69911764705882|0.5230078563411896|0.38159371492704824| 32.2042079685746|
| stddev|257.3538420152301|0.48659245426485753|0.8360712409770491|14.526497332334035|1.1027434322934315| 0.8060572211299488|49.69342859718089|
|    min|                1|                  0|                 1|              0.42|                 0|                  0|              0.0|

In [104]:
# Drop missing data
clean_data = data.na.drop()

In [105]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler,StringIndexer,OneHotEncoder,VectorIndexer

### Working with Categorical Columns

Let's break this down into multiple steps to make it all clear.

In [106]:
data.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [107]:
train_data,test_data = clean_data.randomSplit([0.7,0.3])

In [108]:
gender_indexer = StringIndexer(inputCol="Sex", outputCol="SexIndex")
gender_encoder = OneHotEncoder(inputCol="SexIndex", outputCol="SexVec")

In [109]:
embark_indexer = StringIndexer(inputCol="Embarked", outputCol="EmbarkIndex")
embark_encoder = OneHotEncoder(inputCol="EmbarkIndex", outputCol="EmbarkVec")

## Pipelines 

Let's see an example of how to use pipelines (we'll get a lot more practice with these later!)

In [110]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [111]:
assembler = VectorAssembler(
    inputCols=["Pclass", "SexVec", "Age","SibSp","Parch","Fare","EmbarkVec"],
    outputCol="features")

In [112]:
from pyspark.ml.classification import LogisticRegression

In [113]:
logreg = LogisticRegression(featuresCol='features',labelCol='Survived')

In [114]:
from pyspark.ml import Pipeline

In [115]:
train_data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [116]:
pipeline = Pipeline(stages=[gender_indexer,embark_indexer,
                            gender_encoder,embark_encoder,assembler,logreg])

In [117]:
model = pipeline.fit(train_data)

In [118]:
results = model.transform(test_data)

In [122]:
results.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- SexIndex: double (nullable = true)
 |-- EmbarkIndex: double (nullable = true)
 |-- SexVec: vector (nullable = true)
 |-- EmbarkVec: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)



In [123]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [124]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')

In [126]:
AUC = evaluator.evaluate(results)

In [127]:
AUC

0.735144312393888

In [131]:
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='Survived',
                                             metricName='accuracy')

In [132]:
acc = evaluator.evaluate(results)

In [133]:
acc

0.76

Great, you are now ready for your consulting project!