In [0]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('mylogreg').getOrCreate()

In [0]:
from pyspark.ml.classification import LogisticRegression

In [0]:
my_data=spark.read.table('titanic')

In [0]:
my_data.head(1)

Out[8]: [Row(PassengerId=1, Survived=0, Pclass=3, Name='Braund, Mr. Owen Harris', Sex='male', Age=22.0, SibSp=1, Parch=0, Ticket='A/5 21171', Fare=7.25, Cabin=None, Embarked='S')]

In [0]:
dropped_dataframe_treatment=my_data.na.drop()


In [0]:
my_data.columns

Out[6]: ['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer

In [0]:
strindexer_embarked=StringIndexer(inputCol="Embarked",outputCol="embarked_treated")
strindexer_cabin=StringIndexer(inputCol="Cabin",outputCol="Treated_Cabin")

In [0]:
strin_emb_fit=strindexer_embarked.fit(dropped_dataframe_treatment)
strin_cab_fit=strindexer_cabin.fit(dropped_dataframe_treatment)

In [0]:
transformed_frame_emb=strin_emb_fit.transform(dropped_dataframe_treatment)
transformed_frame_cab=strin_cab_fit.transform(transformed_frame_emb)

In [0]:
transformed_frame_cab.printSchema()

root
 |-- PassengerId: long (nullable = true)
 |-- Survived: long (nullable = true)
 |-- Pclass: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: long (nullable = true)
 |-- Parch: long (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- embarked_treated: double (nullable = false)
 |-- Treated_Cabin: double (nullable = false)



In [0]:
assembler=VectorAssembler(inputCols=['Pclass','Age','SibSp','Parch','Treated_Cabin','embarked_treated'],outputCol='features')
transformed_frame=assembler.transform(transformed_frame_cab)

In [0]:
data=transformed_frame.select(['features','Survived'])

In [0]:
train,test=data.randomSplit([0.7,0.3])
train.show()
test.show()

+--------------------+--------+
|            features|Survived|
+--------------------+--------+
|[1.0,14.0,1.0,2.0...|       1|
|[1.0,15.0,0.0,1.0...|       1|
|[1.0,16.0,0.0,0.0...|       1|
|[1.0,16.0,0.0,1.0...|       1|
|[1.0,17.0,0.0,2.0...|       1|
|[1.0,17.0,1.0,0.0...|       1|
|[1.0,17.0,1.0,0.0...|       1|
|[1.0,18.0,0.0,2.0...|       1|
|[1.0,18.0,1.0,0.0...|       0|
|[1.0,18.0,2.0,2.0...|       1|
|[1.0,19.0,0.0,2.0...|       1|
|[1.0,19.0,1.0,0.0...|       1|
|[1.0,19.0,1.0,0.0...|       0|
|[1.0,19.0,3.0,2.0...|       0|
|[1.0,21.0,0.0,0.0...|       1|
|[1.0,21.0,0.0,1.0...|       0|
|[1.0,21.0,2.0,2.0...|       1|
|[1.0,22.0,0.0,1.0...|       1|
|[1.0,22.0,0.0,2.0...|       1|
|[1.0,22.0,1.0,0.0...|       1|
+--------------------+--------+
only showing top 20 rows

+--------------------+--------+
|            features|Survived|
+--------------------+--------+
|[1.0,0.92,1.0,2.0...|       1|
|[1.0,2.0,1.0,2.0,...|       0|
|[1.0,4.0,0.0,2.0,...|       1|
|[1.0,11.0,1.0

In [0]:
log_reg=LogisticRegression(labelCol='Survived')
fitted_model=log_reg.fit(train)

In [0]:
summary_log=fitted_model.summary

In [0]:
summary_log.predictions.show()

+--------------------+--------+--------------------+--------------------+----------+
|            features|Survived|       rawPrediction|         probability|prediction|
+--------------------+--------+--------------------+--------------------+----------+
|[1.0,14.0,1.0,2.0...|     1.0|[-1.7091319510009...|[0.15327633960698...|       1.0|
|[1.0,15.0,0.0,1.0...|     1.0|[-1.6080130618665...|[0.16686465658067...|       1.0|
|[1.0,16.0,0.0,0.0...|     1.0|[-1.4354175787524...|[0.19225596593201...|       1.0|
|[1.0,16.0,0.0,1.0...|     1.0|[-1.6275227350580...|[0.16417000435800...|       1.0|
|[1.0,17.0,0.0,2.0...|     1.0|[-2.0458769274187...|[0.11446965823925...|       1.0|
|[1.0,17.0,1.0,0.0...|     1.0|[-1.2519437618144...|[0.22236384571996...|       1.0|
|[1.0,17.0,1.0,0.0...|     1.0|[-1.5640263451541...|[0.17306965166280...|       1.0|
|[1.0,18.0,0.0,2.0...|     1.0|[-1.7257633060350...|[0.15113030194946...|       1.0|
|[1.0,18.0,1.0,0.0...|     0.0|[-1.5208809002846...|[0.1793318389

In [0]:
prediction_test=fitted_model.evaluate(test)

In [0]:
print(prediction_test.accuracy)
print(prediction_test.areaUnderROC)

0.7384615384615385
0.6825980392156863


In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator

In [0]:
my_eval=BinaryClassificationEvaluator()