In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("LogistcRegerssion").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/11 12:57:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import (
    BinaryClassificationEvaluator,
    MulticlassClassificationEvaluator,
)

In [4]:
df = spark.read.csv("titanic.csv", inferSchema=True, header=True)

In [5]:
df.show(truncate=False)

+-----------+--------+------+-------------------------------------------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|Name                                                   |Sex   |Age |SibSp|Parch|Ticket          |Fare   |Cabin|Embarked|
+-----------+--------+------+-------------------------------------------------------+------+----+-----+-----+----------------+-------+-----+--------+
|1          |0       |3     |Braund, Mr. Owen Harris                                |male  |22.0|1    |0    |A/5 21171       |7.25   |NULL |S       |
|2          |1       |1     |Cumings, Mrs. John Bradley (Florence Briggs Thayer)    |female|38.0|1    |0    |PC 17599        |71.2833|C85  |C       |
|3          |1       |3     |Heikkinen, Miss. Laina                                 |female|26.0|0    |0    |STON/O2. 3101282|7.925  |NULL |S       |
|4          |1       |1     |Futrelle, Mrs. Jacques Heath (Lily May Peel)           |female|35.0|1  

In [6]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [7]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [8]:
my_cols = df.select(
    "Survived",
    "Pclass",
    "Sex",
    "Age",
    "SibSp",
    "Parch",
    "Fare",
    "Embarked",
)

In [9]:
my_cols.show()

+--------+------+------+----+-----+-----+-------+--------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+--------+------+------+----+-----+-----+-------+--------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|
|       1|     1|female|38.0|    1|    0|71.2833|       C|
|       1|     3|female|26.0|    0|    0|  7.925|       S|
|       1|     1|female|35.0|    1|    0|   53.1|       S|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|
|       0|     3|  male|NULL|    0|    0| 8.4583|       Q|
|       0|     1|  male|54.0|    0|    0|51.8625|       S|
|       0|     3|  male| 2.0|    3|    1| 21.075|       S|
|       1|     3|female|27.0|    0|    2|11.1333|       S|
|       1|     2|female|14.0|    1|    0|30.0708|       C|
|       1|     3|female| 4.0|    1|    1|   16.7|       S|
|       1|     1|female|58.0|    0|    0|  26.55|       S|
|       0|     3|  male|20.0|    0|    0|   8.05|       S|
|       0|     3|  male|39.0|    1|    5| 31.275|       

In [10]:
my_final_data = my_cols.dropna()

In [11]:
from pyspark.ml.feature import (
    VectorAssembler,
    VectorIndexer,
    OneHotEncoder,
    StringIndexer,
)

In [20]:
gender_indexer = StringIndexer(inputCol="Sex", outputCol="SexIndex")
gender_encoder = OneHotEncoder(inputCol="SexIndex", outputCol="SexVec")

In [13]:
embark_indexer = StringIndexer(inputCol="Embarked", outputCol="EmbarkIndex")
embark_encoder = OneHotEncoder(inputCol="EmbarkIndex", outputCol="EmbarkVec")

In [14]:
assembler = VectorAssembler(
    inputCols=["Pclass", "SexVec", "EmbarkVec", "Age", "SibSp", "Parch", "Fare"],
    outputCol="features",
)

In [15]:
from pyspark.ml import Pipeline

In [16]:
log_reg_titanic = LogisticRegression(featuresCol="features", labelCol="Survived")

In [22]:
pipeline = Pipeline(
    stages=[
        gender_indexer,
        embark_indexer,
        gender_encoder,
        embark_encoder,
        assembler,
        log_reg_titanic,
    ]
)

In [18]:
train_data, test_data = my_final_data.randomSplit([0.7, 0.3], seed=20)

In [24]:
train_data.show()

+--------+------+------+----+-----+-----+--------+--------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|    Fare|Embarked|
+--------+------+------+----+-----+-----+--------+--------+
|       0|     1|female| 2.0|    1|    2|  151.55|       S|
|       0|     1|female|25.0|    1|    2|  151.55|       S|
|       0|     1|  male|18.0|    1|    0|   108.9|       C|
|       0|     1|  male|19.0|    1|    0|    53.1|       S|
|       0|     1|  male|19.0|    3|    2|   263.0|       S|
|       0|     1|  male|21.0|    0|    1| 77.2875|       S|
|       0|     1|  male|22.0|    0|    0|135.6333|       C|
|       0|     1|  male|24.0|    0|    0|    79.2|       C|
|       0|     1|  male|24.0|    0|    1|247.5208|       C|
|       0|     1|  male|27.0|    0|    2|   211.5|       C|
|       0|     1|  male|28.0|    1|    0| 82.1708|       C|
|       0|     1|  male|29.0|    0|    0|    30.0|       S|
|       0|     1|  male|29.0|    1|    0|    66.6|       S|
|       0|     1|  male|30.0|    0|    0

In [23]:
fit_model = pipeline.fit(train_data)

24/03/11 13:22:55 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/03/11 13:22:55 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


In [26]:
result = fit_model.transform(test_data)

In [27]:
my_eval = BinaryClassificationEvaluator(
    rawPredictionCol="prediction", labelCol="Survived"
)

In [28]:
result.select("Survived", "prediction").show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [29]:
AUC = my_eval.evaluate(result)

In [30]:
AUC

0.7625603432862507