In [None]:
from pyspark.sql import SparkSession

In [None]:
spark=SparkSession.builder.getOrCreate()

In [None]:
df=spark.read.csv("/content/titanic.csv",header=True,inferSchema=True)

In [None]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- GrpSize: string (nullable = true)
 |-- FareCat: string (nullable = true)
 |-- AgeCat: string (nullable = true)



In [None]:
df.show()

+-----------+--------+------+------+-----+-----+--------+------+-----------+-------+------+
|PassengerId|Survived|Pclass|   Sex|SibSp|Parch|Embarked| Title|    GrpSize|FareCat|AgeCat|
+-----------+--------+------+------+-----+-----+--------+------+-----------+-------+------+
|          1|       0|     3|  male|    1|    0|       S|    Mr|     couple|   0-10| 16-32|
|          2|       1|     1|female|    1|    0|       C|   Mrs|     couple| 70-100| 32-48|
|          3|       1|     3|female|    0|    0|       S|  Miss|       solo|   0-10| 16-32|
|          4|       1|     1|female|    1|    0|       S|   Mrs|     couple|  40-70| 32-48|
|        138|       0|     1|  male|    1|    0|       S|    Mr|     couple|  40-70| 32-48|
|          5|       0|     3|  male|    0|    0|       S|    Mr|       solo|   0-10| 32-48|
|          6|       0|     3|  male|    0|    0|       Q|    Mr|       solo|   0-10| 16-32|
|          7|       0|     1|  male|    0|    0|       S|    Mr|       solo|  40

In [None]:
# Count total rows
print(f"Total Rows: {df.count()}")

Total Rows: 891


In [None]:
# Check class balance
df.groupBy("Survived").count().show()

+--------+-----+
|Survived|count|
+--------+-----+
|       1|  342|
|       0|  549|
+--------+-----+



In [None]:
# Check distribution of categorical variables
for col in ['Sex', 'Embarked', 'Title', 'GrpSize', 'FareCat', 'AgeCat']:
    df.groupBy(col).count().orderBy("count", ascending=False).show()

+------+-----+
|   Sex|count|
+------+-----+
|  male|  577|
|female|  314|
+------+-----+

+--------+-----+
|Embarked|count|
+--------+-----+
|       S|  646|
|       C|  168|
|       Q|   77|
+--------+-----+

+----------+-----+
|     Title|count|
+----------+-----+
|        Mr|  517|
|      Miss|  184|
|       Mrs|  126|
|    Master|   40|
|Rare Title|   24|
+----------+-----+

+-----------+-----+
|    GrpSize|count|
+-----------+-----+
|       solo|  462|
|     couple|  185|
|      group|  170|
|large group|   74|
+-----------+-----+

+-------+-----+
|FareCat|count|
+-------+-----+
|   0-10|  325|
|  10-25|  227|
|  25-40|  158|
|  40-70|   76|
|   100+|   53|
| 70-100|   52|
+-------+-----+

+------+-----+
|AgeCat|count|
+------+-----+
| 16-32|  490|
| 32-48|  216|
|  0-16|  104|
| 48-64|   70|
|   64+|   11|
+------+-----+



In [None]:
# Summary stats for numeric columns
df.select("Pclass", "SibSp", "Parch").describe().show()

+-------+------------------+------------------+-------------------+
|summary|            Pclass|             SibSp|              Parch|
+-------+------------------+------------------+-------------------+
|  count|               891|               891|                891|
|   mean| 2.308641975308642|0.5230078563411896|0.38159371492704824|
| stddev|0.8360712409770492|1.1027434322934326| 0.8060572211299486|
|    min|                 1|                 0|                  0|
|    max|                 3|                 8|                  6|
+-------+------------------+------------------+-------------------+



In [None]:
# Correlation check (if numeric)
for col in ["Pclass", "SibSp", "Parch"]:
    corr = df.stat.corr("Survived", col)
    print(f"Correlation between Survived and {col}: {corr}")


Correlation between Survived and Pclass: -0.33848103596101514
Correlation between Survived and SibSp: -0.03532249888573567
Correlation between Survived and Parch: 0.08162940708348336


In [None]:
from pyspark.ml.feature import StringIndexer,VectorAssembler,OneHotEncoder,VectorIndexer

In [None]:
gender_indexer=StringIndexer(inputCol='Sex',outputCol='sex')
gender_encoder=OneHotEncoder(inputCol='sex',outputCol='gender')

In [None]:
categorical_cols = ['Sex', 'Embarked', 'Title', 'GrpSize', 'FareCat', 'AgeCat']
numeric_cols = ['Pclass', 'SibSp', 'Parch']

In [None]:
# Index and encode categorical variables
indexers = [StringIndexer(inputCol=col, outputCol=col + "_Index") for col in categorical_cols]
encoders = [OneHotEncoder(inputCol=col + "_Index", outputCol=col + "_Vec") for col in categorical_cols]

In [None]:
# Assemble all features
assembler_inputs = [col + "_Vec" for col in categorical_cols] + numeric_cols
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

In [None]:
lr = LogisticRegression(featuresCol="features", labelCol="Survived")

In [None]:
pipeline = Pipeline(stages=indexers + encoders + [assembler, lr])

In [None]:
# Train-Test Split
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

In [None]:
# Fit model
model = pipeline.fit(train_data)

In [None]:
# Predict on test data
predictions = model.transform(test_data)

In [None]:
# Show predictions
predictions.select("Survived", "prediction", "probability").show(10)

+--------+----------+--------------------+
|Survived|prediction|         probability|
+--------+----------+--------------------+
|       1|       1.0|[0.37065980714879...|
|       0|       0.0|[0.55433639291226...|
|       1|       1.0|[0.24355031793992...|
|       0|       0.0|[0.99761802222560...|
|       1|       1.0|[0.12272747916269...|
|       1|       0.0|[0.65630062990053...|
|       0|       0.0|[0.91665228219005...|
|       0|       0.0|[0.65518288988353...|
|       0|       0.0|[0.91665228219005...|
|       0|       0.0|[0.96166813811554...|
+--------+----------+--------------------+
only showing top 10 rows



In [None]:
# Evaluate
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol="Survived", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print(f"AUC on test data: {auc:.4f}")

AUC on test data: 0.9081
