In [10]:
import pyspark
print(pyspark.__version__)



4.0.0


In [11]:
from pyspark.sql import SparkSession
#Initialize spark
spark = SparkSession.builder.appName("TitanicPreprocessing").getOrCreate()


In [19]:
#Load the dataset

df = spark.read.option("header", "true").option("inferSchema", "true").csv("titanic.csv")

df.printSchema()
print(df.columns)


root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [20]:
# Calculate median age
age_median = df.selectExpr("percentile_approx(Age, 0.5)").collect()[0][0]
df = df.fillna({"Age": age_median})  # Fill missing Age values

# Fill missing Embarked with the most frequent value
embarked_mode = df.groupby("Embarked").count().orderBy("count", ascending=False).first()["Embarked"]
df = df.fillna({"Embarked": embarked_mode})  # Fill missing Embarked values

# Drop rows where Fare is missing
df = df.na.drop(subset=["Fare"])


In [21]:
#Removing the duplicates
df = df.dropDuplicates()


In [22]:
#Convert the datatype
df = df.withColumn("Fare", df["Fare"].cast("double"))
df = df.withColumn("Age", df["Age"].cast("double"))


In [23]:
df.show(5)


+-----------+--------+------+--------------------+------+----+-----+-----+--------+--------+-----------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|  Ticket|    Fare|      Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+--------+--------+-----------+--------+
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|PC 17599| 71.2833|        C85|       C|
|         24|       1|     1|Sloper, Mr. Willi...|  male|28.0|    0|    0|  113788|    35.5|         A6|       S|
|        680|       1|     1|Cardeza, Mr. Thom...|  male|36.0|    0|    1|PC 17755|512.3292|B51 B53 B55|       C|
|        292|       1|     1|Bishop, Mrs. Dick...|female|19.0|    1|    0|   11967| 91.0792|        B49|       C|
|        330|       1|     1|Hippach, Miss. Je...|female|16.0|    0|    1|  111361| 57.9792|        B18|       C|
+-----------+--------+------+--------------------+------+----+-----+-----+--------+-----