In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("yasserh/titanic-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/yasserh/titanic-dataset?dataset_version_number=1...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22.0k/22.0k [00:00<00:00, 13.9MB/s]

Extracting files...
Path to dataset files: /home/zhan/.cache/kagglehub/datasets/yasserh/titanic-dataset/versions/1





In [1]:
import numpy as np
import pandas as pd
import polars as pl
# import matplotlib as plt
import os 
import sklearn 

# TODO: clear cache! 

In [None]:
"""
Data cleaning strategy:

1. check data columns - what kind of data, and whether random split is fine? Any time/ other data leakages possible? 
2. drop the ID and name column 
3. train test split 
4. check for missing values and outliers, impute the whole dataset using values calculated from the train set  
5. encode categorical variables 
6. scale features using values calculated from the train set
7. save both train and test dataset splits as separate CSVs for the DIY machine learning library to test with  
"""

In [2]:
# import the csv 
df = pd.read_csv("/home/zhan/titanic.csv")

# check data columns 
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
print(df.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [11]:
# drop the passengerId
df.describe(include="object")
# print(df.columns)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,1601,G6,S
freq,1,577,7,4,644


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [9]:
df.duplicated().sum()

np.int64(0)

In [10]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [14]:
# drop the passenger ID, name; check survived, pclass, sibsp, one hot encode sex. check embarked. 
df = df.drop(columns=['PassengerId', 'Name'])

['male' 'female']


In [18]:
# check for number of unique values in survived, pclass, SibSp, Parch, Embarked
print(df['Sex'].unique())
print(df['Survived'].unique())
print(df['Pclass'].unique())
print(df['SibSp'].unique())
print(df['Parch'].unique())
print(df['Embarked'].unique())

['male' 'female']
[0 1]
[3 1 2]
[1 0 3 4 2 5 8]
[0 1 2 5 3 4 6]
['S' 'C' 'Q' nan]


In [19]:
# drop the cabin column, 75% missing values 
df = df.drop(columns="Cabin")

sex_mapping = {'male': 1, 'female': 0}
df['Sex'] = df['Sex'].map(sex_mapping) # one hot encode sex 

In [21]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,3,1,22.0,1,0,A/5 21171,7.25,S
1,1,1,0,38.0,1,0,PC 17599,71.2833,C
2,1,3,0,26.0,0,0,STON/O2. 3101282,7.925,S
3,1,1,0,35.0,1,0,113803,53.1,S
4,0,3,1,35.0,0,0,373450,8.05,S


In [22]:
df['Embarked'] = df['Embarked'].fillna('S') # fill 2 missing vals with majority class S 

In [23]:
df['Embarked'].value_counts()

Embarked
S    646
C    168
Q     77
Name: count, dtype: int64

In [24]:
# ticket, fare 
df[df['Fare']<1]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
179,0,3,1,36.0,0,0,LINE,0.0,S
263,0,1,1,40.0,0,0,112059,0.0,S
271,1,3,1,25.0,0,0,LINE,0.0,S
277,0,2,1,,0,0,239853,0.0,S
302,0,3,1,19.0,0,0,LINE,0.0,S
413,0,2,1,,0,0,239853,0.0,S
466,0,2,1,,0,0,239853,0.0,S
481,0,2,1,,0,0,239854,0.0,S
597,0,3,1,49.0,0,0,LINE,0.0,S
633,0,1,1,,0,0,112052,0.0,S


In [25]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,3,1,22.0,1,0,A/5 21171,7.25,S
1,1,1,0,38.0,1,0,PC 17599,71.2833,C
2,1,3,0,26.0,0,0,STON/O2. 3101282,7.925,S
3,1,1,0,35.0,1,0,113803,53.1,S
4,0,3,1,35.0,0,0,373450,8.05,S


In [26]:
# one hot encode embarked, drop ticket
df = pd.get_dummies(df, columns=['Embarked'], prefix='Embarked', drop_first=True)
df.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [27]:
df = df.drop(columns='Ticket')

In [None]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1 # make a new family size feature based on parch and siblings 

In [None]:
# one hot encode Pclass
df = pd.get_dummies(df, columns=['Pclass'], prefix='Pclass', drop_first=True)
df.columns

In [28]:
# impute age and fare with the median of the train set, after train test split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

y = df['Survived']
X = df.drop(columns='Survived')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,7.25,False,True
1,1,1,0,38.0,1,0,71.2833,False,False
2,1,3,0,26.0,0,0,7.925,False,True
3,1,1,0,35.0,1,0,53.1,False,True
4,0,3,1,35.0,0,0,8.05,False,True


In [None]:
# impute missing age values with the median from the train set  
age_median = X_train['Age'].median()
X_train['Age'].fillna(age_median, inplace=True)
X_test['Age'].fillna(age_median, inplace=True)

# check for nulls in train and test 
print(X_train['Age'].isnull().value_counts())
print(X_test['Age'].isnull().value_counts())

In [None]:
# Use boolean indexing to select fares >= 1.0 in the training set
normal_fares_train = X_train.loc[X_train['Fare'] >= 1.0, 'Fare']

# Calculate the median of those "normal" fares
fare_median = np.median(normal_fares_train)

# Impute X_train
X_train.loc[X_train['Fare'] < 1.0, 'Fare'] = fare_median

# Impute X_test using the SAME median calculated from the training set
X_test.loc[X_test['Fare'] < 1.0, 'Fare'] = fare_median

display(X_train[X_train['Fare'] < 1])
display(X_test[X_test['Fare'] < 1])

In [None]:
df.columns

In [None]:
# TODO: Look for outliers !!!!! 

In [None]:
scaler = StandardScaler()

numerical_cols = ["Age", "Fare", "SibSp", "Parch", "FamilySize"]

scaler.fit(X_train[numerical_cols]) # fit the training data on all numerical columns of X_train 

# 3. TRANSFORM the training data
X_train_scaled = scaler.transform(X_train[numerical_cols])

# 4. TRANSFORM the test data using the *same* mean and std calculated in step 2
X_test_scaled = scaler.transform(X_test[numerical_cols])

In [None]:
# export X_train, X_test, y_train and y_test as CSVs.

X_train_scaled.to_csv("X_train_processed.csv", index=False)
X_test_scaled.to_csv("X_test_processed.csv", index=False)
y_train.to_frame().to_csv("y_train.csv", index=False)
y_test.to_frame().to_csv("y_test.csv", index=False)