In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [61]:
df = pd.read_csv("train_eda.csv")

In [62]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [63]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [65]:
from sklearn.model_selection import train_test_split

In [66]:
X = df.drop(columns=["Survived"])
y = df["Survived"]

In [67]:
X = X.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])
X["Sex"] = X["Sex"].map({"male": 0, "female": 1})

In [68]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [69]:
# Fill missing Age values with the median age from training data
# This ensures we don't have missing values and uses the same value for both datasets
X_train["Age"] = X_train["Age"].fillna(X_train["Age"].median())
X_test["Age"]  = X_test["Age"].fillna(X_train["Age"].median())


# Fill missing Fare values with the median fare from training data
# Using training data median for both datasets maintains consistency
X_train["Fare"] = X_train["Fare"].fillna(X_train["Fare"].median())
X_test["Fare"]  = X_test["Fare"].fillna(X_train["Fare"].median())


# Fill missing Embarked values with the most common port (mode) from training data
# Mode()[0] returns the first value in case there are multiple modes
X_train["Embarked"] = X_train["Embarked"].fillna(X_train["Embarked"].mode()[0])
X_test["Embarked"]  = X_test["Embarked"].fillna(X_train["Embarked"].mode()[0])


# Convert categorical Embarked feature to one-hot encoded columns
# drop_first=True removes one category to avoid multicollinearity
X_train = pd.get_dummies(X_train, columns=["Embarked"], drop_first=True)
X_test  = pd.get_dummies(X_test, columns=["Embarked"], drop_first=True)


# Align the training and test datasets to ensure they have the same columns
# This handles any categories that might appear in one dataset but not the other
# fill_value=0 ensures any missing columns are filled with zeros
X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)

In [73]:
# Align the training and test feature matrices to have the same columns
# - join="left" keeps all columns from X_train and adds missing ones from X_test
# - axis=1 aligns along columns (features)
# - fill_value=0 fills any missing values with zeros
X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)

In [74]:
# Create a new feature 'FamilySize' by adding the number of siblings/spouses (SibSp),
# parents/children (Parch), and the passenger themselves (+1)
X_train["FamilySize"] = X_train["SibSp"] + X_train["Parch"] + 1
X_test["FamilySize"]  = X_test["SibSp"] + X_test["Parch"] + 1

# Create a binary feature 'IsAlone' that indicates whether the passenger is traveling alone (1) or not (0)
# A passenger is alone if their family size is exactly 1 (just themselves)
X_train["IsAlone"] = (X_train["FamilySize"] == 1).astype(int)
X_test["IsAlone"]  = (X_test["FamilySize"] == 1).astype(int)

In [72]:
X_train.isnull().sum()
X_test.isnull().sum()

Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked_Q    0
Embarked_S    0
FamilySize    0
IsAlone       0
dtype: int64