In [1]:
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import joblib

In [3]:
PROCESSED_DIR=Path("../data/processed")

x_clean=pd.read_csv(PROCESSED_DIR/"X_clean.csv")
y=pd.read_csv(PROCESSED_DIR/"y.csv").squeeze()

x_clean.shape, y.shape

((891, 7), (891,))

In [4]:
# 1) Ensure no missing values
x_clean.isna().sum()

# 2) See which columns are non-numeric (these must be encoded)
x_clean.select_dtypes(include=["object"]).columns.tolist()


['Sex', 'Embarked']

In [7]:
x_encoded=pd.get_dummies(
    x_clean,
    columns=["Sex","Embarked"],
    drop_first=True,  # to avoid dummy variable trap
)

x_encoded.head()


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,True,False,True
1,1,38.0,1,0,71.2833,False,False,False
2,3,26.0,0,0,7.925,False,False,True
3,1,35.0,1,0,53.1,False,False,True
4,3,35.0,0,0,8.05,True,False,True


In [8]:
x_encoded.select_dtypes(include=["object"]).columns.tolist()


[]

In [9]:
x_train, x_test, y_train, y_test = train_test_split(
    x_encoded,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)


In [10]:
#verify class distribution is maintained
y_train.value_counts(normalize=True), y_test.value_counts(normalize=True)


(Survived
 0    0.616573
 1    0.383427
 Name: proportion, dtype: float64,
 Survived
 0    0.614525
 1    0.385475
 Name: proportion, dtype: float64)

In [11]:
num_cols=["Age", "Fare"]
scaler=StandardScaler()

x_train[num_cols]=scaler.fit_transform(x_train[num_cols])
x_test[num_cols]=scaler.transform(x_test[num_cols])

In [12]:
#verify scaling
x_train[num_cols].describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,712.0,2.120651e-16,1.000703,-2.229476,-0.572716,-0.112078,0.502106,3.880117
Fare,712.0,-1.746418e-17,1.000703,-0.662563,-0.498154,-0.361593,-0.017071,10.005329


In [13]:
x_train.to_csv(PROCESSED_DIR / "X_train.csv", index=False)
x_test.to_csv(PROCESSED_DIR / "X_test.csv", index=False)
y_train.to_csv(PROCESSED_DIR / "y_train.csv", index=False)
y_test.to_csv(PROCESSED_DIR / "y_test.csv", index=False)


In [14]:
joblib.dump(scaler, PROCESSED_DIR / "standard_scaler.joblib")


['..\\data\\processed\\standard_scaler.joblib']

In [15]:
print("X_train shape:", x_train.shape)
print("X_test shape:", x_test.shape)

print("Missing in X_train:", x_train.isna().sum().sum())
print("Missing in X_test:", x_test.isna().sum().sum())

print("Non-numeric columns:", x_train.select_dtypes(include=["object"]).columns.tolist())

X_train shape: (712, 8)
X_test shape: (179, 8)
Missing in X_train: 0
Missing in X_test: 0
Non-numeric columns: []
