# Titanic — Data Preprocessing Notebook

Notebook này thực hiện **pipeline tiền xử lý** cho bài Kaggle Titanic (train.csv).  
Mục tiêu: chuẩn hoá dữ liệu, xử lý missing, tạo feature, mã hoá, chuẩn hoá số và lưu dataset đã xử lý.

In [None]:
# 0. Thiết lập môi trường
import os, random
import numpy as np, pandas as pd
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Thư mục lưu kết quả (chạy lần đầu nếu cần)
os.makedirs('processed', exist_ok=True)
os.makedirs('pic', exist_ok=True)

# Path tới dữ liệu - chỉnh nếu cần
DATA_PATH = 'train.csv'  # nếu file ở nơi khác, chỉnh lại đường dẫn

In [None]:
# 1. Load dữ liệu
df = pd.read_csv(DATA_PATH)
print('Shape:', df.shape)
display(df.head())

## 2) Kiểm tra missing và thống kê cơ bản

In [None]:
# 2. Missing & dtypes & basic stats
print('Missing values per column:\n', df.isna().sum())
print('\nDtypes:\n', df.dtypes)
display(df.describe(include='all').T)

## 3) Chiến lược xử lý missing & feature engineering
- Age: impute bằng median theo (Pclass, Title)
- Embarked: fill bằng mode
- Fare: fill bằng median theo Pclass nếu cần
- Cabin: tạo HasCabin + CabinLetter
- Name: trích Title
- FamilySize, IsAlone
- Ticket: trích prefix (optional)

In [None]:
# 3. Preprocessing & feature engineering (thực thi được)
df2 = df.copy()

# Embarked
df2['Embarked'] = df2['Embarked'].fillna(df2['Embarked'].mode()[0])

# Fare
if df2['Fare'].isna().sum() > 0:
    df2['Fare'] = df2.groupby('Pclass')['Fare'].apply(lambda x: x.fillna(x.median()))

# Cabin features
df2['HasCabin'] = df2['Cabin'].notna().astype(int)
df2['CabinLetter'] = df2['Cabin'].fillna('X').map(lambda x: str(x)[0])

# Title extraction
def extract_title(name):
    if pd.isna(name): return 'Unknown'
    title = name.split(',')[1].split('.')[0].strip()
    if title in ['Mr','Mrs','Miss','Master']:
        return title
    return 'Rare'

df2['Title'] = df2['Name'].apply(extract_title)

# Family features
df2['FamilySize'] = df2['SibSp'] + df2['Parch'] + 1
df2['IsAlone'] = (df2['FamilySize'] == 1).astype(int)

# Age imputation by Pclass + Title median
age_median = df2.groupby(['Pclass','Title'])['Age'].median()

def fill_age(row):
    if pd.isna(row['Age']):
        med = age_median.get((row['Pclass'], row['Title']), np.nan)
        if pd.isna(med):
            return df2['Age'].median()
        return med
    else:
        return row['Age']

df2['Age'] = df2.apply(fill_age, axis=1)

# Age bin and FareBand
df2['AgeBin'] = pd.cut(df2['Age'], bins=[0,12,20,40,60,120], labels=['Child','Teen','Adult','MidAge','Senior'])
df2['Fare'] = df2['Fare'].fillna(df2['Fare'].median())
df2['FareBand'] = pd.qcut(df2['Fare'], 4, labels=False)

# Ticket prefix (optional)
def ticket_prefix(t):
    t = str(t)
    parts = [p for p in t.replace('.', '').replace('/', '').split() if not p.isdigit()]
    if len(parts) == 0:
        return 'NONE'
    return parts[0]

df2['TicketPrefix'] = df2['Ticket'].apply(ticket_prefix)

# Show sample
display(df2.head())

## 4) Mã hoá categorical và lưu dataset đã xử lý

In [None]:
# 4. Select & encode features
df_final = df2.copy()

keep_cols = [
    'PassengerId','Survived','Pclass','Sex','Age','Fare',
    'Embarked','HasCabin','CabinLetter','Title','FamilySize','IsAlone',
    'AgeBin','FareBand','TicketPrefix'
]

df_final = df_final[keep_cols].copy()

# One-hot encode
to_onehot = ['Embarked','Title','CabinLetter','TicketPrefix','AgeBin']
df_final = pd.get_dummies(df_final, columns=to_onehot, drop_first=True)

# Sex to binary
df_final['Sex'] = df_final['Sex'].map({'male':1,'female':0}).astype(int)

# Check missing
print('Any nulls left:', df_final.isna().sum().sum())

# Save processed (unscaled)
os.makedirs('processed', exist_ok=True)
df_final.to_csv('processed/titanic_train_preprocessed.csv', index=False)
print('Saved processed/titanic_train_preprocessed.csv')

## 5) Chuẩn hoá số liệu (Scaling) & lưu scaler

In [None]:
# 5. Scaling numeric features
from sklearn.preprocessing import StandardScaler
import joblib

num_cols = ['Age','Fare','FamilySize']
scaler = StandardScaler()
df_final[num_cols] = scaler.fit_transform(df_final[num_cols])

joblib.dump(scaler, 'processed/standard_scaler_titanic.pkl')
df_final.to_csv('processed/titanic_train_preprocessed_scaled.csv', index=False)
print('Saved scaled processed data and scaler')

## 6) Train/Validation split (lưu file)

In [None]:
# 6. Train/validation split
from sklearn.model_selection import train_test_split
X = df_final.drop(columns=['PassengerId','Survived'])
y = df_final['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)

X_train.to_csv('processed/X_train.csv', index=False)
X_val.to_csv('processed/X_val.csv', index=False)
y_train.to_csv('processed/y_train.csv', index=False)
y_val.to_csv('processed/y_val.csv', index=False)
print('Saved train/val splits to processed/')

## 7) Kiểm tra & visualization mẫu

In [None]:
# Quick visual checks (saves into pic/)
import matplotlib.pyplot as plt, seaborn as sns
plt.figure(figsize=(8,4))
sns.kdeplot(df2[df2['Survived']==0]['Age'], label='Not Survived')
sns.kdeplot(df2[df2['Survived']==1]['Age'], label='Survived')
plt.title('Age distribution by Survival')
plt.legend()
plt.savefig('pic/titanic_age_dist.png', bbox_inches='tight', dpi=200)
plt.show()