## Import Libraries

In [1]:
# %load_ext autoreload
%reload_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
from IPython import display

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold

In [2]:
params_cfg = {
    "action"   : "main_feat01",  
    "feat_path": "../../../exps/featbase_19102025/data.npz",
    "seed"    : 42, # Set random seed
    "exp_dir" : os.path.abspath('../../../exps'),
    'exp_name': 'trainbase_19102025',
    "data_dir": os.path.abspath("../../data/titanic"),
    "verbose" : True,
}
params_cfg.update(**{
    "save_dir": os.path.abspath(f'{params_cfg["exp_dir"]}/{params_cfg["exp_name"]}')
})

for v in params_cfg:
    print(f'+ {v}: {params_cfg[v]}')

globals().update(**params_cfg)

+ action: main_feat01
+ feat_path: ../../../exps/featbase_19102025/data.npz
+ seed: 42
+ exp_dir: d:\exps
+ exp_name: trainbase_19102025
+ data_dir: d:\ai_practice_prj\data\titanic
+ verbose: True
+ save_dir: d:\exps\trainbase_19102025


## Data Load

In [3]:
# data_dir = '../../../data/titanic'
# df_train = pd.read_csv(f'{data_dir}/train.csv')
# df_test = pd.read_csv(f'{data_dir}/test.csv')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df_train = pd.read_csv("/kaggle/input/titanic/train.csv")
df_train.head()
df_test = pd.read_csv("/kaggle/input/titanic/test.csv")
df_test.head()

if params_cfg["verbose"]:
    print("-"*10, "information", "-"*10)
    print(f'train-col: {set(df_train.columns)}')
    print(f'test-col: {set(df_test.columns)}')
    print("Union:", set(df_train.columns).intersection(set(df_test.columns)))
    print("Difference:", set(df_train.columns).difference(set(df_test.columns)))

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/titanic/train.csv'

## Preprocessing

In [None]:
def preprocessing_feature_01(df_data, is_train = True, is_debug = True, **kwargs):
    df_output = pd.DataFrame()

    # Sex: gioi tinh
    cls_sex = {'female': 0, 'male' : 1}
    df_output["Sex"] = df_data["Sex"].apply(lambda x: cls_sex[x])
    # Age: median
    df_output["Age"] = df_data["Age"].fillna(df_data["Age"].median())
    # Fare, Pclass
    for name in ['Fare', 'Pclass', 'SibSp', 'Parch']:
        df_output[name] = df_data[name]
    # Cabin
    cls_cabin = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'T':8, 'Z':0}
    df_output["Cabin"] = df_data['Cabin'].apply(lambda x: cls_cabin['Z'] if pd.isna(x) else cls_cabin[x[0]])
    # Embarked
    cls_embarked = {'0': 0, 'C':1, 'Q':2, 'S':3}
    df_output["Embarked"] =  df_data['Embarked'].apply(lambda x: cls_embarked['0'] if pd.isna(x) else cls_embarked[x])
    # Surname
    surnames = ['Capt.', 'Col.', 'Don.', 'Dr.', 'Jonkheer.', 'Lady.', 'Major.', 
            'Master.', 'Miss.', 'Mlle.', 'Mme.', 'Mr.', 'Mrs.', 'Ms.', 'Rev.', 'Sir.', 'the']
    cls_surnames = dict(zip(surnames, range(len(surnames))))
    df_output["Surname"] = df_data['Name'].apply(lambda x: cls_surnames[x.split(',')[1].split(' ')[1]])

    if is_train:
        df_output["Output"] = df_data["Survived"]

    # display.display(df_output)

    if is_debug:
        print("head(10)")
        print(display.display(df_data.head(5)))
        print("tail(10)")
        print(display.display(df_data.tail(5)))
        print("isna")
        display.display(df_data.isna().sum())
        # Sex: gioi tinh
        print("sex")
        display.display(np.unique(df_data['Sex'], return_counts=True))
        # Age: lay median
        print(f'Age IsNa: {df_data["Age"].isna().sum()}')
        print(f"Age Median: {df_data['Age'].median()}")
        # Fare
        display.display(df_data["Fare"].describe())
        # Cabin
        print("-*10", "Cabin")
        display.display(np.unique(df_data['Cabin'].apply(
            lambda x: 'Z0' if pd.isna(x) else x), return_counts=True))
        # Embarked
        display.display(
            np.unique(df_data['Embarked'].apply(lambda x: '0' if pd.isna(x) else x), return_counts=True)
        )
        globals().update(**locals())
    
    return df_output, None
    pass

preprocessing_feature_01(df_train)

head(10)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


None
tail(10)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


None
isna


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

sex


(array(['female', 'male'], dtype=object), array([314, 577]))

Age IsNa: 177
Age Median: 28.0


count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

-*10 Cabin


(array(['A10', 'A14', 'A16', 'A19', 'A20', 'A23', 'A24', 'A26', 'A31',
        'A32', 'A34', 'A36', 'A5', 'A6', 'A7', 'B101', 'B102', 'B18',
        'B19', 'B20', 'B22', 'B28', 'B3', 'B30', 'B35', 'B37', 'B38',
        'B39', 'B4', 'B41', 'B42', 'B49', 'B5', 'B50', 'B51 B53 B55',
        'B57 B59 B63 B66', 'B58 B60', 'B69', 'B71', 'B73', 'B77', 'B78',
        'B79', 'B80', 'B82 B84', 'B86', 'B94', 'B96 B98', 'C101', 'C103',
        'C104', 'C106', 'C110', 'C111', 'C118', 'C123', 'C124', 'C125',
        'C126', 'C128', 'C148', 'C2', 'C22 C26', 'C23 C25 C27', 'C30',
        'C32', 'C45', 'C46', 'C47', 'C49', 'C50', 'C52', 'C54', 'C62 C64',
        'C65', 'C68', 'C7', 'C70', 'C78', 'C82', 'C83', 'C85', 'C86',
        'C87', 'C90', 'C91', 'C92', 'C93', 'C95', 'C99', 'D', 'D10 D12',
        'D11', 'D15', 'D17', 'D19', 'D20', 'D21', 'D26', 'D28', 'D30',
        'D33', 'D35', 'D36', 'D37', 'D45', 'D46', 'D47', 'D48', 'D49',
        'D50', 'D56', 'D6', 'D7', 'D9', 'E10', 'E101', 'E12', 'E121',

(array(['0', 'C', 'Q', 'S'], dtype=object), array([  2, 168,  77, 644]))

(     Sex   Age     Fare  Pclass  SibSp  Parch  Cabin  Embarked  Surname  \
 0      1  22.0   7.2500       3      1      0      0         3       11   
 1      0  38.0  71.2833       1      1      0      3         1       12   
 2      0  26.0   7.9250       3      0      0      0         3        8   
 3      0  35.0  53.1000       1      1      0      3         3       12   
 4      1  35.0   8.0500       3      0      0      0         3       11   
 ..   ...   ...      ...     ...    ...    ...    ...       ...      ...   
 886    1  27.0  13.0000       2      0      0      0         3       14   
 887    0  19.0  30.0000       1      0      0      2         3        8   
 888    0  28.0  23.4500       3      1      2      0         3        8   
 889    1  26.0  30.0000       1      0      0      3         1       11   
 890    1  32.0   7.7500       3      0      0      0         2       11   
 
      Output  
 0         0  
 1         1  
 2         1  
 3         1  
 4         

## FEATURE ENGINEERING

In [None]:
print("\n=== FEATURE ENGINEERING ===")

# 4.1. Trích xuất Title từ Name
df_train['Title'] = df_train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
df_test['Title'] = df_test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Nhóm các Title hiếm
title_mapping = {
    'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
    'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare',
    'Mlle': 'Miss', 'Countess': 'Rare', 'Ms': 'Miss', 'Lady': 'Rare',
    'Jonkheer': 'Rare', 'Don': 'Rare', 'Dona': 'Rare', 'Mme': 'Mrs',
    'Capt': 'Rare', 'Sir': 'Rare'
}
df_train['Title'] = df_train['Title'].map(title_mapping)
df_test['Title'] = df_test['Title'].map(title_mapping)

# 4.2. Nhóm Age theo khoảng
df_train['AgeGroup'] = pd.cut(df_train['Age'], bins=[0, 12, 18, 35, 60, 100],
                              labels=['Child', 'Teen', 'Adult', 'Middle', 'Senior'])
df_test['AgeGroup'] = pd.cut(df_test['Age'], bins=[0, 12, 18, 35, 60, 100],
                             labels=['Child', 'Teen', 'Adult', 'Middle', 'Senior'])

# 4.3. Nhóm Fare thành 4 mức
df_train['FareGroup'] = pd.qcut(df_train['Fare'], 4, labels=['Low', 'Mid', 'High', 'VeryHigh'])
df_test['FareGroup'] = pd.qcut(df_test['Fare'], 4, labels=['Low', 'Mid', 'High', 'VeryHigh'])

# 4.4. Tạo tính năng FamilySize
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch'] + 1

# 4.5. Tính năng IsAlone
df_train['IsAlone'] = (df_train['FamilySize'] == 1).astype(int)
df_test['IsAlone'] = (df_test['FamilySize'] == 1).astype(int)

print("✓ Hoàn thành Feature Engineering")


=== FEATURE ENGINEERING ===
✓ Hoàn thành Feature Engineering


## ENCODING

In [None]:
# Sex → số
df_train['Sex'] = df_train['Sex'].map({'male': 1, 'female': 0})
df_test['Sex'] = df_test['Sex'].map({'male': 1, 'female': 0})

# One-hot encoding
categorical_cols = ['Embarked', 'Pclass', 'Title', 'AgeGroup', 'FareGroup']
df_train = pd.get_dummies(df_train, columns=categorical_cols, drop_first=False)
df_test = pd.get_dummies(df_test, columns=categorical_cols, drop_first=False)

# Đồng bộ cột giữa train và test
df_test = df_test.reindex(columns=df_train.columns, fill_value=0)

print("✓ Hoàn thành Encoding")

✓ Hoàn thành Encoding


## SCALING

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_train['Fare_Scaled'] = scaler.fit_transform(df_train[['Fare']])
df_test['Fare_Scaled'] = scaler.transform(df_test[['Fare']])

print("✓ Hoàn thành Scaling")

drop_cols = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'Age', 'Fare']
df_train.drop(columns=drop_cols, inplace=True, errors='ignore')
df_test.drop(columns=drop_cols, inplace=True, errors='ignore')

✓ Hoàn thành Scaling


## Train

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

print("\n=== TRAINING FULL MODEL ===")

X_train_full = df_train.drop('Survived', axis=1)
y_train_full = df_train['Survived']
X_test = df_test.reindex(columns=X_train_full.columns, fill_value=0)
X_train_full = X_train_full.fillna(X_train_full.median(numeric_only=True))
X_test = X_test.fillna(X_test.median(numeric_only=True))

model = KNeighborsClassifier(n_neighbors=7, weights='distance', n_jobs=-1)
model.fit(X_train_full, y_train_full)

print("✓ Đã huấn luyện mô hình trên toàn bộ dữ liệu train")


=== TRAINING FULL MODEL ===
✓ Đã huấn luyện mô hình trên toàn bộ dữ liệu train


In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix

# Đánh giá trên tập train
y_pred_train = model.predict(X_train_full)
y_pred_proba = model.predict_proba(X_train_full)[:, 1]  # lấy xác suất để tính ROC-AUC

# Tính các chỉ số
train_acc = accuracy_score(y_train_full, y_pred_train)
train_f1 = f1_score(y_train_full, y_pred_train)
train_roc = roc_auc_score(y_train_full, y_pred_proba)

# In kết quả
print("\n===================== KẾT QUẢ TRÊN TẬP TRAIN =====================")
print(f"1. Accuracy (Độ chính xác):     {train_acc:.4f}")
print(f"2. F1 Score:                    {train_f1:.4f}")
print(f"3. ROC-AUC Score:               {train_roc:.4f}")
print("=================================================================")

# Báo cáo chi tiết
print("\n--- Classification Report ---")
print(classification_report(y_train_full, y_pred_train, target_names=['Không sống sót', 'Sống sót']))

# Ma trận nhầm lẫn
print("\n--- Confusion Matrix ---")
cm = confusion_matrix(y_train_full, y_pred_train)
print(cm)


1. Accuracy (Độ chính xác):     0.8395
2. F1 Score:                    0.7810
3. ROC-AUC Score:               0.8956

--- Classification Report ---
                precision    recall  f1-score   support

Không sống sót       0.85      0.90      0.87       549
      Sống sót       0.82      0.75      0.78       342

      accuracy                           0.84       891
     macro avg       0.83      0.82      0.83       891
  weighted avg       0.84      0.84      0.84       891


--- Confusion Matrix ---
[[493  56]
 [ 87 255]]


In [None]:
# Orijinal test csv'sini tekrar yükle (PassengerId için)
test_data_orig = pd.read_csv("/kaggle/input/titanic/test.csv")

# best_model ile test verisi üzerinde tahmin yap
test_preds = model.predict(X_test)

# Submission dataframe'i oluştur
submission = pd.DataFrame({
    "PassengerId": test_data_orig["PassengerId"],
    "Survived": test_preds
})

# CSV olarak kaydet (kaggle ortamında)
submission.to_csv("submission.csv", index=False)

print("Submission file 'submission.csv' created.")

Submission file 'submission.csv' created.


## End