## Import Libraries

In [19]:
# %load_ext autoreload
%reload_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
from IPython import display

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold

In [20]:
params_cfg = {
    "action"   : "main_feat01",  
    "feat_path": "../../../exps/featbase_19102025/data.npz",
    "seed"    : 42, # Set random seed
    "exp_dir" : os.path.abspath('../../../exps'),
    'exp_name': 'trainbase_19102025',
    "data_dir": os.path.abspath("../../data/titanic"),
    "verbose" : True,
}
params_cfg.update(**{
    "save_dir": os.path.abspath(f'{params_cfg["exp_dir"]}/{params_cfg["exp_name"]}')
})

for v in params_cfg:
    print(f'+ {v}: {params_cfg[v]}')

globals().update(**params_cfg)

+ action: main_feat01
+ feat_path: ../../../exps/featbase_19102025/data.npz
+ seed: 42
+ exp_dir: /exps
+ exp_name: trainbase_19102025
+ data_dir: /data/titanic
+ verbose: True
+ save_dir: /exps/trainbase_19102025


## Data Load

In [21]:
# data_dir = '../../../data/titanic'
# df_train = pd.read_csv(f'{data_dir}/train.csv')
# df_test = pd.read_csv(f'{data_dir}/test.csv')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df_train = pd.read_csv("/kaggle/input/titanic/train.csv")
df_train.head()
df_test = pd.read_csv("/kaggle/input/titanic/test.csv")
df_test.head()

if params_cfg["verbose"]:
    print("-"*10, "information", "-"*10)
    print(f'train-col: {set(df_train.columns)}')
    print(f'test-col: {set(df_test.columns)}')
    print("Union:", set(df_train.columns).intersection(set(df_test.columns)))
    print("Difference:", set(df_train.columns).difference(set(df_test.columns)))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv
---------- information ----------
train-col: {'Embarked', 'PassengerId', 'Cabin', 'Survived', 'Fare', 'Pclass', 'Ticket', 'Parch', 'Name', 'Age', 'Sex', 'SibSp'}
test-col: {'Embarked', 'PassengerId', 'Cabin', 'Fare', 'Pclass', 'Ticket', 'Parch', 'Name', 'Age', 'Sex', 'SibSp'}
Union: {'Embarked', 'PassengerId', 'Cabin', 'Fare', 'Pclass', 'Ticket', 'Parch', 'Name', 'Age', 'Sex', 'SibSp'}
Difference: {'Survived'}


## Preprocessing

In [22]:
from sklearn.preprocessing import StandardScaler
def preprocessing_feature_02(df_data, is_train=True, is_debug=True, **kwargs):
    df_output = pd.DataFrame()

    # === 1. SEX ===
    cls_sex = {'female': 0, 'male': 1}
    df_output["Sex"] = df_data["Sex"].map(cls_sex)

    # === 2. AGE ===
    # Điền thiếu theo nhóm Sex + Pclass
    df_data["Age"] = df_data["Age"].fillna(
        df_data.groupby(["Sex", "Pclass"])["Age"].transform("median")
    )
    # Nếu vẫn còn thiếu thì dùng median toàn bộ
    df_data["Age"].fillna(df_data["Age"].median(), inplace=True)
    df_output["Age"] = df_data["Age"]

    # === 3. FARE ===
    df_data["Fare"] = df_data["Fare"].fillna(df_data["Fare"].median())
    # Log-transform để giảm ngoại lai
    df_output["Fare"] = df_data["Fare"].apply(lambda x: np.log1p(x))

    # === 4. PCLASS, SIBSP, PARCH ===
    for col in ['Pclass', 'SibSp', 'Parch']:
        df_output[col] = df_data[col]

    # === 5. CABIN ===
    df_data["Cabin"] = df_data["Cabin"].fillna("Z0")
    df_output["HasCabin"] = df_data["Cabin"].notnull().astype(int)
    # Lấy ký tự đầu tiên để xác định deck
    df_data["Deck"] = df_data["Cabin"].apply(lambda x: str(x)[0])
    cls_deck = {d: i for i, d in enumerate(sorted(df_data["Deck"].unique()))}
    df_output["Deck"] = df_data["Deck"].map(cls_deck)

    # === 6. EMBARKED ===
    df_data["Embarked"].fillna(df_data["Embarked"].mode()[0], inplace=True)
    cls_embarked = {'C': 1, 'Q': 2, 'S': 3}
    df_output["Embarked"] = df_data['Embarked'].map(cls_embarked)

    # === 7. TITLE (trích xuất từ Name) ===
    df_data['Title'] = df_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    title_mapping = {
        'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
        'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare',
        'Mlle': 'Miss', 'Countess': 'Rare', 'Ms': 'Miss', 'Lady': 'Rare',
        'Jonkheer': 'Rare', 'Don': 'Rare', 'Dona': 'Rare', 'Mme': 'Mrs',
        'Capt': 'Rare', 'Sir': 'Rare'
    }
    df_data['Title'] = df_data['Title'].map(title_mapping)
    cls_title = {t: i for i, t in enumerate(df_data['Title'].dropna().unique())}
    df_output["Title"] = df_data["Title"].map(cls_title)

    # === 8. FAMILY FEATURES ===
    df_output["FamilySize"] = df_data["SibSp"] + df_data["Parch"] + 1
    df_output["IsAlone"] = (df_output["FamilySize"] == 1).astype(int)
    df_output["Fare_per_Person"] = df_data["Fare"] / df_output["FamilySize"]

    # === 9. SCALING ===
    scale_cols = ['Age', 'Fare', 'Fare_per_Person']
    scaler = StandardScaler()
    df_output[scale_cols] = scaler.fit_transform(df_output[scale_cols])

    # === 10. OUTPUT (nếu là train set) ===
    if is_train and "Survived" in df_data.columns:
        df_output["Output"] = df_data["Survived"]

    # === 11. DEBUG MODE ===
    if is_debug:
        print("===== PREPROCESSING V2 SUMMARY =====")
        print(f"Shape: {df_output.shape}")
        print("Missing values per column:")
        print(df_output.isna().sum())
        print("\nFeature sample (head):")
        display.display(df_output.head(5))
        print("\nDescribe numeric features:")
        display.display(df_output.describe())
        print("\nUnique Embarked:", df_output["Embarked"].unique())
        print("Unique Deck:", df_output["Deck"].unique())
        print("Unique Title:", df_output["Title"].unique())
        print("====================================\n")
        globals().update(**locals())

    return df_output, None
    pass

preprocessing_feature_02(df_train)

===== PREPROCESSING V2 SUMMARY =====
Shape: (891, 14)
Missing values per column:
Sex                0
Age                0
Fare               0
Pclass             0
SibSp              0
Parch              0
HasCabin           0
Deck               0
Embarked           0
Title              0
FamilySize         0
IsAlone            0
Fare_per_Person    0
Output             0
dtype: int64

Feature sample (head):


Unnamed: 0,Sex,Age,Fare,Pclass,SibSp,Parch,HasCabin,Deck,Embarked,Title,FamilySize,IsAlone,Fare_per_Person,Output
0,1,-0.534891,-0.879741,3,1,0,1,8,3,0,2,0,-0.454798,0
1,0,0.668392,1.36122,1,1,0,1,2,1,1,2,0,0.438994,1
2,0,-0.23407,-0.79854,3,0,0,1,8,3,2,1,1,-0.334757,1
3,0,0.442776,1.062038,1,1,0,1,2,3,1,2,0,0.185187,1
4,1,0.442776,-0.784179,3,0,0,1,8,3,0,1,1,-0.331267,0



Describe numeric features:


Unnamed: 0,Sex,Age,Fare,Pclass,SibSp,Parch,HasCabin,Deck,Embarked,Title,FamilySize,IsAlone,Fare_per_Person,Output
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.647587,2.03354e-16,-1.61487e-16,2.308642,0.523008,0.381594,1.0,6.716049,2.536476,0.794613,1.904602,0.602694,-2.791133e-17,0.383838
std,0.47799,1.000562,1.000562,0.836071,1.102743,0.806057,0.0,2.460739,0.791503,1.076519,1.613459,0.489615,1.000562,0.486592
min,0.0,-2.157819,-3.058578,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,-0.555995,0.0
25%,0.0,-0.5724938,-0.8002318,2.0,0.0,0.0,1.0,8.0,2.0,0.0,1.0,0.0,-0.3536006,0.0
50%,1.0,-0.2340704,-0.2316613,3.0,0.0,0.0,1.0,8.0,3.0,0.0,1.0,1.0,-0.3242883,0.0
75%,1.0,0.5179814,0.5198636,3.0,1.0,0.0,1.0,8.0,3.0,2.0,2.0,1.0,0.1046949,1.0
max,1.0,3.827009,3.385294,3.0,8.0,6.0,1.0,8.0,3.0,4.0,11.0,1.0,13.74643,1.0



Unique Embarked: [3 1 2]
Unique Deck: [8 2 4 6 3 0 1 5 7]
Unique Title: [0 1 2 3 4]



(     Sex       Age      Fare  Pclass  SibSp  Parch  HasCabin  Deck  Embarked  \
 0      1 -0.534891 -0.879741       3      1      0         1     8         3   
 1      0  0.668392  1.361220       1      1      0         1     2         1   
 2      0 -0.234070 -0.798540       3      0      0         1     8         3   
 3      0  0.442776  1.062038       1      1      0         1     2         3   
 4      1  0.442776 -0.784179       3      0      0         1     8         3   
 ..   ...       ...       ...     ...    ...    ...       ...   ...       ...   
 886    1 -0.158865 -0.333698       2      0      0         1     8         3   
 887    0 -0.760507  0.487082       1      0      0         1     1         3   
 888    0 -0.572494  0.242007       3      1      2         1     8         3   
 889    1 -0.234070  0.487082       1      0      0         1     2         1   
 890    1  0.217161 -0.818987       3      0      0         1     8         2   
 
      Title  FamilySize  I

## FEATURE ENGINEERING

In [23]:
print("\n=== FEATURE ENGINEERING ===")

# 4.1. Trích xuất Title từ Name
df_train['Title'] = df_train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
df_test['Title'] = df_test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Nhóm các Title hiếm
title_mapping = {
    'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
    'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare',
    'Mlle': 'Miss', 'Countess': 'Rare', 'Ms': 'Miss', 'Lady': 'Rare',
    'Jonkheer': 'Rare', 'Don': 'Rare', 'Dona': 'Rare', 'Mme': 'Mrs',
    'Capt': 'Rare', 'Sir': 'Rare'
}
df_train['Title'] = df_train['Title'].map(title_mapping)
df_test['Title'] = df_test['Title'].map(title_mapping)

# 4.2. Nhóm Age theo khoảng
df_train['AgeGroup'] = pd.cut(df_train['Age'], bins=[0, 12, 18, 35, 60, 100],
                              labels=['Child', 'Teen', 'Adult', 'Middle', 'Senior'])
df_test['AgeGroup'] = pd.cut(df_test['Age'], bins=[0, 12, 18, 35, 60, 100],
                             labels=['Child', 'Teen', 'Adult', 'Middle', 'Senior'])

# 4.3. Nhóm Fare thành 4 mức
df_train['FareGroup'] = pd.qcut(df_train['Fare'], 4, labels=['Low', 'Mid', 'High', 'VeryHigh'])
df_test['FareGroup'] = pd.qcut(df_test['Fare'], 4, labels=['Low', 'Mid', 'High', 'VeryHigh'])

# 4.4. Tạo tính năng FamilySize
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch'] + 1

# 4.5. Tính năng IsAlone
df_train['IsAlone'] = (df_train['FamilySize'] == 1).astype(int)
df_test['IsAlone'] = (df_test['FamilySize'] == 1).astype(int)

# 4.6 Fare cho 1 người
df_train['Fare_per_Person'] = df_train['Fare'] / df_train['FamilySize']
df_test['Fare_per_Person'] =  df_test['Fare'] / df_test['FamilySize']

# 4.7 Thêm has Cabin từ Cabin
df_train['Has_Cabin'] = df_train['Cabin'].notnull().astype(int)
df_test['Has_Cabin'] = df_test['Cabin'].notnull().astype(int)

# 4.8 Deck từ Cabin
df_train['Deck'] = df_train['Cabin'].astype(str).str[0]
df_test['Deck'] = df_test['Cabin'].astype(str).str[0]

# 4.9 --- Các đặc trưng kết hợp ---
df_train['Sex_Pclass'] = df_train['Sex'].astype(str) + "_" + df_train['Pclass'].astype(str)
df_test['Sex_Pclass'] = df_test['Sex'].astype(str) + "_" + df_test['Pclass'].astype(str)

df_train['Is_Mother'] = ((df_train['Sex'] == 0) & (df_train['Parch'] > 0) & (df_train['Age'] > 18)).astype(int)
df_test['Is_Mother'] = ((df_test['Sex'] == 0) & (df_test['Parch'] > 0) & (df_test['Age'] > 18)).astype(int)

df_train['Is_Boy'] = ((df_train['Sex'] == 1) & (df_train['Age'] < 16)).astype(int)
df_test['Is_Boy'] = ((df_test['Sex'] == 1) & (df_test['Age'] < 16)).astype(int)

print("✓ Hoàn thành Feature Engineering")

print("\n=== AFTER FEATURE ENGINEERING ===")
cols_new = ['Title', 'AgeGroup', 'FareGroup', 'FamilySize', 'IsAlone', 'Fare_per_Person', 'Has_Cabin', 'Deck', 'Sex_Pclass', 'Is_Mother', 'Is_Boy']
print(df_train[cols_new].head(10))

print("Danh sách cột hiện tại:")
print(df_train.columns.tolist())

print(df_train[['Title', 'AgeGroup', 'FareGroup', 'FamilySize', 'IsAlone', 'Fare_per_Person', 'Has_Cabin', 'Deck', 'Sex_Pclass', 'Is_Mother', 'Is_Boy']].describe(include='all'))


=== FEATURE ENGINEERING ===
✓ Hoàn thành Feature Engineering

=== AFTER FEATURE ENGINEERING ===
    Title AgeGroup FareGroup  FamilySize  IsAlone  Fare_per_Person  Has_Cabin  \
0      Mr    Adult       Low           2        0          3.62500          1   
1     Mrs   Middle  VeryHigh           2        0         35.64165          1   
2    Miss    Adult       Mid           1        1          7.92500          1   
3     Mrs    Adult  VeryHigh           2        0         26.55000          1   
4      Mr    Adult       Mid           1        1          8.05000          1   
5      Mr    Adult       Mid           1        1          8.45830          1   
6      Mr   Middle  VeryHigh           1        1         51.86250          1   
7  Master    Child      High           5        0          4.21500          1   
8     Mrs    Adult       Mid           3        0          3.71110          1   
9     Mrs     Teen      High           2        0         15.03540          1   

  Deck Sex_

## ENCODING

In [24]:
# Sex → số
df_train['Sex'] = df_train['Sex'].map({'male': 1, 'female': 0})
df_test['Sex'] = df_test['Sex'].map({'male': 1, 'female': 0})

# One-hot encoding
categorical_cols = ['Embarked', 'Pclass', 'Title', 'AgeGroup', 'FareGroup', 'Has_Cabin', 'Deck', 'Sex_Pclass', 'Is_Mother', 'Is_Boy']
df_train = pd.get_dummies(df_train, columns=categorical_cols, drop_first=False)
df_test = pd.get_dummies(df_test, columns=categorical_cols, drop_first=False)

# Đồng bộ cột giữa train và test
df_test = df_test.reindex(columns=df_train.columns, fill_value=0)

print("✓ Hoàn thành Encoding")

missing_in_test = set(df_train.columns) - set(df_test.columns)
extra_in_test = set(df_test.columns) - set(df_train.columns)

print("Thiếu trong test:", missing_in_test)
print("Thừa trong test:", extra_in_test)

print(df_train.head())
encoded_cols = [col for col in df_train.columns if any(prefix in col for prefix in ['Embarked_', 'Pclass_', 'Title_', 'AgeGroup_', 'FareGroup_', 'Has_Cabin_', 'Deck_', 'Sex_Pclass_', 'Is_Mother_', 'Is_Boy_'])]
print(df_train[encoded_cols].head())

✓ Hoàn thành Encoding
Thiếu trong test: set()
Thừa trong test: set()
   PassengerId  Survived                                               Name  \
0            1         0                            Braund, Mr. Owen Harris   
1            2         1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2            3         1                             Heikkinen, Miss. Laina   
3            4         1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4            5         0                           Allen, Mr. William Henry   

   Sex   Age  SibSp  Parch            Ticket     Fare Cabin  ...  Deck_T  \
0    1  22.0      1      0         A/5 21171   7.2500    Z0  ...   False   
1    0  38.0      1      0          PC 17599  71.2833   C85  ...   False   
2    0  26.0      0      0  STON/O2. 3101282   7.9250    Z0  ...   False   
3    0  35.0      1      0            113803  53.1000  C123  ...   False   
4    1  35.0      0      0            373450   8.0500    Z0  ...   False   


## SCALING

In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

# === Chuẩn bị dữ liệu ===
target = 'Survived'
feature_cols = [c for c in df_train.columns if c != target]

# Encode các cột dạng chuỗi
for col in df_train.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col].astype(str))
    if col in df_test.columns:
        df_test[col] = df_test[col].map(lambda x: x if x in le.classes_ else le.classes_[0])
        df_test[col] = le.transform(df_test[col].astype(str))

# Khớp cột giữa train/test
df_test = df_test.reindex(columns=feature_cols, fill_value=0)

X_train_full = df_train[feature_cols].fillna(0)
y_train_full = df_train[target]
X_test = df_test.fillna(0)

# === Xác định các cột ===
num_cols = [c for c in ['Age', 'Fare', 'SibSp', 'Parch'] if c in df_train.columns]
cat_cols = [c for c in ['Sex', 'Embarked', 'Pclass', 'Title', 'AgeGroup', 'FareGroup', 'Has_Cabin', 'Deck', 'Sex_Pclass', 'Is_Mother', 'Is_Boy'] if c in df_train.columns]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ],
    remainder='drop'
)

print("\nTrước khi Scaled:")
print(display.display(df_train[num_cols].describe()))

scaler = StandardScaler()
df_train[num_cols] = scaler.fit_transform(df_train[num_cols])
df_test[num_cols] = scaler.transform(df_test[num_cols])


print("\nSau khi Scaled:")
print(display.display(df_train[num_cols].describe()))

print("✓ Hoàn thành Pipeline")


Trước khi Scaled:


Unnamed: 0,Age,Fare,SibSp,Parch
count,891.0,891.0,891.0,891.0
mean,29.112424,32.204208,0.523008,0.381594
std,13.304424,49.693429,1.102743,0.806057
min,0.42,0.0,0.0,0.0
25%,21.5,7.9104,0.0,0.0
50%,26.0,14.4542,0.0,0.0
75%,36.0,31.0,1.0,0.0
max,80.0,512.3292,8.0,6.0


None

Sau khi Scaled:


Unnamed: 0,Age,Fare,SibSp,Parch
count,891.0,891.0,891.0,891.0
mean,2.03354e-16,-1.993666e-18,3.5886e-17,5.681949000000001e-17
std,1.000562,1.000562,1.000562,1.000562
min,-2.157819,-0.6484217,-0.4745452,-0.4736736
25%,-0.5724938,-0.4891482,-0.4745452,-0.4736736
50%,-0.2340704,-0.3573909,-0.4745452,-0.4736736
75%,0.5179814,-0.02424635,0.4327934,-0.4736736
max,3.827009,9.667167,6.784163,6.974147


None
✓ Hoàn thành Pipeline


## Train

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, 
    precision_score, recall_score, classification_report, confusion_matrix
)
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.utils.multiclass import type_of_target
import numpy as np
import pandas as pd

# === Chuẩn bị dữ liệu ===
target = 'Survived'
feature_cols = [c for c in df_train.columns if c != target]

# Encode các cột dạng chuỗi
for col in df_train.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col].astype(str))
    if col in df_test.columns:
        # tránh lỗi khi test có giá trị chưa thấy trong train
        df_test[col] = df_test[col].map(lambda x: x if x in le.classes_ else le.classes_[0])
        df_test[col] = le.transform(df_test[col].astype(str))

# Khớp cột giữa train/test
df_test = df_test.reindex(columns=feature_cols, fill_value=0)

X_train_full = df_train[feature_cols].fillna(0)
y_train_full = df_train[target]
X_test = df_test.fillna(0)

# === Thiết lập tham số ===
params = {"random_state": 42}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# === Tạo các pipeline riêng ===
log_clf = Pipeline([
    ('preprocess', preprocessor),
    ('clf', LogisticRegression(max_iter=2000, solver='lbfgs', random_state=42))
])

svc_clf = Pipeline([
    ('preprocess', preprocessor),
    ('clf', SVC(kernel='rbf', probability=True, random_state=42))
])

knn_clf = Pipeline([
    ('preprocess', preprocessor),
    ('clf', KNeighborsClassifier(n_neighbors=10, weights='distance', n_jobs=-1))
])

# === Lưới tham số ===
param_grids = {
    'LogisticRegression': {
        'clf__C': [0.01, 0.1, 1, 10],
        'clf__solver': ['lbfgs', 'liblinear']
    },
    'SVC': {
        'clf__C': [0.1, 1, 10],
        'clf__gamma': ['scale', 'auto']
    },
    'KNeighbors': {
        'clf__n_neighbors': [3, 5, 7, 9, 11],
        'clf__weights': ['uniform', 'distance']
    }
}

# === Huấn luyện GridSearchCV cho từng model ===
grid_results = {}
for name, clf in zip(['LogisticRegression', 'SVC', 'KNeighbors'], [log_clf, svc_clf, knn_clf]):
    print(f"\n=== GridSearchCV cho {name} ===")
    grid = GridSearchCV(
        clf,
        param_grids[name],
        cv=cv,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )
    grid.fit(X_train_full, y_train_full)
    grid_results[name] = grid.best_estimator_
    print(f"Best Params for {name}: {grid.best_params_}")
    print(f"Best CV Accuracy: {grid.best_score_:.4f}")

# === Voting Ensemble ===
voting_clf = VotingClassifier(
    estimators=[
        ('lr', grid_results['LogisticRegression']),
        ('svc', grid_results['SVC']),
        ('knn', grid_results['KNeighbors'])
    ],
    voting='soft',
    weights=[2, 1, 3]
)

# === Đánh giá mô hình ===
model_map = {
    'LogisticRegression': grid_results['LogisticRegression'],
    'SVC': grid_results['SVC'],
    'KNeighbors': grid_results['KNeighbors'],
    'Voting': voting_clf
}

results = {}
best_score = -1
best_name = None
best_model = None

for name, clf in model_map.items():
    print(f"\nĐang đánh giá {name} ...")
    acc_scores = cross_val_score(clf, X_train_full, y_train_full, cv=cv, scoring='accuracy')
    f1_scores  = cross_val_score(clf, X_train_full, y_train_full, cv=cv, scoring='f1')
    roc_scores = cross_val_score(clf, X_train_full, y_train_full, cv=cv, scoring='roc_auc')

    results[name] = {
        'acc_mean': np.mean(acc_scores),
        'f1_mean': np.mean(f1_scores),
        'roc_mean': np.mean(roc_scores)
    }

    print(f"{name} | Accuracy: {np.mean(acc_scores):.4f} ± {np.std(acc_scores):.4f}")
    print(f"F1-score: {np.mean(f1_scores):.4f} | ROC-AUC: {np.mean(roc_scores):.4f}")

    if np.mean(acc_scores) > best_score:
        best_score = np.mean(acc_scores)
        best_name = name
        best_model = clf

print("\n=== Tổng hợp KFold results ===")
for name, met in results.items():
    print(f"{name:20s} acc={met['acc_mean']:.4f} f1={met['f1_mean']:.4f} roc={met['roc_mean']:.4f}")
print(f"\nBest model by CV accuracy: {best_name} ({best_score:.4f})")


=== GridSearchCV cho LogisticRegression ===
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Params for LogisticRegression: {'clf__C': 10, 'clf__solver': 'lbfgs'}
Best CV Accuracy: 0.7890

=== GridSearchCV cho SVC ===
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Params for SVC: {'clf__C': 1, 'clf__gamma': 'auto'}
Best CV Accuracy: 0.8260

=== GridSearchCV cho KNeighbors ===
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Params for KNeighbors: {'clf__n_neighbors': 5, 'clf__weights': 'uniform'}
Best CV Accuracy: 0.8103

Đang đánh giá LogisticRegression ...
LogisticRegression | Accuracy: 0.7890 ± 0.0168
F1-score: 0.7100 | ROC-AUC: 0.8280

Đang đánh giá SVC ...
SVC | Accuracy: 0.8260 ± 0.0172
F1-score: 0.7609 | ROC-AUC: 0.8362

Đang đánh giá KNeighbors ...
KNeighbors | Accuracy: 0.8103 ± 0.0225
F1-score: 0.7488 | ROC-AUC: 0.8460

Đang đánh giá Voting ...
Voting | Accuracy: 0.8170 ± 0.0185
F1-score: 0.7489 | ROC-AUC: 0.8572

=== Tổng h

In [27]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix

final_model = best_model
final_model.fit(X_train_full, y_train_full)
y_pred_train = final_model.predict(X_train_full)

try:
    y_pred_proba_train = final_model.predict_proba(X_train_full)[:, 1]
    train_roc = roc_auc_score(y_train_full, y_pred_proba_train)
except:
    train_roc = np.nan

train_acc = accuracy_score(y_train_full, y_pred_train)
train_f1 = f1_score(y_train_full, y_pred_train)
train_prec = precision_score(y_train_full, y_pred_train)
train_rec = recall_score(y_train_full, y_pred_train)

# ======================
# Báo cáo & Ma trận nhầm lẫn
# ======================
print("\n===================== KẾT QUẢ TRÊN TẬP TRAIN (Mô hình Tối ưu) =====================")
print(f"Mô hình: {best_name}")
print(f"Accuracy : {train_acc:.4f}")
print(f"Precision: {train_prec:.4f}")
print(f"Recall   : {train_rec:.4f}")
print(f"F1 Score : {train_f1:.4f}")
if not np.isnan(train_roc):
    print(f"ROC-AUC  : {train_roc:.4f}")
else:
    print("ROC-AUC  : N/A")
print("===================================================================================")

print("\n--- Classification Report ---")
print(classification_report(y_train_full, y_pred_train, target_names=['Không sống sót (0)', 'Sống sót (1)']))

print("\n--- Confusion Matrix ---")
cm = confusion_matrix(y_train_full, y_pred_train)
print("Ma trận nhầm lẫn (Dòng: Thực tế, Cột: Dự đoán):")
print(cm)

TN, FP, FN, TP = cm.ravel()
print(f"\nTrue Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")
print(f"True Positives (TP): {TP}")


Mô hình: SVC
Accuracy : 0.8328
Precision: 0.8123
Recall   : 0.7339
F1 Score : 0.7711
ROC-AUC  : 0.8635

--- Classification Report ---
                    precision    recall  f1-score   support

Không sống sót (0)       0.84      0.89      0.87       549
      Sống sót (1)       0.81      0.73      0.77       342

          accuracy                           0.83       891
         macro avg       0.83      0.81      0.82       891
      weighted avg       0.83      0.83      0.83       891


--- Confusion Matrix ---
Ma trận nhầm lẫn (Dòng: Thực tế, Cột: Dự đoán):
[[491  58]
 [ 91 251]]

True Negatives (TN): 491
False Positives (FP): 58
False Negatives (FN): 91
True Positives (TP): 251


In [28]:
# Orijinal test csv'sini tekrar yükle (PassengerId için)
test_data_orig = pd.read_csv("/kaggle/input/titanic/test.csv")

# best_model ile test verisi üzerinde tahmin yap
test_preds = best_model.predict(X_test)

# Submission dataframe'i oluştur
submission = pd.DataFrame({
    "PassengerId": test_data_orig["PassengerId"],
    "Survived": test_preds
})

# CSV olarak kaydet (kaggle ortamında)
submission.to_csv("submission.csv", index=False)

print("Submission file 'submission.csv' created.")

Submission file 'submission.csv' created.


## End