## Import Libraries

In [10]:
# %load_ext autoreload
%reload_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
from IPython import display

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold

In [11]:
params_cfg = {
    "action"   : "main_feat01",  
    "feat_path": "../../../exps/featbase_19102025/data.npz",
    "seed"    : 42, # Set random seed
    "exp_dir" : os.path.abspath('../../../exps'),
    'exp_name': 'trainbase_19102025',
    "data_dir": os.path.abspath("../../data/titanic"),
    "verbose" : True,
}
params_cfg.update(**{
    "save_dir": os.path.abspath(f'{params_cfg["exp_dir"]}/{params_cfg["exp_name"]}')
})

for v in params_cfg:
    print(f'+ {v}: {params_cfg[v]}')

globals().update(**params_cfg)

+ action: main_feat01
+ feat_path: ../../../exps/featbase_19102025/data.npz
+ seed: 42
+ exp_dir: /exps
+ exp_name: trainbase_19102025
+ data_dir: /data/titanic
+ verbose: True
+ save_dir: /exps/trainbase_19102025


## Data Load

In [12]:
# data_dir = '../../../data/titanic'
# df_train = pd.read_csv(f'{data_dir}/train.csv')
# df_test = pd.read_csv(f'{data_dir}/test.csv')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df_train = pd.read_csv("/kaggle/input/titanic/train.csv")
df_train.head()
df_test = pd.read_csv("/kaggle/input/titanic/test.csv")
df_test.head()

if params_cfg["verbose"]:
    print("-"*10, "information", "-"*10)
    print(f'train-col: {set(df_train.columns)}')
    print(f'test-col: {set(df_test.columns)}')
    print("Union:", set(df_train.columns).intersection(set(df_test.columns)))
    print("Difference:", set(df_train.columns).difference(set(df_test.columns)))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv
---------- information ----------
train-col: {'Fare', 'SibSp', 'Parch', 'Sex', 'Age', 'Cabin', 'Embarked', 'Ticket', 'Survived', 'Name', 'Pclass', 'PassengerId'}
test-col: {'Fare', 'SibSp', 'Parch', 'Sex', 'Age', 'Cabin', 'Embarked', 'Ticket', 'Name', 'Pclass', 'PassengerId'}
Union: {'Fare', 'SibSp', 'Parch', 'Sex', 'Age', 'Embarked', 'Ticket', 'Name', 'Pclass', 'Cabin', 'PassengerId'}
Difference: {'Survived'}


## Preprocessing

In [13]:
def preprocessing_feature_01(df_data, is_train = True, is_debug = True, **kwargs):
    df_output = pd.DataFrame()

    # Sex: gioi tinh
    cls_sex = {'female': 0, 'male' : 1}
    df_output["Sex"] = df_data["Sex"].apply(lambda x: cls_sex[x])
    # Age: median
    df_output["Age"] = df_data["Age"].fillna(df_data["Age"].median())
    # Fare, Pclass
    for name in ['Fare', 'Pclass', 'SibSp', 'Parch']:
        df_output[name] = df_data[name]
    # Cabin
    cls_cabin = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'T':8, 'Z':0}
    df_output["Cabin"] = df_data['Cabin'].apply(lambda x: cls_cabin['Z'] if pd.isna(x) else cls_cabin[x[0]])
    # Embarked
    cls_embarked = {'0': 0, 'C':1, 'Q':2, 'S':3}
    df_output["Embarked"] =  df_data['Embarked'].apply(lambda x: cls_embarked['0'] if pd.isna(x) else cls_embarked[x])
    # Surname
    surnames = ['Capt.', 'Col.', 'Don.', 'Dr.', 'Jonkheer.', 'Lady.', 'Major.', 
            'Master.', 'Miss.', 'Mlle.', 'Mme.', 'Mr.', 'Mrs.', 'Ms.', 'Rev.', 'Sir.', 'the']
    cls_surnames = dict(zip(surnames, range(len(surnames))))
    df_output["Surname"] = df_data['Name'].apply(lambda x: cls_surnames[x.split(',')[1].split(' ')[1]])

    if is_train:
        df_output["Output"] = df_data["Survived"]

    # display.display(df_output)

    if is_debug:
        print("head(10)")
        print(display.display(df_data.head(5)))
        print("tail(10)")
        print(display.display(df_data.tail(5)))
        print("isna")
        display.display(df_data.isna().sum())
        # Sex: gioi tinh
        print("sex")
        display.display(np.unique(df_data['Sex'], return_counts=True))
        # Age: lay median
        print(f'Age IsNa: {df_data["Age"].isna().sum()}')
        print(f"Age Median: {df_data['Age'].median()}")
        # Fare
        display.display(df_data["Fare"].describe())
        # Cabin
        print("-*10", "Cabin")
        display.display(np.unique(df_data['Cabin'].apply(
            lambda x: 'Z0' if pd.isna(x) else x), return_counts=True))
        # Embarked
        display.display(
            np.unique(df_data['Embarked'].apply(lambda x: '0' if pd.isna(x) else x), return_counts=True)
        )
        globals().update(**locals())
    
    return df_output, None
    pass

preprocessing_feature_01(df_train)

head(10)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


None
tail(10)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


None
isna


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

sex


(array(['female', 'male'], dtype=object), array([314, 577]))

Age IsNa: 177
Age Median: 28.0


count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

-*10 Cabin


(array(['A10', 'A14', 'A16', 'A19', 'A20', 'A23', 'A24', 'A26', 'A31',
        'A32', 'A34', 'A36', 'A5', 'A6', 'A7', 'B101', 'B102', 'B18',
        'B19', 'B20', 'B22', 'B28', 'B3', 'B30', 'B35', 'B37', 'B38',
        'B39', 'B4', 'B41', 'B42', 'B49', 'B5', 'B50', 'B51 B53 B55',
        'B57 B59 B63 B66', 'B58 B60', 'B69', 'B71', 'B73', 'B77', 'B78',
        'B79', 'B80', 'B82 B84', 'B86', 'B94', 'B96 B98', 'C101', 'C103',
        'C104', 'C106', 'C110', 'C111', 'C118', 'C123', 'C124', 'C125',
        'C126', 'C128', 'C148', 'C2', 'C22 C26', 'C23 C25 C27', 'C30',
        'C32', 'C45', 'C46', 'C47', 'C49', 'C50', 'C52', 'C54', 'C62 C64',
        'C65', 'C68', 'C7', 'C70', 'C78', 'C82', 'C83', 'C85', 'C86',
        'C87', 'C90', 'C91', 'C92', 'C93', 'C95', 'C99', 'D', 'D10 D12',
        'D11', 'D15', 'D17', 'D19', 'D20', 'D21', 'D26', 'D28', 'D30',
        'D33', 'D35', 'D36', 'D37', 'D45', 'D46', 'D47', 'D48', 'D49',
        'D50', 'D56', 'D6', 'D7', 'D9', 'E10', 'E101', 'E12', 'E121',

(array(['0', 'C', 'Q', 'S'], dtype=object), array([  2, 168,  77, 644]))

(     Sex   Age     Fare  Pclass  SibSp  Parch  Cabin  Embarked  Surname  \
 0      1  22.0   7.2500       3      1      0      0         3       11   
 1      0  38.0  71.2833       1      1      0      3         1       12   
 2      0  26.0   7.9250       3      0      0      0         3        8   
 3      0  35.0  53.1000       1      1      0      3         3       12   
 4      1  35.0   8.0500       3      0      0      0         3       11   
 ..   ...   ...      ...     ...    ...    ...    ...       ...      ...   
 886    1  27.0  13.0000       2      0      0      0         3       14   
 887    0  19.0  30.0000       1      0      0      2         3        8   
 888    0  28.0  23.4500       3      1      2      0         3        8   
 889    1  26.0  30.0000       1      0      0      3         1       11   
 890    1  32.0   7.7500       3      0      0      0         2       11   
 
      Output  
 0         0  
 1         1  
 2         1  
 3         1  
 4         

## FEATURE ENGINEERING

In [14]:
print("\n=== FEATURE ENGINEERING ===")

# 4.1. Trích xuất Title từ Name
df_train['Title'] = df_train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
df_test['Title'] = df_test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Nhóm các Title hiếm
title_mapping = {
    'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
    'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare',
    'Mlle': 'Miss', 'Countess': 'Rare', 'Ms': 'Miss', 'Lady': 'Rare',
    'Jonkheer': 'Rare', 'Don': 'Rare', 'Dona': 'Rare', 'Mme': 'Mrs',
    'Capt': 'Rare', 'Sir': 'Rare'
}
df_train['Title'] = df_train['Title'].map(title_mapping)
df_test['Title'] = df_test['Title'].map(title_mapping)

# 4.2. Nhóm Age theo khoảng
df_train['AgeGroup'] = pd.cut(df_train['Age'], bins=[0, 12, 18, 35, 60, 100],
                              labels=['Child', 'Teen', 'Adult', 'Middle', 'Senior'])
df_test['AgeGroup'] = pd.cut(df_test['Age'], bins=[0, 12, 18, 35, 60, 100],
                             labels=['Child', 'Teen', 'Adult', 'Middle', 'Senior'])

# 4.3. Nhóm Fare thành 4 mức
df_train['FareGroup'] = pd.qcut(df_train['Fare'], 4, labels=['Low', 'Mid', 'High', 'VeryHigh'])
df_test['FareGroup'] = pd.qcut(df_test['Fare'], 4, labels=['Low', 'Mid', 'High', 'VeryHigh'])

# 4.4. Tạo tính năng FamilySize
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch'] + 1

# 4.5. Tính năng IsAlone
df_train['IsAlone'] = (df_train['FamilySize'] == 1).astype(int)
df_test['IsAlone'] = (df_test['FamilySize'] == 1).astype(int)

# 4.6 Fare cho 1 người
df_train['Fare_per_Person'] = df_train['Fare'] / df_train['FamilySize']

print("✓ Hoàn thành Feature Engineering")

print("\n=== AFTER FEATURE ENGINEERING ===")
cols_new = ['Title', 'AgeGroup', 'FareGroup', 'FamilySize', 'IsAlone', 'Fare_per_Person']
print(df_train[cols_new].head(10))

print("Danh sách cột hiện tại:")
print(df_train.columns.tolist())

print(df_train[['Title', 'AgeGroup', 'FareGroup', 'FamilySize', 'IsAlone', 'Fare_per_Person']].describe(include='all'))


=== FEATURE ENGINEERING ===
✓ Hoàn thành Feature Engineering

=== AFTER FEATURE ENGINEERING ===
    Title AgeGroup FareGroup  FamilySize  IsAlone  Fare_per_Person
0      Mr    Adult       Low           2        0          3.62500
1     Mrs   Middle  VeryHigh           2        0         35.64165
2    Miss    Adult       Mid           1        1          7.92500
3     Mrs    Adult  VeryHigh           2        0         26.55000
4      Mr    Adult       Mid           1        1          8.05000
5      Mr      NaN       Mid           1        1          8.45830
6      Mr   Middle  VeryHigh           1        1         51.86250
7  Master    Child      High           5        0          4.21500
8     Mrs    Adult       Mid           3        0          3.71110
9     Mrs     Teen      High           2        0         15.03540
Danh sách cột hiện tại:
['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'AgeGroup', 'Far

## ENCODING

In [15]:
# Sex → số
df_train['Sex'] = df_train['Sex'].map({'male': 1, 'female': 0})
df_test['Sex'] = df_test['Sex'].map({'male': 1, 'female': 0})

# One-hot encoding
categorical_cols = ['Embarked', 'Pclass', 'Title', 'AgeGroup', 'FareGroup']
df_train = pd.get_dummies(df_train, columns=categorical_cols, drop_first=False)
df_test = pd.get_dummies(df_test, columns=categorical_cols, drop_first=False)

# Đồng bộ cột giữa train và test
df_test = df_test.reindex(columns=df_train.columns, fill_value=0)

print("✓ Hoàn thành Encoding")

missing_in_test = set(df_train.columns) - set(df_test.columns)
extra_in_test = set(df_test.columns) - set(df_train.columns)

print("Thiếu trong test:", missing_in_test)
print("Thừa trong test:", extra_in_test)

print(df_train.head())
encoded_cols = [col for col in df_train.columns if any(prefix in col for prefix in ['Embarked_', 'Pclass_', 'Title_', 'AgeGroup_', 'FareGroup_'])]
print(df_train[encoded_cols].head())

✓ Hoàn thành Encoding
Thiếu trong test: set()
Thừa trong test: set()
   PassengerId  Survived                                               Name  \
0            1         0                            Braund, Mr. Owen Harris   
1            2         1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2            3         1                             Heikkinen, Miss. Laina   
3            4         1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4            5         0                           Allen, Mr. William Henry   

   Sex   Age  SibSp  Parch            Ticket     Fare Cabin  ...  Title_Rare  \
0    1  22.0      1      0         A/5 21171   7.2500   NaN  ...       False   
1    0  38.0      1      0          PC 17599  71.2833   C85  ...       False   
2    0  26.0      0      0  STON/O2. 3101282   7.9250   NaN  ...       False   
3    0  35.0      1      0            113803  53.1000  C123  ...       False   
4    1  35.0      0      0            373450   8.0500   

## SCALING

In [16]:
from sklearn.preprocessing import StandardScaler

target = 'Survived'
feature_cols = [
    c for c in df_train.columns 
    if c != target and c not in ['PassengerId', 'Name', 'Ticket', 'Cabin']
]
num_cols = ['Age', 'Fare', 'SibSp', 'Parch'] 
cat_cols = ['Sex', 'Embarked', 'Pclass', 'Title', 'AgeGroup', 'FareGroup']
num_cols = [c for c in df_train.select_dtypes(include=['int64', 'float64']).columns if c in feature_cols]
if 'Survived' in num_cols:
    num_cols.remove('Survived') 

# One-hot encode các biến phân loại
df_train = pd.get_dummies(df_train, drop_first=True)
df_test = pd.get_dummies(df_test, drop_first=True)

# Căn chỉnh cột giữa train/test cho khớp nhau
df_test = df_test.reindex(columns=df_train.columns, fill_value=0)

print("\nTrước khi Scaled:")
print(display.display(df_train[num_cols].describe()))

scaler = StandardScaler()
df_train[num_cols] = scaler.fit_transform(df_train[num_cols])
df_test[num_cols] = scaler.transform(df_test[num_cols])


print("\nSau khi Scaled:")
print(display.display(df_train[num_cols].describe()))

print("✓ Hoàn thành Scaling")


Trước khi Scaled:


Unnamed: 0,Sex,Age,SibSp,Parch,Fare,FamilySize,IsAlone,Fare_per_Person
count,891.0,714.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.647587,29.699118,0.523008,0.381594,32.204208,1.904602,0.602694,19.916375
std,0.47799,14.526497,1.102743,0.806057,49.693429,1.613459,0.489615,35.841257
min,0.0,0.42,0.0,0.0,0.0,1.0,0.0,0.0
25%,0.0,20.125,0.0,0.0,7.9104,1.0,0.0,7.25
50%,1.0,28.0,0.0,0.0,14.4542,1.0,1.0,8.3
75%,1.0,38.0,1.0,0.0,31.0,2.0,1.0,23.666667
max,1.0,80.0,8.0,6.0,512.3292,11.0,1.0,512.3292


None

Sau khi Scaled:


Unnamed: 0,Sex,Age,SibSp,Parch,Fare,FamilySize,IsAlone,Fare_per_Person
count,891.0,714.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,-1.31582e-16,2.388379e-16,3.5886e-17,5.681949000000001e-17,-1.993666e-18,-2.3924000000000003e-17,-7.974666000000001e-17,-2.791133e-17
std,1.000562,1.000701,1.000562,1.000562,1.000562,1.000562,1.000562,1.000562
min,-1.355574,-2.016979,-0.4745452,-0.4736736,-0.6484217,-0.5609748,-1.231645,-0.555995
25%,-1.355574,-0.6595416,-0.4745452,-0.4736736,-0.4891482,-0.5609748,-1.231645,-0.3536006
50%,0.7376951,-0.1170488,-0.4745452,-0.4736736,-0.3573909,-0.5609748,0.8119223,-0.3242883
75%,0.7376951,0.571831,0.4327934,-0.4736736,-0.02424635,0.05915988,0.8119223,0.1046949
max,0.7376951,3.465126,6.784163,6.974147,9.667167,5.640372,0.8119223,13.74643


None
✓ Hoàn thành Scaling


## Train

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, 
    precision_score, recall_score, classification_report, confusion_matrix
)
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.utils.multiclass import type_of_target
import numpy as np
import pandas as pd

# === Chuẩn bị dữ liệu ===
target = 'Survived'
feature_cols = [c for c in df_train.columns if c != target]

# Encode các cột dạng chuỗi
for col in df_train.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col].astype(str))
    if col in df_test.columns:
        # tránh lỗi khi test có giá trị chưa thấy trong train
        df_test[col] = df_test[col].map(lambda x: x if x in le.classes_ else le.classes_[0])
        df_test[col] = le.transform(df_test[col].astype(str))

# Khớp cột giữa train/test
df_test = df_test.reindex(columns=feature_cols, fill_value=0)

X_train_full = df_train[feature_cols].fillna(0)
y_train_full = df_train[target]
X_test = df_test.fillna(0)

# === Thiết lập tham số ===
params = {"random_state": 42}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# === Định nghĩa mô hình với pipeline (tự động scale) ===
log_clf = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=2000, solver='lbfgs', random_state=params["random_state"])
)
svc_clf = make_pipeline(
    StandardScaler(),
    SVC(kernel='rbf', probability=True, random_state=params["random_state"])
)
knn_clf = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier(n_neighbors=10, weights='distance', n_jobs=-1)
)

# === Lưới tham số để GridSearchCV ===
param_grids = {
    'LogisticRegression': {
        'logisticregression__C': [0.01, 0.1, 1, 10],
        'logisticregression__solver': ['lbfgs', 'liblinear']
    },
    'SVC': {
        'svc__C': [0.1, 1, 10],
        'svc__gamma': ['scale', 'auto']
    },
    'KNeighbors': {
        'kneighborsclassifier__n_neighbors': [3, 5, 7, 9, 11],
        'kneighborsclassifier__weights': ['uniform', 'distance']
    }
}

# === Huấn luyện GridSearchCV cho từng model ===
grid_results = {}
for name, clf in zip(['LogisticRegression', 'SVC', 'KNeighbors'], [log_clf, svc_clf, knn_clf]):
    print(f"\n=== GridSearchCV cho {name} ===")
    grid = GridSearchCV(
        clf,
        param_grids[name],
        cv=cv,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )
    grid.fit(X_train_full, y_train_full)
    grid_results[name] = grid.best_estimator_
    print(f"Best Params for {name}: {grid.best_params_}")
    print(f"Best CV Accuracy: {grid.best_score_:.4f}")

# === Tạo Voting Ensemble ===
voting_clf = VotingClassifier(
    estimators=[
        ('lr', grid_results['LogisticRegression']),
        ('svc', grid_results['SVC']),
        ('knn', grid_results['KNeighbors'])
    ],
    voting='soft',
    weights=[2, 1, 3]
)

# === Đánh giá ===
model_map = {
    'LogisticRegression': grid_results['LogisticRegression'],
    'SVC': grid_results['SVC'],
    'KNeighbors': grid_results['KNeighbors'],
    'Voting': voting_clf
}

results = {}
best_score = -1
best_name = None
best_model = None

for name, clf in model_map.items():
    print(f"\nĐang đánh giá {name} ...")
    acc_scores = cross_val_score(clf, X_train_full, y_train_full, cv=cv, scoring='accuracy')
    f1_scores  = cross_val_score(clf, X_train_full, y_train_full, cv=cv, scoring='f1')

    target_type = type_of_target(y_train_full)
    if target_type == 'binary':
        roc_scores = cross_val_score(clf, X_train_full, y_train_full, cv=cv, scoring='roc_auc')
    else:
        roc_scores = [np.nan] * cv.get_n_splits()

    results[name] = {
        'acc_mean': np.mean(acc_scores),
        'f1_mean': np.mean(f1_scores),
        'roc_mean': np.nanmean(roc_scores)
    }

    print(f"{name} | Accuracy: {np.mean(acc_scores):.4f} ± {np.std(acc_scores):.4f}")
    print(f"F1-score: {np.mean(f1_scores):.4f} | ROC-AUC: {np.nanmean(roc_scores):.4f}")

    if np.mean(acc_scores) > best_score:
        best_score = np.mean(acc_scores)
        best_name = name
        best_model = clf

# === Tổng hợp kết quả ===
print("\n=== Tổng hợp KFold results ===")
for name, met in results.items():
    print(f"{name:20s} acc={met['acc_mean']:.4f} f1={met['f1_mean']:.4f} roc={met['roc_mean']:.4f}")
print(f"\nBest model by CV accuracy: {best_name} ({best_score:.4f})")


=== GridSearchCV cho LogisticRegression ===
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Params for LogisticRegression: {'logisticregression__C': 0.01, 'logisticregression__solver': 'liblinear'}
Best CV Accuracy: 0.8339

=== GridSearchCV cho SVC ===
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Params for SVC: {'svc__C': 10, 'svc__gamma': 'auto'}
Best CV Accuracy: 0.7183

=== GridSearchCV cho KNeighbors ===
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Params for KNeighbors: {'kneighborsclassifier__n_neighbors': 11, 'kneighborsclassifier__weights': 'uniform'}
Best CV Accuracy: 0.5872

Đang đánh giá LogisticRegression ...
LogisticRegression | Accuracy: 0.8339 ± 0.0175
F1-score: 0.7842 | ROC-AUC: 0.8815

Đang đánh giá SVC ...
SVC | Accuracy: 0.7183 ± 0.0225
F1-score: 0.4536 | ROC-AUC: 0.8548

Đang đánh giá KNeighbors ...
KNeighbors | Accuracy: 0.5872 ± 0.0850
F1-score: 0.1637 | ROC-AUC: 0.5405

Đang đánh giá Voting ...
Voting | 

In [18]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix

final_model = best_model
final_model.fit(X_train_full, y_train_full)
y_pred_train = final_model.predict(X_train_full)

try:
    y_pred_proba_train = final_model.predict_proba(X_train_full)[:, 1]
    train_roc = roc_auc_score(y_train_full, y_pred_proba_train)
except:
    train_roc = np.nan

train_acc = accuracy_score(y_train_full, y_pred_train)
train_f1 = f1_score(y_train_full, y_pred_train)
train_prec = precision_score(y_train_full, y_pred_train)
train_rec = recall_score(y_train_full, y_pred_train)

# ======================
# Báo cáo & Ma trận nhầm lẫn
# ======================
print("\n===================== KẾT QUẢ TRÊN TẬP TRAIN (Mô hình Tối ưu) =====================")
print(f"Mô hình: {best_name}")
print(f"Accuracy : {train_acc:.4f}")
print(f"Precision: {train_prec:.4f}")
print(f"Recall   : {train_rec:.4f}")
print(f"F1 Score : {train_f1:.4f}")
if not np.isnan(train_roc):
    print(f"ROC-AUC  : {train_roc:.4f}")
else:
    print("ROC-AUC  : N/A")
print("===================================================================================")

print("\n--- Classification Report ---")
print(classification_report(y_train_full, y_pred_train, target_names=['Không sống sót (0)', 'Sống sót (1)']))

print("\n--- Confusion Matrix ---")
cm = confusion_matrix(y_train_full, y_pred_train)
print("Ma trận nhầm lẫn (Dòng: Thực tế, Cột: Dự đoán):")
print(cm)

TN, FP, FN, TP = cm.ravel()
print(f"\nTrue Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")
print(f"True Positives (TP): {TP}")


Mô hình: LogisticRegression
Accuracy : 1.0000
Precision: 1.0000
Recall   : 1.0000
F1 Score : 1.0000
ROC-AUC  : 1.0000

--- Classification Report ---
                    precision    recall  f1-score   support

Không sống sót (0)       1.00      1.00      1.00       549
      Sống sót (1)       1.00      1.00      1.00       342

          accuracy                           1.00       891
         macro avg       1.00      1.00      1.00       891
      weighted avg       1.00      1.00      1.00       891


--- Confusion Matrix ---
Ma trận nhầm lẫn (Dòng: Thực tế, Cột: Dự đoán):
[[549   0]
 [  0 342]]

True Negatives (TN): 549
False Positives (FP): 0
False Negatives (FN): 0
True Positives (TP): 342


In [19]:
# Orijinal test csv'sini tekrar yükle (PassengerId için)
test_data_orig = pd.read_csv("/kaggle/input/titanic/test.csv")

# best_model ile test verisi üzerinde tahmin yap
test_preds = best_model.predict(X_test)

# Submission dataframe'i oluştur
submission = pd.DataFrame({
    "PassengerId": test_data_orig["PassengerId"],
    "Survived": test_preds
})

# CSV olarak kaydet (kaggle ortamında)
submission.to_csv("submission.csv", index=False)

print("Submission file 'submission.csv' created.")

Submission file 'submission.csv' created.


## End