1. Use the same dataset from the previous task
2. Reuse validation strategy and preprocessing without changes
3. Train xgboost model
4. Train lightgbm model
5. Train catboost model
6. Compare performance on local validation and on test set on kaggle

In [7]:
!pip install xgboost
!pip install lightgbm
!pip install catboost



In [8]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.base import clone
from sklearn.utils import check_random_state

import warnings
from sklearn.exceptions import ConvergenceWarning

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

import xgboost as xgb
import lightgbm as lgb
import catboost as ctb

Collecting catboost
  Downloading catboost-1.2.7-cp312-cp312-win_amd64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp312-cp312-win_amd64.whl (101.7 MB)
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   --- ------------------------------------ 9.7/101.7 MB 50.2 MB/s eta 0:00:02
   ------- -------------------------------- 18.6/101.7 MB 45.2 MB/s eta 0:00:02
   ---------- ----------------------------- 26.5/101.7 MB 44.1 MB/s eta 0:00:02
   ------------- -------------------------- 34.9/101.7 MB 42.6 MB/s eta 0:00:02
   ---------------- ----------------------- 42.7/101.7 MB 41.8 MB/s eta 0:00:02
   -------------------- ------------------- 51.1/101.7 MB 41.2 MB/s eta 0:00:02
   ----------------------- ---------------- 59.2/101.7 MB 40.6 MB/s eta 0:00:02
   ------------------------- -------------- 65.8/101.7 MB 39.6 MB/s eta 0:00:01
   ----------------------------- ---------- 75.5/101.7 MB 40.5 MB/s eta 0:00:01
   -------------------------------- ------- 83.6/101.

In [10]:
csv_train = r'https://drive.usercontent.google.com/download?id=1RIKv1X_XWYAPCsmH6UPKOue3nHu1pkGJ&export=download&authuser=0'
csv_test = r'https://drive.usercontent.google.com/download?id=1dNHk0lq04yQDpfz3_lf4PX2rZEQWrHnp&export=download&authuser=0&confirm=t&uuid=668130f8-48df-4ba3-ac54-d187c7f65b60&at=AO7h07cWJg4cofkmU9mkqR7XAEl7:1724719828412'

train_raw = pd.read_csv(csv_train)
test_raw = pd.read_csv(csv_test)

In [11]:
age_mean_train = train_raw['Age'].mean()
embarked_top_train = train_raw['Embarked'].value_counts().index[0]
fare_mean_train = train_raw['Fare'].mean()

def enhance_features(data):
    data = data.copy().set_index('PassengerId')
        
    data['missing_age'] = data['Age'].isnull().astype(int)
    data['missing_fare'] = data['Fare'].isnull().astype(int)
    data['missing_embarked'] = data['Embarked'].isnull().astype(int)
    data['missing_cabin'] = data['Cabin'].isnull().astype(int)
    
    data['title'] = data['Name'].str.extract(r'[^,]*, ([^.]*)\. .*')
    data['ticket_prefix'] = data['Ticket'].str.extract(r'(.*) \d*').replace('\.', '', regex=True).fillna('')    
    data['cabin_letter'] = data['Cabin'].str.extract(r'([A-Za-z]*)\d*')
    
    data['Age'] = data['Age'].fillna(age_mean_train)
    data['Fare'] = data['Fare'].fillna(fare_mean_train)    
    data['Embarked'] = data['Embarked'].fillna(embarked_top_train)
    data['cabin_letter'] = data['cabin_letter'].fillna('Unknown')    
    
    return data.drop(columns=['Name', 'Ticket', 'Cabin'])


train_X = enhance_features(train_raw.drop(columns=['Survived']))
train_y = train_raw['Survived']

train_X

  data['ticket_prefix'] = data['Ticket'].str.extract(r'(.*) \d*').replace('\.', '', regex=True).fillna('')


Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,missing_age,missing_fare,missing_embarked,missing_cabin,title,ticket_prefix,cabin_letter
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,3,male,22.000000,1,0,7.2500,S,0,0,0,1,Mr,A/5,Unknown
2,1,female,38.000000,1,0,71.2833,C,0,0,0,0,Mrs,PC,C
3,3,female,26.000000,0,0,7.9250,S,0,0,0,1,Miss,STON/O2,Unknown
4,1,female,35.000000,1,0,53.1000,S,0,0,0,0,Mrs,,C
5,3,male,35.000000,0,0,8.0500,S,0,0,0,1,Mr,,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,2,male,27.000000,0,0,13.0000,S,0,0,0,1,Rev,,Unknown
888,1,female,19.000000,0,0,30.0000,S,0,0,0,0,Miss,,B
889,3,female,29.699118,1,2,23.4500,S,1,0,0,1,Miss,W/C,Unknown
890,1,male,26.000000,0,0,30.0000,C,0,0,0,0,Mr,,C


In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('one_hot', OneHotEncoder(categories='auto', handle_unknown='ignore'), ['Sex', 'Embarked', 'title', 'ticket_prefix', 'cabin_letter']),
    ],
    remainder='passthrough'
)


pipeline_xgb_1 = Pipeline(steps=[
    ('Preprocessor', preprocessor),
    ('BaggingClassifier', xgb.XGBClassifier(random_state=0, n_jobs=-1))
])


In [None]:
# define the xgboost model (from xgboost package)
# define the hyperparameters
# train the model
# try to improve the model by changing the hyperparameters on local validation (remember that using gridsearch is a bad idea, because it can't use the early stopping)
# retrain the model on the whole train dataset
# don't forget to specify the number of boosting rounds you found optimal

In [None]:
# define the lightgbm model (from lightgbm package)
# define the hyperparameters
# train the model
# try to improve the model by changing the hyperparameters on local validation (remember that using gridsearch is a bad idea, because it can't use the early stopping)
# retrain the model on the whole train dataset
# don't forget to specify the number of boosting rounds you found optimal

In [None]:
# define the catboost model (from catboost package)
# define the hyperparameters
# train the model
# try to improve the model by changing the hyperparameters on local validation (remember that using gridsearch is a bad idea, because it can't use the early stopping)
# retrain the model on the whole train dataset
# don't forget to specify the number of boosting rounds you found optimal

In [None]:
# compare the results of the three models from this homework and with models from the previous homework
# make a conclusion on which model is better and why
# if your boosting is worse than the RF, try to improve it

In [None]:
# load test data
# do the same preprocessing as for train data

# using retrained models make predictions on the test data for all new three models
# save the predictions to a file
# upload the predictions to Kaggle and make a submission
# report the score you got and compare it with the score you got on the validation data
# make a conclusion on how well the models generalizes