### Exercise

* Train and compare LightGBM, CatBoost, and XGBoost models on a dataset, focusing on their ability to handle large datasets and categorical data

#### Step 01: Load and Preprocess the Data

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

url = "https://raw.githubusercontent.com/datasciencedojo/datasets//master/titanic.csv"
df = pd.read_csv(url)

df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [8]:
# Selecting Feature and Target
features = ['Pclass', 'Sex', 'Age', 'Fare','Embarked']
target = 'Survived'
df.info()
# Handle Missing Data
df.fillna({'Age':df['Age'].median()}, inplace=True)
df.fillna({'Embarked':df['Embarked'].mode()[0]}, inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [11]:
# Label Encoding
label_encoder = {}
for col in ['Sex', 'Embarked']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoder = le

# SPlit Dataset
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Data Shape: {X_train.shape}")
print(f"Test Data Shape: {X_test.shape}")

Training Data Shape: (712, 5)
Test Data Shape: (179, 5)


#### Train the LightGBM Model

In [13]:
# LightGBM
model_lgb = lgb.LGBMClassifier()
model_lgb.fit(X_train, y_train)

# Predictions
y_pred_lgb = model_lgb.predict(X_test)

# Evaluate LightGBM
acc_lgb = accuracy_score(y_test, y_pred_lgb)
print(f"Accuracy with LightGBM: {acc_lgb:.4f}")

[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000735 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 180
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838
Accuracy with LightGBM: 0.8045


#### Train CatBoost Model

In [18]:
# # CatBoost
# model_cat = CatBoostClassifier()
# model_cat.fit(X_train, y_train)

# # Predictions
# y_pred_cat = model_cat.predict(X_test)

# # Evaluate
# acc_cat = accuracy_score(y_test, y_pred_cat)
# print(f"Accuracy with CatBoost: {acc_cat:.4f}")
# Accuracy with CatBoost: 0.8101


# Selecting Categorical Features 
features_cat = ['Sex', 'Pclass', 'Embarked']

# CatBoost
model_cat = CatBoostClassifier(cat_features=features_cat, verbose=0)
model_cat.fit(X_train, y_train)

# Predictions
y_pred_cat = model_cat.predict(X_test)

# Evaluate
acc_cat = accuracy_score(y_test, y_pred_cat)
print(f"Accuracy with CatBoost: {acc_cat:.4f}")

Accuracy with CatBoost: 0.8156


#### Train XGBoost Model

In [20]:
# XGBoost
model_xgb = XGBClassifier(eval_metric='logloss')
model_xgb.fit(X_train, y_train)

# Predictions
y_pred_xgb = model_xgb.predict(X_test)

# Evaluations
acc_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"Accuracy with XGBoost: {acc_xgb:.4f}")

Accuracy with XGBoost: 0.7709


### Comparing Accuracy of All

In [21]:
print(f"Accuracy with CatBoost: {acc_cat:.4f}")
print(f"Accuracy with LightGBM: {acc_lgb:.4f}")
print(f"Accuracy with XGBoost: {acc_xgb:.4f}")

Accuracy with CatBoost: 0.8156
Accuracy with LightGBM: 0.8045
Accuracy with XGBoost: 0.7709
