In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold

from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

### Load datasets

In [2]:
train = pd.read_csv('./train.csv', index_col = "id")
test = pd.read_csv("./test.csv", index_col = "id") # Specifying index_col now saves us time later
sub = pd.read_csv("./sample_submission.csv")

## Handle missing values

In [3]:
train_df = train.copy()
test_df = test.copy()

In [4]:
def handle_missed(df, n_freq=50):
    numeric = ['stem-width', 'stem-height', 'cap-diameter']
    category = [x for x in df.columns if x not in numeric]
    if 'class' in category:
        category.remove('class')
    # for numerical cols
    for col in numeric:
        if df[col].isnull().sum() > 0:
            mean_col = df[col].mean()
            df[col].fillna(mean_col, inplace=True)
    
    # for categorical cols
    for col in category:
        if df[col].isnull().sum() > 0:
            # Replace missing values with 'NO'
            df[col].fillna('NO', inplace=True)
            unique_vals = df[col].unique()
            freq_list = []
            freq_least = []
            for x in unique_vals:
                if np.sum(df[col] == x) > n_freq:
                    freq_list.append(x)

            # Replace less frequent values with 'LF'
            df[col] = df[col].apply(lambda x: x if x in freq_list else 'LF')
    
    return df

In [5]:
# train data
train_df = handle_missed(train_df, n_freq=100)

# test data
test_df = handle_missed(test_df, n_freq=100)

In [8]:
numeric = ['stem-width', 'stem-height', 'cap-diameter']
category = [x for x in test_df.columns if x not in numeric]
train_df[category] = train_df[category].astype('category')
test_df[category] = test_df[category].astype('category')

In [9]:
# log tranformation for nearly normal distribution of numerical columns
train_temp = train_df.copy()
for col in numeric:
    train_temp[col] = train_temp[col].apply(lambda x: np.log(x+1.00001))

In [10]:
# log tranformation for nearly normal distribution of numerical columns
test_temp = test_df.copy()
# test_temp = test_temp.drop(columns=['veil-type'], axis=1)
for col in numeric:
    test_temp[col] = test_temp[col].apply(lambda x: np.log(x+1.00001))

In [11]:
le = LabelEncoder() # converts categorical data into numerical ones: e=0, 1=p
train_temp['class'] = le.fit_transform(train_temp['class'])

In [12]:
X = train_temp.drop(columns=['class'], axis=1)
y = train_temp['class']

## XGBoost

In [14]:
from sklearn.metrics import matthews_corrcoef

def mcc_metric(y_pred, dmatrix):
    y_true = dmatrix.get_label()
    y_pred = (y_pred > 0.5).astype(int) 
    mcc = matthews_corrcoef(y_true, y_pred)
    return 'mcc', mcc

## XGBoost

In [15]:
model = XGBClassifier(                                      
    colsample_bytree=0.52,      
    max_depth=18,             
    min_child_weight=9,                
    random_state=3,                 
    n_estimators=512, 
    learning_rate = 0.023,
    gamma = 0.0004,
    subsample = 0.7,
    reg_alpha = 0.008,
    reg_lambda = 0.92,
    enable_categorical = True  ,
    scale_pos_weight = 1.01          
    )

In [45]:
skf = StratifiedKFold(n_splits=5, random_state = 42, shuffle = True)
print(skf)
xgb_list = []
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]
    model = XGBClassifier(                                      
                        colsample_bytree=0.52,      
                        max_depth=21,             
                        min_child_weight=9,                
                        random_state=3,                 
                        n_estimators=512, 
                        learning_rate = 0.023,
                        gamma = 0.0004,
                        subsample = 0.7,
                        reg_alpha = 0.008,
                        reg_lambda = 0.92,
                        enable_categorical = True,
                        scale_pos_weight = 1.01,
                        device = 'cuda' ,
                        random_seed = 57
                        )

    XGB = model.fit(
    X_train,
    y_train)

    y_pred = XGB.predict(X_test)
    score = matthews_corrcoef(y_test, y_pred)
    print('MCC:', score)

    xgb_list.append(XGB)

StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
MCC: 0.9848779549139681
MCC: 0.9848982944912599
MCC: 0.9849437206070658
MCC: 0.9847403713353182
MCC: 0.9849426964551093


## CatBoost

In [29]:
from catboost import CatBoostClassifier
catb_params = {
    'iterations': 1000,
    'depth': 12,
    'learning_rate': 0.2,
    'random_strength': 10,
    'bagging_temperature': 0.9,
    'od_type': 'IncToDec',
    'l2_leaf_reg': 20,
    'border_count': 215,
    'verbose': 100,
    'loss_function': 'Logloss',  # Use 'Logloss' for binary classification
    'eval_metric': 'MCC',
    'random_seed': 57,
    'task_type': 'GPU',  # Use GPU for training
    'devices': '0:1'  # Use the first GPU device
}

catModel = CatBoostClassifier(**catb_params, 
                              cat_features = category, 
                              early_stopping_rounds=50,
                              #iterations = 500,
                              #grow_policy = 'Lossguide',
                              )

catModel.fit(X_train, y_train)

y_pred = catModel.predict(X_test)
score = matthews_corrcoef(y_test, y_pred)
print('MCC:', score)


0:	learn: 0.8116119	total: 294ms	remaining: 4m 54s
100:	learn: 0.9827630	total: 23.3s	remaining: 3m 27s
200:	learn: 0.9833781	total: 44.9s	remaining: 2m 58s
300:	learn: 0.9838991	total: 1m 6s	remaining: 2m 35s
400:	learn: 0.9840642	total: 1m 30s	remaining: 2m 14s
500:	learn: 0.9843168	total: 1m 54s	remaining: 1m 53s
600:	learn: 0.9845693	total: 2m 19s	remaining: 1m 32s
700:	learn: 0.9847829	total: 2m 44s	remaining: 1m 10s
800:	learn: 0.9849059	total: 3m 9s	remaining: 47s
900:	learn: 0.9850726	total: 3m 35s	remaining: 23.7s
999:	learn: 0.9851891	total: 4m	remaining: 0us
MCC: 0.9845423879608713


In [40]:
skf = StratifiedKFold(n_splits=5, random_state = 40, shuffle = True)
print(skf)
catb_list = []
catb_params = {
    'iterations': 1000,
    'depth': 10,
    'learning_rate': 0.21,
    'random_strength': 10,
    'bagging_temperature': 0.9,
    'od_type': 'IncToDec',
    'l2_leaf_reg': 21,
    'border_count': 215,
    'verbose': 100,
    'loss_function': 'Logloss',  # Use 'Logloss' for binary classification
    'eval_metric': 'MCC',
    'random_seed': 57,
    'task_type': 'GPU',  # Use GPU for training
    'devices': '0:1'  # Use the first GPU device
}
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]

    catb = CatBoostClassifier(**catb_params, 
                              cat_features = category, 
                              early_stopping_rounds=50,
                              random_seed = 57)
    catb.fit(X_train, y_train)
    lgb_y_pred = catb.predict(X_test)
    score = matthews_corrcoef(y_test, lgb_y_pred)
    print('MCC:', score)

    catb_list.append(catb)

StratifiedKFold(n_splits=5, random_state=40, shuffle=True)
0:	learn: 0.8168353	total: 298ms	remaining: 4m 57s
100:	learn: 0.9832865	total: 23.4s	remaining: 3m 28s
200:	learn: 0.9838303	total: 45.7s	remaining: 3m 1s
300:	learn: 0.9841749	total: 1m 8s	remaining: 2m 38s
400:	learn: 0.9843374	total: 1m 31s	remaining: 2m 15s
500:	learn: 0.9844920	total: 1m 53s	remaining: 1m 53s
600:	learn: 0.9846441	total: 2m 18s	remaining: 1m 31s
700:	learn: 0.9847711	total: 2m 42s	remaining: 1m 9s
800:	learn: 0.9848988	total: 3m 6s	remaining: 46.4s
900:	learn: 0.9850397	total: 3m 32s	remaining: 23.4s
999:	learn: 0.9851642	total: 3m 58s	remaining: 0us
MCC: 0.9843645674908398
0:	learn: 0.8046836	total: 298ms	remaining: 4m 57s
100:	learn: 0.9829112	total: 23.1s	remaining: 3m 25s
200:	learn: 0.9835572	total: 44.6s	remaining: 2m 57s
300:	learn: 0.9839602	total: 1m 6s	remaining: 2m 35s
400:	learn: 0.9842522	total: 1m 29s	remaining: 2m 13s
500:	learn: 0.9845096	total: 1m 53s	remaining: 1m 52s
600:	learn: 0.98467

## LGBM

In [53]:
from lightgbm import LGBMClassifier
skf = StratifiedKFold(n_splits=5, random_state = 315, shuffle = True)
print(skf)
lgb_list = []
params = {'verbose' : -1, 
          'objective' : 'binary', 
          'num_leaves': 512, 
          'max_depth': 9, 
          'learning_rate': 0.075, 
          'n_estimators': 512, 
          'class_weight': 'balanced', 
          'min_child_samples': 41, 
          'subsample': 0.9, 
          'colsample_bytree': 0.75, 
          'reg_alpha': 0.08, 
          'reg_lambda': 4, 
          'random_state': 57
          }
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]

    lgb = LGBMClassifier(**params)
    lgb.fit(X_train, y_train)
    lgb_y_pred = lgb.predict(X_test)
    score = matthews_corrcoef(y_test, lgb_y_pred)
    print('MCC:', score)

    lgb_list.append(lgb)

StratifiedKFold(n_splits=5, random_state=315, shuffle=True)
MCC: 0.9847001805640095
MCC: 0.9846392745918029
MCC: 0.9841337868751824
MCC: 0.9848382652180192
MCC: 0.9843544277099249


## Hard Voting

In [54]:
test_result = []
for i in range(0, 5):
    test_result.append(xgb_list[i].predict(test_temp))
    # test_result.append(catb_list[i].predict(test_temp))
    # test_result.append(lgb_list[i].predict(test_temp))

final = np.average(test_result, axis = 0)
final = np.where(final>=0.5, 'p', 'e')

sub['class'] = final
sub.to_csv('sub_Xgb.csv', index = False)

In [56]:
test_result = []
for i in range(0, 5):
    test_result.append(xgb_list[i].predict(test_temp))
    if i == 3 or i ==4: # best models for lgbm
        test_result.append(lgb_list[i].predict(test_temp))

final = np.average(test_result, axis = 0)
final = np.where(final>=0.5, 'p', 'e')

sub['class'] = final
sub.to_csv('sub_Xgb_Lgbm.csv', index = False)

In [44]:
test_result = []
for i in range(0, 5):
    test_result.append(xgb_list[i].predict(test_temp))
    test_result.append(catb_list[i].predict(test_temp))
    test_result.append(lgb_list[i].predict(test_temp))

final = np.average(test_result, axis = 0)
final = np.where(final>=0.5, 'p', 'e')

sub['class'] = final
sub.to_csv('sub_Xgb_Catb_Lgbm.csv', index = False)

In [28]:
sub.head(20)

Unnamed: 0,id,class
0,3116945,e
1,3116946,p
2,3116947,p
3,3116948,p
4,3116949,e
