In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

### Load datasets

In [2]:
train_df = pd.read_csv('./train.csv', index_col = "id")
test_df = pd.read_csv("./test.csv", index_col = "id") # Specifying index_col now saves us time later
sub = pd.read_csv("./submission.csv")

## Handle missing values

In [3]:
def handle_missed(df, n_freq=50):
    numeric = ['stem-width', 'stem-height', 'cap-diameter']
    category = [x for x in df.columns if x not in numeric]
    if 'class' in category:
        category.remove('class')
    # for numerical cols
    for col in numeric:
        if df[col].isnull().sum() > 0:
            mean_col = df[col].mean()
            df[col].fillna(mean_col, inplace=True)
    
    # for categorical cols
    for col in category:
        if df[col].isnull().sum() > 0:
            # Replace missing values with 'NO'
            df[col].fillna('NO', inplace=True)
            unique_vals = df[col].unique()
            freq_list = []
            freq_least = []
            for x in unique_vals:
                if np.sum(df[col] == x) > n_freq:
                    freq_list.append(x)

            # Replace less frequent values with 'LF'
            df[col] = df[col].apply(lambda x: x if x in freq_list else 'LF')
    
    return df

In [4]:
# train data
train_df = handle_missed(train_df, n_freq=200)

# test data
test_df = handle_missed(test_df, n_freq=200)

In [5]:
numeric = ['stem-width', 'stem-height', 'cap-diameter']
category = [x for x in test_df.columns if x not in numeric]
train_df[category] = train_df[category].astype('category')
test_df[category] = test_df[category].astype('category')

In [6]:
# log tranformation for nearly normal distribution of numerical columns
for col in numeric:
    train_df[col] = train_df[col].apply(lambda x: np.log(x+1.00001))
    test_df[col] = test_df[col].apply(lambda x: np.log(x+1.00001))

In [7]:
le = LabelEncoder() # converts categorical data into numerical ones: e=0, 1=p
train_df['class'] = le.fit_transform(train_df['class'])

In [8]:
X = train_df.drop(columns=['class'], axis=1)
y = train_df['class']

## XGBoost

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=3, stratify=y)

print(X_train.shape)

(2493556, 20)


In [10]:
from sklearn.metrics import matthews_corrcoef

def mcc_metric(y_pred, dmatrix):
    y_true = dmatrix.get_label()
    y_pred = (y_pred > 0.5).astype(int) 
    mcc = matthews_corrcoef(y_true, y_pred)
    return 'mcc', mcc

In [11]:
model = XGBClassifier(     
    device = 'cuda',                                 
    colsample_bytree=0.52,      
    max_depth=16,             
    min_child_weight=7,                
    random_state=35,                 
    n_estimators=1200, 
    learning_rate = 0.016,
    gamma = 0.001,
    subsample = 0.7,
    reg_alpha = 0.01,
    reg_lambda = 0.9,
    enable_categorical = True  ,
    scale_pos_weight = 1         
    )

In [12]:
XGB = model.fit(
    X_train, 
    y_train) 

y_pred = XGB.predict(X_test)
score = matthews_corrcoef(y_test, y_pred)
print('MCC:', score)

MCC: 0.984966200272008


In [13]:
test_pred_prob = model.predict(test_df)
test_pred_class = le.inverse_transform(test_pred_prob)
sub['class']= test_pred_class
sub.to_csv('submission_xgb.csv', index = False)
pd.read_csv('submission_xgb.csv')

Unnamed: 0,id,class
0,3116945,e
1,3116946,p
2,3116947,p
3,3116948,p
4,3116949,e
...,...,...
2077959,5194904,p
2077960,5194905,p
2077961,5194906,p
2077962,5194907,e


In [14]:
# best params for score 0.98510

# model = XGBClassifier(                                      
#     colsample_bytree=0.52,      
#     max_depth=18,             
#     min_child_weight=9,                
#     random_state=3,                 
#     n_estimators=512, 
#     learning_rate = 0.023,
#     gamma = 0.0004,
#     subsample = 0.7,
#     reg_alpha = 0.008,
#     reg_lambda = 0.92,
#     enable_categorical = True  ,
#     scale_pos_weight = 1.01          
#     )