In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import datatable as dt
import time

from catboost import CatBoostClassifier

from numba import jit #to speed up loops

from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import roc_auc_score

In [3]:
# Getting Data from folder
start = time.time()
train = dt.fread(r"C:\Users\Ong Yi Kai\Desktop\Data\Kaggle competitions\Tabular Data Oct 2021\train.csv").to_pandas()
test = dt.fread(r"C:\Users\Ong Yi Kai\Desktop\Data\Kaggle competitions\Tabular Data Oct 2021\test.csv").to_pandas()
end = time.time()
total_time = end-start
print("DONE LOADING! Time taken:{:.2f}".format(total_time))

DONE LOADING! Time taken:10.33


Reducing memeory usage (change data type)

In [None]:
# this function will help to reduce momory 
# data will be samller with the same value

@jit(forceobj=True)
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
        
            
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
#reducing the memory of data types
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
train.head()

In [4]:
features = [col for col in test.columns if 'f' in col]
TARGET = 'target'

target = train.iloc[:,-1:].copy()
train = train.drop('target', axis=1)

In [5]:
ID_test = test.iloc[:,:1]
X_test = test.iloc[:,1:]

LIGHT GBM FIT

In [23]:
cat_params = {'iterations': 2866,
 'od_wait': 3385,
 'learning_rate': 0.04280810491488757,
 'reg_lambda': 0.32139709692279206,
 'subsample': 0.8442605943226449,
 'random_strength': 22.468752639603235,
 'depth': 4,
 'min_data_in_leaf': 31,
 'leaf_estimation_iterations': 15
             }

N_SPLITS = 10
N_ESTIMATORS = 5000
EARLY_STOPPING_ROUNDS = 100
VERBOSE = 500
SEED = 2021

In [None]:
cat_pred = np.zeros(test.shape[0])

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X=train, y=target)):
    print(f"===== CatBoostClassifier fold {fold} =====")
    X_train = train[features].iloc[trn_idx]
    y_train = target.iloc[trn_idx]
    X_valid = train[features].iloc[val_idx]
    y_valid = target.iloc[val_idx]
    print("training-CV split DONE!")
    start = time.time()
    model = CatBoostClassifier(**cat_params)
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        verbose=VERBOSE,
    )

    pred = model.predict_proba(X_valid)[:, -1]
    cat_pred += model.predict_proba(X_test)[:, -1] / N_SPLITS

    elapsed = time.time() - start
    auc = roc_auc_score(y_valid, pred)
    print(f"fold {fold} - cat auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")

===== CatBoostClassifier fold 0 =====
training-CV split DONE!
0:	learn: 0.6808974	test: 0.6809157	best: 0.6809157 (0)	total: 302ms	remaining: 14m 26s
500:	learn: 0.4841184	test: 0.4862579	best: 0.4862579 (500)	total: 2m 48s	remaining: 13m 15s
1000:	learn: 0.4722281	test: 0.4751388	best: 0.4751388 (1000)	total: 5m 29s	remaining: 10m 13s
1500:	learn: 0.4676746	test: 0.4717071	best: 0.4717071 (1500)	total: 8m 9s	remaining: 7m 24s
2000:	learn: 0.4647631	test: 0.4700886	best: 0.4700886 (2000)	total: 10m 48s	remaining: 4m 40s
2500:	learn: 0.4624931	test: 0.4693016	best: 0.4693002 (2498)	total: 13m 24s	remaining: 1m 57s
2865:	learn: 0.4609984	test: 0.4689027	best: 0.4688998 (2862)	total: 15m 17s	remaining: 0us

bestTest = 0.4688997969
bestIteration = 2862

Shrink model to first 2863 iterations.
fold 0 - cat auc: 0.855690, elapsed time: 950.42sec

===== CatBoostClassifier fold 1 =====
training-CV split DONE!
0:	learn: 0.6808831	test: 0.6808379	best: 0.6808379 (0)	total: 357ms	remaining: 17m 3s

GET RESULTS FOR SUBMISSION

In [16]:
submission_proba = model.predict_proba(X_test)
submission_proba

array([[0.22691564, 0.77308436],
       [0.74979458, 0.25020542],
       [0.09013634, 0.90986366],
       ...,
       [0.69432056, 0.30567944],
       [0.48345983, 0.51654017],
       [0.58040396, 0.41959604]])

In [20]:
submit_final = pd.concat([ID_test,pd.DataFrame(submission_proba[:,1])],axis=1)
submit_final.shape

(500000, 2)

In [21]:
submit_final.to_csv('oct_{}_lightgbm_.csv'.format(time.time()),index=False,header=['id','target'])

In [22]:
#0.855...