In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import datatable as dt
import time
import lightgbm as lgb

from numba import jit #to speed up loops

from sklearn.model_selection import train_test_split,StratifiedKFold 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

In [2]:
# Getting Data from folder
start = time.time()
train = dt.fread(r"C:\Users\Ong Yi Kai\Desktop\Data\Kaggle competitions\Tabular Data Oct 2021\train.csv").to_pandas()
test = dt.fread(r"C:\Users\Ong Yi Kai\Desktop\Data\Kaggle competitions\Tabular Data Oct 2021\test.csv").to_pandas()
end = time.time()
total_time = end-start
print("DONE LOADING! Time taken:{:.2f}".format(total_time))

DONE LOADING! Time taken:43.04


Reducing memeory usage (change data type)

In [3]:
# this function will help to reduce momory 
# data will be samller with the same value

@jit(forceobj=True)
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
        
            
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
#reducing the memory of data types
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Memory usage of dataframe is 1878.74 MB
Memory usage after optimization is: 549.32 MB
Decreased by 70.8%
Memory usage of dataframe is 938.89 MB
Memory usage after optimization is: 273.70 MB
Decreased by 70.8%


In [37]:
# seperate test into ID and Data
ID_test = test.iloc[:,:1]
X_test = test.iloc[:,1:]
X_test.shape

(500000, 285)

LIGHT GBM FIT

In [20]:
  Params = {
    'objective': 'binary',
    'metric': 'auc',
    'verbosity': '-1',
    'boosting_type': 'gbdt',
    'feature_pre_filter': False,
    'lambda_l1': 8.533875942246594,
    'lambda_l2': 2.0533270677941314e-06,
    'num_leaves': 13,
    'feature_fraction': 0.4,
    'bagging_fraction': 1.0,
    'bagging_freq': 0,
    'min_child_samples': 50,
    'early_stopping_round': 100,
    'num_iterations':1000
  }

In [1]:
folds = StratifiedKFold(n_splits = 10, random_state = 2021, shuffle = True)

predictions = np.zeros(len(test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(train)):
    print(f"Fold: {fold}")
    training= lgb.Dataset(train.iloc[trn_idx,1:-1],label = train.iloc[trn_idx,-1:])
    CV =lgb.Dataset(train.iloc[val_idx,1:-1],label = train.iloc[val_idx,-1:])
    
    model = lgb.train(
        Params, 
        training,
        valid_sets=[CV], 
        verbose_eval=100, 
        early_stopping_rounds=100)
   
    pred = model.predict(train.iloc[val_idx,1:-1])
    roc = roc_auc_score(train.iloc[val_idx,-1:], pred)
    print(f" roc_auc_score: {roc}")
    print("-"*50)
    
    predictions += model.predict(X_test)/folds.n_splits 

NameError: name 'StratifiedKFold' is not defined

GET RESULTS FOR SUBMISSION

In [34]:
submit_final = pd.concat([ID_test,pd.DataFrame(predictions)],axis=1)
submit_final.shape

(500000, 2)

In [35]:
submit_final.to_csv('oct_{}_lightgbm_.csv'.format(time.time()),index=False,header=['id','target'])