# Libraries  📚

In [None]:
import pandas as pd
import numpy as np
import os
import gc

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier

import warnings
warnings.simplefilter('ignore')

# Load Datasets 🗃️

In [None]:
INPUT = "../input/tabular-playground-series-feb-2022/"

df_train = pd.read_csv(INPUT + "train.csv")
df_test = pd.read_csv(INPUT + "test.csv")
df_submission = pd.read_csv(INPUT + "sample_submission.csv")

# Data Manipulation ⚙️

In [None]:
train = df_train.drop(["row_id"],axis=1)
test = df_test.drop(["row_id"],axis=1)

print(f'Size of train data: {train.shape}')
print(f'Size of test data: {test.shape}')

TARGET = 'target'
FEATURES = [col for col in train.columns if col not in ['row_id', TARGET]]
SEED = 2022
train.target.value_counts()

In [None]:
lb = LabelEncoder()
train[TARGET]  = lb.fit_transform(train['target'])

In [None]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
train[FEATURES] = scale.fit_transform(train[FEATURES])
test[FEATURES] = scale.transform(test[FEATURES])

# Model Training 🏋️

In [None]:
fit_params = { 'objective' : 'multiclass',
                'metric' : 'multi_logloss',
               }

In [None]:
# Run CV
from sklearn.model_selection import KFold, cross_val_score

# Lets split the data into 5 folds.  
K = 5

# We will use this 'kf'(KFold splitting stratergy) 
kf = KFold(n_splits = K, shuffle = True, random_state = SEED)

acc = []
lgb_predictions = []
lgb_scores = []

# split()  method generate indices to split data into training and test set. 
for i, (train_index, test_index) in enumerate(kf.split(train[FEATURES], train[TARGET])):
    
    # Create data for this fold
    y_train, y_valid = train[TARGET].iloc[train_index], train[TARGET].iloc[test_index]
    X_train, X_valid = train.iloc[train_index][FEATURES], train.iloc[test_index][FEATURES]
    print( "\nFold ", i)
    
    lgb_model = LGBMClassifier(**fit_params)
    
    lgb_model.fit(X_train, y_train, eval_set = [(X_valid,y_valid)], early_stopping_rounds=150)
    
    # Generate validation predictions for this fold
    lgb_predict = lgb_model.predict(X_valid)
    acc = accuracy_score(y_valid, lgb_predict)
    lgb_scores.append(acc)    
        
     # Accumulate test set predictions
    y_test_pred = lgb_model.predict(test[FEATURES])
    lgb_predictions.append(y_test_pred)
    
    print("Mean Accuracy :", np.mean(lgb_scores))


# Submit To Kaggle 🇰

In [None]:
from scipy.stats import mode

lgb_submission = df_submission.copy()
lgb_submission["target"] = lb.inverse_transform(np.squeeze(mode(np.column_stack(lgb_predictions),axis = 1)[0]).astype('int'))
lgb_submission.to_csv("submission.csv",index=False)
lgb_submission.head()