<br>
<h1 style="color:pink; text-align:center; font-size:30px; font-family:Arial Black; border-radius:30px 30px; background-color:black; line-height: 50px; padding: 15px 15px 15px 2.5%;">💥LightBGM Ensemble💥</h1>
<br>

# Approach

<div class="alert alert-block alert-info"><p style='color:black;'>
    1. Import libraries<br>
    2. Read the data<br>
    3. Check for missing values and target distribution<br>
    4. Train different LGBMClassifiers and predict for test data<br>
    5. Take the average of the predictions<br>
    6. Create submission files<br>
</p>
</div>

# ✅ Importing Required Libraries

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from tqdm.notebook import tqdm_notebook

import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.set_option('display.max_rows', 100)

# ✅Reading the Data

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/test.csv")
ss = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/sample_submission.csv")

# 🔍Basic Data Checks

In [None]:
print(f'Shape of Train dataset is : {train.shape}')
print(f'Shape of Test dataset is : {test.shape}')
print(f'Shape of Sample Submission dataset is : {ss.shape}')

In [None]:
def check_NAN_columns(df, df_name):
    if len(df.columns[df.isnull().any()]) == 0:
        print(f'No missing data in {df_name} dataset')
    else:
        print(f'The following columns are having missing data in {df_name} dataset:')
        print(df.columns[df.isnull().any()])

In [None]:
datasets = {
    'Train': train,
    'Test': test,
    'Sample Submission': ss,
}
for df_name, df in datasets.items():
    check_NAN_columns(df, df_name)

In [None]:
train.drop(columns=['id']).describe().T

In [None]:
test.drop(columns=['id']).describe().T

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
sns.countplot(x='target', data=train)
ax.set_title('Target Distribution')

# LGBMClassifier model

In [None]:
X = train.drop(["id","target"], axis=1)
y = train.target
X_test = test.drop("id", axis=1)

<div class="alert alert-block alert-info"><p style='color:black;'>We are gonna create <b>100 different instances of LGBMClassifier with different random states</b> and then take the average of the predictions.</p></div>

In [None]:
iterations = 100
lgbm_pred = 0
logloss = []
for i in tqdm_notebook(range(iterations)):
    random_state = np.random.randint(0,2000)
    X_train, X_valid, y_train, y_valid = train_test_split(X, y , test_size=0.2, stratify=y, random_state=random_state)
    lgbm = LGBMClassifier(random_state=random_state)
    lgbm.fit(X_train,y_train)
    valid_preds = lgbm.predict_proba(X_valid)
    lgbm_pred += lgbm.predict_proba(X_test)/iterations
    logloss.append(log_loss(y_valid,valid_preds))
    print(f'Iteration {i} : Random State = {random_state}, Log Loss = {log_loss(y_valid,valid_preds)}')
print(f'Average Log Loss = {sum(logloss)/len(logloss)}')

# 📁 Submission file

In [None]:
ss["Class_1"] = lgbm_pred[:,0]
ss["Class_2"] = lgbm_pred[:,1]
ss["Class_3"] = lgbm_pred[:,2]
ss["Class_4"] = lgbm_pred[:,3]
ss["Class_5"] = lgbm_pred[:,4]
ss["Class_6"] = lgbm_pred[:,5]
ss["Class_7"] = lgbm_pred[:,6]
ss["Class_8"] = lgbm_pred[:,7]
ss["Class_9"] = lgbm_pred[:,8]
ss.to_csv("/kaggle/working/lgbm_100_ensemble_sub.csv", index=False)

# Kindly Upvote, if you like this notebook.