In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 🚀 Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
pd.set_option('display.max_columns', None)

# 🚀 Loading Train and Test dataset

## 🛰️ Loading test dataset

In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
train.head()

## 🛰️ Loading test dataset

In [None]:
test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
test.head()

## 🛰️ Loading sample submission dataset

In [None]:
sample_submission = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')
sample_submission.head()

# 🚀 Data Preprocessing

## 🛰️ Checking for null values in train dataset

In [None]:
train.isnull().sum()

## 🛰️ Checking for null values in test dataset

In [None]:
test.isnull().sum()

## 🛰️ Check data information

In [None]:
train.info()

## 🛰️ Check train data shape

In [None]:
train.shape

## 🛰️ Check test data shape

In [None]:
test.shape

## 🛰️ Drop 'id' column from train dataset

In [None]:
train.drop('id', axis=1, inplace = True)

## 🛰️ Drop 'id' column from test dataset

In [None]:
test.drop('id', axis=1, inplace = True)

# 🚀 Exploratory Data Analysis (EDA)

## 🛰️ Verify the values under each feature

In [None]:
for i in train.columns:
    print(train[i].value_counts())

## 🛰️ Check the data distribution

In [None]:
for i in train.columns:
    sns.histplot(train[i])
    plt.show()

## 🛰️ Separating dependent and independent variables

In [None]:
X = train.iloc[:,0:50]
y = train.iloc[:,50:]

y = np.ravel(y)

## 🛰️ Log Transform Train data

In [None]:
np.seterr(divide = 'ignore')
for i in X.columns:
    #X[i] = np.where(X[i]>0, np.log1p(X[i]), 0)
    X[i] = np.log(X[i]-(min(X[i]-1)))

## 🛰️ Log Transform Test data

In [None]:
np.seterr(divide = 'ignore')
for i in test.columns:
    #test[i] = np.where(test[i]>0, np.log1p(test[i]), 0)
    test[i] = np.log(test[i]-(min(test[i]-1)))

## 🛰️  Label Encoding target variable

In [None]:
label = LabelEncoder()
y = label.fit_transform(y)

## 🛰️ Splitting the data into training and validation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = 42)

In [None]:
skfold = StratifiedKFold(n_splits=5)

# 🚀 Building Model Pipeline

## 🛰️ Trying different models

In [None]:
pipe_1 = Pipeline([
    ('scale',StandardScaler()),
    ('gnb',GaussianNB())
])

In [None]:
cross_score = cross_val_score(pipe_1,X,y,cv=skfold)
print(np.mean(cross_score))

In [None]:
pipe_2 = Pipeline([
    ('scale',StandardScaler()),
    ('bnb',BernoulliNB())
])

In [None]:
cross_score = cross_val_score(pipe_2,X,y,cv=skfold)
print(np.mean(cross_score))

In [None]:
pipe_3 = Pipeline([
    ('scale',StandardScaler()),
    ('lr',LogisticRegression(solver = 'sag',multi_class='multinomial'))
])

In [None]:
cross_score = cross_val_score(pipe_3,X,y,cv=skfold)
print(np.mean(cross_score))

In [None]:
pipe_4 = Pipeline([
    ('scale',StandardScaler()),
    ('rf',RandomForestClassifier())
])

In [None]:
cross_score = cross_val_score(pipe_4,X,y,cv=skfold)
print(np.mean(cross_score))

In [None]:
pipe_5 = Pipeline([
    ('scale',StandardScaler()),
    ('lgbm',LGBMClassifier())
])

In [None]:
cross_score = cross_val_score(pipe_5,X,y,cv=skfold)
print(np.mean(cross_score))

In [None]:
pipe_6 = Pipeline([
    ('scale',StandardScaler()),
    ('xgb',XGBClassifier())
])

In [None]:
cross_score = cross_val_score(pipe_6,X,y,cv=skfold)
print(np.mean(cross_score))

In [None]:
pipe_7 = Pipeline([
    ('scale',StandardScaler()),
    ('knn',KNeighborsClassifier())
])

In [None]:
cross_score = cross_val_score(pipe_7,X,y,cv=skfold)
print(np.mean(cross_score))

# 🚀 Hyperparamaters Tuning

## 🛰️ Tuning hyperparameters for LightGBM

In [None]:
params = {
    'lgbm__boosting_type' : ['gbdt'],
    'lgbm__objective' : ['multiclass'],
    'lgbm__num_leaves': [30,35,40],
    'lgbm__learning_rate' : [0.001,0.01,0.1],
}

In [None]:
lgbm_search = GridSearchCV(pipe_5, params)

In [None]:
lgbm_search.fit(X_train,y_train)

In [None]:
lgbm_search.best_params_

In [None]:
lgbm_search.best_score_ 

## 🛰️ Tuning hyperparameters for BernoulliNB

In [None]:
params = {
    'bnb__alpha' : [1.0,2.0,3.0,4.0,5.0]
}

In [None]:
bnb_search = GridSearchCV(pipe_2, params)

In [None]:
bnb_search.fit(X,y)

In [None]:
bnb_search.best_params_

In [None]:
bnb_search.best_score_ 

## 🛰️ Predicting probabilities for all classes for BernoulliNB algorithm

In [None]:
pipe_2 = Pipeline([
    ('scale',StandardScaler()),
    ('bnb',BernoulliNB(alpha=4.0))
])

In [None]:
pipe_2.fit(X_train,y_train)

In [None]:
predictions = pipe_2.predict(X_test)
print(predictions)

In [None]:
probability_predictions = pipe_2.predict_proba(X_test)
probability_predictions

## 🛰️ Predicting probabilities on test dataset using BernoulliNB algorithm

In [None]:
probability_predictions = pipe_2.predict_proba(test)
probability_predictions

In [None]:
sample_submission.iloc[:,1:5] = probability_predictions
sample_submission

In [None]:
sample_submission.to_csv('submission.csv',index=False)

## 🛰️ Predicting probabilities for all classes for LightGBM algorithm

In [None]:
pipe_5 = Pipeline([
    ('scale',StandardScaler()),
    ('lgbm',LGBMClassifier(boosting_type='gbdt',num_leaves=35, learning_rat=0.1,objective='multiclass'))
])

In [None]:
pipe_5.fit(X_train,y_train)

In [None]:
predictions = pipe_5.predict(X_test)
print(predictions)

In [None]:
probability_predictions = pipe_5.predict_proba(X_test)
probability_predictions

## 🛰️ Predicting probabilities on test dataset using LightGBM algorithm

In [None]:
probability_predictions = pipe_5.predict_proba(test)
probability_predictions

In [None]:
sample_submission.iloc[:,1:5] = probability_predictions
sample_submission

In [None]:
sample_submission.to_csv('submission_lgbm.csv',index=False)