In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# 👉 Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from lightgbm import LGBMClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import BernoulliNB
from scipy import stats
from scipy.special import boxcox1p
from catboost import CatBoostClassifier,  Pool
from xgboost import XGBClassifier

# 👉  Load train and test datasets

In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
train.head()

In [None]:
test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
test.head()

In [None]:
sample_submission = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')
sample_submission.head()

# 👉  Exploratory Data Analysis (EDA)

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train.drop('id', axis=1, inplace = True)

In [None]:
test.drop('id', axis=1, inplace = True)

In [None]:
train.describe().T\
        .style.bar(subset=['mean'], color=px.colors.qualitative.G10[0])\
        .background_gradient(subset=['std'], cmap='Greens')\
        .background_gradient(subset=['50%'], cmap='BuGn')

In [None]:
train.plot.hist(subplots=True, legend=True, layout=(10,5), figsize=(100,80), title="Histogram for all features")
    
# We can see that data is right skewed

In [None]:
test.plot.hist(subplots=True, legend=True, layout=(10,5), figsize=(100,80), title="Histogram for all features")
    
# We can see that data is right skewed

In [None]:
# There are also some negative values in dataframe
for i in train.columns:
    value = train[i].value_counts()
    print(value)

In [None]:
# There are also some negative values in dataframe
for i in test.columns:
    value = test[i].value_counts()
    print(value)

# 👉 Data Preprocessing

In [None]:
X = train.iloc[:,0:50]
y = train.iloc[:,50:]

### 🚀 Log transforming features to remove right skewness of data

In [None]:
np.seterr(divide = 'ignore')
for i in X.columns:
    #X[i] = np.where(X[i]>0, np.log1p(X[i]), 0)
    X[i] = np.log(X[i]-(min(X[i]-1)))

In [None]:
np.seterr(divide = 'ignore')
for i in test.columns:
    #test[i] = np.where(test[i]>0, np.log1p(test[i]), 0)
    test[i] = np.log(test[i]-(min(test[i]-1)))

In [None]:
X.plot.hist(subplots=True, legend=True, layout=(10,5), figsize=(100,80), title="Histogram for all features")

In [None]:
test.plot.hist(subplots=True, legend=True, layout=(10,5), figsize=(100,80), title="Histogram for all features")

In [None]:
label = LabelEncoder()
y = label.fit_transform(y)
y = np.ravel(y)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [None]:
skfold = StratifiedKFold(n_splits=5)

# 👉  Building Model Pipeline

## 🚀 Logistic Regression

In [None]:
pipe_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(multi_class='multinomial', n_jobs=-1))
])

In [None]:
pipe_lr.fit(X_train,y_train)

In [None]:
pipe_lr.score(X_test,y_test)

In [None]:
# cross_validation_score = cross_val_score(pipe_lr, X, y, cv=skfold)
# print(np.mean(cross_validation_score))

## 🚀  Catboost

In [None]:
pipe_cat = Pipeline([
    ('scaler', StandardScaler()),
    ('cat', CatBoostClassifier(loss_function='MultiClass',eval_metric='MultiClass', verbose=False))
])

In [None]:
# train_pool = Pool(data=X_train, label=y_train)
# test_pool = Pool(data=X_test, label=y_test.values) 

In [None]:
pipe_cat.fit(X_train,y_train)
# pipe_cat.fit(train_pool)

In [None]:
pipe_cat.score(X_test,y_test)

In [None]:
# cross_validation_score = cross_val_score(pipe_cat, X, y, cv=skfold)
# print(np.mean(cross_validation_score))

## 🚀 SGDClassifier

In [None]:
pipe_sgd = Pipeline([
    ('scaler', StandardScaler()),
    ('sgd', SGDClassifier(alpha=0.001, early_stopping=True,n_jobs=-1))
])

In [None]:
pipe_sgd.fit(X_train,y_train)

In [None]:
pipe_sgd.score(X_test,y_test)

In [None]:
# cross_validation_score = cross_val_score(pipe_sgd, X, y, cv=skfold)
# print(np.mean(cross_validation_score))

## 🚀 RandomForestClassifier

In [None]:
pipe_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier())
])

In [None]:
pipe_rf.fit(X_train,y_train)

In [None]:
pipe_rf.score(X_test,y_test)

In [None]:
# cross_validation_score = cross_val_score(pipe_rf, X, y, cv=skfold)
# print(np.mean(cross_validation_score))

## 🚀 LightGBM

In [None]:
pipe_lgbm = Pipeline([
    ('scaler', StandardScaler()),
    ('lgbm', LGBMClassifier(n_estimators=150, num_leaves=35, objective='multiclass'))
])

In [None]:
pipe_lgbm.fit(X_train,y_train)

In [None]:
pipe_lgbm.score(X_test,y_test)

In [None]:
# cross_validation_score = cross_val_score(pipe_lgbm, X, y, cv=skfold)
# print(np.mean(cross_validation_score))

## 🚀 BernoulliNB

In [None]:
pipe_bnb = Pipeline([
    ('scaler', StandardScaler()),
    ('bnb', BernoulliNB(alpha=7.0))
])

In [None]:
pipe_bnb.fit(X_train,y_train)

In [None]:
pipe_bnb.score(X_test,y_test)

In [None]:
# cross_validation_score = cross_val_score(pipe_bnb, X, y, cv=skfold)
# print(np.mean(cross_validation_score))

## 🚀 XGBoost

In [None]:
pipe_xgb = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', XGBClassifier())
])

In [None]:
pipe_xgb.fit(X_train,y_train)

In [None]:
pipe_xgb.score(X_test,y_test)

In [None]:
# cross_validation_score = cross_val_score(pipe_xgb, X, y, cv=skfold)
# print(np.mean(cross_validation_score))

# 👉 Hyperparameters Tuning

## 🚀 Tuning Logistic Regression

In [None]:
params_lr = [{
    'lr__penalty': ['l2'],
    'lr__C': [0.01,0.1,1.0,2.0],
    'lr__solver': ['newton-cg', 'lbfgs', 'sag', 'saga']
    },
    {
    'lr__penalty': ['l1'],
    'lr__C': [0.01,0.1,1.0,2.0],
    'lr__solver': ['saga']
    }
]    

In [None]:
hyper_search = GridSearchCV(pipe_lr, params_lr)
hyper_search.fit(X,y)

In [None]:
hyper_search.best_params_

In [None]:
pipe_lr = hyper_search.best_estimator_

In [None]:
pipe_lr.fit(X_train,y_train)

In [None]:
pipe_lr.score(X_test,y_test)

In [None]:
# cross_validation_score = cross_val_score(pipe_lr, X, y, cv=skfold)
# print(np.mean(cross_validation_score))

## 🚀 Tuning SGDClassifier

In [None]:
params_sgd = {
        'sgd__loss' : ['log','modified_huber'],
        'sgd__penalty' : ['l2', 'l1'],
        'sgd__alpha' : [0.0001,0.001],
        'sgd__learning_rate' : ['constant','optimal','invscaling'],
        'sgd__eta0' : [0.1]
}

In [None]:
hyper_search = GridSearchCV(pipe_sgd, params_sgd)
hyper_search.fit(X,y)

In [None]:
hyper_search.best_params_

In [None]:
pipe_sgd = hyper_search.best_estimator_

In [None]:
pipe_sgd.fit(X_train,y_train)

In [None]:
pipe_sgd.score(X_test,y_test)

In [None]:
# cross_validation_score = cross_val_score(pipe_sgd, X, y, cv=skfold)
# print(np.mean(cross_validation_score))

## 🚀 Tuning RandomForestClassifier

In [None]:
# params_rf = {
#     'rf__n_estimators': [50,100,150],
#     'rf__criterion' : ['gini', 'entropy'],
#     'rf__min_samples_split' : [2,3,4],
#     'rf__max_features': ['auto', 'sqrt', 'log2']
#     }

In [None]:
# hyper_search = GridSearchCV(pipe_rf, params_rf)
# hyper_search.fit(X,y)

In [None]:
# hyper_search.best_params_

In [None]:
# pipe_rf = hyper_search.best_estimator_

In [None]:
# pipe_rf.fit(X_train,y_train)

In [None]:
# pipe_rf.score(X_test,y_test)

In [None]:
# cross_validation_score = cross_val_score(pipe_rf, X, y, cv=skfold)
# print(np.mean(cross_validation_score))

## 🚀 Tuning LightGBM

In [None]:
params_lgbm = [
    {
    'lgbm__boosting_type': ['gbdt'],
    'lgbm__num_leaves': [30,35,40,45],
    'lgbm__learning_rate': [0.001,0.01,0.1],
    'lgbm__n_estimators' : [50,100,150],
    },
    {
    'lgbm__boosting_type': ['dart'],
    'lgbm__num_leaves': [30,35,40,45],
    'lgbm__learning_rate': [0.001,0.01,0.1],
    'lgbm__n_estimators' : [50,100,150],   
    }
]

In [None]:
hyper_search = GridSearchCV(pipe_lgbm, params_lgbm)
hyper_search.fit(X,y)

In [None]:
hyper_search.best_params_

In [None]:
pipe_lgbm = hyper_search.best_estimator_

In [None]:
pipe_lgbm.fit(X_train,y_train)

In [None]:
pipe_lgbm.score(X_test,y_test)

In [None]:
# cross_validation_score = cross_val_score(pipe_lgbm, X, y, cv=skfold)
# print(np.mean(cross_validation_score))

## 🚀 Tuning BernoulliNB

In [None]:
params_bnb = {
    'bnb__alpha': [1.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0]
    }

In [None]:
hyper_search = GridSearchCV(pipe_bnb, params_bnb)
hyper_search.fit(X,y)

In [None]:
hyper_search.best_params_

In [None]:
pipe_bnb = hyper_search.best_estimator_

In [None]:
pipe_bnb.fit(X_train,y_train)

In [None]:
pipe_bnb.score(X_test,y_test)

In [None]:
# cross_validation_score = cross_val_score(pipe_bnb, X, y, cv=skfold)
# print(np.mean(cross_validation_score))

# 👉 Voting Classifier

In [None]:
vote = VotingClassifier(estimators=[
    ('lgbm',pipe_lgbm),('cat',pipe_cat),('xgb',pipe_xgb),('bnb',pipe_bnb),('lr',pipe_lr),('sgd',pipe_sgd),('rf',pipe_rf)], voting='soft')

In [None]:
vote.fit(X_train, y_train)

In [None]:
vote.score(X_test,y_test)

# 👉 Submission

In [None]:
probability_predictions= vote.predict_proba(test)
probability_predictions

In [None]:
sample_submission.iloc[:,1:5] = probability_predictions
sample_submission

In [None]:
sample_submission.to_csv('submission.csv',index=False)