# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 15px;">Tabular Playground Series - May 2021</p>

The algorithms used are:

* GradientBoostingClassifier
* AdaBoostClassifier
* ExtraTreesClassifier
* ElasticNet
* BaggingClassifier
* DecisionTreeClassifier
* ExtraTreeClassifier
* XGBClassifier
* LGBMClassifier

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
#sns.set_style("dark")
sns.set(rc={'figure.figsize':(10,6)})
sns.set(font_scale=1.3)
plt.style.use('fivethirtyeight')

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import roc_curve, log_loss, accuracy_score

import warnings
warnings.filterwarnings("ignore")

# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 15px;">Input</p>

In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')

In [None]:
train.head()

In [None]:
train.info()

# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 15px;">Preprocess</p>

In [None]:
# Encoder of Class
le = LabelEncoder()
encoder = le.fit_transform(train.target)
train = train.assign(target=encoder)

In [None]:
plt.figure(figsize=(12,6))
contagem = train['target'].value_counts()

ax = sns.countplot(data=train, x='target')

for c in ax.patches:
    ax.text(c.get_x() + c.get_width() / 2, c.get_height() + 0.9, c.get_height(), fontsize='16')
    
ax.set_xlabel('N° of Class', fontsize = 15)
ax.set_ylabel('Count', fontsize = 15);

In [None]:
train.describe().T

In [None]:
columns = test.iloc[:,1:51].columns
plt.subplots(figsize=(18,120))
length = len(columns)

for i,j in zip(columns, range(length)):
    plt.subplot((length/2), 2, j+1)
    plt.subplots_adjust(wspace=.2, hspace=.5)
    sns.kdeplot(train[i])

plt.show()

In [None]:
test.describe().T

In [None]:
columns = test.iloc[:,1:51].columns
plt.subplots(figsize=(18,120))
length = len(columns)

for i,j in zip(columns, range(length)):
    plt.subplot((length/2), 2, j+1)
    plt.subplots_adjust(wspace=.2, hspace=.5)
    sns.kdeplot(test[i])

plt.show()

# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 15px;">Model</p>

In [None]:
x = train.drop(['target'], axis=1)
y = train['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, stratify=y)

In [None]:
colunas = ['Modelo','Probability']
resultado = pd.DataFrame(columns=colunas)


models = []
models.append(('GradientBoostingClassifier', GradientBoostingClassifier()))
models.append(('AdaBoostClassifier', AdaBoostClassifier()))
models.append(('ExtraTreesClassifier', ExtraTreesClassifier()))
models.append(('BaggingClassifier', BaggingClassifier()))
models.append(('RandomForestClassifier', RandomForestClassifier()))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
models.append(('ExtraTreeClassifier', ExtraTreeClassifier()))
models.append(("XGBClassifier", XGBClassifier()))
models.append(("LGBMClassifier", LGBMClassifier()))

for name, model in models:
    model.fit(x_train,y_train)
    y_pred=model.predict_proba(x_test)
    lost = log_loss(y_test, y_pred)
    resultado = resultado.append(pd.DataFrame([[name, lost]], columns=colunas))
    
resultado.sort_values(by=['Probability'], ascending=True, inplace=True)
resultado

In [None]:
model = LGBMClassifier()
model.fit(x_train, y_train)
pred = model.predict_proba(x_test)

In [None]:
result = model.predict_proba(test)

# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 15px;">Submission</p>

In [None]:
submission = pd.DataFrame({'id': sub['id'],
                           'Class_1': result[:, 0],
                           'Class_2': result[:, 1],
                           'Class_3': result[:, 2],
                           'Class_4': result[:, 3],})

In [None]:
submission.to_csv('submission.csv', index=False)