# Tabular Playground Serires Junio 2021

Para esta edicion de Tabular se nos pide hacer una clasificación para evaluar las distintas clases de un dataset con mas de 70 caracteristicas. Para ello, como viene siendo costumbre, empezaremos con una exploracion de datos para limpiar los mismos de ser necesario, despues se muestran los datos por pantalla para observar su distribuicion, tras esto, comparamos los distintos modelos seleccionados y elegimos el que nos da mejor resultado de base, en este modelo seleccionado tocamos los hiperparametros para obtener un mejor resultado que con el modelo al desnudo. Por ultimo, subimos los resultados a la competicion.

In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
#
import seaborn as sns
import matplotlib.pyplot as plt

import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier as xgb
from lightgbm import LGBMClassifier as lgb
from catboost import CatBoostClassifier as cbc
from tpot import TPOTClassifier

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train['target'].unique()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
sns.countplot(data = train, x = 'target')

In [None]:
train.drop(columns=['id']).describe().T.style.bar(subset=['mean'])\
                            .background_gradient(subset=['std'])\
                            .background_gradient(subset=['min'])\
                            .background_gradient(subset=['50%'])
#This cell belongs to Kaustubh B Bhargav (@kaustubh93)

In [None]:
encoder = LabelEncoder()
encoder.fit(train['target'])
train1 = train
train['target'] = encoder.fit_transform(train['target'])
    
train1.head(10)
#Cambiamos las clases de [1-8] a [0-7]

In [None]:
scaler = StandardScaler()

In [None]:
X = train.drop(['target', 'id'], axis = 1)
y = train['target']
test = test.drop(['id'], axis = 1)

X = scaler.fit_transform(X)
test = scaler.transform(test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
model = RandomForestClassifier()  
model.fit(X_train,y_train)
y_preds = model.predict_proba(X_test)
log_loss1 = log_loss(y_test, y_preds)
log_loss1

In [None]:
model2 = cbc()

model2.fit(X_train, y_train, verbose = False)
y_preds2 = model.predict_proba(X_test)
log_loss2 = log_loss(y_test, y_preds2)
log_loss2

In [None]:
model3 = xgb()

model3.fit(X_train, y_train)
y_preds3 = model3.predict_proba(X_test)
log_loss3 = log_loss(y_test, y_preds3)
log_loss3

In [None]:
model4 = lgb()

model4.fit(X_train, y_train)
y_preds4 = model4.predict_proba(X_test)
log_loss4 = log_loss(y_test, y_preds4)
log_loss4

In [None]:
# I tried run this part, but I have been waiting 4 hours and dont see the end

# import h2o
# from h2o.automl import H2OAutoML
# h2o.init()

# # Run AutoML for 20 base models (limited to 1 hour max runtime by default)
# train = h2o.import_file("../input/tabular-playground-series-jun-2021/train.csv")
# test = h2o.import_file("../input/tabular-playground-series-jun-2021/test.csv")

# x = train.columns
# y = 'target'
# x.remove(y)

# aml = H2OAutoML(max_models = 20, seed = 1)
# aml.train(x = x, y = 'target', training_frame = train)

# # View the AutoML Leaderboard
# lb = aml.leaderboard
# lb.head(rows = lb.nrows)  # Print all rows instead of default (10 rows)
# preds = aml.leader.predict(test)
# lb = h2o.automl.get_leaderboard(aml, extra_columns = 'ALL')
# lb

In [None]:
# Running this part, I obtain: 1.9927355643445277

# def objective(trial, data = X, target = y):
#     X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

#     model = RandomForestClassifier()  
#     model.fit(X_train,y_train)

#     y_preds = model.predict_proba(X_test)


#     log_loss1 = log_loss(y_test, y_preds)

#     return log_loss1

# study = optuna.create_study(direction = 'minimize')
# study.optimize(objective, n_trials = 100)

In [None]:
# tpot = TPOTClassifier(generations = 1, random_state = 42, verbosity = 1)
# tpot.fit(X_train, y_train)
# print(tpot.score(X_test, y_test))

In [None]:
modelfinal = lgb(boosting_type = 'gbdt', objective = 'binary', num_leaves = 150,
                 learning_rate = 0.005, n_estimators = 1000, max_depth = 300,
                 reg_lambda = 0.05)

modelfinal.fit(X_train, y_train)
y_predsfinal = modelfinal.predict_proba(X_test)
log_lossfinal = log_loss(y_test, y_predsfinal)
log_lossfinal

In [None]:
sample_submission = pd.read_csv('../input/tabular-playground-series-jun-2021/sample_submission.csv')
test_pred = modelfinal.predict_proba(test)
submission = pd.DataFrame(test_pred, 
                          columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9'])
submission['id'] = sample_submission['id']
submission.head()

In [None]:
submission.to_csv('submision.csv', index = False)