In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import SGDClassifier
pd.options.display.max_columns = 80

### Caminho de treino e teste

In [None]:
train_path = '/kaggle/input/tabular-playground-series-jun-2021/train.csv'
test_path = '/kaggle/input/tabular-playground-series-jun-2021/test.csv'

In [None]:
train = pd.read_csv(train_path,index_col='id')
test = pd.read_csv(test_path,index_col='id')

In [None]:
train.head()

In [None]:
train.info() #não possui valores NaN ou nulos

In [None]:
train['target'].value_counts()

In [None]:
train.hist(figsize=(120,80))

In [None]:
s = StandardScaler()
train[train.columns[:10]].hist(figsize=(20,15))
k = s.fit_transform(train[train.columns[:10]])

In [None]:
plt.hist(k)
plt.show()

### Separar o target

In [None]:
train_X = train.drop('target',axis=1)
train_y = train['target']
features = list(train_X.columns)
features

In [None]:
pipeline = Pipeline([('scaler',StandardScaler())])

In [None]:
train_prep = pipeline.fit_transform(train_X)

In [None]:
sgd_class = SGDClassifier(n_jobs=8)
sgd_class.fit(train_prep,train_y)
predictions = sgd_class.predict(train_prep)

In [None]:
cross_val_score(sgd_class,train_prep,train_y,cv=3,scoring='accuracy',n_jobs=8)

In [None]:
f1_score(train_y,predictions,average='weighted')

### RandomForest Classifier

In [None]:
rf_clf = RandomForestClassifier(n_jobs=8,n_estimators=1000,max_depth=4,random_state=41)
rf_clf.fit(train_prep,train_y)

In [None]:
rf_predictions = rf_clf.predict(train_prep)

In [None]:
f1_score(train_y,rf_predictions,average='weighted')

In [None]:
sorted(zip(features, rf_clf.feature_importances_),key=lambda x: x[1],reverse=True)

In [None]:
cross_val_score(rf_clf,train_prep,train_y,cv=3,n_jobs=8)

In [None]:
t_train_pred=cross_val_predict(rf_clf,train_prep,train_y,cv=3,n_jobs=8)

### Confusion Matrix

In [None]:
conf_mx = confusion_matrix(train_y,t_train_pred)

In [None]:
plt.matshow(conf_mx,cmap=plt.cm.gray)

In [None]:
conf_mx

### Probabilidades por classe

In [None]:
y_pred = rf_clf.predict_proba(train_prep)

### Log Loss

In [None]:
logloss = log_loss(train_y, y_pred)
print(f'Log loss: {logloss}')

### Submission

In [None]:
predictions = rf_clf.predict_proba(pipeline.transform(test))
sub = pd.DataFrame(predictions, columns=['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9'])
sub['id']= test.index.to_series().values

sub.to_csv('submission.csv', index=False)

