<img src="https://i.imgur.com/bGpKLYh.png">

Although the data used for this competition is synthetic, it is based on a real dataset and generated using a CTGAN. The original dataset deals with predicting the category on an eCommerce product given various attributes about the listing. 
> 🎯 Goal: To predict the probability the id belongs to each class

> 📖 Data:
> - ```train.csv``` - *training data*, one product (id) per row, with the associated features (feature_*) and class label (target)
> - ```test.csv``` - *test data*

# Import libraries 📚

In [None]:
!pip install pycomp

import numpy as np 
import pandas as pd
import cudf
import cupy
import time
import seaborn as sns

from pycomp.viz.insights import *
from cuml.preprocessing import LabelEncoder
from cuml.preprocessing.model_selection import train_test_split
from cuml.metrics import accuracy_score
from cuml import PCA
from cuml.manifold import UMAP, TSNE
from cuml.linear_model import LogisticRegression
from cuml.ensemble import RandomForestClassifier as cuRFC
from cuml.metrics import log_loss as logloss
from tpot import TPOTClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.metrics import roc_curve, log_loss

import warnings
warnings.filterwarnings("ignore")

In [None]:
def custom_palette(custom_colors):
    customPalette = sns.set_palette(sns.color_palette(custom_colors))
    sns.palplot(sns.color_palette(custom_colors),size=0.8)
    plt.tick_params(axis='both', labelsize=0, length = 0)

b = ["#2a9d8f","#e9c46a","#f4a261","#e76f51"]
custom_palette(b)

In [None]:
train = cudf.read_csv("../input/tabular-playground-series-may-2021/train.csv")
test = cudf.read_csv("../input/tabular-playground-series-may-2021/test.csv")

In [None]:
train

In [None]:
test

# EDA 📊

In [None]:
train_p = train.to_pandas()
lic = []
for col in train_p.columns[1:-1]:
    lic.append(col)

In [None]:
plot_donut_chart(df=train.to_pandas(), col='target',
                 title='Target Value Distribution',colors=[b[1],b[2],b[3],b[0]])

In [None]:
def plot(col):
    plt.figure(figsize = (18, 8),dpi=80)
    plt.rcParams["axes.linewidth"] = 3
    g = sns.countplot(x = col, hue = 'target', data = train_p)
    plt.legend(loc='upper right')
    plt.title("Distribution of "+ col,fontsize=15)
    plt.legend(title='Target',loc='upper right')
    plt.show();

for col in lic:
    plot(col)

<center><img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRrRyySk4pDN6tju38z-r8oVA6oha9WSJBl0gVxTNALk3gz8TXZaNjQQfPSjSisodD-upo&usqp=CAU" ></center>

# Label Encoding 🏷️

In [None]:
le = LabelEncoder()
encoded = le.fit_transform(train.target)
train = train.assign(target=encoded)
train.head()

In [None]:
plt.figure(figsize=(16,16),dpi=80)
corr=train.to_pandas().corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, cmap='PuRd', robust=True, center=0,
            square=True, linewidths=.5)
plt.title('Correlation', fontsize=15)
plt.show()

In [None]:
def styling(cell):
    if cell < 0 :
        return 'background: #fde2e4; color:black'
    else:
        return 'background: #deaaff; color: white'

target_df = pd.DataFrame(corr.target).iloc[:-1,:].T
target_df.style.applymap(styling)

# Dimensionality Reduction 💭

In [None]:
train_pp = train.to_pandas()
def plot_dr(technique,title):
    start = time.time()
    technique = technique(n_components=2)
    result = technique.fit_transform(train_p[lic].values)
    plt.figure(figsize = (16, 8))
    plt.scatter(result[:,0], result[:,1], c = train_pp['target'].values, s = 0.7, cmap='cool')
    plt.title(title,fontsize=18, fontweight='bold')
    plt.xticks([])
    plt.yticks([])
    plt.show()
    print('Duration: {} seconds'.format(time.time() - start))

In [None]:
plot_dr(UMAP,"UMAP")

In [None]:
plot_dr(TSNE,"TSNE")

In [None]:
%%time

X = train.drop(["target"],axis=1)
y = train["target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, 
                                                    shuffle=False, stratify=y)

# Model training ⚙️

In [None]:
def training(model, X_train, y_train, X_val, y_val, model_name):
    t1 = time.time()
    
    model.fit(X_train, y_train)
    predicts = model.predict_proba(X_val)
    logl = logloss(y_val, predicts)
    
    t2 = time.time()
    training_time = t2-t1 
    
    print("\t\t\t--- Model:", model_name,"---")
    print("Log loss: ", logl,"\t\t\t","Training time:",training_time,"\n")

In [None]:
for col in X_train.columns:
    X_train[col] = X_train[col].astype('float32')
    
for col in X_val.columns:
    X_val[col] = X_val[col].astype('float32')
    
y_train = y_train.astype('int32')
y_val = y_val.astype('int32')

In [None]:
lr = LogisticRegression(fit_intercept=True,penalty='l1')

rf = cuRFC(n_estimators=500)

m = [lr,rf]
mn = ["Logistic Regression","Random Forest"]

for i in range(0,len(m)):
    training(model=m[i], X_train=X_train, y_train=y_train, X_val=X_val,y_val=y_val, model_name=mn[i])

<center><img src="https://raw.githubusercontent.com/EpistasisLab/tpot/master/images/tpot-logo.jpg" width="250"></center>

In [None]:
tpot = TPOTClassifier(
   generations=5,
   population_size=100,
   scoring = 'roc_auc_ovr',
   config_dict="TPOT cuML",
   cv=5,
   verbosity=2
)

# for cuML with TPOT, we need to use CPU data
tpot.fit(X_train.to_pandas(), y_train.to_pandas())
tpot.export('tps-pipeline.py')

In [None]:
print('Accuracy :', tpot.score(X_val.to_pandas(), y_val.to_pandas()))
fin_preds = tpot.predict_proba(test.to_pandas())

<center><img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRAWAwYLg9InZxOdcxr7mEKMyVsNM8MHNXB0GDDKC5tqR0I71h9LbvXQLMCoSQk82vh3Zw&usqp=CAU"></center>

In [None]:
%%time

NUM_SPLITS = 5
model = LGBMClassifier(**{'learning_rate': 0.05,
                    'max_depth': 10,
                    'num_leaves' : 63,
                    'objective': 'multiclass',
                    'metric': 'multi_logloss',
                    'bagging_seed': 42,
                    'boosting_type': 'gbdt',
                    'is_unbalance': True})

X_c = pd.concat([X_train.to_pandas(), X_val.to_pandas()])
y_c = pd.concat([y_train.to_pandas(), y_val.to_pandas()])

test_p = test.to_pandas()
oof_pred = np.zeros((len(X_c), 4))
test_pred = np.zeros((len(test_p), 4))

folds = StratifiedKFold(n_splits=NUM_SPLITS, shuffle=True, random_state=2021)

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_c,y_c)):
    print('-- Fold:', fold_,'--' )
    model = model.fit(X_c.iloc[trn_idx], y_c.iloc[trn_idx], eval_set=[(X_c.iloc[trn_idx],y_c.iloc[trn_idx]),(X_c.iloc[val_idx], y_c.iloc[val_idx])],
                          eval_metric = 'multi_logloss',
                          early_stopping_rounds = 100,verbose=250)
         
    temp_oof = model.predict_proba(X_c.iloc[val_idx])
    oof_pred[val_idx] =  temp_oof
    
    print(f"Log Loss: {log_loss(y_c.iloc[val_idx], temp_oof)}")
    
    temp_test = model.predict_proba(test_p)
    test_pred += test_pred/NUM_SPLITS

print(f"Overall Log Loss: {log_loss(y_c, oof_pred)}")

<center><img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSjifalk1omESSaUXBBKVI16qaoPQYPxya-Sd5Gm__po7WPeP8R3aDBZD-hnYZbWYeSdg&usqp=CAU"></center>

In [None]:
%%time

NUM_SPLITS = 10
model2 = CatBoostClassifier()

oof_pred = np.zeros((len(X_c), 4))
test_pred = np.zeros((len(test_p), 4))

folds = StratifiedKFold(n_splits=NUM_SPLITS, shuffle=True, random_state=2021)

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_c,y_c)):
    print('-- Fold:', fold_,'--' )
    model2 = model2.fit(X_c.iloc[trn_idx], y_c.iloc[trn_idx],eval_set = [(X_c.iloc[val_idx], y_c.iloc[val_idx])],
                          early_stopping_rounds = 100,verbose=250)
         
    temp_oof = model.predict_proba(X_c.iloc[val_idx])
    oof_pred[val_idx] =  temp_oof
    
    print(f"Log Loss: {log_loss(y_c.iloc[val_idx], temp_oof)}")
    
    temp_test = model.predict_proba(test_p)
    test_pred += test_pred/NUM_SPLITS
    
print(f"Overall Log Loss: {log_loss(y_c, oof_pred)}")

# Submission file 📝

In [None]:
predictions = cudf.DataFrame(fin_preds)
predictions.columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4']
predictions['id'] = test['id']
predictions = predictions[['id', 'Class_1', 'Class_2', 'Class_3', 'Class_4']]

predictions.to_csv("/kaggle/working/Predictions_teapot.csv", index=False)
predictions

Work in progress 🚧