<center><h1>TPS - EDA + TAB-Net with Optuna Tuning</h1></center>
<hr>

![forest image](https://media.cntraveler.com/photos/5eb18e42fc043ed5d9779733/16:9/w_2580,c_limit/BlackForest-Germany-GettyImages-147180370.jpg)

The dataset that is used for this competition is synthetic but based on a real dataset and generated using a CTGAN. This dataset is based off of the original [Forest Cover Type Prediction](https://www.kaggle.com/c/forest-cover-type-prediction/overview) competition.
<li> For each Id in the test set, we must predict the Cover_Type class
<li> In this notebook we will try and take a meaningful look at the data
<li> TAB-Net Classifier will be used along with Optuna for hyperparameter tuning

In [None]:
!pip install pytorch-tabnet
!pip install optuna

In [None]:
import torch
import optuna
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import chi2
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.metrics import Metric
from sklearn.model_selection import train_test_split

# Importing the data and Reducing memory usage

In [None]:
df_main = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')

In [None]:
print("The dataframe has {} rows and {} columns".format(df_main.shape[0],df_main.shape[1]))

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
df_main = reduce_mem_usage(df_main)

# Exploratory Data Analysis

In [None]:
plt.figure(figsize=(12,8))
ax = sns.countplot(data = df_main, x="Cover_Type", palette = "Set2")
plt.title('Distribution of Cover Type')
plt.xlabel('Cover Type')
plt.ylabel('Count')
for p in ax.patches:
    ax.annotate('{}'.format(p.get_height()), (p.get_x()+0.2, p.get_height()*1.015))

<h3><span style="font-family:cursive;">Here we can see the severe class imbalance with Cover Type 4 and 5 having very few instances.</span></h3>

In [None]:
flag = False
for col in df_main.columns:
    if df_main[col].isnull().sum()>0:
        flag = True
        print("{} column has {} null values".format(col,df_main[col].isnull()))
if flag==False:
    print("No null values present! 😁")

In [None]:
for col in df_main.columns:
    if len(df_main[col].unique())==1:
        print("{} column has only one value".format(col))

<h2>Chi Square Test for Association</h2>
<blockquote><span style="font-family:cursive;">The Chi-Square Test for Association is used to determine if there is any association between two variables. It is really a hypothesis test of independence. The null hypothesis is that the two variables are not associated, i.e., independent. The alternate hypothesis is that the two variables are associated.</span></blockquote>

In [None]:
cols = df_main.columns[15:55]
y = df_main[["Cover_Type"]]
Soil_type_df = df_main[cols]
chi_scores = chi2(Soil_type_df,y)

In [None]:
p_values = pd.Series(chi_scores[1],index = Soil_type_df.columns)
p_values.sort_values(ascending = False , inplace = True)

In [None]:
for i,x in p_values.iteritems():
    if not x<0.05:
        print("Column {} is independent of Cover Type".format(i))

In [None]:
cols_norm = df_main.columns[1:11]
df_main[cols_norm].describe()

<h3>Numerical Columns vs Cover Type Distribution</h3>

In [None]:
fig, axes = plt.subplots(5, 2, figsize=(30,50))
k=0
for i in range(5):
    for j in range(2):
        sns.stripplot(ax=axes[i,j], x="Cover_Type", y=str(cols_norm[k]), data=df_main, palette="Set2")
        plt.setp(axes[i,j].get_xticklabels(), visible = True)
        axes[i,j].set_title("{} vs Cover Type".format(cols_norm[k]))
        k+=1

<h3> <span style="font-family:cursive;">Correlation Matrix for Numerical Columns</span></h3>

In [None]:
sns.heatmap(df_main[cols_norm].corr())

<h3><span style="font-family:cursive;">Prepare the data for modelling</span> </h3>

In [None]:
#Drop Soil_Type7 and Soil_Type15

df_main.drop(labels = ["Soil_Type7","Soil_Type15"], axis=1, inplace=True)
df_test.drop(labels = ["Soil_Type7","Soil_Type15"], axis=1, inplace=True)

#Apply Robust Scaler

scaler = RobustScaler()
df_main[cols_norm] = scaler.fit_transform(df_main[cols_norm])
df_test[cols_norm] = scaler.transform(df_test[cols_norm])

<h3> <span style="font-family:cursive;">Concatenate label 5 to increase instances of class</span> </h3>

In [None]:
df_main = pd.concat([df_main,
              df_main[df_main.Cover_Type==5],
              df_main[df_main.Cover_Type==5],
              df_main[df_main.Cover_Type==5],
              df_main[df_main.Cover_Type==5],
              df_main[df_main.Cover_Type==5],
              df_main[df_main.Cover_Type==5]],
             ignore_index=True)

In [None]:
y = df_main[["Cover_Type"]]

In [None]:
df_main.drop(labels=["Cover_Type","Id"], axis=1, inplace=True)

In [None]:
Id_test = df_test[["Id"]]
df_test.drop(labels=["Id"],axis=1,inplace=True)

# TabNet Classifier + Optuna

In [None]:
df_main = np.array(df_main)
y = np.array(y)
y = y.ravel()

In [None]:
'''
X_train, X_val, y_train, y_val = train_test_split(df_main, y, test_size=0.2)

def objective(trial):

    hyperparams = {
        'n_a_d': trial.suggest_categorical('n_a_d', [8, 16, 24, 32, 64, 128]),
        'n_steps': trial.suggest_int('n_steps', 3, 10, 1),
        'gamma': trial.suggest_categorical('gamma', [1.0, 1.2, 1.5, 2.0]),
        'lambda': trial.suggest_categorical('lambda', [0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001, 0]),
        'batch_size': trial.suggest_categorical('batch_size', [1024, 2048, 4096, 8192, 16384, 32768]),
        'virtual_batch_size': trial.suggest_categorical('virtual_batch_size', [128, 256, 512, 1024]),
        'lr': trial.suggest_categorical('lr', [0.005, 0.01, 0.02, 0.025]),
        'gamma_decay': trial.suggest_categorical('gamma_decay', [0.4, 0.8, 0.9, 0.95]),
        'mask_type': trial.suggest_categorical('mask_type', ['entmax', 'sparsemax']),
        'batch_momentum': trial.suggest_categorical('batch_momentum', [0.6, 0.7, 0.8, 0.9, 0.95, 0.98]),
    }
    MAX_EPOCHS = 35
    PATIENCE = 5
    model = TabNetClassifier(
        n_d=hyperparams['n_a_d'],
        n_a=hyperparams['n_a_d'],
        gamma=hyperparams['gamma'],
        optimizer_fn=torch.optim.Adam,
        optimizer_params={'lr':hyperparams['lr']},
        scheduler_params={"step_size":hyperparams['n_steps'],
                          "gamma":hyperparams['gamma_decay']},
        scheduler_fn=torch.optim.lr_scheduler.StepLR,
        mask_type=hyperparams['mask_type'],
        lambda_sparse=hyperparams['lambda'],
        momentum=hyperparams['batch_momentum'],
        verbose = 0
    )
    model.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        eval_name=['train', 'val'],
        max_epochs=MAX_EPOCHS, 
        patience=PATIENCE,
        batch_size=hyperparams['batch_size'],
        virtual_batch_size=hyperparams['virtual_batch_size'],
        num_workers=0,
        drop_last=False
    )

    val_preds = model.predict(X_val)
    
    del model

    return accuracy_score(y_val,val_preds)

study = optuna.create_study(direction='maximize',
                            sampler=optuna.samplers.TPESampler(multivariate=True, seed=123))

study.optimize(objective, 
               timeout=60*60*6, 
               n_trials=15, 
               gc_after_trial=False)
'''

In [None]:
#best_params = {'n_a_d': 32, 'n_steps': 7, 'gamma': 1.0, 'lambda': 0.001, 'batch_size': 4096, 'virtual_batch_size': 256, 'lr': 0.025, 'gamma_decay': 0.4, 'mask_type': 'entmax', 'batch_momentum': 0.8}

<h3><span style="font-family:cursive;">Uncomment the above cells to run Optuna for hyperparameter tuning. I have loaded the model below with the best parameters obtained from the tuning</span></h3>

In [None]:
clf = TabNetClassifier( n_d=32,
                        n_a=32,
                        gamma=1.0,
                        optimizer_fn=torch.optim.Adam,
                        optimizer_params={'lr':0.025},
                        scheduler_params={"step_size":7,
                                          "gamma":0.4},
                        scheduler_fn=torch.optim.lr_scheduler.StepLR,
                        mask_type= 'entmax',
                        lambda_sparse=0.001,
                        momentum=0.8,
                        verbose = 0)

In [None]:
try:
    clf_ = torch.load("tabnetclf")
    y_pred = clf_.predict(np.array(df_test))
except:
    max_epochs=30
    clf.fit(
        X_train=df_main, y_train=y,
        max_epochs=max_epochs,
        batch_size=4096, virtual_batch_size=256,
        eval_metric = ['accuracy']
    )
    y_pred = clf.predict(np.array(df_test))
    torch.save(clf,"tabnetclf")

# Create Solution File

In [None]:
Id_test["Cover_Type"] = y_pred

In [None]:
Id_test.to_csv("submission.csv", index=False)