# Banking77 Experiments with LGBM and UMAP

Experiments with original and trimmed versions of the dataset.

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, f1_score, classification_report

In [2]:
from sklearn.metrics import accuracy_score,zero_one_loss,balanced_accuracy_score, brier_score_loss,precision_score, recall_score, f1_score, mean_squared_error, classification_report

In [3]:
# some setting for this notebook to actually show the graphs inline
%matplotlib inline
np.set_printoptions(precision=5, suppress=True)  # suppress scientific float notation

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
def run_experiment(X, y):

    kfold = StratifiedKFold(n_splits=5, shuffle=True)

    res = []

    split = 0
    for train_ix, val_ix in kfold.split(X, y):
        split = split + 1
        print("split {}".format(split))
        
        
        X_train, X_val = X[train_ix], X[val_ix]  
        y_train, y_val = y[train_ix], y[val_ix] 

        params = {
                  "learning_rate": 0.1,
                  "max_depth": 4,
                  "num_leaves": 15,
                  "n_estimators": 1000,
                  "n_jobs": 5,
                  "verbosity": -1,
                  "seed": 77,
            }
        estimator = LGBMClassifier(**params)
        estimator = estimator.fit(X_train, y_train)

        y_val_pred = estimator.predict(X_val)
        y_val_pred_proba =estimator.predict_proba(X_val)
        
        scores = {
            'split': split,
            'accuracy_score': accuracy_score(y_val, y_val_pred),
            'balanced_accuracy_score': balanced_accuracy_score(y_val, y_val_pred),
            'f1_score_macro': f1_score(y_val, y_val_pred, average='macro'),
            'f1_score_weighted': f1_score(y_val, y_val_pred, average='weighted'),
        }
        
        res.append(scores)

    return pd.DataFrame(res)

# Original Dataset

In [5]:
df = pd.read_csv('/global/project/hpcg1614_shared/ca/data/banking77/clean_embed_all-mpnet-base-v2.csv')

X = df.drop(['category'], axis=1).to_numpy()
y_cat = df['category'].to_numpy()
label_transformer = LabelEncoder()
y = label_transformer.fit_transform(y_cat)

In [6]:
X.shape

(10003, 768)

In [7]:
res1 = run_experiment(X, y)

split 1
split 2
split 3
split 4
split 5


In [8]:
res1

Unnamed: 0,split,accuracy_score,balanced_accuracy_score,f1_score_macro,f1_score_weighted
0,1,0.882059,0.878338,0.878739,0.881876
1,2,0.887056,0.886343,0.885905,0.8869
2,3,0.896552,0.891017,0.891703,0.896229
3,4,0.892,0.886242,0.887559,0.891571
4,5,0.879,0.875199,0.874678,0.878324


# Trimmed Dataset

In [9]:
df_trimmed = pd.read_csv('Banking77_trimmed_updatedLabels_load.csv')
trimmed_ids = list(df_trimmed['id'].to_numpy())

In [10]:
X_trimmed = X[trimmed_ids]
y_trimmed = y[trimmed_ids]
X_trimmed.shape

(8575, 768)

In [None]:
res2 = run_experiment(X_trimmed, y_trimmed)

split 1
split 2


In [None]:
res2

# Original Dataset + UMAP

In [None]:
from umap import UMAP


u_params = {
        'n_components': 30,
        'n_neighbors': 20,
        'min_dist': 0.1,
        'metric': "euclidean"
}

dim_reducer = UMAP(**u_params)
dim_reducer = dim_reducer.fit(X)
X_dims = dim_reducer.transform(X)

In [None]:
res3 = run_experiment(X, y)

In [None]:
res3

# Trimmed Dataset + UMAP

In [None]:
X_dims_trimmed = X_dims[trimmed_ids]
y_dims_trimmed = y_dims[trimmed_ids]

In [None]:
res4 = run_experiment(X_dims_trimmed, y_dims_trimmed)

In [None]:
res4