<a href="https://colab.research.google.com/github/TharindaDilshan/Artificial_Neural_Networks_With_Tensorflow/blob/master/Evaluation/3.%20SMOTE/UCI/LR/SMOTE_UCI_Dataset_Evaluation(LR).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import Libraries

In [None]:
!pip install -U cluster-over-sampling
!pip install som-learn

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state
from imblearn.over_sampling import SMOTE
from imblearn.datasets import fetch_datasets
from imblearn.pipeline import make_pipeline
from clover.over_sampling import SOMO, KMeansSMOTE

from sklearn.metrics import f1_score
from sklearn.preprocessing import normalize
from sklearn.metrics import roc_auc_score

### Grid Search

In [None]:
DATASETS = fetch_datasets(filter_data=['abalone', 'mammography'])
N_RND_SEEDS= 2 ** 32
ESTIMATORS = {
	'smote': make_pipeline(StandardScaler(), SMOTE(), LogisticRegression()),
	'somo':  make_pipeline(StandardScaler(), SOMO(), LogisticRegression()),
  'kmeans_smote':  make_pipeline(StandardScaler(), KMeansSMOTE(), LogisticRegression())
}
PARAM_GRIDS = {
	'smote': {'smote__k_neighbors': [2, 3, 4, 5]},
	'somo': {'somo__k_neighbors': [2, 3, 4, 5], 'somo__distribution_ratio': np.linspace(0.1, 1, 10)},
        'kmeans_smote': {'kmeanssmote__k_neighbors': [2, 3, 4, 5]}
}
	

def fit_grid_search_cv(estimator_name, dataset_name, n_splits, seed):
    # X, y = X, y
    rnd_params = [param for param in ESTIMATORS[estimator_name].get_params() if param.endswith('random_state')]
    rnd_seeds = check_random_state(seed).randint(N_RND_SEEDS, size=len(rnd_params))
    gscv = GridSearchCV(
        estimator=ESTIMATORS[estimator_name].set_params(**dict(zip(rnd_params, rnd_seeds))),
        param_grid=PARAM_GRIDS[estimator_name],
        scoring=['roc_auc', 'f1', 'f1_micro', 'f1_macro'],
        cv=StratifiedKFold(n_splits=n_splits, shuffle=True),
        refit=False
    )  
    return gscv.fit(X, y)

gs = fit_grid_search_cv('smote', 'mammography', 5, 2)

np.mean(gs.cv_results_['mean_test_roc_auc'])

### Parameter tuning using Cross Validation

#### Custom Cross Validation

In [None]:
def custom_cross_val_score(params, X, y, n_splits=5, random_state=0, scoring=roc_auc_score):
  skf = StratifiedKFold(n_splits=n_splits)
  scores_roc = []
  scores_f1_micro = []
  scores_f1_macro = []
  scores_f1_weighted = []

  for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    sm = SMOTE(random_state=42, k_neighbors=params['neighbors'])
    X_res, y_res = sm.fit_resample(X_train, y_train)

    clf = LogisticRegression(random_state=9).fit(X_res, y_res)
    pred = clf.predict(X_test)

    scores_roc.append(roc_auc_score(y_test, pred))
    scores_f1_micro.append(f1_score(y_test, pred, average='micro'))
    scores_f1_macro.append(f1_score(y_test, pred, average='macro'))
    scores_f1_weighted.append(f1_score(y_test, pred, average='weighted'))

  return np.mean(scores_roc), np.mean(scores_f1_micro), np.mean(scores_f1_macro), np.mean(scores_f1_weighted)
     

#### Parameter Tuning

In [None]:
def hyper_param_tuner(X, y):
  neighbors = [2, 3, 4, 5]

  best_params = {}
  best_auc_score = 0.0
  best_f1_micro = 0.0
  best_f1_macro = 0.0
  best_f1_weighted = 0.0

  for neighbor in neighbors:
    params = {'neighbors': neighbor}
    auc_score, f1_micro, f1_macro, f1_weighted = custom_cross_val_score(params, X, y)
    if auc_score > best_auc_score:
      best_auc_score = auc_score
      best_params = params
      best_f1_micro = f1_micro
      best_f1_macro = f1_macro
      best_f1_weighted = f1_weighted
  
  print("\n\nBest ROC AUC: ", best_auc_score)
  print("F1 micro: ", best_f1_micro)
  print("F1 macro: ", best_f1_macro)
  print("F1 weighted: ", best_f1_weighted)
  print("Best params: ", best_params)


### Data

#### Ecoli

In [None]:
ecoli = pd.read_fwf('ecoli.data', header=None)

ecoli.drop(ecoli.columns[0], axis=1, inplace=True)
ecoli = ecoli.T.reset_index(drop=True).T
ecoli.loc[(ecoli[7] == 'pp'), ecoli.columns[7]] = 1
ecoli.loc[(ecoli[7] != 1), ecoli.columns[7]] = 0

X = ecoli.iloc[:,0:7]
y = ecoli.iloc[:,7] 
y = y.astype(int)
y = y.to_numpy()
X = normalize(X)

try:
  hyper_param_tuner(X, y)
except Exception as e:
  print(str(e))



Best ROC AUC:  0.8862793347003874
F1 micro:  0.8749341527655838
F1 macro:  0.8223493889230215
F1 weighted:  0.8871104907626478
Best params:  {'neighbors': 4}


#### Haberman

In [None]:
haberman = pd.read_csv('haberman.data', header=None)

X = haberman.iloc[:,0:3]
y = haberman.iloc[:,3]
z = (y == 2)
y = z.astype(int)
y = y.to_numpy()
X = normalize(X)

try:
  hyper_param_tuner(X, y)
except Exception as e:
  print(str(e))



Best ROC AUC:  0.6630228758169935
F1 micro:  0.6993654151242729
F1 macro:  0.6324015114336807
F1 weighted:  0.7008816075831039
Best params:  {'neighbors': 5}


#### Iris

In [None]:
iris = pd.read_csv('iris.data', header=None)

iris.loc[(iris[4] == 'Iris-versicolor'), iris.columns[4]] = 1
iris.loc[(iris[4] != 1), iris.columns[4]] = 0

X = iris.iloc[:,0:4]
y = iris.iloc[:,4]
y = y.astype(int)
y = y.to_numpy()
X = normalize(X)

try:
  hyper_param_tuner(X, y)
except Exception as e:
  print(str(e))



Best ROC AUC:  0.74
F1 micro:  0.66
F1 macro:  0.6256311510483525
F1 weighted:  0.5932391955498502
Best params:  {'neighbors': 2}


#### Libra

In [None]:
libra = pd.read_csv('movement_libras.data', header=None)

libra.loc[(libra[90] == 2), libra.columns[90]] = 1
libra.loc[(libra[90] == 3), libra.columns[90]] = 1
libra.loc[(libra[90] != 1), libra.columns[90]] = 0

X = libra.iloc[:,0:90]
y = libra.iloc[:,90]
y = y.astype(int)
y = y.to_numpy()
X = normalize(X)

try:
  hyper_param_tuner(X, y)
except Exception as e:
  print(str(e))



Best ROC AUC:  0.4643652234033359
F1 micro:  0.4972222222222221
F1 macro:  0.451015150195533
F1 weighted:  0.5110258339474625
Best params:  {'neighbors': 3}


#### Liver

In [None]:
liver = pd.read_csv('liver.data', header=None)
liver.loc[(liver[6] != 1), liver.columns[6]] = 0

X = liver.iloc[:,0:6]
y = liver.iloc[:,6]
y = y.astype(int)
y = y.to_numpy()
X = normalize(X)

try:
  hyper_param_tuner(X, y)
except Exception as e:
  print(str(e))



Best ROC AUC:  0.6411206896551723
F1 micro:  0.6289855072463768
F1 macro:  0.6249327339132135
F1 weighted:  0.6266711580506942
Best params:  {'neighbors': 5}


#### Breast

In [None]:
breast = pd.read_csv('BreastTissue.csv')

breast.drop(breast.columns[0], axis=1, inplace=True)
breast.loc[(breast.Class == 'car'), 'Class'] = 1
breast.loc[(breast.Class == 'fad'), 'Class'] = 1
breast.loc[(breast.Class != 1), 'Class'] = 0

X = breast.iloc[:,1:10]
y = breast.iloc[:,0]
y = y.astype(int)
y = y.to_numpy()
X = normalize(X)

try:
  hyper_param_tuner(X, y)
except Exception as e:
  print(str(e))



Best ROC AUC:  0.5714285714285714
F1 micro:  0.5536796536796538
F1 macro:  0.5472630545147708
F1 weighted:  0.56043463596324
Best params:  {'neighbors': 2}


#### Glass

In [None]:
glass = pd.read_csv('glass.data', header=None)

glass.drop(glass.columns[0], axis=1, inplace=True)
glass = glass.T.reset_index(drop=True).T
glass.loc[(glass[9] != 1), glass.columns[9]] = 0

X = glass.iloc[:,0:9]
y = glass.iloc[:,9]
y = y.astype(int)
y = y.to_numpy()
X = normalize(X)

try:
  hyper_param_tuner(X, y)
except Exception as e:
  print(str(e))



Best ROC AUC:  0.6849753694581281
F1 micro:  0.5951273532668881
F1 macro:  0.566042932987066
F1 weighted:  0.5414986293659637
Best params:  {'neighbors': 3}


#### Heart

In [None]:
heart = pd.read_csv('heart.dat', sep='\s', header=None)

heart.loc[(heart[13] != 2), heart.columns[13]] = 0
heart.loc[(heart[13] == 2), heart.columns[13]] = 1

X = heart.iloc[:,0:13]
y = heart.iloc[:,13]
y = y.astype(int)
y = y.to_numpy()
X = normalize(X)

try:
  hyper_param_tuner(X, y)
except Exception as e:
  print(str(e))



Best ROC AUC:  0.6783333333333333
F1 micro:  0.6777777777777778
F1 macro:  0.6754620892028131
F1 weighted:  0.6781072273047922
Best params:  {'neighbors': 2}


  """Entry point for launching an IPython kernel.


#### Wine

In [None]:
wine = pd.read_csv('wine.data', header=None)

wine.loc[(wine[0] != 2), wine.columns[0]] = 0
wine.loc[(wine[0] == 2), wine.columns[0]] = 1

X = wine.iloc[:,1:14]
X = X.T.reset_index(drop=True).T
y = wine.iloc[:,0]
y = y.astype(int)
y = y.to_numpy()
X = normalize(X)

try:
  hyper_param_tuner(X, y)
except Exception as e:
  print(str(e))



Best ROC AUC:  0.7152380952380952
F1 micro:  0.7236507936507935
F1 macro:  0.7052106431866494
F1 weighted:  0.7134748940811891
Best params:  {'neighbors': 2}
