<a href="https://colab.research.google.com/github/TharindaDilshan/somo_example/blob/main/SOMO_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Libraries

In [None]:
!pip install -U imbalanced-learn
!pip install -U cluster-over-sampling
!pip install som-learn

In [2]:
from clover.over_sampling import SOMO
import math
import numpy as np
import pandas as pd
from random import *
from math import isnan

from sklearn.metrics import f1_score 
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import normalize
from imblearn.datasets import fetch_datasets 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

## Custom Cross Validation

In [3]:
def custom_cross_val_score(params, X, y, n_splits=5, random_state=0, scoring=roc_auc_score):
  skf = StratifiedKFold(n_splits=n_splits)
  scores_roc = []
  scores_f1_micro = []
  scores_f1_macro = []
  scores_f1_weighted = []

  for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    somo = SOMO(random_state=42, k_neighbors=params['neigh'], distribution_ratio=params['dist'], raise_error=False)
    X_res, y_res = somo.fit_resample(X_train, y_train)

    clf = LogisticRegression(random_state=9).fit(X_res, y_res)
    pred = clf.predict(X_test)

    scores_roc.append(roc_auc_score(y_test, pred))
    scores_f1_micro.append(f1_score(y_test, pred, average='micro'))
    scores_f1_macro.append(f1_score(y_test, pred, average='macro'))
    scores_f1_weighted.append(f1_score(y_test, pred, average='weighted'))

  return np.mean(scores_roc), np.mean(scores_f1_micro), np.mean(scores_f1_macro), np.mean(scores_f1_weighted)

## Abalone

In [9]:
abalone = fetch_datasets()["abalone"]

X, y = abalone.data, abalone.target

X = normalize(X)
y = [0 if i == -1 else i for i in y]
y = np.array(y)

neighs = [2, 3, 4, 5]
dists = np.linspace(0.1, 1, 10)

best_params = {}
best_auc_score = 0.0
best_f1_micro = 0.0
best_f1_macro = 0.0
best_f1_weighted = 0.0

for neigh in neighs:
  for dist in dists:
    params = {'neigh': neigh, 'dist': dist}
    auc_score, f1_micro, f1_macro, f1_weighted = custom_cross_val_score(params, X, y)
    if auc_score > best_auc_score:
      best_auc_score = auc_score
      best_params = params
      best_f1_micro = f1_micro
      best_f1_macro = f1_macro
      best_f1_weighted = f1_weighted
  
print("\n\nBest ROC AUC: ", best_auc_score)
print("F1 micro: ", best_f1_micro)
print("F1 macro: ", best_f1_macro)
print("F1 weighted: ", best_f1_weighted)
print("Best params: ", best_params)



Best ROC AUC:  0.5
F1 micro:  0.9063922871959429
F1 macro:  0.4754489609903329
F1 weighted:  0.8618866524112201
Best params:  {'neigh': 2, 'dist': 0.1}


## Mammography

In [7]:
mammography = fetch_datasets()["mammography"]

X, y = mammography.data, mammography.target

X = normalize(X)
y = [0 if i == -1 else i for i in y]
y = np.array(y)

neighs = [2, 3, 4, 5]
dists = np.linspace(0.1, 1, 10)

best_params = {}
best_auc_score = 0.0
best_f1_micro = 0.0
best_f1_macro = 0.0
best_f1_weighted = 0.0

for neigh in neighs:
  for dist in dists:
    params = {'neigh': neigh, 'dist': dist}
    auc_score, f1_micro, f1_macro, f1_weighted = custom_cross_val_score(params, X, y)
    if auc_score > best_auc_score:
      best_auc_score = auc_score
      best_params = params
      best_f1_micro = f1_micro
      best_f1_macro = f1_macro
      best_f1_weighted = f1_weighted
  
print("\n\nBest ROC AUC: ", best_auc_score)
print("F1 micro: ", best_f1_micro)
print("F1 macro: ", best_f1_macro)
print("F1 weighted: ", best_f1_weighted)
print("Best params: ", best_params)



Best ROC AUC:  0.5018773103326879
F1 micro:  0.9767504636208569
F1 macro:  0.49789230627907416
F1 weighted:  0.9654368559771893
Best params:  {'neigh': 2, 'dist': 0.1}
