In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import imblearn as debias # https://imbalanced-learn.org/stable/
from collections import Counter


In [7]:
def XYsplit(df: pd.DataFrame) -> pd.DataFrame:
  """Function to split a df into X and Y arrays."""
  Y = df["diagnosis"]
  X = df.drop(["diagnosis"], axis = 1)
  return [X, Y]

def XYmerge(X: pd.DataFrame, Y: pd.Series) -> pd.DataFrame:
  X["diagnosis"] = Y
  return X

In [8]:
# RANDOM SAMPLING
def rndSample(X, Y, mode = "under") -> pd.DataFrame:
  """Random Undersampling Technique"""
  sampler = None
  if mode == "under": sampler = debias.under_sampling.RandomUnderSampler(random_state = 0)
  elif mode == "over": sampler = debias.over_sampling.RandomOverSampler(random_state = 0)
  Xresampled,Yresampled = sampler.fit_resample(X, Y)
  return [Xresampled, Yresampled]

In [9]:
# debias.over_sampling.SMOTE/ADASYN
# Synthetic Minority Over-sampling TEchnique (SMOTE)

def smoteSample(X, Y) -> pd.DataFrame: 
  sampler = debias.over_sampling.SMOTE()
  newX, newY = sampler.fit_resample(X, Y)
  return [newX, newY]

# ADAptive SYNthetic Technique (ADASYN)
def adasynSample(X, Y) -> pd.DataFrame:
  sampler = debias.over_sampling.ADASYN()
  newX, newY = sampler.fit_resample(X, Y)
  return [newX, newY]

In [10]:
# Example driver
"""
from ipynb.fs.full.preprocessing import dataPreProcessing

def cmpSamplingTechniques(dataFile = "data"):
  vars = dataPreProcessing(dsFile = dataFile, processes = ["clean", "predMap"])
  df = vars["df"]
  X, Y = XYsplit(df)

  debiasedSets = [
    {"type": "original", "set": {"X": X, "Y": Y, "merged": df}},
    {"type": "random over", "set": {"X": None, "Y": None, "merged": None}},
    {"type": "random under", "set": {"X": None, "Y": None, "merged": None}},
    {"type": "smote", "set": {"X": None, "Y": None, "merged": None}},
    {"type": "adasyn", "set": {"X": None, "Y": None, "merged": None}}
  ]

  debiasedSets[1]["set"]["X"], debiasedSets[1]["set"]["Y"] = rndSample(X, Y, mode = "over") # Random Over-Sampling
  debiasedSets[2]["set"]["X"], debiasedSets[2]["set"]["Y"] = rndSample(X, Y, mode = "under") # Random Under-Sampling
  debiasedSets[3]["set"]["X"], debiasedSets[3]["set"]["Y"] = smoteSample(X, Y) # SMOTE
  debiasedSets[4]["set"]["X"], debiasedSets[4]["set"]["Y"] = adasynSample(X, Y) # ADASYN

  print(type(Y))
  # Ensure the sampling worked:
  for set in  debiasedSets:
    print("Diagnosis distribution", set["type"], ":", sorted(Counter(set["set"]["Y"]).items()))
    
  smoteX, smoteY = debiasedSets[3]["set"]["X"], debiasedSets[3]["set"]["Y"]
  df = XYmerge(smoteX, smoteY)
  print(df)
cmpSamplingTechniques()"
"""

'\nfrom ipynb.fs.full.preprocessing import dataPreProcessing\n\ndef cmpSamplingTechniques(dataFile = "data"):\n  vars = dataPreProcessing(dsFile = dataFile, processes = ["clean", "predMap"])\n  df = vars["df"]\n  X, Y = XYsplit(df)\n\n  debiasedSets = [\n    {"type": "original", "set": {"X": X, "Y": Y, "merged": df}},\n    {"type": "random over", "set": {"X": None, "Y": None, "merged": None}},\n    {"type": "random under", "set": {"X": None, "Y": None, "merged": None}},\n    {"type": "smote", "set": {"X": None, "Y": None, "merged": None}},\n    {"type": "adasyn", "set": {"X": None, "Y": None, "merged": None}}\n  ]\n\n  debiasedSets[1]["set"]["X"], debiasedSets[1]["set"]["Y"] = rndSample(X, Y, mode = "over") # Random Over-Sampling\n  debiasedSets[2]["set"]["X"], debiasedSets[2]["set"]["Y"] = rndSample(X, Y, mode = "under") # Random Under-Sampling\n  debiasedSets[3]["set"]["X"], debiasedSets[3]["set"]["Y"] = smoteSample(X, Y) # SMOTE\n  debiasedSets[4]["set"]["X"], debiasedSets[4]["set"]