**Importing all necessary library**

In [2]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import cohen_kappa_score
from imblearn.over_sampling import SMOTE
from collections import Counter

- Here we used a mammography data, which have 6 different input feature and one target. In the target feature it has two classes, '-1' which stand for the majority class and '1' which stand for the minority class, hence the data is imbalanced. The type of the target data is string, where the type of feature variable is integer, so for that reason we have to preprocess the our data.
#### Preprocessing:

1.   First we will split the file into lines and then right strip for every line.
2.   Then we store the feature and target values  separately into some list.

In [4]:
with open('mammography_data.txt') as f:
  lines = [line.rstrip('\n')for line in f]

In [5]:
data_list=[]
for i in range(len(lines)):
    l= lines[i].split(',')
    a=[float(i) for i in l[:-1]]
    data_list.append(a)
data= np.array(data_list)

In [6]:
data.shape

(11183, 6)

In [7]:
labels_string=[]
for i in range(len(lines)):
  labels_string.append(lines[i].split(',')[-1])

In [8]:
labels=[]
for i in range(len(labels_string)):
  if labels_string[i]==np.unique(labels_string)[0]:
    labels.append(-1)
  else:
    labels.append(1)
labels=np.array(labels)

### **K-fold :**
Here we break down our input feature and target class, into 'k' number of folds separately, such that each folds have (k-1) training data and there should be a single test data in each fold.
\
After that, we take the first fold (input feature,target class) from k-folds and we split that into traning data and test data, Then by looking the target unique values,we split the training data into majority which is for '-1' class and minority class which is for '1'.


In [10]:
def get_folds(data,labels,k):
  np.random.seed(42)
  shuffle=np.random.permutation(len(data))
  data=data[shuffle]
  labels=labels[shuffle]

  block_length=int(len(data)/k)
  data_folds=[]
  label_folds=[]
  for i in range(k):
    if i==(k-1):
      data_fold=data[i*block_length:]
      data_folds.append(data_fold)
      label_fold=labels[i*block_length:]
      label_folds.append(label_fold)
    else:
      data_fold=data[i*block_length:block_length*(i+1)]
      data_folds.append(data_fold)
      label_fold=labels[i*block_length:block_length*(i+1)]
      label_folds.append(label_fold)
  return np.array(data_folds,dtype=object),np.array(label_folds,dtype=object)

In [11]:
def divide_data_into_k_folds(data,labels,k):
  data_folds_array,label_folds_array=get_folds(data,labels,k)
  all_data_folds_list=[]
  all_labels_folds_list=[]
  for i in range(k):
    data_fold_k=[np.concatenate(data_folds_array[np.where(np.array(list(range(k)))!=i)[0]]), data_folds_array[i]]
    all_data_folds_list.append(data_fold_k)

    label_fold_k=[np.concatenate(label_folds_array[np.where(np.array(list(range(k)))!=i)[0]]), label_folds_array[i]]
    all_labels_folds_list.append(label_fold_k)

  return all_data_folds_list, all_labels_folds_list

In [12]:
all_data_folds_list, all_labels_folds_list=divide_data_into_k_folds(data,labels,3)

In [13]:
X_train,X_test=all_data_folds_list[0][0],all_data_folds_list[0][1]

In [14]:
X_train.shape

(7456, 6)

In [15]:
y_train,y_test=all_labels_folds_list[0][0],all_labels_folds_list[0][1]

In [16]:
y_train.shape

(7456,)

In [17]:
np.unique(y_train, return_counts=True)

(array([-1,  1]), array([7280,  176], dtype=int64))

In [18]:
X_train_minority=[]
for i in np.where(y_train==1):
    a=X_train[i]
    X_train_minority.append(a)

In [19]:
X_train_minority_class=np.array(X_train_minority[0])

In [20]:
X_train_majority=[]
for i in np.where(y_train==-1):
    a=X_train[i]
    X_train_majority.append(a)

In [21]:
X_train_majority_class=np.array(X_train_majority[0])

 ### **Kernel Density Function :**
At this point we designed our kernel function. We used multidimensional gaussian Kernel function which is expressed mathematically as:
$$
K(x) = {2\pi}^{-\frac{d}{2}} {det(H)}^{-\frac{1}{2}} e^{-\frac{1}{2} {x^T} {H^{-1}}x}
$$
where $d$ is a number of input feature and $H$ is an identitiy matrix of dimension $d$. We calculate the Kernel density at some point $x$ using the formula:
$$
f(x) = {\frac{1}{n}}{\sum_{i=1}^{n}K(x-x_{i})}
$$
Where $x_{i}$ denotes the $i$-th data point of the minority class which belongs to the training data which is used to calculate the density at point any point $x$. We can put each test data as an argument to this function to obtain densities at the test data points. Note that in the multi-dimensional case $x$ and $x_i$ are both vectors of dimension $d$.

In [23]:
def k(H,x,d):
  H_inv = np.linalg.inv(H)
  x_tras=np.matrix.transpose(x)
  h= np.linalg.det(H)
  m=np.matmul(x_tras,H_inv)
  m1=np.matmul(m,x)
  e=np.exp(-0.5*m1)
  n=2*(np.pi)**(-d/2)
  v=n*(h)**(-1/2)*(e)
  return v

In [24]:
type(X_train_majority_class)

numpy.ndarray

In [25]:
def density_gen(X, X_minority):
    density_list=[]
    for j in range(len(X)):
        s=0
        for i in range(len(X_minority)):
            xi=X_minority[i]
            dif=X[j]-xi
            s=s+k(np.identity(X_minority.shape[1]), dif, X_minority.shape[1])
        density_list.append(s/len(X_minority))
    return np.array(density_list)

In [26]:
unique_labels,counts=np.unique(y_train, return_counts=True)

### **Metropolis hastings** :
" Metropolis hasting is technique of generating random samples from the any probability distribution. we can use this for the oversampling of data ."
\
**Steps of metropolis hastings:**
1. First we take the initial guess $x_t$, which is random data points for the minority class.
2. Then we find the candidate value $ x_{t+1}$  of $x_t$, such that it's Normal Distributed from point $x_t$ with $\sigma^2$.
3. After this we check the acceptance probability $α$ which is a ratio of kernel density of candidate value $ x_{t+1}$ and Kernel density of initial guess which is $x_t$.
$α=\frac{f(x_{x+1})}{f(x_t)}$
4. So there are few condition about accepting the $α$ values:

*   We take uniform distribution in interval between $(0,1)$, if the Uniform value is smaller than the $α$ which means the Kernel density of candidate value is higher during the finding of $α$, so there is maximum likelihood of accepting the $α$, and it store the $α$ values into a some sample list, otherwise if uniform distribution value is greater than the  $α$ which means the kernel density of initial guess ($x_t$) is higher so,there is maximum likelihood of rejecting the $α$ hence the value wont store into the sample list it will remains the same.

In [28]:
def metropolis_hastings(target_density, X_minority, size=100):
    np.random.seed(43)
    burnin_size = 10000
    size += burnin_size
    x0 =X_minority[np.random.choice(list(range(len(X_train_minority_class))),1)]
    xt = x0
    samples = []
    for i in tqdm(range(size)):
        xt_candidate = np.array([np.random.multivariate_normal(xt[0], 0.2*np.eye(X_minority.shape[1]))])
        accept_odds = (target_density(xt_candidate,X_minority))/(target_density(xt,X_minority))
        if np.random.uniform(0, 1) < accept_odds:
            xt = xt_candidate
        samples.append(xt)
    samples = np.array(samples[burnin_size:])
    samples = np.reshape(samples, [samples.shape[0], X_minority.shape[1]])
    return samples

In [29]:
!pip install smote_variants



### **SMOTE and their Variants :**
Here we used top three different varients of the SMOTE:

1.   Polynome fit Smote: Polynome fit refers 4 different strategies by the topology parameter (star, mesh, bus,polynome) among this we used star topology is that it generates the point along the line segmets between relative far samples of minority class, thus the minority class is more scattered.
2.   ProWSyn(Proximate weighted synthesis): It generate the new points by sampling the line segments between minority instances which having the similar distances to the majority instances.
3.   Lee : it is a similar approach to using KNN for noise filtering.


#### SOMOTE:
This is also a oversampling method, it oversamples the minority calss and make it equal to the majority class.
\
**Steps of generating a synthetic data:**


- First we find the $k$ nearest neighbour for the minority calss, then we
generate the data points in between this $k$ nearest data points using formula:\
$x_i'= x_i +λ(x_j-x_i)$
\
So here the $x_i'$ is the new generated point, where $λ$ is factor which range between $(0,1)$.


In [31]:
import smote_variants as sv

oversampler_SMOTE=sv.SMOTE()
oversampler_polynome_fit = sv.polynom_fit_SMOTE_star()
oversampler_ProWSyn= sv.ProWSyn()
oversampler_SMOTE_IPF= sv.SMOTE_IPF()
oversampler_Lee= sv.Lee()


In [32]:
oversampler_object_list = [oversampler_SMOTE,oversampler_polynome_fit,oversampler_ProWSyn, oversampler_Lee, metropolis_hastings]

In [33]:
oversampler_name_list = ['SMOTE','polynome_fit_SMOTE','ProWSyn', 'Lee', 'Metropolis_hastings']

In [34]:
def get_imb_oversampling(X_train,y_train,X_test,y_test,oversampler_index,classifier,visualize=True):
  ## MH is in the end of the list oversampler_object_list
  if oversampler_index!=len(oversampler_object_list)-1:
    X_samp, y_samp = oversampler_object_list[oversampler_index].sample(X_train, y_train)

  else:
    X_train_minority=[]
    for i in np.where(y_train==1):
      a=X_train[i]
      X_train_minority.append(a)

    X_train_majority=[]
    for i in np.where(y_train==-1):
      a=X_train[i]
      X_train_majority.append(a)
    X_train_minority_class=np.array(X_train_minority[0])
    X_train_majority_class=np.array(X_train_majority[0])

    unique_labels,counts=np.unique(y_train, return_counts=True)

    synthetic_minority_data = metropolis_hastings(density_gen, X_train_minority_class, size=counts[0]-counts[1])
    balanced_minority=np.concatenate([X_train_minority_class, synthetic_minority_data])
    balanced_data=np.concatenate([X_train_majority_class,balanced_minority])
    balanced_labels_for_classification=np.concatenate([np.zeros(len(X_train_majority_class))-1,np.ones(len(balanced_minority))])
    X_samp=balanced_data
    y_samp=balanced_labels_for_classification

  classifier.fit(X_samp,y_samp)
  y_pred=classifier.predict(X_test)
  cm=confusion_matrix(y_test, y_pred)
  acc=accuracy_score(y_test, y_pred)
  b_acc=balanced_accuracy_score(y_test, y_pred)
  gm=geometric_mean_score(y_test, y_pred)
  f1=f1_score(y_test, y_pred)
  ck=cohen_kappa_score(y_test, y_pred)

  if visualize==True:
    scaler = StandardScaler()
    scaler.fit(X_samp)
    scaled_X_samp = scaler.transform(X_samp)
    pca = PCA(n_components =2)
    X_samp_pca=pca.fit_transform(scaled_X_samp)

    plt.figure(figsize=(8,6))
    color=["r","b"]
    color_counter=0
    for i in [-1,1]:
      plt.scatter(X_samp_pca[:,0][np.where(y_samp==i)],X_samp_pca[:,1][np.where(y_samp==i)], s=10, c=color[color_counter])
      color_counter=color_counter+1
    plt.legend(['Majority','Manority'],fontsize=10, loc='lower right')
    plt.xlabel("Principal Component 1",fontsize=12)
    plt.ylabel("Principlal Component 2 ",fontsize=12)
    plt.title("PCA plot of majority class + minority class using"+""+ oversampler_name_list[oversampler_index]+"technique",fontsize=14)


  return [cm, acc,b_acc,gm, f1, ck]

In [35]:
def calc_avg_performance(performance_index,results_all_folds):
  ## performance index takes the index of the performance measure foe which we want to average over folds
  ## results_all_folds are the results of a particular oversampler run over all folds recording all performance measures
  s=0
  for result in results_all_folds:
    s=s+result[performance_index]
  avg=s/len(results_all_folds)

  s=0
  for result in results_all_folds:
    s=s+(result[performance_index]-avg)**2
  std=np.sqrt(s/len(results_all_folds))
  return avg, std

In [36]:
clf=KNeighborsClassifier(n_neighbors=5)

In [37]:
perfromance_measures=['accuracy','balanced accuracy','g-mean','f1-score','cohen_kappa']
def get_results_all_folds(oversampler_index,classifier, visualize=False, get_detailed_results=False):

  results_all_folds=[]
  for i in range(len(all_data_folds_list)):
    X_train,X_test=all_data_folds_list[i][0],all_data_folds_list[i][1]
    y_train,y_test=all_labels_folds_list[i][0],all_labels_folds_list[i][1]
    results_single_fold=get_imb_oversampling(X_train,y_train,X_test,y_test,oversampler_index,clf,visualize=visualize)
    results_all_folds.append(results_single_fold)

  performance_summary_oversampler_classifer={}
  for performance_index in range(1,6):
    avg_std=calc_avg_performance(performance_index,results_all_folds)
    performance_summary_oversampler_classifer[perfromance_measures[performance_index-1]]=avg_std

  if get_detailed_results==True:
    return results_all_folds, performance_summary_oversampler_classifer
  else:
    return performance_summary_oversampler_classifer

In [38]:
get_results_all_folds(0,clf)

2024-08-20 14:46:09,802:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'random', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'SMOTE'}")
2024-08-20 14:46:09,806:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2024-08-20 14:46:09,808:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2024-08-20 14:46:09,809:INFO:SMOTE: simplex sampling with n_dim 2
2024-08-20 14:46:09,978:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'random', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'SMOTE'}")
2024-08-20 14:46:09,979:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2024-08-20 14:46:09,980:INFO:NearestNeighborsW

{'accuracy': (0.9574351184628734, 0.0014663471702170215),
 'balanced accuracy': (0.8942360139816947, 0.010394482688024155),
 'g-mean': (0.89169441205919, 0.011208422354584316),
 'f1-score': (0.47393935016116173, 0.02983274134227196),
 'cohen_kappa': (0.4560043722541991, 0.029679972944954943)}

In [39]:
summary_performace_for_all_oversampling_algorithms_all_folds=[]
for i in range(len(oversampler_object_list)):
  summary_performace_for_all_oversampling_algorithms_all_folds.append(get_results_all_folds(i,clf))

2024-08-20 14:46:10,338:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'random', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'SMOTE'}")
2024-08-20 14:46:10,340:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2024-08-20 14:46:10,341:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2024-08-20 14:46:10,344:INFO:SMOTE: simplex sampling with n_dim 2
2024-08-20 14:46:10,582:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'random', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'SMOTE'}")
2024-08-20 14:46:10,582:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2024-08-20 14:46:10,584:INFO:NearestNeighborsW

  0%|          | 0/17104 [00:00<?, ?it/s]

  0%|          | 0/17092 [00:00<?, ?it/s]

  0%|          | 0/17130 [00:00<?, ?it/s]

In [40]:
summary_performace_for_all_oversampling_algorithms_all_folds_list=np.array([[[round(i,3) for i in summary_performace_for_all_oversampling_algorithms_all_folds[oversampler_index][key]] for key in summary_performace_for_all_oversampling_algorithms_all_folds[0].keys()] for oversampler_index in range(len(oversampler_name_list))])

In [41]:
oversamplers_result_summary_list=[]
for i in range(summary_performace_for_all_oversampling_algorithms_all_folds_list.shape[0]):
  performance_results_summary_list=[]
  for j in range(summary_performace_for_all_oversampling_algorithms_all_folds_list.shape[1]):
    performance_results_summary_list.append(str(summary_performace_for_all_oversampling_algorithms_all_folds_list[i][j][0])+"\u00B1"+str(summary_performace_for_all_oversampling_algorithms_all_folds_list[i][j][1]))
  oversamplers_result_summary_list.append(performance_results_summary_list)
oversamplers_result_summary_list=np.array(oversamplers_result_summary_list)

In [42]:
performance_results_summary_list

['0.98±0.002', '0.802±0.032', '0.778±0.04', '0.588±0.035', '0.578±0.036']

In [43]:
oversamplers_result_summary_list

array([['0.957±0.001', '0.892±0.012', '0.889±0.013', '0.473±0.031',
        '0.455±0.031'],
       ['0.97±0.002', '0.877±0.028', '0.871±0.031', '0.544±0.021',
        '0.529±0.021'],
       ['0.958±0.002', '0.898±0.019', '0.896±0.02', '0.478±0.019',
        '0.46±0.018'],
       ['0.981±0.002', '0.863±0.021', '0.854±0.024', '0.648±0.037',
        '0.639±0.038'],
       ['0.98±0.002', '0.802±0.032', '0.778±0.04', '0.588±0.035',
        '0.578±0.036']], dtype='<U11')

In [44]:
df=pd.DataFrame(oversamplers_result_summary_list,columns=perfromance_measures,index=oversampler_name_list)

In [45]:
df

Unnamed: 0,accuracy,balanced accuracy,g-mean,f1-score,cohen_kappa
SMOTE,0.957±0.001,0.892±0.012,0.889±0.013,0.473±0.031,0.455±0.031
polynome_fit_SMOTE,0.97±0.002,0.877±0.028,0.871±0.031,0.544±0.021,0.529±0.021
ProWSyn,0.958±0.002,0.898±0.019,0.896±0.02,0.478±0.019,0.46±0.018
Lee,0.981±0.002,0.863±0.021,0.854±0.024,0.648±0.037,0.639±0.038
Metropolis_hastings,0.98±0.002,0.802±0.032,0.778±0.04,0.588±0.035,0.578±0.036


In [46]:
with pd.ExcelWriter('KNN_clf_result_for_data1.xlsx') as excel_writer:
  df.to_excel(excel_writer,sheet_name='sheet_1')