In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import cohen_kappa_score
from imblearn.over_sampling import SMOTE
from collections import Counter

In [2]:
with open('mammography_data.txt') as f:
  lines = [line.rstrip('\n')for line in f]

In [3]:
data_list=[]
for i in range(len(lines)):
    l= lines[i].split(',')
    a=[float(i) for i in l[:-1]]
    data_list.append(a)
data= np.array(data_list)

In [4]:
data.shape

(11183, 6)

In [5]:
labels_string=[]
for i in range(len(lines)):
  labels_string.append(lines[i].split(',')[-1])

In [6]:
labels=[]
for i in range(len(labels_string)):
  if labels_string[i]==np.unique(labels_string)[0]:
    labels.append(-1)
  else:
    labels.append(1)
labels=np.array(labels)


In [7]:
def get_folds(data,labels,k):
  np.random.seed(42)
  shuffle=np.random.permutation(len(data))
  data=data[shuffle]
  labels=labels[shuffle]

  block_length=int(len(data)/k)
  data_folds=[]
  label_folds=[]
  for i in range(k):
    if i==(k-1):
      data_fold=data[i*block_length:]
      data_folds.append(data_fold)
      label_fold=labels[i*block_length:]
      label_folds.append(label_fold)
    else:
      data_fold=data[i*block_length:block_length*(i+1)]
      data_folds.append(data_fold)
      label_fold=labels[i*block_length:block_length*(i+1)]
      label_folds.append(label_fold)
  return np.array(data_folds,dtype=object),np.array(label_folds,dtype=object)

In [8]:
def divide_data_into_k_folds(data,labels,k):
  data_folds_array,label_folds_array=get_folds(data,labels,k)
  all_data_folds_list=[]
  all_labels_folds_list=[]
  for i in range(k):
    data_fold_k=[np.concatenate(data_folds_array[np.where(np.array(list(range(k)))!=i)[0]]), data_folds_array[i]]
    all_data_folds_list.append(data_fold_k)

    label_fold_k=[np.concatenate(label_folds_array[np.where(np.array(list(range(k)))!=i)[0]]), label_folds_array[i]]
    all_labels_folds_list.append(label_fold_k)

  return all_data_folds_list, all_labels_folds_list

In [9]:
all_data_folds_list, all_labels_folds_list=divide_data_into_k_folds(data,labels,5)

In [10]:
X_train,X_test=all_data_folds_list[0][0],all_data_folds_list[0][1]

In [11]:
X_train.shape

(8947, 6)

In [12]:
y_train,y_test=all_labels_folds_list[0][0],all_labels_folds_list[0][1]

In [13]:
np.unique(y_train, return_counts=True)

(array([-1,  1]), array([8737,  210], dtype=int64))

In [14]:
X_train_minority=[]
for i in np.where(y_train==1):
    a=X_train[i]
    X_train_minority.append(a)

In [15]:
X_train_minority_class=np.array(X_train_minority[0])

In [16]:
X_train_majority=[]
for i in np.where(y_train==-1):
    a=X_train[i]
    X_train_majority.append(a)

In [17]:
X_train_majority_class=np.array(X_train_majority[0])

In [18]:
def k(H,x,d):
  H_inv = np.linalg.inv(H)
  x_tras=np.matrix.transpose(x)
  h= np.linalg.det(H)
  m=np.matmul(x_tras,H_inv)
  m1=np.matmul(m,x)
  e=np.exp(-0.5*m1)
  n=2*(np.pi)**(-d/2)
  v=n*(h)**(-1/2)*(e)
  return v

In [19]:
def density_gen(X, X_minority):
    density_list=[]
    for j in range(len(X)):
        s=0
        for i in range(len(X_minority)):
            xi=X_minority[i]
            dif=X[j]-xi
            s=s+k(np.identity(X_minority.shape[1]), dif, X_minority.shape[1])
        density_list.append(s/len(X_minority))
    return np.array(density_list)

In [20]:
unique_labels,counts=np.unique(y_train, return_counts=True)

In [21]:
def metropolis_hastings(target_density, X_minority, size=100):
    np.random.seed(43)
    burnin_size = 10000
    size += burnin_size
    x0 =X_minority[np.random.choice(list(range(len(X_train_minority_class))),1)]
    xt = x0
    samples = []
    for i in tqdm(range(size)):
        xt_candidate = np.array([np.random.multivariate_normal(xt[0], 0.2*np.eye(X_minority.shape[1]))])
        accept_odds = (target_density(xt_candidate,X_minority))/(target_density(xt,X_minority))
        if np.random.uniform(0, 1) < accept_odds:
            xt = xt_candidate
        samples.append(xt)
    samples = np.array(samples[burnin_size:])
    samples = np.reshape(samples, [samples.shape[0], X_minority.shape[1]])
    return samples

In [22]:
import smote_variants as sv

oversampler_SMOTE=sv.SMOTE()
oversampler_polynome_fit = sv.polynom_fit_SMOTE_star()
oversampler_ProWSyn= sv.ProWSyn()
oversampler_SMOTE_IPF= sv.SMOTE_IPF()
oversampler_Lee= sv.Lee()


In [23]:
oversampler_object_list = [oversampler_SMOTE,oversampler_polynome_fit,oversampler_ProWSyn, oversampler_Lee, metropolis_hastings]

In [24]:
oversampler_name_list = ['SMOTE','polynome_fit_SMOTE','ProWSyn', 'Lee', 'Metropolis_hastings']

In [25]:
def get_imb_oversampling(X_train,y_train,X_test,y_test,oversampler_index,classifier,visualize=True):
  ## MH is in the end of the list oversampler_object_list
  if oversampler_index!=len(oversampler_object_list)-1:
    X_samp, y_samp = oversampler_object_list[oversampler_index].sample(X_train, y_train)

  else:
    X_train_minority=[]
    for i in np.where(y_train==1):
      a=X_train[i]
      X_train_minority.append(a)

    X_train_majority=[]
    for i in np.where(y_train==-1):
      a=X_train[i]
      X_train_majority.append(a)
    X_train_minority_class=np.array(X_train_minority[0])
    X_train_majority_class=np.array(X_train_majority[0])

    unique_labels,counts=np.unique(y_train, return_counts=True)

    synthetic_minority_data = metropolis_hastings(density_gen, X_train_minority_class, size=counts[0]-counts[1])
    balanced_minority=np.concatenate([X_train_minority_class, synthetic_minority_data])
    balanced_data=np.concatenate([X_train_majority_class,balanced_minority])
    balanced_labels_for_classification=np.concatenate([np.zeros(len(X_train_majority_class))-1,np.ones(len(balanced_minority))])
    X_samp=balanced_data
    y_samp=balanced_labels_for_classification

  classifier.fit(X_samp,y_samp)
  y_pred=classifier.predict(X_test)
  cm=confusion_matrix(y_test, y_pred)
  acc=accuracy_score(y_test, y_pred)
  b_acc=balanced_accuracy_score(y_test, y_pred)
  gm=geometric_mean_score(y_test, y_pred)
  f1=f1_score(y_test, y_pred)
  ck=cohen_kappa_score(y_test, y_pred)

  if visualize==True:
    scaler = StandardScaler()
    scaler.fit(X_samp)
    scaled_X_samp = scaler.transform(X_samp)
    pca = PCA(n_components =2)
    X_samp_pca=pca.fit_transform(scaled_X_samp)

    plt.figure(figsize=(8,6))
    color=["r","b"]
    color_counter=0
    for i in [-1,1]:
      plt.scatter(X_samp_pca[:,0][np.where(y_samp==i)],X_samp_pca[:,1][np.where(y_samp==i)], s=10, c=color[color_counter])
      color_counter=color_counter+1
    plt.legend(['Majority','Manority'],fontsize=10, loc='lower right')
    plt.xlabel("Principal Component 1",fontsize=12)
    plt.ylabel("Principlal Component 2 ",fontsize=12)
    plt.title("PCA plot of majority class + minority class using"+""+ oversampler_name_list[oversampler_index]+"technique",fontsize=14)


  return [cm, acc,b_acc,gm, f1, ck]

In [26]:
def calc_avg_performance(performance_index,results_all_folds):
  ## performance index takes the index of the performance measure foe which we want to average over folds
  ## results_all_folds are the results of a particular oversampler run over all folds recording all performance measures
  s=0
  for result in results_all_folds:
    s=s+result[performance_index]
  avg=s/len(results_all_folds)

  s=0
  for result in results_all_folds:
    s=s+(result[performance_index]-avg)**2
  std=np.sqrt(s/len(results_all_folds))
  return avg, std

In [27]:
clf = DecisionTreeClassifier(random_state=0)

In [28]:
perfromance_measures=['accuracy','balanced accuracy','g-mean','f1-score','cohen_kappa']
def get_results_all_folds(oversampler_index,classifier, visualize=False, get_detailed_results=False):

  results_all_folds=[]
  for i in range(len(all_data_folds_list)):
    X_train,X_test=all_data_folds_list[i][0],all_data_folds_list[i][1]
    y_train,y_test=all_labels_folds_list[i][0],all_labels_folds_list[i][1]
    results_single_fold=get_imb_oversampling(X_train,y_train,X_test,y_test,oversampler_index,clf,visualize=visualize)
    results_all_folds.append(results_single_fold)

  performance_summary_oversampler_classifer={}
  for performance_index in range(1,6):
    avg_std=calc_avg_performance(performance_index,results_all_folds)
    performance_summary_oversampler_classifer[perfromance_measures[performance_index-1]]=avg_std

  if get_detailed_results==True:
    return results_all_folds, performance_summary_oversampler_classifer
  else:
    return performance_summary_oversampler_classifer

In [29]:
get_results_all_folds(0,clf)

2024-08-20 14:52:58,836:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'random', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'SMOTE'}")
2024-08-20 14:52:58,836:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2024-08-20 14:52:58,836:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2024-08-20 14:52:58,844:INFO:SMOTE: simplex sampling with n_dim 2
2024-08-20 14:52:58,943:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'random', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'SMOTE'}")
2024-08-20 14:52:58,943:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2024-08-20 14:52:58,943:INFO:NearestNeighborsW

{'accuracy': (0.9715641406486573, 0.0021997232931447444),
 'balanced accuracy': (0.8329272500729792, 0.0520663927349183),
 'g-mean': (0.8176830747201942, 0.062219743679275595),
 'f1-score': (0.5263367939114246, 0.04692265859265942),
 'cohen_kappa': (0.5124875119128701, 0.048087444355024815)}

In [30]:
summary_performace_for_all_oversampling_algorithms_all_folds=[]
for i in range(len(oversampler_object_list)):
  summary_performace_for_all_oversampling_algorithms_all_folds.append(get_results_all_folds(i,clf))

2024-08-20 14:52:59,375:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'random', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'SMOTE'}")
2024-08-20 14:52:59,376:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2024-08-20 14:52:59,378:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2024-08-20 14:52:59,381:INFO:SMOTE: simplex sampling with n_dim 2
2024-08-20 14:52:59,510:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'random', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'SMOTE'}")
2024-08-20 14:52:59,512:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2024-08-20 14:52:59,512:INFO:NearestNeighborsW

  0%|          | 0/18527 [00:00<?, ?it/s]

  0%|          | 0/18521 [00:00<?, ?it/s]

  0%|          | 0/18533 [00:00<?, ?it/s]

  0%|          | 0/18535 [00:00<?, ?it/s]

  0%|          | 0/18536 [00:00<?, ?it/s]

In [31]:
summary_performace_for_all_oversampling_algorithms_all_folds_list=np.array([[[round(i,3) for i in summary_performace_for_all_oversampling_algorithms_all_folds[oversampler_index][key]] for key in summary_performace_for_all_oversampling_algorithms_all_folds[0].keys()] for oversampler_index in range(len(oversampler_name_list))])

In [32]:
oversamplers_result_summary_list=[]
for i in range(summary_performace_for_all_oversampling_algorithms_all_folds_list.shape[0]):
  performance_results_summary_list=[]
  for j in range(summary_performace_for_all_oversampling_algorithms_all_folds_list.shape[1]):
    performance_results_summary_list.append(str(summary_performace_for_all_oversampling_algorithms_all_folds_list[i][j][0])+"\u00B1"+str(summary_performace_for_all_oversampling_algorithms_all_folds_list[i][j][1]))
  oversamplers_result_summary_list.append(performance_results_summary_list)
oversamplers_result_summary_list=np.array(oversamplers_result_summary_list)

In [33]:
performance_results_summary_list

['0.971±0.003', '0.817±0.034', '0.8±0.041', '0.515±0.049', '0.501±0.05']

In [34]:
oversamplers_result_summary_list

array([['0.971±0.004', '0.825±0.056', '0.807±0.069', '0.514±0.063',
        '0.5±0.065'],
       ['0.977±0.004', '0.828±0.04', '0.812±0.049', '0.575±0.056',
        '0.564±0.058'],
       ['0.954±0.004', '0.851±0.037', '0.843±0.042', '0.428±0.03',
        '0.408±0.031'],
       ['0.981±0.003', '0.81±0.043', '0.788±0.054', '0.601±0.05',
        '0.591±0.052'],
       ['0.971±0.003', '0.817±0.034', '0.8±0.041', '0.515±0.049',
        '0.501±0.05']], dtype='<U11')

In [35]:
df=pd.DataFrame(oversamplers_result_summary_list,columns=perfromance_measures,index=oversampler_name_list)

In [36]:
df

Unnamed: 0,accuracy,balanced accuracy,g-mean,f1-score,cohen_kappa
SMOTE,0.971±0.004,0.825±0.056,0.807±0.069,0.514±0.063,0.5±0.065
polynome_fit_SMOTE,0.977±0.004,0.828±0.04,0.812±0.049,0.575±0.056,0.564±0.058
ProWSyn,0.954±0.004,0.851±0.037,0.843±0.042,0.428±0.03,0.408±0.031
Lee,0.981±0.003,0.81±0.043,0.788±0.054,0.601±0.05,0.591±0.052
Metropolis_hastings,0.971±0.003,0.817±0.034,0.8±0.041,0.515±0.049,0.501±0.05


In [37]:
with pd.ExcelWriter('Decision_tree_clf_result_for_data1.xlsx') as excel_writer:
  df.to_excel(excel_writer,sheet_name='sheet_1')