In [0]:
import pandas as pd
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from pprint import pprint
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm
from sklearn.metrics import accuracy_score

Load Datasets

In [0]:
df_75_25=pd.read_csv('/content/75_25_haberman.csv')
print(df_75_25)
print(df_75_25['Class'].value_counts())

print("Class imbalanced for {:.2f}% of the dataset"
      .format(df_75_25['Class'].value_counts()[0]/len(df_75_25)*100))

     Age  YearOfOper  AXNDetected  Class
0     30          62            3      0
1     30          65            0      0
2     31          59            2      0
3     31          65            4      0
4     33          58           10      0
..   ...         ...          ...    ...
301   76          67            0      0
302   77          65            3      0
303   78          65            1      1
304   83          58            2      1
305   30          64            1      0

[306 rows x 4 columns]
0    225
1     81
Name: Class, dtype: int64
Class imbalanced for 73.53% of the dataset


In [0]:
df_65_35=pd.read_csv('/content/65_35_diabetes.csv')
print(df_65_35)
print(df_65_35['Class'].value_counts())

print("Class imbalanced for {:.2f}% of the dataset"
      .format(df_65_35['Class'].value_counts()[0]/len(df_65_35)*100))

     Tpreg  Pgluc  DBP  TSFT  Insulin   BMI    DPF  Age  Class
0        1     85   66    29        0  26.6  0.351   31      0
1        8    183   64     0        0  23.3  0.672   32      1
2        1     89   66    23       94  28.1  0.167   21      0
3        0    137   40    35      168  43.1  2.288   33      1
4        5    116   74     0        0  25.6  0.201   30      0
..     ...    ...  ...   ...      ...   ...    ...  ...    ...
763      2    122   70    27        0  36.8  0.340   27      0
764      5    121   72    23      112  26.2  0.245   30      0
765      1    126   60     0        0  30.1  0.349   47      1
766      1     93   70    31        0  30.4  0.315   23      0
767      6    148   72    35        0  33.6  0.627   50      1

[768 rows x 9 columns]
0    500
1    268
Name: Class, dtype: int64
Class imbalanced for 65.10% of the dataset


In [0]:
df_90_10=pd.read_csv('/content/90_10_creditcard.csv')
print(df_90_10)
print(df_90_10['Class'].value_counts())

print("Class imbalanced for {:.2f}% of the dataset"
      .format(df_90_10['Class'].value_counts()[0]/len(df_90_10)*100))

        Time        V1        V2        V3  ...       V27       V28  Amount  Class
0          0 -1.359807 -0.072781  2.536347  ...  0.133558 -0.021053  149.62      0
1          0  1.191857  0.266151  0.166480  ... -0.008983  0.014724    2.69      0
2          1 -1.358354 -1.340163  1.773209  ... -0.055353 -0.059752  378.66      0
3          1 -0.966272 -0.185226  1.792993  ...  0.062723  0.061458  123.50      0
4          2 -1.158233  0.877737  1.548718  ...  0.219422  0.215153   69.99      0
...      ...       ...       ...       ...  ...       ...       ...     ...    ...
4915  169142 -1.927883  1.125653 -4.518331  ...  0.292680  0.147968  390.00      1
4916  169347  1.378559  1.289381 -5.004247  ...  0.389152  0.186637    0.76      1
4917  169351 -0.676143  1.126366 -2.213700  ...  0.385107  0.194361   77.89      1
4918  169966 -3.113832  0.585864 -5.399730  ...  0.884876 -0.253700  245.00      1
4919  170348  1.991976  0.158476 -2.583441  ...  0.002988 -0.015309   42.53      1

[49

Create Functions

In [0]:
def getBestK(data):
  range_n_clusters = [2, 3, 4, 5, 6]
  sil_scores=[]

  for n_clusters in range_n_clusters:

      # Initialize the clusterer with n_clusters value and a random generator
      # seed of 10 for reproducibility.
      clusterer = KMeans(n_clusters=n_clusters, random_state=10)
      cluster_labels = clusterer.fit_predict(data)

      # The silhouette_score gives the average value for all the samples.
      # This gives a perspective into the density and separation of the formed
      # clusters
      silhouette_avg = silhouette_score(data, cluster_labels)
      #print("For n_clusters =", n_clusters,
      #      "The average silhouette_score is :", silhouette_avg)
      sil_scores.append(silhouette_avg)

  for j in range (len(sil_scores)):
    if sil_scores[j]==max(sil_scores):
      best_k=range_n_clusters[j]
  return(best_k)

In [0]:
def getCluster(k,X_train,y_train):  
  clf =  KMeans(n_clusters=k)
  clf.fit(X_train,y_train)

  #Coordinates of cluster centers with shape [n_clusters, n_features]
  clf.cluster_centers_
  #Labels of each point
  clf.labels_

  # Nice Pythonic way to get the indices of the points for each corresponding cluster
  mydict = {i: np.where(clf.labels_ == i)[0] for i in range(clf.n_clusters)}
  return mydict

In [0]:
def findModpred(X_train,y_train,x_samp,mydict,clust):
  clustClasses=y_train[mydict[clust]]
  x_samp=np.reshape((np.asarray(x_samp)),(1,len(x_samp)))
  if len(Counter(clustClasses).keys()) > 1:
    rf = RandomForestClassifier(max_depth=2, random_state=0)
    rf.fit(X_train[mydict[clust]], y_train[mydict[clust]])
    mod_pred=rf.predict(x_samp)
  else:
    mod_pred=clustClasses[0] #as all the values in y_train from this cluster are same so selecting any one
  return mod_pred

In [0]:
def getyPred(k,X_train,y_train,X_test):
  mydict=getCluster(k,X_train,y_train)
  clf =  KMeans(n_clusters=k)
  clf.fit(X_train,y_train)
  Y_pred_kmeans=clf.predict(X_test)
  for pred in range (len(Y_pred_kmeans)):
    Y_pred_kmeans[pred]=findModpred(X_train,y_train,X_test[pred,:],mydict,Y_pred_kmeans[pred])
  return Y_pred_kmeans

In [0]:
def stratCV(df):
  accuracy=[]
  bin=0
  results_baseline=np.empty((2,12))

  df=df.dropna() # Drop rows with Nan values
  print("Class original value counts:")
  print(df['Class'].value_counts())

  # Separate attributes and labels
  X=df.drop(columns=['Class']).values
  y=df['Class'].values

  # Establishing baseline using Decision Tree
  clf = DecisionTreeClassifier()
  clf = clf.fit(X, y)
  scores = cross_val_score(clf, X, y, cv=10)
  scores=np.hstack((scores,scores.mean(),scores.std()))
  results_baseline[0,:]=scores

  # Establishing baseline using Random Forest
  rf = RandomForestClassifier()
  rf = rf.fit(X,y)
  rf_scores = cross_val_score(rf, X, y, cv=10)
  rf_scores=np.hstack((rf_scores,rf_scores.mean(),rf_scores.std()))
  results_baseline[1,:]=rf_scores

  # Performing stratified cross validation
  skf = StratifiedKFold(n_splits=10)
  skf.get_n_splits(X, y)
  for train_index, test_index in skf.split(X, y):
    print("\nBin: ",bin)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("TRAIN:", len(X_train), "TEST:", len(X_test))
    k=getBestK(X_train)
    print("No. of Clusters= "+str(k)+"\n")
    y_pred=getyPred(k,X_train,y_train,X_test)
    acc=accuracy_score(y_test, y_pred)
    accuracy.append(acc)
    bin+=1
  skf_res=np.hstack([accuracy,np.array(accuracy).mean(),np.array(accuracy).std()])
  skf_res=skf_res.reshape(1,12)
  results=np.concatenate((results_baseline,skf_res),axis=0)
  return results

The main loop

In [0]:
if __name__ == "__main__":
  results=np.zeros((1,12))
  print("-----------------------------------------------")
  print("Performing operations on 90% imbalanced dataset")
  print("-----------------------------------------------\n")
  r=stratCV(df_90_10)
  results=np.concatenate((results,r),axis=0)

  print("\n-----------------------------------------------")
  print("Performing operations on 75% imbalanced dataset")
  print("-----------------------------------------------\n")
  r=stratCV(df_75_25)
  results=np.concatenate((results,r),axis=0)

  print("\n-----------------------------------------------")
  print("Performing operations on 65% imbalanced dataset")
  print("-----------------------------------------------\n")
  r=stratCV(df_65_35)
  results=np.concatenate((results,r),axis=0)

  results=results[1:,:]
  df_out=pd.DataFrame(results)
  df_out.to_csv('Results.csv', sep=',',index=False)

-----------------------------------------------
Performing operations on 90% imbalanced dataset
-----------------------------------------------

Class original value counts:
0    4428
1     492
Name: Class, dtype: int64

Bin:  0
TRAIN: 4428 TEST: 492
No. of Clusters= 3


Bin:  1
TRAIN: 4428 TEST: 492
No. of Clusters= 3


Bin:  2
TRAIN: 4428 TEST: 492
No. of Clusters= 3


Bin:  3
TRAIN: 4428 TEST: 492
No. of Clusters= 2


Bin:  4
TRAIN: 4428 TEST: 492
No. of Clusters= 2


Bin:  5
TRAIN: 4428 TEST: 492
No. of Clusters= 2


Bin:  6
TRAIN: 4428 TEST: 492
No. of Clusters= 3


Bin:  7
TRAIN: 4428 TEST: 492
No. of Clusters= 3


Bin:  8
TRAIN: 4428 TEST: 492
No. of Clusters= 2


Bin:  9
TRAIN: 4428 TEST: 492
No. of Clusters= 2


-----------------------------------------------
Performing operations on 75% imbalanced dataset
-----------------------------------------------

Class original value counts:
0    225
1     81
Name: Class, dtype: int64

Bin:  0
TRAIN: 275 TEST: 31
No. of Clusters= 3


B

In [0]:
print(df_out)

         0         1         2   ...        9         10        11
0  0.939024  0.995935  1.000000  ...  0.400407  0.933130  0.178477
1  0.995935  1.000000  1.000000  ...  1.000000  0.999593  0.001220
2  0.977642  0.975610  1.000000  ...  1.000000  0.900000  0.275164
3  0.741935  0.354839  0.290323  ...  0.733333  0.625161  0.162251
4  0.741935  0.516129  0.580645  ...  0.733333  0.680430  0.081822
5  0.741935  0.741935  0.258065  ...  0.300000  0.585161  0.220016
6  0.636364  0.753247  0.701299  ...  0.723684  0.695301  0.060219
7  0.740260  0.805195  0.740260  ...  0.815789  0.766917  0.048233
8  0.766234  0.740260  0.766234  ...  0.763158  0.709706  0.065195

[9 rows x 12 columns]
