Because of problems with the dependencies the first part of the project continues here.

In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

The next code does the first things we need to prepare the dataset and is described in the first part.

In [2]:
heartArray = np.loadtxt('heart.dat', unpack = True)
columns = ['age ','sex','cpt', 'rbp', 'sc', 'fbs', 
                                         'rer', 'mhr', 'eia', 'oldpeak', 'slope', 
                                         'nmv', 'thal', 'presence']
df = pd.DataFrame(heartArray.transpose(1, 0), columns = columns)

dfx = df.drop(columns=["fbs"])

y = dfx["presence"].values

y[y == 1] = 0
y[y == 2] = 1

X_train, X_test, y_train, y_test = train_test_split(dfx.drop(columns=["presence"]).values, 
                                                    dfx["presence"].values, test_size=0.3, 
                                                    random_state=0)


In [3]:
names = ['Random Forest', 'Linear SVM','Naive Bayes']
classifiers = [RandomForestClassifier(n_estimators=100, random_state=0), 
               SVC(kernel='linear', C=10),
               GaussianNB()]

**Rebalancing**

cost matrix

In [4]:
cost_matrix = [[0 , 1], [5, 0]]

Current samples

In [5]:
print(Counter(y_train))

Counter({0.0: 102, 1.0: 87})


*Undersampling*

In [6]:
sampler = RandomUnderSampler(sampling_strategy={0: 74, 1: 87}, random_state=1)
X_train_us, y_train_us = sampler.fit_resample(X_train, y_train)
print(Counter(y_train_us))

Counter({1.0: 87, 0.0: 74})


In [7]:
for name, clf in zip(names, classifiers):
  print("\033[4m" +  name + "\033[0m")
  print("------------------------------------------------------")

  clf.fit(X_train_us, y_train_us)
  y_pred = clf.predict(X_test)
  print(classification_report(y_test, y_pred, target_names=['absence','presence']))

  conf_m = confusion_matrix(y_test, y_pred)
  print("\033[4m" + "Confusion matrix" + "\033[0m")
  print(conf_m , "\n") 
  loss = np.sum(conf_m * cost_matrix)
  print("loss: %d" %loss)
  print("------------------------------------------------------\n")

[4mRandom Forest[0m
------------------------------------------------------
              precision    recall  f1-score   support

     absence       0.89      0.71      0.79        48
    presence       0.67      0.88      0.76        33

    accuracy                           0.78        81
   macro avg       0.78      0.79      0.78        81
weighted avg       0.80      0.78      0.78        81

[4mConfusion matrix[0m
[[34 14]
 [ 4 29]] 

loss: 34
------------------------------------------------------

[4mLinear SVM[0m
------------------------------------------------------
              precision    recall  f1-score   support

     absence       0.89      0.71      0.79        48
    presence       0.67      0.88      0.76        33

    accuracy                           0.78        81
   macro avg       0.78      0.79      0.78        81
weighted avg       0.80      0.78      0.78        81

[4mConfusion matrix[0m
[[34 14]
 [ 4 29]] 

loss: 34
-----------------------------

Use different undersampling strategy




In [8]:
sampler = RandomUnderSampler(sampling_strategy={0: 43, 1: 87}, random_state=1)
X_train_us, y_train_us = sampler.fit_resample(X_train, y_train)
print(Counter(y_train_us))

Counter({1.0: 87, 0.0: 43})


In [9]:
for name, clf in zip(names, classifiers):
  print("\033[4m" +  name + "\033[0m")
  print("------------------------------------------------------")

  clf.fit(X_train_us, y_train_us)
  y_pred = clf.predict(X_test)
  print(classification_report(y_test, y_pred, target_names=['absence','presence']))

  conf_m = confusion_matrix(y_test, y_pred)
  print("\033[4m" + "Confusion matrix" + "\033[0m")
  print(conf_m , "\n") 
  loss = np.sum(conf_m * cost_matrix)
  print("loss: %d" %loss)
  print("------------------------------------------------------\n")

[4mRandom Forest[0m
------------------------------------------------------
              precision    recall  f1-score   support

     absence       0.90      0.54      0.68        48
    presence       0.58      0.91      0.71        33

    accuracy                           0.69        81
   macro avg       0.74      0.73      0.69        81
weighted avg       0.77      0.69      0.69        81

[4mConfusion matrix[0m
[[26 22]
 [ 3 30]] 

loss: 37
------------------------------------------------------

[4mLinear SVM[0m
------------------------------------------------------
              precision    recall  f1-score   support

     absence       0.92      0.69      0.79        48
    presence       0.67      0.91      0.77        33

    accuracy                           0.78        81
   macro avg       0.79      0.80      0.78        81
weighted avg       0.81      0.78      0.78        81

[4mConfusion matrix[0m
[[33 15]
 [ 3 30]] 

loss: 30
-----------------------------

**Oversampling**

In [10]:
sampler = RandomOverSampler(sampling_strategy={0: 102, 1: 100}, random_state=1)
X_train_us, y_train_us = sampler.fit_resample(X_train, y_train)
print(Counter(y_train_us))

Counter({0.0: 102, 1.0: 100})


In [11]:
for name, clf in zip(names, classifiers):
  print("\033[4m" +  name + "\033[0m")
  print("------------------------------------------------------")

  clf.fit(X_train_us, y_train_us)
  y_pred = clf.predict(X_test)
  print(classification_report(y_test, y_pred, target_names=['absence','presence']))

  conf_m = confusion_matrix(y_test, y_pred)
  print("\033[4m" + "Confusion matrix" + "\033[0m")
  print(conf_m , "\n") 
  loss = np.sum(conf_m * cost_matrix)
  print("loss: %d" %loss)
  print("------------------------------------------------------\n")

[4mRandom Forest[0m
------------------------------------------------------
              precision    recall  f1-score   support

     absence       0.88      0.75      0.81        48
    presence       0.70      0.85      0.77        33

    accuracy                           0.79        81
   macro avg       0.79      0.80      0.79        81
weighted avg       0.81      0.79      0.79        81

[4mConfusion matrix[0m
[[36 12]
 [ 5 28]] 

loss: 37
------------------------------------------------------

[4mLinear SVM[0m
------------------------------------------------------
              precision    recall  f1-score   support

     absence       0.87      0.81      0.84        48
    presence       0.75      0.82      0.78        33

    accuracy                           0.81        81
   macro avg       0.81      0.82      0.81        81
weighted avg       0.82      0.81      0.82        81

[4mConfusion matrix[0m
[[39  9]
 [ 6 27]] 

loss: 39
-----------------------------

Use different oversampling strategy


In [12]:
sampler = RandomOverSampler(sampling_strategy={0: 102, 1: 148}, random_state=1)
X_train_us, y_train_us = sampler.fit_resample(X_train, y_train)
print(Counter(y_train_us))

Counter({1.0: 148, 0.0: 102})


  f"After over-sampling, the number of samples ({n_samples})"


In [13]:
for name, clf in zip(names, classifiers):
  print("\033[4m" +  name + "\033[0m")
  print("------------------------------------------------------")

  clf.fit(X_train_us, y_train_us)
  y_pred = clf.predict(X_test)
  print(classification_report(y_test, y_pred, target_names=['absence','presence']))

  conf_m = confusion_matrix(y_test, y_pred)
  print("\033[4m" + "Confusion matrix" + "\033[0m")
  print(conf_m , "\n") 
  loss = np.sum(conf_m * cost_matrix)
  print("loss: %d" %loss)
  print("------------------------------------------------------\n")

[4mRandom Forest[0m
------------------------------------------------------
              precision    recall  f1-score   support

     absence       0.85      0.73      0.79        48
    presence       0.68      0.82      0.74        33

    accuracy                           0.77        81
   macro avg       0.76      0.77      0.76        81
weighted avg       0.78      0.77      0.77        81

[4mConfusion matrix[0m
[[35 13]
 [ 6 27]] 

loss: 43
------------------------------------------------------

[4mLinear SVM[0m
------------------------------------------------------
              precision    recall  f1-score   support

     absence       0.87      0.71      0.78        48
    presence       0.67      0.85      0.75        33

    accuracy                           0.77        81
   macro avg       0.77      0.78      0.76        81
weighted avg       0.79      0.77      0.77        81

[4mConfusion matrix[0m
[[34 14]
 [ 5 28]] 

loss: 39
-----------------------------

**Compination**

In [14]:
sampler = RandomUnderSampler(sampling_strategy={0: 90, 1: 87}, random_state=1)
X_train_us, y_train_us = sampler.fit_resample(X_train, y_train)
sampler = RandomOverSampler(sampling_strategy={0: 90, 1: 110}, random_state=1)
X_train_us, y_train_us = sampler.fit_resample(X_train_us, y_train_us)
print(Counter(y_train_us))

Counter({1.0: 110, 0.0: 90})


  f"After over-sampling, the number of samples ({n_samples})"


In [15]:
for name, clf in zip(names, classifiers):
  print("\033[4m" +  name + "\033[0m")
  print("------------------------------------------------------")

  clf.fit(X_train_us, y_train_us)
  y_pred = clf.predict(X_test)
  print(classification_report(y_test, y_pred, target_names=['absence','presence']))

  conf_m = confusion_matrix(y_test, y_pred)
  print("\033[4m" + "Confusion matrix" + "\033[0m")
  print(conf_m , "\n") 
  loss = np.sum(conf_m * cost_matrix)
  print("loss: %d" %loss)
  print("------------------------------------------------------\n")

[4mRandom Forest[0m
------------------------------------------------------
              precision    recall  f1-score   support

     absence       0.88      0.77      0.82        48
    presence       0.72      0.85      0.78        33

    accuracy                           0.80        81
   macro avg       0.80      0.81      0.80        81
weighted avg       0.81      0.80      0.80        81

[4mConfusion matrix[0m
[[37 11]
 [ 5 28]] 

loss: 36
------------------------------------------------------

[4mLinear SVM[0m
------------------------------------------------------
              precision    recall  f1-score   support

     absence       0.91      0.60      0.72        48
    presence       0.61      0.91      0.73        33

    accuracy                           0.73        81
   macro avg       0.76      0.76      0.73        81
weighted avg       0.79      0.73      0.73        81

[4mConfusion matrix[0m
[[29 19]
 [ 3 30]] 

loss: 34
-----------------------------

As expected the rebalancing methods did not work that well because the data were already almost balanced

**With weights**

In [16]:
names = ['Random Forest', 'Linear SVM','Naive Bayes']
classifiers = [RandomForestClassifier(n_estimators=100, random_state=0, class_weight={0: 10, 1: 13}), 
               SVC(kernel='linear', C=10,class_weight={0: 7, 1: 12}),
               GaussianNB()]
naiveBayesWeights = np.zeros(y_train.shape[0])
naiveBayesWeights[np.where(y_train == 1)] = 5;
naiveBayesWeights[np.where(y_train == 0)] = 3;

In [17]:
for name, clf in zip(names, classifiers):
  print("\033[4m" +  name + "\033[0m")
  print("------------------------------------------------------")

  if(name == 'Naive Bayes'):
    clf.fit(X_train, y_train, naiveBayesWeights)
  else:
    clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)
  print(classification_report(y_test, y_pred, target_names=['absence','presence']))

  conf_m = confusion_matrix(y_test, y_pred)
  print("\033[4m" + "Confusion matrix" + "\033[0m")
  print(conf_m , "\n") 
  loss = np.sum(conf_m * cost_matrix)
  print("loss: %d" %loss)
  print("------------------------------------------------------\n")

[4mRandom Forest[0m
------------------------------------------------------
              precision    recall  f1-score   support

     absence       0.88      0.79      0.84        48
    presence       0.74      0.85      0.79        33

    accuracy                           0.81        81
   macro avg       0.81      0.82      0.81        81
weighted avg       0.82      0.81      0.82        81

[4mConfusion matrix[0m
[[38 10]
 [ 5 28]] 

loss: 35
------------------------------------------------------

[4mLinear SVM[0m
------------------------------------------------------
              precision    recall  f1-score   support

     absence       0.90      0.58      0.71        48
    presence       0.60      0.91      0.72        33

    accuracy                           0.72        81
   macro avg       0.75      0.75      0.72        81
weighted avg       0.78      0.72      0.71        81

[4mConfusion matrix[0m
[[28 20]
 [ 3 30]] 

loss: 35
-----------------------------

Using compination rebalacning with sample weights

In [18]:
names = ['Random Forest', 'Linear SVM','Naive Bayes']
classifiers = [RandomForestClassifier(n_estimators=100, random_state=0, class_weight={0: 15, 1: 1}), 
               SVC(kernel='linear', C=10,class_weight={0: 3, 1: 5}),
               GaussianNB()]
naiveBayesWeights = np.zeros(y_train_us.shape[0])
naiveBayesWeights[np.where(y_train_us == 1)] = 13;
naiveBayesWeights[np.where(y_train_us == 0)] = 2;

In [19]:
for name, clf in zip(names, classifiers):
  print("\033[4m" +  name + "\033[0m")
  print("------------------------------------------------------")

  if(name == 'Naive Bayes'):
    clf.fit(X_train_us, y_train_us, naiveBayesWeights)
  else:
    clf.fit(X_train_us, y_train_us)
    
  y_pred = clf.predict(X_test)

  print(classification_report(y_test, y_pred, target_names=['absence','presence']))

  conf_m = confusion_matrix(y_test, y_pred)
  print("\033[4m" + "Confusion matrix" + "\033[0m")
  print(conf_m , "\n") 
  loss = np.sum(conf_m * cost_matrix)
  print("loss: %d" %loss)
  print("------------------------------------------------------\n")

[4mRandom Forest[0m
------------------------------------------------------
              precision    recall  f1-score   support

     absence       0.94      0.71      0.81        48
    presence       0.69      0.94      0.79        33

    accuracy                           0.80        81
   macro avg       0.82      0.82      0.80        81
weighted avg       0.84      0.80      0.80        81

[4mConfusion matrix[0m
[[34 14]
 [ 2 31]] 

loss: 24
------------------------------------------------------

[4mLinear SVM[0m
------------------------------------------------------
              precision    recall  f1-score   support

     absence       0.94      0.60      0.73        48
    presence       0.62      0.94      0.75        33

    accuracy                           0.74        81
   macro avg       0.78      0.77      0.74        81
weighted avg       0.81      0.74      0.74        81

[4mConfusion matrix[0m
[[29 19]
 [ 2 31]] 

loss: 29
-----------------------------

Although rebalancing seems to be needless in this dataset, It seems that combining rebalancing with class weights works best for this dataset giving the best result overall.