In [49]:
import import_ipynb
from spider import spider
import pandas as pd

In [50]:
def load_dataset(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        data_line_index = next((i for i, line in enumerate(lines) if line.startswith('@data')), None)
    df = pd.read_csv(file_path, delimiter=',', skiprows=range(data_line_index))
    df = df.reset_index()
    df['@data'] = df['@data'].map({' positive': 1, ' negative': 0})
    df.rename(columns={'@data': 'labels'}, inplace=True)
    X = df.drop('labels', axis=1).values
    y = df['labels'].values
    return X, y


In [51]:
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score, KFold
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (recall_score, accuracy_score, balanced_accuracy_score, 
                             precision_score, f1_score, confusion_matrix)


def claculate_metrics(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    recall = recall_score(y_true, y_pred)
    specificity = tn / (tn+fp)
    accuracy = accuracy_score(y_true, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
    geometric_mean = np.sqrt(tp*tn)
    precision = precision_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    metrics = {
        'recall': recall,
        'specificity': specificity,
        'accuracy': accuracy,
        'balanced_accuracy': balanced_accuracy,
        'geometric_mean': geometric_mean,
        'precision': precision,
        'f1': f1,
    }
    return metrics

In [52]:
def test_models(X, y):
    classifiers = [MultinomialNB(), KNeighborsClassifier(), DecisionTreeClassifier()]
    kf = KFold(n_splits=5)

    for classifier in classifiers:
        fold_metrics = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            classifier.fit(X_train, y_train)
            try: 
                y_pred = classifier.predict(X_test)
                fold_metrics.append(claculate_metrics(y_test, y_pred))
            except:
                continue
        print(f"{classifier.__class__.__name__}")
        avg_metrics = {metric: np.mean([fold[metric] for fold in fold_metrics]) for metric in fold_metrics[0]}
        for metric, value in avg_metrics.items():
            print(f"{metric}: {value}")
        print()
    print("-----------------------------------")

In [53]:
smote = SMOTE()
datasets = ['datasets/glass1.dat']
for dataset in datasets:
    X, y = load_dataset(dataset)
    print(dataset)
    print("Bez preprocessingu: ")
    test_models(X, y)
    print("Preprocessing SPIDER weak amplification:")
    new_X_spider, new_y_spider = spider(X, y, amplification_type='week_amplification')
    test_models(new_X_spider, new_y_spider)

    print("Preprocessing SPIDER weak amplification and relabeling:")
    new_X_spider, new_y_spider = spider(X, y, amplification_type='week_amplification_and_relabeling')
    test_models(new_X_spider, new_y_spider)

    print("Preprocessing SPIDER strong amplifiaction:")
    new_X_spider, new_y_spider = spider(X, y, amplification_type='strong_amplification')
    test_models(new_X_spider, new_y_spider)

    print("Preprocessing SMOTE")
    new_X_smote, new_y_smote = smote.fit_resample(X, y)
    test_models(new_X_smote, new_y_smote)



datasets/glass1.dat
Bez preprocessingu: 
MultinomialNB
recall: 0.0125
specificity: nan
accuracy: 0.2894795127353267
balanced_accuracy: 0.2471241990137339
geometric_mean: 1.0198039027185568
precision: 0.1
f1: 0.02222222222222222

KNeighborsClassifier
recall: 0.3006326949384405
specificity: nan
accuracy: 0.6016611295681064
balanced_accuracy: 0.5849994077336759
geometric_mean: 5.757021148285399
precision: 0.5022222222222222
f1: 0.36753278688524593

DecisionTreeClassifier
recall: 0.3403214774281806
specificity: nan
accuracy: 0.5310077519379844
balanced_accuracy: 0.5276211508256652
geometric_mean: 5.364490712499103
precision: 0.41637426900584795
f1: 0.3600081699346405

-----------------------------------
Preprocessing SPIDER weak amplification:
MultinomialNB
recall: 0.12240896358543418
specificity: 0.453234693877551
accuracy: 0.2226938775510204
balanced_accuracy: 0.3224748899559824
geometric_mean: 1.3843251395565515
precision: 0.21428571428571427
f1: 0.12503293807641633

KNeighborsClassifie

  _warn_prf(average, modifier, msg_start, len(result))
  specificity = tn / (tn+fp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  specificity = tn / (tn+fp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  specificity = tn / (tn+fp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  specificity = tn / (tn+fp)
  specificity = tn / (tn+fp)
  specificity = tn / (tn+fp)


MultinomialNB
recall: 0.9785164835164835
specificity: nan
accuracy: 0.632973005255614
balanced_accuracy: 0.6179212454212455
geometric_mean: 3.5
precision: 0.6462393803098451
f1: 0.7429788810410176

KNeighborsClassifier
recall: 0.8993859082094374
specificity: nan
accuracy: 0.7452938365981845
balanced_accuracy: 0.7830470999588648
geometric_mean: 15.884240319942368
precision: 0.7875
f1: 0.7985857323461192

DecisionTreeClassifier
recall: 1.0
specificity: 0.47420634920634913
accuracy: 0.6956521739130435
balanced_accuracy: 0.7371031746031745
geometric_mean: 27.661564342710676
precision: 0.6455429566787775
f1: 0.7526831227496862

-----------------------------------
Preprocessing SPIDER strong amplifiaction:
MultinomialNB
recall: 0.9785164835164835
specificity: nan
accuracy: 0.632973005255614
balanced_accuracy: 0.6179212454212455
geometric_mean: 3.5
precision: 0.6462393803098451
f1: 0.7429788810410176

KNeighborsClassifier
recall: 0.8993859082094374
specificity: nan
accuracy: 0.745293836598184

  specificity = tn / (tn+fp)
  specificity = tn / (tn+fp)
  specificity = tn / (tn+fp)
  _warn_prf(average, modifier, msg_start, len(result))
  specificity = tn / (tn+fp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  specificity = tn / (tn+fp)
  _warn_prf(average, modifier, msg_start, len(result))
  specificity = tn / (tn+fp)
