In [202]:
from curses import delay_output
import numpy as np
import pandas as pd
from feature_selection import select_features
from sklearn import tree
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from models import (get_RF, get_DT, get_knn, get_SVC, run_classifier)
from helper_functions import (get_cv_performance_metrics, hr, get_sampler, plot_confusion_matrices, k_fold_cv)
from sklearn.model_selection import GridSearchCV
import pprint 
import sys
import pickle
pp = pprint.PrettyPrinter(indent=4)



#Load the data
data = pd.read_csv("data/drug_consumption.data", header=None)
D_input_cols = data.iloc[:, 1:13]
# We are only concerned with 'Alcohol' Class
D_target = data.iloc[:, 13]
# D_input_cols and D_target together form the dataset D

#Convert C1 (CL0) and C2 (CL1) to non-user (0) and all other classes to user (1)
D_target.loc[(D_target == "CL1") | (D_target == "CL0")] = 0
D_target.loc[D_target != 0] = 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  D_target.loc[(D_target == "CL1") | (D_target == "CL0")] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  D_target.loc[D_target != 0] = 1


In [203]:
print("Num of + examples: ", np.count_nonzero(D_target))

Num of + examples:  1817


In [204]:
D_input_cols.shape

(1885, 12)

In [209]:
D_input_cols.corr()


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12
1,1.0,0.110286,0.158811,0.354241,-0.069753,-0.136654,-0.033849,-0.226778,0.063504,0.183564,-0.190939,-0.332188
2,0.110286,1.0,0.196774,0.216271,-0.001213,0.074646,0.057864,-0.131021,0.219743,0.183831,-0.167492,-0.244277
3,0.158811,0.196774,1.0,0.225311,-0.036099,-0.100993,0.115645,0.057994,0.091088,0.240417,-0.132482,-0.131146
4,0.354241,0.216271,0.225311,1.0,-0.127946,-0.136191,0.109524,-0.341969,0.150921,0.214,-0.231572,-0.345415
5,-0.069753,-0.001213,-0.036099,-0.127946,1.0,0.047642,0.018402,0.084816,-0.038726,-0.029923,0.082411,0.100304
6,-0.136654,0.074646,-0.100993,-0.136191,0.047642,1.0,-0.431051,0.010177,-0.216964,-0.391088,0.174399,0.079988
7,-0.033849,0.057864,0.115645,0.109524,0.018402,-0.431051,1.0,0.245277,0.157336,0.308024,0.114151,0.21013
8,-0.226778,-0.131021,0.057994,-0.341969,0.084816,0.010177,0.245277,1.0,0.038516,-0.056811,0.277512,0.421709
9,0.063504,0.219743,0.091088,0.150921,-0.038726,-0.216964,0.157336,0.038516,1.0,0.247482,-0.22969,-0.208061
10,0.183564,0.183831,0.240417,0.214,-0.029923,-0.391088,0.308024,-0.056811,0.247482,1.0,-0.335133,-0.229038


In [210]:
from sklearn.feature_selection import mutual_info_classif, GenericUnivariateSelect

transformer = GenericUnivariateSelect(mutual_info_classif, mode='k_best', param=11)
cols = D_input_cols.columns
D_input_cols = transformer.fit_transform(D_input_cols, D_target.astype(int))
print("Retained Features: ", transformer.get_feature_names_out(cols))

Retained Features:  [1 3 4 5 6 7 8 9 10 11 12]


In [211]:
#Using Standardization to Scale input features
from sklearn.preprocessing import (StandardScaler, RobustScaler, power_transform, quantile_transform)

#D_input_cols = power_transform(D_input_cols)
D_input_cols = quantile_transform(D_input_cols)
#std_scaler = StandardScaler()
#std_scaler = RobustScaler()

#D_input_cols = std_scaler.fit_transform(D_input_cols)

In [212]:
### Convert to numpy.array
D_target = np.array(D_target).astype('int')
D_input_cols = np.array(D_input_cols)

In [213]:
#########################################################################################################
# TASK 0: Rerun all the algorithms and use 10-fold CV
#########################################################################################################

hr()
print("Running 10-fold CV using all 4 classifiers.")
hr()

param_dict = [
    {"min_samples_leaf": 2},
    {"max_depth": 5, "random_state": 0},
    {},
    {"n_neighbors": 3}]

classifiers = [get_DT, get_RF, get_SVC, get_knn]
names = ["DecisionTree", "RandomForest", "SVC", "k-NN"]


#initialise the object for CV sampler
#initialise the object for CV sampler
cv = KFold(n_splits=10, random_state=1, shuffle=True)
strat_cv = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
acc= []
strat_acc = []
for i in range(len(classifiers)):
    classifier = classifiers[i](param_dict[i])
    score = k_fold_cv(classifier, None, D_input_cols, D_target, cv, score_name="roc_auc")
    strat_score = k_fold_cv(classifier, None, D_input_cols, D_target, strat_cv, score_name="roc_auc")
    acc.append(score)
    strat_acc.append(strat_score)

for i in range(len(names)):
    hr()
    print(names[i])
    print(f"\nMean Balanced Accuracy for 10 fold CV: {acc[i]}")
    print(f"\nMean Balanced Accuracy for Stratified 10 fold CV: {strat_acc[i]}")
hr()



 #################################################################################################### 

Running 10-fold CV using all 4 classifiers.

 #################################################################################################### 


 #################################################################################################### 

DecisionTree

Mean Balanced Accuracy for 10 fold CV: 0.5572609963377438

Mean Balanced Accuracy for Stratified 10 fold CV: 0.5448292959342683

 #################################################################################################### 

RandomForest

Mean Balanced Accuracy for 10 fold CV: 0.5

Mean Balanced Accuracy for Stratified 10 fold CV: 0.5

 #################################################################################################### 

SVC

Mean Balanced Accuracy for 10 fold CV: 0.5

Mean Balanced Accuracy for Stratified 10 fold CV: 0.5

 ########################################################################

In [None]:
#########################################################################################################
# TASK 1&2: Oversampling and Training
#########################################################################################################
#We will use 4 samplers with 25 different configurations each to get metrics of the 4 algorithms

sampler_names = ["RandomOverSampler", "ADASYN", "SVMSMOTE", "SMOTE"]
s_metrics = {}

params = [
    {"sampling_strategy": [0.3, 0.5, 0.7, 0.9, 1]},
    {"sampling_strategy": [0.3, 0.5, 0.7, 0.9, 1], "n_neighbors": [2, 3, 5, 7, 9]},
    {"sampling_strategy": [0.3, 0.5, 0.7, 0.9, 1], "k_neighbors": [2, 3, 5, 7, 9]},
    {"sampling_strategy": [0.3, 0.5, 0.7, 0.9, 1], "k_neighbors": [2, 3, 5, 7, 9]}
]

# X_train, X_test, y_train, y_test = train_test_split(D_input_cols,\
#     D_target, test_size = 0.33, random_state = 5)

# Change this to introduce test set if needed.
X = D_input_cols
y = D_target

# Change this if stratified KFold is preferred
cv = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)

def generate_metric_matrix():
    #Generate comparision matrix
    #Loop over all possible combinations of Models->Sampler->Parameters
    for i in range(len(names)):
        s_metrics[names[i]] = {}
        classifier = classifiers[i](param_dict[i])
        for name, param in zip(sampler_names, params):
            keys = list(param.keys())
            if name not in s_metrics[names[i]].keys():
                s_metrics[names[i]][name] = {"acc": [], "param": []}

            if len(keys) == 1:
                for val in param[keys[0]]:
                    sampler = get_sampler(name, {keys[0]: val})
                    X_sampled, y_sampled = sampler.fit_resample(X, y)
                    m = get_cv_performance_metrics(classifier, X_sampled, y_sampled, cv)
                    s_metrics[names[i]][name]["acc"].append(["%.4f"%m["Mean Accuracy"], "%.4f"%m["Std Dev"]])
                    s_metrics[names[i]][name]["param"].append({keys[0]: val})

            else:
                for v1 in param[keys[0]]:
                    for v2 in param[keys[1]]:
                        print("-", end=" ")
                        sys.stdout.flush()
                        sampler = get_sampler(name, {keys[0]: v1, keys[1]: v2})
                        X_sampled, y_sampled = sampler.fit_resample(X, y)
                        m = get_cv_performance_metrics(classifier, X_sampled, y_sampled, cv)
                        s_metrics[names[i]][name]["acc"].append(["%.4f"%m["Mean Accuracy"], "%.4f"%m["Std Dev"]])
                        s_metrics[names[i]][name]["param"].append({keys[0]: v1, keys[1]: v2})
                        
    # Save the dictionary to file
    with open('metric_dict.pkl', 'wb') as f:
        pickle.dump(s_metrics, f)

#Run the below command only when metric_matrix is to be generated
#generate_metric_matrix()


In [None]:
############################################## Construct DB1  ################################################

# We know from experiment that RandomOversampler with a sampling_strategy of 0.9 gives the best accuracy on D
sampler = get_sampler("RandomOverSampler", {"sampling_strategy": 0.9})

DB1_X, DB1_y = sampler.fit_resample(X, y)

# Retraining using DB1
hr()
print("Training Models using DB1")
hr()

X_train, X_test, y_train, y_test = train_test_split(DB1_X,\
     DB1_y.astype(int), test_size = 0.33, random_state = 5)

param_dict = [
    {
        "min_samples_leaf": [2, 3, 4, 5],
        "criterion": ["gini", "entropy", "log_loss"],
        "min_samples_split": [2, 3, 5],
        "min_weight_fraction_leaf": [0, 0.3, 0.5],
        "max_features": ["sqrt", "log2"]
        },
    {
        "max_depth": [3],
        "random_state": [0], 
        "criterion": ["gini", "entropy", "log_loss"],
        "min_weight_fraction_leaf": [0.0, 0.3, 0.5],
        "max_features": ["sqrt", "log2"],
        "bootstrap": [True, False]
        },
    {
        "kernel": ("linear", "rbf"),
        },
    {
        "n_neighbors": [3, 5, 7, 9],
        "weights": ["uniform", "distance"],
        "leaf_size": [10, 30, 50],
        "p": [1, 2, 3, 4]
        }]

classifiers = [get_DT, get_RF, get_SVC, get_knn]
names = ["DecisionTree", "RandomForest", "SVC", "k-NN"]






In [None]:

# #initialise the object for CV sampler
# acc= []
# std = []
# strat_acc = []
# strat_std = []
# for i in range(len(classifiers)):
#     # Using GridSearch to get the best combination of parameters
#     print("Classifier : ", classifiers[i], "\nParams: ", param_dict[i])
#     classifier = GridSearchCV(classifiers[i](), param_dict[i], verbose=1)
#     classifier.fit(X_train, y_train)
#     print(classifier.best_params_)
#     hr()

In [214]:
import pickle
with open('metric_dict.pkl', 'rb') as f:
    loaded_dict = pickle.load(f)

In [91]:
z = o = 0
for i in DB1_y:
    if i == 0:
        z+=1
    else:
        o+=1
print(z, o)

NameError: name 'DB1_y' is not defined

In [215]:
import pickle
pp = pprint.PrettyPrinter(indent=2)

pp.pprint(loaded_dict)
print(loaded_dict)

{ 'DecisionTree': { 'ADASYN': { 'Score': [ 0.5546250508061238,
                                           0.6433819943097141,
                                           0.5982556564151199,
                                           0.5820054870613738,
                                           0.5946186153637718,
                                           0.5862166373120173,
                                           0.6071182766562797,
                                           0.5975555480287225,
                                           0.582125389513616,
                                           0.5734094296165831,
                                           0.5991640699092263,
                                           0.5771765343449397,
                                           0.561319265682157,
                                           0.5888091044573905,
                                           0.5950931445603577,
                                           0.590895881316

In [238]:
top_5 = [0,0,0,0,0,0,0,0,0,0]
top_5_s = [0,0,0,0,0,0,0,0,0,0]

mini = 0
#k = "k-NN"
for k in loaded_dict.keys():
    top_5 = [0,0,0,0,0,0,0,0,0,0]
    top_5_s = [0,0,0,0,0,0,0,0,0,0]
    print(k, "\n")
    for j in loaded_dict[k].keys():

        for i in range(len( loaded_dict[k][j]["Score"])):
             if float(float(loaded_dict[k][j]["Score"][i])) > mini:
                idx = top_5.index(min(top_5))
                mini = top_5[idx]
                top_5[idx] = float(loaded_dict[k][j]["Score"][i])
                top_5_s[idx] = [k, j, loaded_dict[k][j]["param"][i]]
    for i in range(len(top_5)):
        print(top_5[i], top_5_s[i])

DecisionTree 

0.622156211895407 ['DecisionTree', 'SMOTE', {'sampling_strategy': 0.9, 'k_neighbors': 3}]
0.6043862620241159 ['DecisionTree', 'SMOTE', {'sampling_strategy': 1, 'k_neighbors': 9}]
0.6362748950006775 ['DecisionTree', 'SVMSMOTE', {'sampling_strategy': 1, 'k_neighbors': 7}]
0.6087474596938084 ['DecisionTree', 'SVMSMOTE', {'sampling_strategy': 0.9, 'k_neighbors': 3}]
0.6104176263378946 ['DecisionTree', 'SVMSMOTE', {'sampling_strategy': 0.3, 'k_neighbors': 7}]
0.6292497629047553 ['DecisionTree', 'SMOTE', {'sampling_strategy': 0.7, 'k_neighbors': 9}]
0.6433819943097141 ['DecisionTree', 'ADASYN', {'sampling_strategy': 0.3, 'n_neighbors': 3}]
0.6295535835252676 ['DecisionTree', 'SMOTE', {'sampling_strategy': 0.3, 'k_neighbors': 3}]
0.6154623357268663 ['DecisionTree', 'SMOTE', {'sampling_strategy': 0.7, 'k_neighbors': 5}]
0.6128492074244682 ['DecisionTree', 'SVMSMOTE', {'sampling_strategy': 0.5, 'k_neighbors': 5}]
RandomForest 

0.6152980625931446 ['RandomForest', 'RandomOverSampl

In [231]:
for i in range(len(top_5)):
    print(top_5[i], top_5_s[i])

0.6186011380571739 ['k-NN', 'SVMSMOTE', {'sampling_strategy': 0.7, 'k_neighbors': 9}]
0.6249444519712777 ['k-NN', 'SMOTE', {'sampling_strategy': 1, 'k_neighbors': 9}]
0.6181753827394662 ['k-NN', 'ADASYN', {'sampling_strategy': 0.9, 'n_neighbors': 7}]
0.6198658718330848 ['k-NN', 'SMOTE', {'sampling_strategy': 0.9, 'k_neighbors': 7}]
0.6218943910039291 ['k-NN', 'SMOTE', {'sampling_strategy': 0.9, 'k_neighbors': 9}]
0.6265485706543827 ['k-NN', 'SVMSMOTE', {'sampling_strategy': 1, 'k_neighbors': 9}]
0.6269584067199567 ['k-NN', 'SVMSMOTE', {'sampling_strategy': 1, 'k_neighbors': 7}]
0.6172314049586777 ['k-NN', 'SVMSMOTE', {'sampling_strategy': 0.5, 'k_neighbors': 7}]
0.6182119631486249 ['k-NN', 'ADASYN', {'sampling_strategy': 0.3, 'n_neighbors': 7}]
0.6331682698821297 ['k-NN', 'SMOTE', {'sampling_strategy': 0.7, 'k_neighbors': 9}]


In [None]:
# from helper_functions import (get_cv_performance_metrics, hr, get_sampler, plot_confusion_matrices)

# for i in range(len(classifiers)):
#     name = names[i]
#     classifier = classifiers[i](loaded_dict[names[i]])
#     met = run_classifier(classifier, DB1_X, DB1_y)
#     plot_confusion_matrices("algos_on_DB1", met, name)
    

In [None]:
d = run_classifier(classifiers[0](loaded_dict[names[0]]), DB1_X, DB1_y)

In [None]:
d["Accuracy"]

In [None]:
d["Confusion"][0]

In [None]:
import matplotlib.pyplot as plt


In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
fig = ConfusionMatrixDisplay(d["Confusion"][0], display_labels=d["Classifier"].classes_)
fig.plot()
fig.ax_.set_title('Day v/s No of Questions on GFG', fontsize=15)

In [None]:
sampler_names = ["RandomUnderSampler", "NeighbourhoodCleaningRule", "NearMiss", "InstanceHardnessThreshold"]
sampler_metrics = {}

param_dict = [
    {"min_samples_leaf": 2},
    {"max_depth": 5, "random_state": 0},
    {},
    {"n_neighbors": 3}]

params = [
    {"sampling_strategy": [0.3, 0.5, 0.7, 0.9, 1], "replacement": [True, False]},
    {"n_neighbors": [2, 3, 5, 7], "kind_sel": ["mode", "all"]},
    {"sampling_strategy": [0.3, 0.5, 0.7, 0.9, 1], "n_neighbors": [2, 3, 5, 7, 9]},
    {"sampling_strategy": [0.3, 0.5, 0.7, 0.9, 1], "cv": [3, 5, 7]}
]

def generate_undersampling_metric_matrix():
    #Generate comparision matrix
    #Loop over all possible combinations of Models->Sampler->Parameters
    for i in range(len(names)):
        sampler_metrics[names[i]] = {}
        classifier = classifiers[i](param_dict[i])
        for name, param in zip(sampler_names, params):
            keys = list(param.keys())
            if name not in sampler_metrics[names[i]].keys():
                sampler_metrics[names[i]][name] = {"acc": [], "param": []}

            for v1 in param[keys[0]]:
                for v2 in param[keys[1]]:
                    print("-", end=" ")
                    sys.stdout.flush()
                    sampler = get_sampler(name, {keys[0]: v1, keys[1]: v2})
                    X_sampled, y_sampled = sampler.fit_resample(X, y)
                    m = get_cv_performance_metrics(classifier, X_sampled, y_sampled, cv)
                    sampler_metrics[names[i]][name]["acc"].append(["%.4f"%m["Mean Accuracy"], "%.4f"%m["Std Dev"]])
                    sampler_metrics[names[i]][name]["param"].append({keys[0]: v1, keys[1]: v2})
                    
    # Save the dictionary to file
    with open('undersampling_metric_dict.pkl', 'wb') as f:
        pickle.dump(sampler_metrics, f)

generate_undersampling_metric_matrix()

In [239]:
with open('undersampling_metric_dict.pkl', 'rb') as f:
    loaded_dict = pickle.load(f)
pp.pprint(loaded_dict)
#print(loaded_dict)

{ 'DecisionTree': { 'InstanceHardnessThreshold': { 'Score': [ 0.6754508196721312,
                                                              0.6854921419861807,
                                                              0.6751270153095786,
                                                              0.6804409971548571,
                                                              0.6707173824685,
                                                              0.6380568351171927,
                                                              0.6608985909768325,
                                                              0.6637501693537461,
                                                              0.6533674298875491,
                                                              0.6859673485977511,
                                                              0.6559636228153367,
                                                              0.6688047012599918,
                   

In [258]:
acc = {}
i = 0
al = "InstanceHardnessThreshold"
for c in loaded_dict.keys():
    
    acc[c] = []
    for a in loaded_dict[c][al]['Score']:
        acc[c].append(a)

In [259]:
mean_scores = []
length = []
for name in acc.keys():
    for i in range(len(acc[name])):
        if len(mean_scores) <= i:
            mean_scores.append(0)
            length.append(0)
        mean_scores[i]+= float(acc[name][i])
        length[i]+=1

In [260]:
for i in range(len(mean_scores)):
    print(mean_scores[i]/length[i])

0.6324479237230728
0.6338363704105134
0.6258532888497493
0.6332259348326785
0.6345935510093483
0.6088789628776587
0.6338859910581223
0.6401768899878065
0.6237174840807479
0.6264007248340333
0.6330807986722666
0.6405440489093619
0.6490804091586506
0.6365480625931446
0.6289738856523507
0.6239944621325024
0.6312346734859775
0.6163389107167051
0.6367252066115703
0.6370166644086167


In [None]:
acc

In [None]:
for i in acc.keys():
    

In [None]:
input_variables = np.array(D_input_cols)

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0, sampling_strategy = 0.5)


In [None]:
X_resampled, y_resampled = c
from collections import Counter
print(sorted(Counter(y_resampled).items()))

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
classifier

In [None]:
from imblearn.over_sampling import (SMOTE, RandomOverSampler, ADASYN, KMeansSMOTE)


In [None]:
ros = SMOTE(sampling_strategy=0.1, k_neighbors=20)

In [None]:
X_resampled, y_resampled = ros.fit_resample(input_variables, D_target)
from collections import Counter
print(sorted(Counter(y_resampled).items()))

In [None]:
clf = GridSearchCV(classifier, parameters)
clf.fit()

In [None]:
import pickle
with open('metric_dict.pkl', 'rb') as f:
    loaded_dict = pickle.load(f)


In [None]:
pp.pprint(loaded_dict)

In [None]:
top_5 = [0,0,0,0,0]
top_5_s = [0,0,0,0,0]

mini = 0
k = "SVC"
#for k in loaded_dict.keys():
for j in loaded_dict[k].keys():
    for i in range(len( loaded_dict[k][j]["TNR"])):
         if float(float(loaded_dict[k][j]["TNR"][i][0])) > mini:
            idx = top_5.index(min(top_5))
            mini = top_5[idx]
            top_5[idx] = float(loaded_dict[k][j]["TNR"][i][0])
            top_5_s[idx] = [float(loaded_dict[k][j]["TNR"][i][1]), k, j, loaded_dict[k][j]["param"][i]]
            

In [None]:

for i in range(len(top_5)):
    print(top_5[i], " -- ", top_5_s[i])
    

In [None]:

for k in loaded_dict.keys():
    top_5 = [0,0,0,0,0]
    top_5_s = [0,0,0,0,0]
    mini = 0
    for j in loaded_dict[k].keys():
        for i in range(len( loaded_dict[k][j]["TNR"])):
             if float(float(loaded_dict[k][j]["TNR"][i][0])) > mini:
                idx = top_5.index(min(top_5))
                mini = top_5[idx]
                top_5[idx] = float(loaded_dict[k][j]["TNR"][i][0])
                top_5_s[idx] = [float(loaded_dict[k][j]["TNR"][i][1]), k, j, loaded_dict[k][j]["param"][i]]
    print("\n\n\t\t", k)
    for z in range(len(top_5)):
        print(top_5[z], " -- ", top_5_s[z]) 

In [None]:
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
# from sklearn.linear_model import LogisticRegression
# from sklearn.datasets import make_classification
from sklearn.model_selection import KFold

def get_cv_performance_metrics(classifier, X, y, cv):
    """Returns the accuracy of a classifier after n-fold cross-validation

    Args:
        classifier (sklearn.model): Classifier being used for the task
        X (np.array): Input Variables
        y (np.array): Target Variable
        cv (sklearn.model_selection): Sampler
    """
    scores = cross_val_score(classifier, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return (mean(scores), std(scores))
