# Importing the libraries

In [126]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,accuracy_score,precision_score,recall_score
from yellowbrick import ROCAUC
from imblearn.over_sampling import SMOTE

# Importing the data 

In [127]:
data = pd.read_csv('Data/cleaned_data.csv')
data = data[['Imaginary Part: Min', 'Imaginary Part: Avg', 'Real Part: Min', 'Real Part: Avg', 'Gender', 'Age', 'Smoking','Diagnosis']]
data.head()

Unnamed: 0,Imaginary Part: Min,Imaginary Part: Avg,Real Part: Min,Real Part: Avg,Gender,Age,Smoking,Diagnosis
0,-320.61,-300.563531,-495.26,-464.171991,1.0,77.0,2.0,1
1,-325.39,-314.75036,-473.73,-469.26314,0.0,72.0,2.0,1
2,-323.0,-317.436056,-476.12,-471.897667,1.0,73.0,3.0,1
3,-327.78,-317.39967,-473.73,-468.856388,1.0,76.0,2.0,1
4,-325.39,-316.155785,-478.52,-472.869783,0.0,65.0,2.0,1


# Feature Scaling

In [128]:
features = ['Imaginary Part: Min', 'Imaginary Part: Avg', 'Real Part: Min', 'Real Part: Avg', 'Gender', 'Age', 'Smoking']
target = 'Diagnosis'
X = data[features]
y = data[target]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=features)
X.head()


Unnamed: 0,Imaginary Part: Min,Imaginary Part: Avg,Real Part: Min,Real Part: Avg,Gender,Age,Smoking
0,-0.200429,0.164022,-0.456438,-0.125737,0.79959,1.239927,0.373718
1,-0.369452,-0.387892,-0.014925,-0.242758,-1.250641,0.991345,0.373718
2,-0.284941,-0.492374,-0.063936,-0.303313,0.79959,1.041062,1.868588
3,-0.453963,-0.490958,-0.014925,-0.233409,0.79959,1.190211,0.373718
4,-0.369452,-0.442567,-0.113153,-0.325658,-1.250641,0.64333,0.373718


# Splling the data

In [129]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Balancing the classes using smote


In [130]:
smoter = SMOTE()
X_smoted, y_smoted = smoter.fit_resample(X_train,y_train)
X_smoted.head()

Unnamed: 0,Imaginary Part: Min,Imaginary Part: Avg,Real Part: Min,Real Part: Avg,Gender,Age,Smoking
0,2.862128,2.753557,0.574237,0.314936,-1.250641,-1.29561,-1.121153
1,-0.200429,-0.275419,-0.014925,-0.280171,-1.250641,-0.59958,0.373718
2,3.07429,2.986976,0.635758,0.383892,0.79959,1.637658,0.373718
3,-0.200429,-0.044047,0.083098,-0.154806,-1.250641,-0.549863,0.373718
4,-0.538828,-0.568797,-0.063936,-0.240516,0.79959,-0.002983,1.868588


# Modelling

In [131]:
def data_modelling(model):
    print('Cleaned Orignal Data:')
    train_model(model,X_train, X_test, y_train, y_test)
    print('Cleaned Smoted Data:')
    train_model(model,X_smoted, X_test, y_smoted, y_test)

def train_model(model,X_train, X_test, y_train, y_test):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    evaluate(model,y_test,y_pred)

def evaluate(model,y_test,y_pred):
    print(classification_report(y_test,y_pred))

# Logistic Regression

In [136]:
from sklearn.linear_model import LogisticRegression
data_modelling(LogisticRegression(C=10, penalty= 'l2', solver='newton-cg'))

Cleaned Orignal Data:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.90      0.90      0.90        10
           2       0.62      0.71      0.67         7
           3       0.50      1.00      0.67         1

    accuracy                           0.75        20
   macro avg       0.51      0.65      0.56        20
weighted avg       0.69      0.75      0.72        20

Cleaned Smoted Data:
              precision    recall  f1-score   support

           0       0.33      0.50      0.40         2
           1       0.89      0.80      0.84        10
           2       0.75      0.43      0.55         7
           3       0.25      1.00      0.40         1

    accuracy                           0.65        20
   macro avg       0.56      0.68      0.55        20
weighted avg       0.75      0.65      0.67        20



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [135]:
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
# define dataset
# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.716667 using {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.716667 (0.100277) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.716667 (0.100277) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.710000 (0.110604) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.716667 (0.118556) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.716667 (0.118556) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.686667 (0.099107) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.683333 (0.109798) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.683333 (0.109798) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.673333 (0.099778) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.683333 (0.077817) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.683333 (0.077817) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.683333 (0.077817) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.663333 (0.091226) with

# SVC

In [138]:
from sklearn.svm import SVC
data_modelling(SVC(C = 1 ,gamma = 'scale', kernel='rbf'))

Cleaned Orignal Data:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       1.00      0.90      0.95        10
           2       0.64      1.00      0.78         7
           3       0.00      0.00      0.00         1

    accuracy                           0.80        20
   macro avg       0.41      0.47      0.43        20
weighted avg       0.72      0.80      0.75        20

Cleaned Smoted Data:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.88      0.70      0.78        10
           2       1.00      0.43      0.60         7
           3       0.17      1.00      0.29         1

    accuracy                           0.55        20
   macro avg       0.51      0.53      0.42        20
weighted avg       0.80      0.55      0.61        20



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [137]:
# example of grid searching key hyperparametres for SVC
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
# define dataset
# define model and parameters
model = SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']
# define grid search
grid = dict(kernel=kernel,C=C,gamma=gamma)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.756667 using {'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}
0.666667 (0.113529) with: {'C': 50, 'gamma': 'scale', 'kernel': 'poly'}
0.703333 (0.098263) with: {'C': 50, 'gamma': 'scale', 'kernel': 'rbf'}
0.586667 (0.108730) with: {'C': 50, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.666667 (0.104350) with: {'C': 10, 'gamma': 'scale', 'kernel': 'poly'}
0.673333 (0.103064) with: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
0.610000 (0.101160) with: {'C': 10, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.650000 (0.080623) with: {'C': 1.0, 'gamma': 'scale', 'kernel': 'poly'}
0.756667 (0.071570) with: {'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}
0.690000 (0.090738) with: {'C': 1.0, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.433333 (0.090676) with: {'C': 0.1, 'gamma': 'scale', 'kernel': 'poly'}
0.643333 (0.111604) with: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
0.676667 (0.076085) with: {'C': 0.1, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.390000 (0.047258) with: {'C': 0.01, 'gamma': 'scale',

# KNN

In [142]:
from sklearn.neighbors import KNeighborsClassifier
# define models and parameters
model = KNeighborsClassifier()
data_modelling(KNeighborsClassifier(metric='manhattan',n_neighbors=17,weights='distance'))
# {'metric': 'manhattan', 'n_neighbors': 17, 'weights': 'distance'}

Cleaned Orignal Data:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.82      0.90      0.86        10
           2       0.62      0.71      0.67         7
           3       1.00      1.00      1.00         1

    accuracy                           0.75        20
   macro avg       0.61      0.65      0.63        20
weighted avg       0.68      0.75      0.71        20

Cleaned Smoted Data:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.88      0.70      0.78        10
           2       1.00      0.14      0.25         7
           3       0.14      1.00      0.25         1

    accuracy                           0.45        20
   macro avg       0.50      0.46      0.32        20
weighted avg       0.79      0.45      0.49        20



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [140]:
# example of grid searching key hyperparametres for KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
# define models and parameters
model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.726667 using {'metric': 'manhattan', 'n_neighbors': 17, 'weights': 'distance'}
0.700000 (0.141421) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.700000 (0.141421) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.663333 (0.116857) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.723333 (0.120231) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.673333 (0.133998) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.703333 (0.122429) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.616667 (0.096896) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
0.693333 (0.118134) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.660000 (0.105198) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
0.703333 (0.107961) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.620000

# Bagged Dtree

In [145]:
from sklearn.ensemble import BaggingClassifier
data_modelling(BaggingClassifier(n_estimators=1000))

Cleaned Orignal Data:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       1.00      0.90      0.95        10
           2       0.75      0.86      0.80         7
           3       0.00      0.00      0.00         1

    accuracy                           0.75        20
   macro avg       0.44      0.44      0.44        20
weighted avg       0.76      0.75      0.75        20

Cleaned Smoted Data:
              precision    recall  f1-score   support

           0       0.33      0.50      0.40         2
           1       1.00      0.80      0.89        10
           2       1.00      0.86      0.92         7
           3       0.33      1.00      0.50         1

    accuracy                           0.80        20
   macro avg       0.67      0.79      0.68        20
weighted avg       0.90      0.80      0.83        20



In [144]:
# example of grid searching key hyperparameters for BaggingClassifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
# define dataset
# define models and parameters
model = BaggingClassifier()
n_estimators = [10, 100, 1000]
# define grid search
grid = dict(n_estimators=n_estimators)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.756667 using {'n_estimators': 1000}
0.730000 (0.100499) with: {'n_estimators': 10}
0.740000 (0.091652) with: {'n_estimators': 100}
0.756667 (0.080346) with: {'n_estimators': 1000}
