# Importing the libraries

In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,accuracy_score,precision_score,recall_score
from yellowbrick import ROCAUC
from imblearn.over_sampling import SMOTE

# Importing the data 

In [17]:
data = pd.read_csv('imbalanced_data.csv')
data.drop('ID',axis = 1, inplace=True)
data = data[['Imaginary Part: Min', 'Imaginary Part: Avg', 'Real Part: Min', 'Real Part: Avg', 'Gender', 'Age', 'Smoking','Diagnosis']]
data.head()

Unnamed: 0,Imaginary Part: Min,Imaginary Part: Avg,Real Part: Min,Real Part: Avg,Gender,Age,Smoking,Diagnosis
0,-320.61,-300.563531,-495.26,-464.171991,1.0,77.0,2.0,1
1,-325.39,-314.75036,-473.73,-469.26314,0.0,72.0,2.0,1
2,-323.0,-317.436056,-476.12,-471.897667,1.0,73.0,3.0,1
3,-327.78,-317.39967,-473.73,-468.856388,1.0,76.0,2.0,1
4,-325.39,-316.155785,-478.52,-472.869783,0.0,65.0,2.0,1


In [18]:
smoted_data = pd.read_csv('cleaned_data.csv')
smoted_data.head()

Unnamed: 0,Imaginary Part: Min,Imaginary Part: Avg,Real Part: Min,Real Part: Avg,Gender,Age,Smoking,Diagnosis
0,-320.61,-300.563531,-495.26,-464.171991,1.0,77.0,2.0,COPD
1,-325.39,-314.75036,-473.73,-469.26314,0.0,72.0,2.0,COPD
2,-323.0,-317.436056,-476.12,-471.897667,1.0,73.0,3.0,COPD
3,-327.78,-317.39967,-473.73,-468.856388,1.0,76.0,2.0,COPD
4,-325.39,-316.155785,-478.52,-472.869783,0.0,65.0,2.0,COPD


# Splling the data

In [19]:
features = ['Imaginary Part: Min', 'Imaginary Part: Avg', 'Real Part: Min', 'Real Part: Avg', 'Gender', 'Age', 'Smoking']
target = 'Diagnosis'

X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Xs = smoted_data[features]
# ys = smoted_data[target]
# Xs_train, Xs_test, ys_train, ys_test = train_test_split(Xs, ys, test_size=0.2, random_state=42)

In [26]:
# Balancing the classes using smote
smoter = SMOTE()
X_smoted, y_smoted = smoter.fit_resample(X_train,y_train)
smoted_data = pd.concat([pd.DataFrame(X_smoted, columns=X.columns), pd.Series(y_smoted, name='Diagnosis')], axis=1)

NameError: name 'SMOTE' is not defined

# Feature Scaling

In [20]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(X,y)
scaled_smoted_data = scaler.fit_transform(Xs,ys)

# Modelling

In [23]:
def data_modelling(model):
    print('Cleaned Orignal Data:')
    train_model(model,X_train, X_test, y_train, y_test)
    print('Cleaned Smoted Data:')
    train_model(model,Xs_train, Xs_test, ys_train, ys_test)

def train_model(model,X_train, X_test, y_train, y_test):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    evaluate(model,y_test,y_pred)

def evaluate(model,y_test,y_pred):
    print(classification_report(y_test,y_pred))

# Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression
data_modelling(LogisticRegression())

Cleaned Orignal Data:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.82      0.90      0.86        10
           2       0.43      0.43      0.43         7
           3       0.00      0.00      0.00         1

    accuracy                           0.60        20
   macro avg       0.31      0.33      0.32        20
weighted avg       0.56      0.60      0.58        20

Cleaned Smoted Data:
              precision    recall  f1-score   support

      Asthma       0.57      0.57      0.57         7
        COPD       0.83      1.00      0.91        10
          HC       1.00      0.17      0.29         6
    Infected       0.50      0.67      0.57         9

    accuracy                           0.66        32
   macro avg       0.73      0.60      0.58        32
weighted avg       0.71      0.66      0.62        32



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# SVC

In [25]:
from sklearn.svm import SVC
data_modelling(SVC())


Cleaned Orignal Data:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00        10
           2       0.35      1.00      0.52         7
           3       0.00      0.00      0.00         1

    accuracy                           0.35        20
   macro avg       0.09      0.25      0.13        20
weighted avg       0.12      0.35      0.18        20

Cleaned Smoted Data:
              precision    recall  f1-score   support

      Asthma       0.00      0.00      0.00         7
        COPD       0.00      0.00      0.00        10
          HC       0.19      1.00      0.32         6
    Infected       0.00      0.00      0.00         9

    accuracy                           0.19        32
   macro avg       0.05      0.25      0.08        32
weighted avg       0.04      0.19      0.06        32



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# KNN

# Gaussian Naive Bayes