In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression



### Dataset 1

In [96]:
path = "D:\Document\Programming\Software\Python\Projects\Project-3\Dataset\Diabetes_cleaned1.csv"
data = pd.read_csv(path)
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,79.799479,33.6,0.627,50,1
1,1.0,85.0,66.0,29.0,79.799479,26.6,0.351,31,0
2,8.0,183.0,64.0,20.536458,79.799479,23.3,0.672,32,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,3.845052,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [97]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [98]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

In [99]:
import pickle

def scaler_standar(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    file = open(r"D:\Document\Programming\Software\Python\Projects\Project-3\Model\Standarscaler.pkl",'wb')
    pickle.dump(scaler,file)
    file.close()
    
    return X_train_scaled ,X_test_scaled

In [100]:
X_train_scaled,X_test_scaled = scaler_standar(X_train,X_test)

In [101]:
### Model Initialization
log_reg = LogisticRegression()

In [102]:
### Hyperparameter Tuning
## GridSearchCV
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

parameters = {'penalty':['l1','l2'],
              'C': np.logspace(-3,3.7),
              'solver': ['newton-cg','lbfgs','liblinear']
              }
scoring = {
    'recall': 'recall',
    'accuracy': 'accuracy',
    'precision': 'precision',
    'f1': 'f1',
    'roc_auc': 'roc_auc' 
            }
clf = GridSearchCV(estimator=log_reg,
                   param_grid=parameters,
                   cv = 5,
                   scoring=scoring,
                   refit='roc_auc'
)

In [103]:
clf.fit(X_train_scaled,y_train)

In [104]:
clf.best_params_

{'C': 2.620398528858349, 'penalty': 'l2', 'solver': 'newton-cg'}

In [83]:
clf.best_score_

0.8478080071104859

In [105]:
log_reg = LogisticRegression(C=2.620398528858349,penalty='l2',solver='newton-cg')
log_reg.fit(X_train_scaled,y_train)

In [120]:
y_pred = log_reg.predict(X_test_scaled)

In [107]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
cm = confusion_matrix(y_test,y_pred)
cm

array([[83, 16],
       [20, 35]], dtype=int64)

In [108]:
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.7662337662337663

In [109]:
TP = cm[0][0]
TN = cm[1][1]
FP = cm[0][1]
FN = cm[1][0]

In [110]:
precision = TP/(TP+FP)
precision

0.8383838383838383

In [111]:
recall = TP/(TP+FN)
recall

0.8058252427184466

In [112]:
F1_SCORE = (2*precision*recall)/(precision+recall)
F1_SCORE

0.8217821782178217

In [121]:
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.7662337662337663
              precision    recall  f1-score   support

           0       0.81      0.84      0.82        99
           1       0.69      0.64      0.66        55

    accuracy                           0.77       154
   macro avg       0.75      0.74      0.74       154
weighted avg       0.76      0.77      0.76       154



In [119]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=200,learning_rate=0.3)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[79 20]
 [19 36]]
0.7467532467532467
              precision    recall  f1-score   support

           0       0.81      0.80      0.80        99
           1       0.64      0.65      0.65        55

    accuracy                           0.75       154
   macro avg       0.72      0.73      0.73       154
weighted avg       0.75      0.75      0.75       154



In [114]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100,max_depth=20),
    'Logistic Regressor': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier()
}
from sklearn.metrics import accuracy_score

def evaluation_model(X_train,X_test,y_train,y_test,models):
    report = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        model.fit(X_train,y_train)
        
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test,y_pred)
        
        report[list(models.keys())[i]] = accuracy
        
    return report
evaluation_model(X_train,X_test,y_train,y_test,models)


{'Random Forest': 0.7597402597402597,
 'Logistic Regressor': 0.7727272727272727,
 'Decision Tree': 0.7207792207792207}

In [116]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

pipeline = Pipeline([('classifier', RandomForestClassifier())])

param_grid = [
    {
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [10, 50, 100],
        'classifier__max_depth': [None, 10, 20],
    },
    {
        'classifier': [SVC()],
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf'],
    }
]

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

# Best model and parameters
print(f"Best Model: {grid_search.best_estimator_}")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-validation Accuracy: {grid_search.best_score_:.2f}")

# Test set performance
test_accuracy = grid_search.score(X_test, y_test)
print(f"Test Set Accuracy: {test_accuracy:.2f}")


Best Model: Pipeline(steps=[('classifier',
                 RandomForestClassifier(max_depth=10, n_estimators=50))])
Best Parameters: {'classifier': RandomForestClassifier(), 'classifier__max_depth': 10, 'classifier__n_estimators': 50}
Best Cross-validation Accuracy: 0.67
Test Set Accuracy: 0.65


In [117]:
import pickle
file = open(r"D:\Document\Programming\Software\Python\Projects\Project-3\Model\Classifier.pkl",'wb')
pickle.dump(log_reg,file)
file.close()

# Dataset 2: IQR + SMOTE

In [37]:
path2 = "D:\Document\Programming\Software\Python\Projects\Project-3\Dataset\Diabetes_cleaned2.csv"
data2 = pd.read_csv(path2)
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,79.799479,33.6,0.627,50,1
1,1.0,85.0,66.0,29.0,79.799479,26.6,0.351,31,0
2,8.0,183.0,64.0,20.536458,79.799479,23.3,0.672,32,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,3.845052,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [38]:
X = data2.iloc[:,:-1]
y = data2.iloc[:,-1]

In [39]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

In [40]:
### Model Initialization
log_reg = LogisticRegression()

In [41]:
### Hyperparameter Tuning
## GridSearchCV
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

parameters = {'penalty':['l1','l2'],
              'C': np.logspace(-3,3.7),
              'solver': ['newton-cg','lbfgs','liblinear']
              }
scoring = {
    'recall': 'recall',
    'accuracy': 'accuracy',
    'precision': 'precision',
    'f1': 'f1',
    'roc_auc': 'roc_auc' 
            }
clf = GridSearchCV(estimator=log_reg,
                   param_grid=parameters,
                   cv = 5,
                   scoring=scoring,
                   refit='f1'
)

In [42]:
clf.fit(X_train,y_train)
clf.best_params_

{'C': 1.912637440861444, 'penalty': 'l1', 'solver': 'liblinear'}

In [43]:
log_reg = LogisticRegression(C= 1.912637440861444,penalty='l1',solver='liblinear')
log_reg.fit(X_train,y_train)
y_pred = log_reg.predict(X_test)

In [44]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

cm = confusion_matrix(y_test,y_pred)
print(f"confusion matrix:\n{cm}\n")

accuracy = accuracy_score(y_test,y_pred)
print(f"accuracy score:{accuracy}")

TP = cm[0][0]
TN = cm[1][1]
FP = cm[0][1]
FN = cm[1][0]

precision = TP/(TP+FP)
print(f"Precision:{precision}")
recall = TP/(TP+FN)
print(f"Recall:{recall}")
F1_SCORE = (2*precision*recall)/(precision+recall)
print(f"F1_SCORE:{F1_SCORE}")

confusion matrix:
[[75 24]
 [30 71]]

accuracy score:0.73
Precision:0.7575757575757576
Recall:0.7142857142857143
F1_SCORE:0.7352941176470589


In [45]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=200,learning_rate=0.3)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[70 29]
 [21 80]]
0.75
              precision    recall  f1-score   support

           0       0.77      0.71      0.74        99
           1       0.73      0.79      0.76       101

    accuracy                           0.75       200
   macro avg       0.75      0.75      0.75       200
weighted avg       0.75      0.75      0.75       200



In [46]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

pipeline = Pipeline([('classifier', RandomForestClassifier())])

param_grid = [
    {
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [10, 50, 100],
        'classifier__max_depth': [None, 10, 20],
    },
    {
        'classifier': [SVC()],
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf'],
    }
]

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

# Best model and parameters
print(f"Best Model: {grid_search.best_estimator_}")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-validation Accuracy: {grid_search.best_score_:.2f}")

# Test set performance
test_accuracy = grid_search.score(X_test, y_test)
print(f"Test Set Accuracy: {test_accuracy:.2f}")


Best Model: Pipeline(steps=[('classifier', RandomForestClassifier())])
Best Parameters: {'classifier': RandomForestClassifier(), 'classifier__max_depth': None, 'classifier__n_estimators': 100}
Best Cross-validation Accuracy: 0.83
Test Set Accuracy: 0.80


In [None]:
import pickle
file = open(r"D:\Document\Programming\Software\Python\Projects\Project-3\Model\Classifier.pkl",'wb')
pickle.dump(log_reg,file)
file.close()

In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
models = {
    'Random Forest': RandomForestClassifier(),
    'Logistic Regressor': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier()
}
from sklearn.metrics import accuracy_score

def evaluation_model(X_train,X_test,y_train,y_test,models):
    report = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        model.fit(X_train,y_train)
        
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test,y_pred)
        
        report[list(models.keys())[i]] = accuracy
        
    return report

In [48]:
evaluation_model(X_train,X_test,y_train,y_test,models)

{'Random Forest': 0.78, 'Logistic Regressor': 0.73, 'Decision Tree': 0.72}

# Dataset 3: IQR

In [36]:
path3 = "D:\Document\Programming\Software\Python\Projects\Project-3\Dataset\Diabetes_cleaned2.csv"
data3 = pd.read_csv(path3)

X = data3.iloc[:,:-1]
y = data3.iloc[:,-1]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)
### Model Initialization
log_reg = LogisticRegression()
### Hyperparameter Tuning
## GridSearchCV
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

parameters = {'penalty':['l1','l2'],
              'C': np.logspace(-3,3.7),
              'solver': ['newton-cg','lbfgs','liblinear']
              }
scoring = {
    'recall': 'recall',
    'accuracy': 'accuracy',
    'precision': 'precision',
    'f1': 'f1',
    'roc_auc': 'roc_auc' 
            }
clf = GridSearchCV(estimator=log_reg,
                   param_grid=parameters,
                   cv = 5,
                   scoring=scoring,
                   refit='f1'
)
clf.fit(X_train,y_train)
clf.best_params_
log_reg = LogisticRegression(C= 0.7437527275659046,penalty='l2',solver='newton-cg')
log_reg.fit(X_train,y_train)
y_pred = log_reg.predict(X_test)
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

cm = confusion_matrix(y_test,y_pred)
print(f"confusion matrix:\n{cm}\n")

accuracy = accuracy_score(y_test,y_pred)
print(f"accuracy score:{accuracy}")

TP = cm[0][0]
TN = cm[1][1]
FP = cm[0][1]
FN = cm[1][0]

precision = TP/(TP+FP)
print(f"Precision:{precision}")
recall = TP/(TP+FN)
print(f"Recall:{recall}")
F1_SCORE = (2*precision*recall)/(precision+recall)
print(f"F1_SCORE:{F1_SCORE}")


confusion matrix:
[[75 24]
 [30 71]]

accuracy score:0.73
Precision:0.7575757575757576
Recall:0.7142857142857143
F1_SCORE:0.7352941176470589


In [49]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=200,learning_rate=0.3)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[70 29]
 [21 80]]
0.75
              precision    recall  f1-score   support

           0       0.77      0.71      0.74        99
           1       0.73      0.79      0.76       101

    accuracy                           0.75       200
   macro avg       0.75      0.75      0.75       200
weighted avg       0.75      0.75      0.75       200



In [139]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
models = {
    'Random Forest': RandomForestClassifier(max_depth=10,n_estimators=100),
    'Logistic Regressor': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier()
}
from sklearn.metrics import accuracy_score

def evaluation_model(X_train,X_test,y_train,y_test,models):
    report = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        model.fit(X_train,y_train)
        
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test,y_pred)
        
        report[list(models.keys())[i]] = accuracy
        
    return report
evaluation_model(X_train,X_test,y_train,y_test,models)

{'Random Forest': 0.7662337662337663,
 'Logistic Regressor': 0.7727272727272727,
 'Decision Tree': 0.7337662337662337}

In [50]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

pipeline = Pipeline([('classifier', RandomForestClassifier())])

param_grid = [
    {
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [10, 50, 100],
        'classifier__max_depth': [None, 10, 20],
    },
    {
        'classifier': [SVC()],
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf'],
    }
]

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

# Best model and parameters
print(f"Best Model: {grid_search.best_estimator_}")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-validation Accuracy: {grid_search.best_score_:.2f}")

# Test set performance
test_accuracy = grid_search.score(X_test, y_test)
print(f"Test Set Accuracy: {test_accuracy:.2f}")


Best Model: Pipeline(steps=[('classifier', RandomForestClassifier())])
Best Parameters: {'classifier': RandomForestClassifier(), 'classifier__max_depth': None, 'classifier__n_estimators': 100}
Best Cross-validation Accuracy: 0.82
Test Set Accuracy: 0.80
