In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score,confusion_matrix,recall_score,precision_score,classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [6]:
diebitics_df = pd.read_csv("../datasets/diabetes.csv")

In [7]:
diebitics_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [8]:
diebitics_df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [9]:
diebitics_df['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [10]:
diebitics_df.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [11]:
X = diebitics_df.drop(columns=['Outcome'],axis=1)
y = diebitics_df.Outcome

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=3)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
param_grid = {"C": [0.01, 0.1, 1, 10, 100]}
lr = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
lr.fit(X_train_scaled, y_train)
prediction = lr.predict(X_test_scaled)

# Print results
print(f"Best params: {lr.best_params_} and best score: {lr.best_score_}")
print(f"Test Accuracy: {accuracy_score(y_test, prediction):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, prediction))
print(f"Recall Score: {recall_score(y_test, prediction, average='weighted'):.4f}")
print(f"Precision Score: {precision_score(y_test, prediction, average='weighted'):.4f}")

# Print Classification Report for better understanding
print("Classification Report:\n", classification_report(y_test, prediction))

Best params: {'C': 1} and best score: 0.7801012928162068
Test Accuracy: 0.7338
Confusion Matrix:
 [[78 14]
 [27 35]]
Recall Score: 0.7338
Precision Score: 0.7314
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.85      0.79        92
           1       0.71      0.56      0.63        62

    accuracy                           0.73       154
   macro avg       0.73      0.71      0.71       154
weighted avg       0.73      0.73      0.73       154



In [76]:
param_grid = {
    "max_depth": [3, 5, 10, None], 
    "min_samples_split": [2, 5, 10], 
    "criterion": ["gini", "entropy"]
}

dt = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
dt.fit(X_train_scaled, y_train)
prediction = dt.predict(X_test_scaled)

# Print results
print(f"Best params: {dt.best_params_} and best score: {dt.best_score_}")
print(f"Test Accuracy: {accuracy_score(y_test, prediction):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, prediction))
print(f"Recall Score: {recall_score(y_test, prediction, average='weighted'):.4f}")
print(f"Precision Score: {precision_score(y_test, prediction, average='weighted'):.4f}")
print("Classification Report:\n", classification_report(y_test, prediction))

Best params: {'criterion': 'entropy', 'max_depth': 3, 'min_samples_split': 5} and best score: 0.7491803278688525
Test Accuracy: 0.6688
Confusion Matrix:
 [[72 20]
 [31 31]]
Recall Score: 0.6688
Precision Score: 0.6623
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.78      0.74        92
           1       0.61      0.50      0.55        62

    accuracy                           0.67       154
   macro avg       0.65      0.64      0.64       154
weighted avg       0.66      0.67      0.66       154



In [77]:
param_grid = {
    "n_estimators": [50, 100, 200], 
    "max_depth": [5, 10, 20], 
    "min_samples_split": [2, 5]
}

rf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
rf.fit(X_train_scaled, y_train)
prediction = rf.predict(X_test_scaled)

# Print results
print(f"Best params: {rf.best_params_} and best score: {rf.best_score_}")
print(f"Test Accuracy: {accuracy_score(y_test, prediction):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, prediction))
print(f"Recall Score: {recall_score(y_test, prediction, average='weighted'):.4f}")
print(f"Precision Score: {precision_score(y_test, prediction, average='weighted'):.4f}")
print("Classification Report:\n", classification_report(y_test, prediction))

Best params: {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 50} and best score: 0.7768892443022791
Test Accuracy: 0.7208
Confusion Matrix:
 [[80 12]
 [31 31]]
Recall Score: 0.7208
Precision Score: 0.7208
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.87      0.79        92
           1       0.72      0.50      0.59        62

    accuracy                           0.72       154
   macro avg       0.72      0.68      0.69       154
weighted avg       0.72      0.72      0.71       154



In [None]:
model = SVC(kernel='linear', class_weight='balanced')

model.fit(X_train_scaled,y_train)
prediction = model.predict(X_test_scaled)

print(f"Test Accuracy: {accuracy_score(y_test, prediction):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, prediction))
print(f"Recall Score: {recall_score(y_test, prediction, average='weighted'):.4f}")
print(f"Precision Score: {precision_score(y_test, prediction, average='weighted'):.4f}")
# print("Classification Report:\n", classification_report(y_test, prediction))

NameError: name 'SVC' is not defined

In [232]:
param_grid = {
    "n_neighbors": [3, 5, 7, 10, 15],  
    "weights": ["uniform", "distance"],  
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],  
    "p": [1, 2]  
}


knn = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Train model
knn.fit(X_train_scaled, y_train)

# Predictions on test set
prediction = knn.predict(X_test_scaled)

# Print results
print(f"Best params: {knn.best_params_} and best score: {knn.best_score_}")
print(f"Test Accuracy: {accuracy_score(y_test, prediction):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, prediction))
print(f"Recall Score: {recall_score(y_test, prediction, average='weighted'):.4f}")
print(f"Precision Score: {precision_score(y_test, prediction, average='weighted'):.4f}")
print("Classification Report:\n", classification_report(y_test, prediction))

Best params: {'algorithm': 'auto', 'n_neighbors': 15, 'p': 1, 'weights': 'distance'} and best score: 0.7622284419565506
Test Accuracy: 0.7338
Confusion Matrix:
 [[82 10]
 [31 31]]
Recall Score: 0.7338
Precision Score: 0.7379
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.89      0.80        92
           1       0.76      0.50      0.60        62

    accuracy                           0.73       154
   macro avg       0.74      0.70      0.70       154
weighted avg       0.74      0.73      0.72       154



In [81]:
pip install pickel

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement pickel (from versions: none)
ERROR: No matching distribution found for pickel


In [228]:
#Only saving SVM due to it's high accurecy
import pickle
filename =  'diebitics_model.sav'
pickle.dump(model,open(filename,'wb'))