#STEP 1

In [16]:
import pickle
import numpy as np
import pandas as pd

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler


In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#STEP 2

In [18]:
df = pd.read_csv('/content/drive/MyDrive/ml_assinments/assin_4/diabetes.csv')

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [19]:
X.head()

Unnamed: 0,Glucose,Insulin,BMI,Age
0,148,0,33.6,50
1,85,0,26.6,31
2,183,0,23.3,32
3,89,94,28.1,21
4,137,168,43.1,33


In [20]:
std_scaler = StandardScaler()
std_X = std_scaler.fit_transform(X)
print(X)

     Glucose  Insulin   BMI  Age
0        148        0  33.6   50
1         85        0  26.6   31
2        183        0  23.3   32
3         89       94  28.1   21
4        137      168  43.1   33
..       ...      ...   ...  ...
763      101      180  32.9   63
764      122        0  36.8   27
765      121      112  26.2   30
766      126        0  30.1   47
767       93        0  30.4   23

[768 rows x 4 columns]


StandardScaler is preferred over MinMaxScaler because it standardizes data to zero mean and unit variance, which aligns better with the assumptions of models like Perceptron and GaussianNB. This scaling helps maintain a Gaussian distribution, making it more effective for models sensitive to feature magnitudes.

In [21]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#STEP 3

In [22]:
def evaluation_metrics(y_test, y_pred):
  return{
      'accuracy': accuracy_score(y_test, y_pred),
      'precision': precision_score(y_test, y_pred, average='macro'),
      'recall': recall_score(y_test, y_pred, average='macro'),
      'f1_score': f1_score(y_test, y_pred, average='macro')
  }

Naive Bayes Model

In [23]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

y_pred_nb = nb_model.predict(X_test)

print(evaluation_metrics(y_test, y_pred_nb))

{'accuracy': 0.7467532467532467, 'precision': 0.7240740740740741, 'recall': 0.7222222222222222, 'f1_score': 0.723110967682449}


Perceptron Model

In [24]:
per_model = Perceptron()
per_model.fit(X_train, y_train)

y_pred_per = per_model.predict(X_test)

print(evaluation_metrics(y_test, y_pred_per))

{'accuracy': 0.42857142857142855, 'precision': 0.6015457277801631, 'recall': 0.5434343434343434, 'f1_score': 0.3915229885057471}


Gaussian Naive Bayes:

Strengths: With a higher accuracy (0.75) and F1-score (0.72), Gaussian Naive Bayes consistently outperforms in identifying correct cases overall. High precision and recall values (both around 0.72) indicate that it balances minimizing false positives and capturing true positives well.
Weaknesses: Despite strong performance, it could still miss a few true positive cases, given its recall isnâ€™t perfect.
Perceptron:

Strengths: The Perceptron model has moderate recall (0.54), meaning it captures some true positive cases effectively.
Weaknesses: With lower precision (0.55) and F1-score (0.53), Perceptron is less reliable in distinguishing between diabetic and non-diabetic cases, producing more false positives and lower overall accuracy.

Custom Perceptron Model

In [25]:

class Custom_Perceptron_Model:
    def __init__(self, learning_rate=0.01, n_iters=1000):
        self.lr = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        X = np.array(X, dtype=np.float64)
        y = np.array(y, dtype=np.float64)

        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Convert labels to -1 and 1
        y_ = np.where(y <= 0, -1, 1)

        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                linear_output = np.dot(x_i, self.weights) + self.bias
                y_predicted = self.sign(linear_output)

                if y_predicted != y_[idx]:
                    self.weights += self.lr * y_[idx] * x_i
                    self.bias += self.lr * y_[idx]

    def predict(self, X):
        X = np.array(X, dtype=np.float64)
        linear_output = np.dot(X, self.weights) + self.bias
        y_predicted = self.sign(linear_output)
        return np.where(y_predicted <= 0, -1, 1)

    def sign(self, x):
        return np.where(x >= 0, 1, -1)

In [26]:
cust_per_model = Custom_Perceptron_Model()
cust_per_model.fit(X_train, y_train)

y_pred_cust_per = cust_per_model.predict(X_test)

print(evaluation_metrics(y_test, y_pred_cust_per))

{'accuracy': 0.13636363636363635, 'precision': 0.2333333333333333, 'recall': 0.1272727272727273, 'f1_score': 0.16470588235294117}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#STEP 6

In [27]:
with open('/content/drive/MyDrive/ml_assinments/assin_4/nb_model.pkl', 'wb') as f:
  pickle.dump(nb_model, f)

with open('/content/drive/MyDrive/ml_assinments/assin_4/per_model.pkl', 'wb') as f:
  pickle.dump(per_model, f)

with open('/content/drive/MyDrive/ml_assinments/assin_4/custom_per_model.pkl', 'wb') as f:
  pickle.dump(cust_per_model, f)

#STEP 9

In [28]:

# Define number of splits for k-fold cross-validation
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Function to evaluate model with cross-validation
def evaluate_model(model, X, y, kf):
    accuracy_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')
    precision_scores = cross_val_score(model, X, y, cv=kf, scoring='precision')
    recall_scores = cross_val_score(model, X, y, cv=kf, scoring='recall')
    f1_scores = cross_val_score(model, X, y, cv=kf, scoring='f1')

    print(f'Accuracy: {np.mean(accuracy_scores):.2f}')
    print(f'Precision: {np.mean(precision_scores):.2f}')
    print(f'Recall: {np.mean(recall_scores):.2f}')
    print(f'F1 Score: {np.mean(f1_scores):.2f}\n')

    return {
        'accuracy': np.mean(accuracy_scores),
        'precision': np.mean(precision_scores),
        'recall': np.mean(recall_scores),
        'f1_score': np.mean(f1_scores)
    }

# Evaluate Naive Bayes model
print("Naive Bayes Model Performance:")
naive_bayes_results = evaluate_model(nb_model, X, y, kf)

# Evaluate Perceptron model
print("Perceptron Model Performance:")
perceptron_results = evaluate_model(per_model, X, y, kf)

Naive Bayes Model Performance:
Accuracy: 0.75
Precision: 0.69
Recall: 0.55
F1 Score: 0.61

Perceptron Model Performance:
Accuracy: 0.46
Precision: 0.39
Recall: 0.61
F1 Score: 0.38



The Naive Bayes model outperforms the Perceptron model with an accuracy of 75% compared to 46%, indicating it makes more correct predictions overall. Naive Bayes also has a better precision (0.69 vs. 0.39), suggesting it produces fewer false positives. However, the Perceptron model achieves a slightly higher recall (0.61 vs. 0.55), indicating it captures more true diabetic cases but at the cost of increased false positives. Overall, Naive Bayes is the preferred model due to its balanced accuracy, precision, and F1 score.