In [None]:
import sklearn

In [None]:
import pandas as pd
df = pd.read_csv("pima-indians-diabetes.csv")

In [None]:
df.head()

Unnamed: 0,Pregancies,Glucose,BloodPressure,Skin Thickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## 1. Normal Decision Tree

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [None]:
X_1 = df.iloc[:,:8]
y_1 = df.iloc[:,8]

In [None]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size = 0.2, stratify = y_1)

Decision_Tree = DecisionTreeClassifier(max_depth = None,
                                       random_state = 1)

Decision_Tree.fit(X_train_1, y_train_1)

y_pred_1 = Decision_Tree.predict(X_test_1)
y_pred_train_1 = Decision_Tree.predict(X_train_1)
train_accuracy = accuracy_score(y_train_1, y_pred_train_1)
test_accuracy = accuracy_score(y_test_1, y_pred_1)
print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

Train Accuracy: 1.0
Test Accuracy: 0.6493506493506493


## 2. Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create and train the random forest classifier
rf = RandomForestClassifier(n_estimators=100, max_depth=None)
rf.fit(X_train_1, y_train_1)

# Predict on the test set
y_prediction = rf.predict(X_test_1)
y_prediction_train = rf.predict(X_train_1)

# Check accuracy
train_accuracy = accuracy_score(y_train_1, y_prediction_train)
test_accuracy = accuracy_score(y_test_1, y_prediction)
print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)


Train Accuracy: 1.0
Test Accuracy: 0.7337662337662337


## 3. Neural Network

In [None]:
import torch
from torch import nn

In [None]:
np_array = df.values
data_tensor = torch.tensor(np_array, dtype=torch.float32)

In [None]:
data_tensor.shape

torch.Size([768, 9])

In [None]:
X_tensor = data_tensor[:,0:8]
y_tensor = data_tensor[:, 8]
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_tensor, y_tensor, test_size = 0.2, stratify = y_tensor)

In [None]:
class Linear_Regression_model(nn.Module):
  def __init__(self, hidden_units : int):
    super().__init__()
    self.linear_layer =  nn.Sequential(
        nn.Linear(in_features = 8, out_features = hidden_units),
        nn.ReLU(),
        nn.Linear(in_features = hidden_units, out_features = hidden_units),
        nn.ReLU(),
        nn.Linear(in_features = hidden_units, out_features = 1),
        nn.Sigmoid()
    )

  def forward(self, x: torch.Tensor) -> torch.Tensor:
    return self.linear_layer(x)

torch.manual_seed(42)
model_1 = Linear_Regression_model(hidden_units = 32)

In [None]:
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(params = model_1.parameters(),
                            lr = 0.01)

In [None]:
epochs = 1000

for epoch in range(epochs):
  model_1.train()
  y_pred = model_1(X_train_2)
  optimizer.zero_grad()
  loss = loss_fn(y_pred, y_train_2.unsqueeze(1))
  loss.backward()
  train_accuracy = accuracy_score(y_train_2, torch.round(y_pred).detach().numpy())
  optimizer.step()
  model_1.eval()
  with torch.inference_mode():
    if epoch%100 == 0:
      y_pred_test = model_1(X_test_2)
      test_accuracy = accuracy_score(y_test_2, torch.round(y_pred_test))
      test_loss = loss_fn(y_pred_test, y_test_2.unsqueeze(1))
      print(f"Epoch = {epoch} | Train loss = {loss} | Test loss = {test_loss} | Train Accuracy = {train_accuracy} | Test Accuracy = {test_accuracy}")

Epoch = 0 | Train loss = 0.7052377462387085 | Test loss = 0.7026256918907166 | Train Accuracy = 0.6514657980456026 | Test Accuracy = 0.6493506493506493
Epoch = 100 | Train loss = 0.6936607956886292 | Test loss = 0.6935781836509705 | Train Accuracy = 0.6514657980456026 | Test Accuracy = 0.6493506493506493
Epoch = 200 | Train loss = 0.6934172511100769 | Test loss = 0.6933648586273193 | Train Accuracy = 0.6514657980456026 | Test Accuracy = 0.6493506493506493
Epoch = 300 | Train loss = 0.693333089351654 | Test loss = 0.6932920813560486 | Train Accuracy = 0.6514657980456026 | Test Accuracy = 0.6493506493506493
Epoch = 400 | Train loss = 0.6932901740074158 | Test loss = 0.693255603313446 | Train Accuracy = 0.6514657980456026 | Test Accuracy = 0.6493506493506493
Epoch = 500 | Train loss = 0.6932641267776489 | Test loss = 0.6932335495948792 | Train Accuracy = 0.6514657980456026 | Test Accuracy = 0.6493506493506493
Epoch = 600 | Train loss = 0.6932465434074402 | Test loss = 0.6932188868522644 |

## 4. Some Other Techniques:




In [None]:
df.describe()

Unnamed: 0,Pregancies,Glucose,BloodPressure,Skin Thickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
df.Outcome.value_counts()

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,500
1,268


In [None]:
X = df.drop('Outcome', axis = 'columns')
y = df.Outcome

In [None]:
!pip install --upgrade scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading scikit_learn-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m48.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.3.2
    Uninstalling scikit-learn-1.3.2:
      Successfully uninstalled scikit-learn-1.3.2
Successfully installed scikit-learn-1.5.1


In [None]:
X_scaled = (X - X.mean())/X.std()
X_scaled.shape

(768, 8)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify = y, random_state = 10)

In [None]:
X_train.shape

(576, 8)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

scores = cross_val_score(DecisionTreeClassifier(), X_scaled, y, cv = 5)
scores

array([0.68181818, 0.69480519, 0.70779221, 0.78431373, 0.75163399])

In [None]:
scores.mean()

0.724072659366777

In [None]:
## Using Bagging Classifier now
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score

bag_model = BaggingClassifier(
    estimator = DecisionTreeClassifier(),
    n_estimators = 200,
    max_samples = 0.8,
    oob_score = True,
    random_state = 0
)

bag_model.fit(X_train, y_train)

bag_model.oob_score_

0.7673611111111112

In [None]:
y_pred = bag_model.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
accuracy

0.7916666666666666

In [None]:
scores = cross_val_score(bag_model, X, y, cv = 5)

In [None]:
scores.mean()

0.7604957134368899

## 5. Trying to Implement XGBoost Classifier:

In [None]:
import xgboost

xgboost = xgboost.XGBClassifier(learning_rate = 0.01, n_estimators = 500)

xgboost.fit(X_train, y_train)

In [None]:
y_pred = xgboost.predict(X_test)
accuracy_test = accuracy_score(y_pred, y_test)
accuracy_test

0.78125

In [None]:
scores = cross_val_score(xgboost, X, y, cv = 5)
scores

array([0.75974026, 0.73376623, 0.74675325, 0.80392157, 0.75163399])

In [None]:
scores.mean()

0.7591630591630592

## 6. Trying To Implement a VotingClassifier:

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify = y, random_state = 10)

NameError: name 'X_scaled' is not defined

In [None]:
#Trying to implement Voting/Stacking

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
m1 = KNeighborsClassifier()
m2 = LogisticRegression()
m3 = RandomForestClassifier(n_estimators=1000)
m4 = GaussianNB()
m5 = DecisionTreeClassifier()

In [None]:
from sklearn.ensemble import VotingClassifier
model = VotingClassifier([("knn", m1),("lr", m2), ("rf", m3),("gn", m4), ("dt", m5)])
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_pred_2 = model.predict(X_train)
test_accuracy = accuracy_score(y_pred, y_test)
train_accuracy = accuracy_score(y_pred_2, y_train)
print(f"Train Accuracy : {train_accuracy}, Test Accuracy : {test_accuracy}")

Train Accuracy : 0.8871527777777778, Test Accuracy : 0.8020833333333334


In [None]:
#We seem to be getting the best accuracy from the voting classifier that was coded above

## 7. Trying to Implement Boosting:

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
abc = AdaBoostClassifier(n_estimators = 9, random_state = 0)
abc.fit(X_train, y_train)

In [None]:
y_pred = abc.predict(X_test)
y_pred_2 = abc.predict(X_train)
test_accuracy = accuracy_score(y_pred, y_test)
train_accuracy = accuracy_score(y_pred_2, y_train)
print(f"Train Accuracy : {train_accuracy}, Test Accuracy : {test_accuracy}")

Train Accuracy : 0.7795138888888888, Test Accuracy : 0.8020833333333334


In [None]:
#Adaboost and VotingClassifier are giving us the highest accuracies here.

In [None]:
pip install --upgrade xgboost



In [None]:
import xgboost

xgboost_model = xgboost.XGBClassifier(max_depth=5, n_estimators=4)

xgboost_model.fit(
    X_train, y_train,
    verbose=True
)


In [None]:
y_pred = xgboost_model.predict(X_test)
y_pred_2 = xgboost_model.predict(X_train)
test_accuracy = accuracy_score(y_pred, y_test)
train_accuracy = accuracy_score(y_pred_2, y_train)
print(f"Train Accuracy : {train_accuracy}, Test Accuracy : {test_accuracy}")

Train Accuracy : 0.8645833333333334, Test Accuracy : 0.7864583333333334


In [None]:
0.7864583333333334
# here uptill now..
# xgboost < Adaboost < VotingClassifier

0.7864583333333334