## Diabetes Prediction Model


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn import svm

In [4]:
df = pd.read_csv('diabetes.csv')
df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [69]:
X=df.drop('Outcome',axis=1)
y = df['Outcome']
y.value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [74]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler

In [75]:
rds=RandomOverSampler(random_state=42)
X,y = rds.fit_resample(X,y)
print(Counter(y))

Counter({1: 500, 0: 500})


#### Standardization

In [177]:
scaler = StandardScaler()
Standard_data = scaler.fit_transform(X)
X = Standard_data

In [178]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [59]:
svc = svm.SVC(kernel='linear')
svc.fit(X_train,y_train)

In [60]:
y_train_pred = svc.predict(X_train)
y_test_pred = svc.predict(X_test)

In [61]:
print(accuracy_score(y_train,y_train_pred)*100)
print(accuracy_score(y_test,y_test_pred)*100)

75.6
74.8


### xgboost

In [179]:
from xgboost  import XGBClassifier,plot_importance

In [191]:
model = XGBClassifier(
    eval_metric='logloss',
    random_state=42,
    reg_alpha=0,           # L1 regularization
    reg_lambda=0.5,          # L2 regularization
    max_depth=3,           # Controls tree depth
    n_estimators=100,      # Number of trees
    learning_rate=0.1,     # Step size shrinkage
    subsample=0.8,         # Randomly sample training data (to prevent overfitting)
    colsample_bytree=0.6   # Randomly sample features (also prevents overfitting)
)
model.fit(X_train,y_train)

In [183]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np

# Define the parameter grid
param_dist = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [2, 3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.5, 1, 2, 5],
    'reg_lambda': [0.5, 1, 2, 3, 5]
}

# Create base model
xgb = XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42)

# Set up randomized search
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30,           # Try 30 random combinations
    scoring='accuracy',
    n_jobs=-1,
    cv=cv,
    verbose=1,
    random_state=42
)

# Fit
random_search.fit(X, y)

# Best results
print("Best Parameters:", random_search.best_params_)
print("Best Accuracy:", random_search.best_score_ * 100)


Fitting 5 folds for each of 30 candidates, totalling 150 fits


Parameters: { "use_label_encoder" } are not used.



Best Parameters: {'subsample': 0.8, 'reg_lambda': 0.5, 'reg_alpha': 0, 'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.6}
Best Accuracy: 84.00000000000001


In [226]:


model = XGBClassifier(eval_metric='logloss',
    random_state=22,    
    reg_alpha=2.5,
    reg_lambda=4,
    max_depth=6,           # Controls tree depth
    n_estimators=100,      
    learning_rate=0.1,     
    subsample=0.8,         
    colsample_bytree=0.8)  
model.fit(X_train,y_train)


In [227]:
y_test_pred=model.predict(X_test)
y_train_pred=model.predict(X_train)
print(accuracy_score(y_train,y_train_pred)*100)
print(accuracy_score(y_test,y_test_pred)*100)

93.06666666666666
80.0


In [228]:
import pickle

pickle.dump(model,open ('Diabetesmodel.pkl','wb'))


In [230]:
model.predict([[5,116,74,0,0,25.6,0.201,30]])
model.predict([[10,115,0,0,0,35.3,0.134,29]])

array([1])