In [None]:
import kagglehub
import pandas as pd

# Download latest version
path = kagglehub.dataset_download("uciml/pima-indians-diabetes-database")

print("Path to dataset files:", path)

# Correct path to actual CSV file (you must find the CSV file inside the path)
csv_path = path + "/diabetes.csv"  # Usually the file is named 'diabetes.csv'

# Read the CSV file
df = pd.read_csv(csv_path)

# Rename columns
df.columns = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin",
              "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"]

# Display first 10 rows
print(df.head(10))


Path to dataset files: /kaggle/input/pima-indians-diabetes-database
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   
5            5      116             74              0        0  25.6   
6            3       78             50             32       88  31.0   
7           10      115              0              0        0  35.3   
8            2      197             70             45      543  30.5   
9            8      125             96              0        0   0.0   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2      

In [None]:
df.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [None]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
df.Outcome.value_counts()  #To check no. of people is suffering or not suffering from the diabetes


Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,500
1,268


In [None]:
#Train Test Split
X=df.drop("Outcome",axis="columns")
y=df.Outcome
# use the Standardscaler so that the value didn't differ to much
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)
X_scaled[:5]

array([[ 0.63994726,  0.84832379,  0.14964075,  0.90726993, -0.69289057,
         0.20401277,  0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575,  0.53090156, -0.69289057,
        -0.68442195, -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, -1.28821221, -0.69289057,
        -1.10325546,  0.60439732, -0.10558415],
       [-0.84488505, -0.99820778, -0.16054575,  0.15453319,  0.12330164,
        -0.49404308, -0.92076261, -1.04154944],
       [-1.14185152,  0.5040552 , -1.50468724,  0.90726993,  0.76583594,
         1.4097456 ,  5.4849091 , -0.0204964 ]])

In [None]:
#Let split the data into train and test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,stratify=y,random_state=10)
X_train.shape

(576, 8)

In [None]:
X_test.shape

(192, 8)

In [None]:
y_train.value_counts()#To check no. of people is suffering or not suffering from the diabetes in y train dataset


Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,375
1,201


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

#Using the cross validation then store the score value in score obj then take the mean
scores=cross_val_score(DecisionTreeClassifier(),X,y,cv=5)
scores

array([0.67532468, 0.66233766, 0.69480519, 0.77777778, 0.7254902 ])

In [None]:
scores.mean()


np.float64(0.7071471012647483)

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# Create Bagging Classifier
bag_model = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=100,
    max_samples=0.8,
    oob_score=True,
    random_state=0
)

# Train the model
bag_model.fit(X_train, y_train)

# Print OOB Score
print("OOB Score:", bag_model.oob_score_)


OOB Score: 0.7534722222222222


In [None]:
bag_model.score(X_test,y_test)


0.7760416666666666

In [None]:
bag_model = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=100,
    max_samples=0.8,
    oob_score=True,
    random_state=0
)
scores = cross_val_score(bag_model, X, y, cv=5)
scores

array([0.75324675, 0.72727273, 0.74675325, 0.82352941, 0.73856209])

In [None]:
scores.mean()


np.float64(0.7578728461081402)

In [None]:
#Train using Random Forest
#The difference is that random forest but it also randomly pick the features also
from sklearn.ensemble import RandomForestClassifier
scores=cross_val_score(RandomForestClassifier(n_estimators=50),X,y,cv=5)
scores.mean()

np.float64(0.7552669552669553)