In [64]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.preprocessing import StandardScaler

In [65]:
# Load the dataset
data = pd.read_csv('diabetes.csv')

# Print first five rows
print('#######################')
print('First five rows')
print('#######################')
print(data.head())


# Check for missing values
print('#######################')
print('Columns and missing values')
print('#######################')
print(data.isnull().sum())

print('.......')
print('.......')
print('.......')

#check for zero values
print('#######################')
print('Columns and zero values')
print('#######################')
zero_counts =(data == 0).sum()
print(zero_counts)

#######################
First five rows
#######################
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
#######################
Columns and missing values
#######################
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI           

In [66]:
# Replace zeros with NaNs
data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.nan)

# Replace missing values with the median
data.fillna(data.median(), inplace=True)

In [67]:
# Split the data into features and target
X = data.drop('Outcome', axis=1)
y = data['Outcome']

In [68]:
# Normalize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [69]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [92]:
# Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, random_state=42)

In [93]:
# Find accuracy
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
print('Accuracy:', accuracy)
print("Confusion Matrix:\n", confusion)

Accuracy: 0.7532467532467533
Confusion Matrix:
 [[82 17]
 [21 34]]
