In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Load the diabetes dataset
diabetes = pd.read_csv("diabetes.csv")

# Replace all the zeros with NaN in the relevant columns
diabetes[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]] = diabetes[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]].replace(0, np.nan)

# Fill in the NaNs with the median value
diabetes.fillna(diabetes.median(), inplace=True)


In [9]:
print(diabetes.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6    148.0           72.0           35.0    125.0  33.6   
1            1     85.0           66.0           29.0    125.0  26.6   
2            8    183.0           64.0           29.0    125.0  23.3   
3            1     89.0           66.0           23.0     94.0  28.1   
4            0    137.0           40.0           35.0    168.0  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [10]:
# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler to the diabetes data and transform the data
diabetes_std = scaler.fit_transform(diabetes)

# Convert the standardized data back to a pandas dataframe
diabetes_std = pd.DataFrame(diabetes_std, columns=diabetes.columns)


In [11]:
# Create an instance of SimpleImputer with the "median" strategy to fill NaNs
imputer = SimpleImputer(strategy='median')

# Fit the imputer on the dataset
imputer.fit(diabetes)

# Transform the dataset by filling in the NaNs with the median value
diabetes = pd.DataFrame(imputer.transform(diabetes), columns=diabetes.columns)

# Inspect the first few rows of the data again to confirm the transformation
print(diabetes.head())


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0          6.0    148.0           72.0           35.0    125.0  33.6   
1          1.0     85.0           66.0           29.0    125.0  26.6   
2          8.0    183.0           64.0           29.0    125.0  23.3   
3          1.0     89.0           66.0           23.0     94.0  28.1   
4          0.0    137.0           40.0           35.0    168.0  43.1   

   DiabetesPedigreeFunction   Age  Outcome  
0                     0.627  50.0      1.0  
1                     0.351  31.0      0.0  
2                     0.672  32.0      1.0  
3                     0.167  21.0      0.0  
4                     2.288  33.0      1.0  


In [12]:
# Split the dataset into features and target
X = diabetes.iloc[:, :-1]
y = diabetes.iloc[:, -1]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training set
scaler.fit(X_train)

# Transform the training and testing sets using the scaler
X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Inspect the first few rows of the standardized training set
print(X_train.head())

   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0    -0.526397 -1.256881      -0.018995       0.034298 -0.175620 -0.007450   
1     1.588046 -0.326051       0.808174      -0.560583 -0.175620 -0.599092   
2    -0.828460  0.571536      -2.169636      -1.155463 -0.652193 -0.526941   
3    -1.130523  1.302903      -1.838768       0.034298 -0.175620 -1.508200   
4     0.681856  0.405316       0.642740       0.986106  2.604392  1.998360   

   DiabetesPedigreeFunction       Age  
0                 -0.490735 -1.035940  
1                  2.415030  1.487101  
2                  0.549161 -0.948939  
3                 -0.639291  2.792122  
4                 -0.686829  1.139095  


In [13]:
# Create an instance of RandomForestClassifier
rfc = RandomForestClassifier(random_state=42)

# Fit the classifier on the training set
rfc.fit(X_train, y_train)

# Predict the target values for the testing set
y_pred = rfc.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy score
print("Accuracy:", accuracy)


Accuracy: 0.7337662337662337
