In [7]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv('diabetes.csv')

# Replace zero values with median for relevant columns
for col in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    df[col] = df[col].replace(0, df[col].median())

# Features and target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save preprocessed data
pd.DataFrame(X_scaled, columns=X.columns).to_csv('cleaned_diabetes.csv', index=False)

# Verify scaling
print("Mean of scaled features:", X_scaled.mean(axis=0))
print("Standard deviation of scaled features:", X_scaled.std(axis=0))
print("First 5 rows of scaled data:")
print(pd.DataFrame(X_scaled, columns=X.columns).head())

Mean of scaled features: [-6.47630098e-17  4.62592927e-18  5.78241159e-18 -1.27213055e-16
  2.60208521e-17  4.67218856e-16  2.45174251e-16  1.93132547e-16]
Standard deviation of scaled features: [1. 1. 1. 1. 1. 1. 1. 1.]
First 5 rows of scaled data:
   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0     0.639947  0.866045      -0.031990       0.831114 -0.608201  0.167240   
1    -0.844885 -1.205066      -0.528319       0.180566 -0.608201 -0.851551   
2     1.233880  2.016662      -0.693761      -0.469981 -0.608201 -1.331838   
3    -0.844885 -1.073567      -0.528319      -0.469981 -0.006185 -0.633239   
4    -1.141852  0.504422      -2.679076       0.831114  0.695378  1.549885   

   DiabetesPedigreeFunction       Age  
0                  0.468492  1.425995  
1                 -0.365061 -0.190672  
2                  0.604397 -0.105584  
3                 -0.920763 -1.041549  
4                  5.484909 -0.020496  


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import joblib

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate
accuracy = model.score(X_test, y_test)
print(f"Model accuracy: {accuracy:.2f}")

# Save model
joblib.dump(model, 'diabetes_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

Model accuracy: 0.77


['scaler.pkl']

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import joblib

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate
accuracy = model.score(X_test, y_test)
print(f"Model accuracy: {accuracy:.2f}")

# Save model
joblib.dump(model, 'diabetes_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print("Model and scaler saved successfully.")

Model accuracy: 0.77
Model and scaler saved successfully.
