# Diabetes Prediction using Machine Learning

This project uses a machine learning model to predict whether a person has diabetes based on various medical parameters. The dataset used is the Pima Indians Diabetes Dataset.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load the dataset
from sklearn.datasets import load_diabetes
import pandas as pd

# Instead of sklearn's regression dataset, use the Pima Indians Diabetes Dataset
url = "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv"
df = pd.read_csv(url)
df.head()

In [None]:
# Basic data info
df.info()

# Check for null values
df.isnull().sum()

In [None]:
# Feature and target split
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:
# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Feature importance
feature_importance = pd.Series(model.feature_importances_, index=df.columns[:-1])
feature_importance.sort_values(ascending=False).plot(kind='bar', title='Feature Importance')
plt.show()