<a href="https://colab.research.google.com/github/WhyAvya/vitalis/blob/main/Diabetes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Diabetes


### Load Data

In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report

url1 = "https://raw.githubusercontent.com/WhyAvya/vitalis/f816162997dd8f385f10fd7a9dd6f92099bdca83/diabetes_data_no_heartRate.csv"
url2 = "https://raw.githubusercontent.com/WhyAvya/vitalis/refs/heads/main/diabetes.csv"

df1 = pd.read_csv(url1)
df2 = pd.read_csv(url2)

### Data Inspec


In [33]:
print("Dataset 1 columns:", df1.columns.tolist())
print("Dataset 2 columns:", df2.columns.tolist())

Dataset 1 columns: ['Age', 'Sex', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Stroke', 'HighBP', 'Diabetes']
Dataset 2 columns: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'diabetes_outcome']


### Data Prep


In [34]:
features = ['Age', 'BMI', 'Glucose', 'BloodPressure', 'PhysActivity', 'Pregnancies']

# Check if HighBP exists in df1 and BloodPressure in df2
if 'HighBP' in df1.columns and 'BloodPressure' in df2.columns:
    bp_median = df2['BloodPressure'].median()
    df1['BloodPressure'] = df1['HighBP'].map({1: bp_median + 10, 0: bp_median - 5})

# Check if PhysActivity exists
if 'PhysActivity' in df1.columns:
    df1['PhysActivity'] = df1['PhysActivity'].map({'Yes': 1, 'No': 0}).fillna(0)

# Select only relevant columns
df1 = df1[[col for col in features if col in df1.columns]]
df2 = df2[[col for col in features if col in df2.columns]]

## Model Train


In [36]:
# 1. Check for target column (case-insensitive)
possible_targets = ['diabetes_outcome', 'diabetes', 'target']
target_column = next((col for col in df2.columns if col.lower() in possible_targets), None)

if not target_column:
    # Show exactly what columns are available
    print("Error: Target column not found. Available columns:")
    print(df2.columns.tolist())
    raise ValueError("Could not find 'target' or similar target column")

# 2. Use the found target column name (keeping original case)
train_data = df2.copy()
train_data['Risk'] = train_data[target_column]  # Use actual column name

# 3. Select only features that exist in the dataframe
available_features = [col for col in features if col in df2.columns]

# 4. Original modeling code (now with validated columns)
imputer = SimpleImputer(strategy='median')
train_data[available_features] = imputer.fit_transform(train_data[available_features])

X = train_data[available_features]
y = train_data['Risk']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

print(f"Model trained using target column: '{target_column}'")
print(f"Features used: {available_features}")

Model trained using target column: 'diabetes_outcome'
Features used: ['pregnancies', 'glucose', 'bloodpressure', 'bmi', 'age']


## Model Evaluation

In [39]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("\n=== Model Evaluation ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"Precision: {precision_score(y_test, y_pred):.2f}")
print(f"Recall: {recall_score(y_test, y_pred):.2f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.2f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_prob):.2f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


=== Model Evaluation ===
Accuracy: 0.71
Precision: 0.59
Recall: 0.64
F1 Score: 0.61
ROC AUC: 0.81

Confusion Matrix:
[[75 24]
 [20 35]]


## Prediction


In [40]:
def predict_diabetes_risk(pregnancies, glucose, bloodpressure, bmi, age):
    input_data = [[pregnancies, glucose, bloodpressure, bmi, age]]
    input_imputed = imputer.transform(input_data)
    risk_prob = model.predict_proba(input_imputed)[0][1]

    return {
        'risk_percentage': f"{risk_prob*100:.1f}%",
        'risk_class': 'High Risk' if risk_prob > 0.5 else 'Low Risk',
        'features_used': ['pregnancies', 'glucose', 'bloodpressure', 'bmi', 'age']
    }

## Example


In [41]:
test_case = {
    'pregnancies': 2,
    'glucose': 148,
    'bloodpressure': 72,
    'bmi': 33.6,
    'age': 50
}

result = predict_diabetes_risk(**test_case)
print("\n=== Example Prediction ===")
print(f"Input Features: {test_case}")
print(f"Diabetes Risk: {result['risk_percentage']}")
print(f"Classification: {result['risk_class']}")
print(f"Features Used: {result['features_used']}")


=== Example Prediction ===
Input Features: {'pregnancies': 2, 'glucose': 148, 'bloodpressure': 72, 'bmi': 33.6, 'age': 50}
Diabetes Risk: 65.6%
Classification: High Risk
Features Used: ['pregnancies', 'glucose', 'bloodpressure', 'bmi', 'age']


