In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from snowflake.snowpark.context import get_active_session
session = get_active_session()


In [None]:
df_snow = session.table("diabetes_raw")
df = df_snow.to_pandas()
df.head()

In [None]:
df.info()

In [None]:
print(df.columns.tolist())

#### I dont have any nan values in my dataset but I am seeing a pattern that the numericall columns like Skin Thickness , Insulin contains zeroes in it which is not possible so I need to replace these with there medians as it is robust to outliers. 

In [None]:
df.columns = df.columns.str.strip()

In [None]:
cols = ['GLUCOSE', 'BLOODPRESSURE', 'SKINTHICKNESS', 'INSULIN', 'BMI', 'DIABETESPEDIGREEFUNCTION']

In [None]:
for col in cols:
    median_val = df[col].median()
    df[col]=df[col].replace(0,median_val)

In [None]:
def bmi_cat(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif bmi < 25:
        return "Normal"
    elif bmi < 30:
        return "Overweight"
    else:
        return "Obese"

def age_group(age):
    if age < 30:
        return "Young"
    elif age < 50:
        return "Adult"
    else:
        return "Senior"

In [None]:
df["BMI_Category"] = df["BMI"].apply(bmi_cat)
df["Age_Group"] = df["AGE"].apply(age_group)

In [None]:
count_bmi = df[df['OUTCOME']==1]['BMI_Category'].value_counts().sort_index()
count_age = df[df['OUTCOME']==1]['Age_Group'].value_counts().sort_index()

In [None]:
plt.figure(figsize=(4,3))
count_bmi.plot(kind='bar',color='yellow')
plt.title("Diabetes Cases by BMI Category")
plt.xlabel("BMI Category")
plt.ylabel("Number of Diabetes Cases")
plt.tight_layout()

plt.show()

In [None]:
plt.figure(figsize=(4,3))
count_age.plot(kind='bar',color='yellow')
plt.title("Diabetes Cases by Age Group")
plt.xlabel("Age Group")
plt.ylabel("Number of Diabetes Cases")
plt.tight_layout()

plt.show()

## Insights
* Replaced invalid zero values with the median since these features cannot realistically be zero.

* Created two new features: BMI_Category and Age_Group to understand patterns better.

* Analysis shows that Obese individuals have the highest likelihood of diabetes.

* Adults (30â€“50 years) are the most affected age group.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X=df.drop(columns=['OUTCOME', 'BMI_Category', 'Age_Group'],errors='ignore')
y=df['OUTCOME']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler=StandardScaler()
scaler

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix, recall_score , accuracy_score

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [None]:
results=[]
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    cm = confusion_matrix(y_test, y_pred)
    fn=cm[1][0]
    recall = recall_score(y_test, y_pred)
    accuracy=accuracy_score(y_test, y_pred)
    results.append([name, fn, recall , accuracy])

for r in results:
    print(f"{r[0]} -> FN: {r[1]}, Recall: {r[2]:.4f} ,Accuracy: {r[3]:.4f}")

In [None]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

In [None]:
train.columns = [c.upper() for c in train.columns]

In [None]:
train_df = session.create_dataframe(train)

In [None]:
train_df.write.mode("overwrite").save_as_table("FEATURE_STORE_DIABETES")

In [None]:
input_cols = [
    "PREGNANCIES","GLUCOSE","BLOODPRESSURE","SKINTHICKNESS",
    "INSULIN","BMI","DIABETESPEDIGREEFUNCTION","AGE"
]

label_col = "LABEL"

In [None]:
feature_df = session.table("FEATURE_STORE_DIABETES")
feature_df.show()

In [None]:
pdf = feature_df.to_pandas()

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
scaler=StandardScaler()
scaler

In [None]:
xgb=XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb

In [None]:
pipeline=Pipeline([
    ('scaler',scaler),
    ('xgb',xgb)
])

In [None]:
pipeline.fit(X_train,y_train)

In [None]:
import pickle

with open("diabetes_xgb.pkl", "wb") as f:
    pickle.dump(pipeline, f)

In [None]:
CREATE STAGE MODEL_STAGE;

In [None]:
session.file.put(
    "diabetes_xgb.pkl",
    "@MODEL_STAGE",
    auto_compress=False,
    overwrite=True
)

In [None]:
LIST @MODEL_STAGE;