In [None]:
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

from snowflake.snowpark.context import get_active_session
session = get_active_session()


In [None]:
SELECT * FROM diabetes_raw 

In [None]:
df_snow = session.table("diabetes_raw")
df = df_snow.to_pandas()
df.head()

In [None]:
df.info()

In [None]:
print(df.columns.tolist())

#### I dont have any nan values in my dataset but I am seeing a pattern that the numericall columns like Skin Thickness , Insulin contains zeroes in it which is not possible so I need to replace these with there medians as it is robust to outliers. 

In [None]:
df.columns = df.columns.str.strip()

In [None]:
cols = ['GLUCOSE', 'BLOODPRESSURE', 'SKINTHICKNESS', 'INSULIN', 'BMI', 'DIABETESPEDIGREEFUNCTION']

In [None]:
for col in cols:
    median_val = df[col].median()
    df[col]=df[col].replace(0,median_val)

In [None]:
# Added new Feature to the dataframe
def bmi_cat(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif bmi < 25:
        return "Normal"
    elif bmi < 30:
        return "Overweight"
    else:
        return "Obese"

def age_group(age):
    if age < 30:
        return "Young"
    elif age < 50:
        return "Adult"
    else:
        return "Senior"

In [None]:
df["BMI_Category"] = df["BMI"].apply(bmi_cat)
df["Age_Group"] = df["AGE"].apply(age_group)

In [None]:
snowflake_df = session.create_dataframe(df)

In [None]:
# Storing it into Feature Store
snowflake_df.write.mode("overwrite").save_as_table("FEATURE_STORE_DIABETES")

### To access dataset from feature store and use it as pandas df

In [None]:
df_snow = session.table("FEATURE_STORE_DIABETES")

In [None]:
df = df_snow.to_pandas()

In [None]:
count_bmi = df.groupby("BMI_Category")["OUTCOME"].sum()
count_age = df.groupby("Age_Group")["OUTCOME"].sum()

In [None]:
count_bmi , count_age

## Insights
* Replaced invalid zero values with the median since these features cannot realistically be zero.

* Created two new features: BMI_Category and Age_Group to understand patterns better.

* Analysis shows that Obese individuals have the highest likelihood of diabetes.

* Adults (30â€“50 years) are the most affected age group.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X=df.drop(columns=['OUTCOME', 'BMI_Category', 'Age_Group'],errors='ignore')
y=df['OUTCOME']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler=StandardScaler()
scaler

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix, recall_score , accuracy_score

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [None]:
results=[]
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    cm = confusion_matrix(y_test, y_pred)
    fn=cm[1][0]
    recall = recall_score(y_test, y_pred)
    accuracy=accuracy_score(y_test, y_pred)
    results.append([name, fn, recall , accuracy])

for r in results:
    print(f"{r[0]} -> FN: {r[1]}, Recall: {r[2]:.4f} ,Accuracy: {r[3]:.4f}")

### I will be using XGBoost as it's recall accuracy is highest which means it has less FN which major in this problem statement.

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
scaler_pipe=StandardScaler()
scaler_pipe

In [None]:
xgb=XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb

In [None]:
pipeline=Pipeline([
    ('scaler',scaler),
    ('xgb',xgb)
])

In [None]:
pipeline.fit(X_train,y_train)

In [None]:
import pickle

with open("diabetes_xgb.pkl", "wb") as f:
    pickle.dump(pipeline, f)

In [None]:
CREATE STAGE IF NOT EXISTS MODEL_STAGE;

In [None]:
session.file.put("diabetes_xgb.pkl", "@MODEL_STAGE", overwrite=True)

In [None]:
LIST @MODEL_STAGE;

#### Unzip Compress Model using snowflake auto unzip

In [None]:
session.file.get("@MODEL_STAGE/diabetes_xgb.pkl.gz", "downloaded_model/")

In [None]:
import gzip

with gzip.open("downloaded_model/diabetes_xgb.pkl.gz","rb") as snow:
    model=pickle.load(snow)

In [None]:
y_pred=model.predict(X_test)

In [None]:
print(round(accuracy_score(y_test,y_pred),2)*100)
print(round(recall_score(y_test,y_pred),2)*100)