In [None]:
!pip install ucimlrepo

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
from ucimlrepo import fetch_ucirepo

from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline


from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
# pd.options.plotting.backend = "plotly"

import plotly.express as px
# import plotly.graph_objects as go
# import plotly.figure_factory as ff

from helper import (generate_confusion_matrix, generate_classification_report, generate_statistical_summary,
                    generate_logistic_model_weights, generate_std_model_weights)

In [None]:
DATA_RS, MODEL_RS = 100,100

In [None]:
from ucimlrepo import fetch_ucirepo
# fetch dataset 
data = fetch_ucirepo(id=544) 
  
# data (as pandas dataframes) 
X = data.data.features 
y = data.data.targets 


# x_std = zscore(X)

In [None]:
df = pd.concat([X,y], axis=1, join="inner")

def binarize_y(target:str):
    """"""
    if target in ["Insufficient_Weight", "Normal_Weight"]:
        return "low risk"
    else:
        return "high risk"
    
df["target"] = df.NObeyesdad.apply(binarize_y)

# Q-1

- Make a general statement abot **TP**, **TN**, **FP**, and **FN** in the context of this problem. (Hint: Think in terms of normal and overweight)

In [None]:
def plot_target(df,target_col):
    fig = px.histogram(df,x=f"{target_col}", color=f"{target_col}", color_discrete_sequence=["grey", "cyan"])
    fig.show()

In [None]:
plot_target(df=df, target_col="target")

# Q-2

- Assume that the ML model youre building is deployed in a country which has an obesity crisis but diagnosis is inexpansive. What Metric would you choose? 
- Now, assume that another ML model needs to be deployed in a country with obesity crisis but the diagnosis is very expansive. What Metric would you use in the latter case? Provide motivation.


# Q-3
There are two Logistic Regression models built based on this dataset: One model is trained using standardized data, and another without using the standardized data.


In [None]:
# Preprocessing
features = ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'CALC', 'MTRANS']
target = ["target"]

def deobjectify_df(X:pd.DataFrame):
    """"""

    # List to store Categorical Columns
    cat_cols = list(X.columns[X.dtypes == 'object'])
    print("Categorical Columns: ",cat_cols)

    # List to store Numerical Columns
    num_cols = list(X.columns[X.dtypes != 'object'])
    print("\nNumerical Columns:" ,num_cols)

    ## One-Hot Encoding Categorical Columns
    x_dummy =  pd.get_dummies(X[cat_cols], drop_first=True)

    ## Joining New dummified and Numerical columns
    x_new = pd.concat([x_dummy, X[num_cols]], axis=1, join='inner')
    return x_new

x_new = deobjectify_df(X=df[features])

def get_train_val_test(X,y):
    x_train, x_int, y_train, y_int = train_test_split(X,y, random_state=DATA_RS,test_size=0.5, stratify=y)
    x_val, x_test, y_val, y_test = train_test_split(x_int,y_int, random_state=DATA_RS,test_size=0.5, stratify=y_int)
    return x_train,x_val,x_test, y_train,y_val,y_test

x_train,x_val,x_test, y_train,y_val,y_test = get_train_val_test(X=x_new,y=df[target])

lr = LogisticRegression(penalty="none", random_state=MODEL_RS, max_iter=2500)
lr_std= LogisticRegression(penalty="none", random_state=MODEL_RS, max_iter=1000)
std_steps = [("Scaling", StandardScaler()), ("Modeling", lr_std)]
std_pipeline = Pipeline(steps=std_steps)


lr.fit(x_train, np.ravel(y_train))

std_pipeline.fit(x_train, np.ravel(y_train))

def get_model_pred(model, x_pred):
    y_hat = model.predict(x_pred)
    return y_hat

## 3.1
- Report **_Prediction_** error for both the models.

In [None]:
generate_classification_report(y_true=y_val, y_pred=get_model_pred(lr,x_val), target_names=lr.classes_.tolist(), split="Logistic Model: Validation")
generate_classification_report(y_true=y_val, y_pred=get_model_pred(std_pipeline,x_val), target_names=std_pipeline.classes_.tolist(), split="Regularized Model: Validation")

In [None]:
generate_confusion_matrix(y_true=y_test, y_pred=get_model_pred(lr,x_test), labels=lr.classes_.tolist(), title="-->Data: Non-Standardized, Set: Test")
generate_confusion_matrix(y_true=y_test, y_pred=get_model_pred(std_pipeline,x_test), labels=std_pipeline.classes_.tolist(), title="-->Data: Standardized, Set: Test")

## 3.2
- Visually analyze the weight coefficients of both the mdoels. Provide your interpretation for the 3 most impactful features.
- The weights differ quite significantly in both the models. Can you suggest the reason for the same.

In [None]:
generate_logistic_model_weights(lr,model_name="Logistic Model")
generate_std_model_weights(std_pipeline,model_name="Standardized Model")