<a href="https://colab.research.google.com/github/SumaiyaData/Metabolic-Syndrome-detector/blob/main/Metabolic_Syndrome_detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [79]:
# 1. Load Library + dataset + Y_dataprofile report

from sklearn import set_config
set_config(transform_output="pandas")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
import pickle

metabolic_syndrom_original= pd.read_csv("/content/Metabolic Syndrome.csv")
metabolic_syndrom= metabolic_syndrom_original.copy()
metabolic_syndrom.head()
# !pip install ydata-profiling
# from ydata_profiling import ProfileReport
# profile = ProfileReport(metabolic_syndrom, title="Profiling Report")
# profile.to_file("metabolic_syndrom_report.html")

#2. Based on correlation decide delete or combine columns because highly related columns represent same things:
#
metabolic_syndrom = metabolic_syndrom.drop(columns=['seqn'])
metabolic_syndrom= metabolic_syndrom.drop(columns=['Albuminuria'])
metabolic_syndrom= metabolic_syndrom.drop(columns=['BMI'])
metabolic_syndrom.head()

#3. Separate features (X) and target (y)

X = metabolic_syndrom.drop('MetabolicSyndrome', axis=1)
y = metabolic_syndrom['MetabolicSyndrome']

#4. Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# 5. impute missing Values

transformer1 = ColumnTransformer([
    ('impute_marital',SimpleImputer(strategy='most_frequent'),['Marital']),
    ('impute_income',SimpleImputer(strategy='median'),['Income']),
    ('impute_waistcirc',SimpleImputer(strategy='median'),['WaistCirc'])
],remainder='passthrough', verbose_feature_names_out=False)

# 6. create outlier transformer
class OutlierCapper(BaseEstimator, TransformerMixin):
    def __init__(self, method='iqr', multiplier=1.5, columns=None):
        self.method = method
        self.multiplier = multiplier
        self.columns = columns
        self.limits_ = {}

    def fit(self, X, y=None):
        X_df = pd.DataFrame(X)
        if self.columns is None:
             self.columns = X_df.columns

        for col in self.columns:
            if self.method == 'iqr':
                q1 = X_df[col].quantile(0.25)
                q3 = X_df[col].quantile(0.75)
                iqr = q3 - q1
                lower = q1 - self.multiplier * iqr
                upper = q3 + self.multiplier * iqr
            elif self.method == 'zscore':
                mean = X_df[col].mean()
                std = X_df[col].std()
                lower = mean - self.multiplier * std
                upper = mean + self.multiplier * std
            else:
                raise ValueError("method must be 'iqr' or 'zscore'")
            self.limits_[col] = (lower, upper)
        return self

    def transform(self, X):
        X_df = pd.DataFrame(X).copy()
        for col, (lower, upper) in self.limits_.items():
            X_df[col] = np.clip(X_df[col], lower, upper)
        return X_df

transformer2 = ColumnTransformer([
    ('iqr_outlier', Pipeline([
        ('outlier', OutlierCapper(method='iqr', multiplier=1.5))
    ]), ['WaistCirc', 'UrAlbCr', 'BloodGlucose','HDL', 'Triglycerides']),

    ('zscore_outlier', Pipeline([
        ('outlier', OutlierCapper(method='zscore', multiplier=1.5))
    ]), ['UricAcid'])
], remainder='passthrough',verbose_feature_names_out=False)

# Income (no outliers), Age (no outliers)

# 7. Encoding
transformer3 = ColumnTransformer([
    ('ohe_columns',OneHotEncoder(sparse_output=False,drop='first',handle_unknown='ignore'),['Sex','Marital','Race']),
],remainder='passthrough', verbose_feature_names_out=False)


# 8. Scaling
class FlexibleScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.minmax_scaler = MinMaxScaler()
        self.standard_scaler = StandardScaler()
        self.power_scaler = PowerTransformer(method='yeo-johnson')
        self.minmax_cols = []
        self.standard_cols = []
        self.power_cols = []

    def fit(self, X, y=None):
        X_df = pd.DataFrame(X)

        # Find which columns exist in the data
        self.minmax_cols = [col for col in ['Age', 'Income'] if col in X_df.columns]
        self.standard_cols = [col for col in ['UricAcid', 'HDL', 'WaistCirc'] if col in X_df.columns]
        self.power_cols = [col for col in ['UrAlbCr', 'BloodGlucose', 'Triglycerides'] if col in X_df.columns]

        # Fit scalers only on columns that exist
        if self.minmax_cols:
            self.minmax_scaler.fit(X_df[self.minmax_cols])
        if self.standard_cols:
            self.standard_scaler.fit(X_df[self.standard_cols])
        if self.power_cols:
            self.power_scaler.fit(X_df[self.power_cols])

        return self

    def transform(self, X):
        X_df = pd.DataFrame(X).copy()

        # Transform only columns that exist
        if self.minmax_cols:
            X_df[self.minmax_cols] = self.minmax_scaler.transform(X_df[self.minmax_cols])
        if self.standard_cols:
            X_df[self.standard_cols] = self.standard_scaler.transform(X_df[self.standard_cols])
        if self.power_cols:
            X_df[self.power_cols] = self.power_scaler.transform(X_df[self.power_cols])

        return X_df

# Use FlexibleScaler instead of ColumnTransformer
transformer4 = FlexibleScaler()

# 9 . Combine preprocessing into one preprocessing pipeline
preprocessor = Pipeline([
    ('transformer1', transformer1),
    ('transformer2', transformer2),
    ('transformer3', transformer3),
    ('transformer4', transformer4)
])

# 10. Separate pipelines for each model, cross validation, export
#
#LogisticRegression
pipe_lr = Pipeline([('preprocessing', preprocessor), ('classifier', LogisticRegression(class_weight='balanced'))])
pipe_lr.fit(X_train, y_train)
y_pred = pipe_lr.predict(X_test)
accuracy_score(y_test,y_pred)
# cross validation using cross_val_score
cross_val_score(pipe_lr, X_train, y_train, cv=5, scoring='accuracy').mean()
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
# export
pickle.dump(pipe_lr,open('pipe.pkl','wb'))

# #RandomForestClassifier
pipe_rf = Pipeline([('preprocessing', preprocessor),('classifier', RandomForestClassifier(class_weight='balanced'))])
pipe_rf.fit(X_train, y_train)
y_pred = pipe_rf.predict(X_test)
accuracy_score(y_test,y_pred)
# cross validation using cross_val_score
cross_val_score(pipe_rf, X_train, y_train, cv=5, scoring='accuracy').mean()
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
# Export
pickle.dump(pipe_rf,open('piperf.pkl','wb'))




Accuracy: 0.8503118503118503
Confusion Matrix:
 [[276  48]
 [ 24 133]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.85      0.88       324
           1       0.73      0.85      0.79       157

    accuracy                           0.85       481
   macro avg       0.83      0.85      0.84       481
weighted avg       0.86      0.85      0.85       481





Accuracy: 0.8814968814968815
Confusion Matrix:
 [[302  22]
 [ 35 122]]
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.93      0.91       324
           1       0.85      0.78      0.81       157

    accuracy                           0.88       481
   macro avg       0.87      0.85      0.86       481
weighted avg       0.88      0.88      0.88       481



