In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,PowerTransformer, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import time
from sklearn.compose import ColumnTransformer
import warnings
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from itertools import combinations, permutations
import itertools
from sklearn.feature_selection import f_classif
import shap
import matplotlib.pyplot as plt

In [None]:
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
# Load the dataset
data = pd.read_csv('/kaggle/input/s-obesity/ObesityDataSet_raw_and_data_sinthetic.csv')


In [None]:
data.head()

In [None]:
# Print the column names
print(data.columns)

In [None]:
# Check for missing values
missing_values = data.isnull().sum()
print("Missing Values:\n", missing_values)

In [None]:
# Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
data_imputed = imputer.fit_transform(data)
data_imputed = pd.DataFrame(data_imputed, columns=data.columns)

In [None]:
# Splitting the data into features (X) and target variable (y)
X = data_imputed.drop('NObeyesdad', axis=1)
y = data_imputed['NObeyesdad']

In [None]:
# Data splitting: Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Perform one-hot encoding
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_train_encoded = pd.DataFrame(encoder.fit_transform(X_train), columns=encoder.get_feature_names_out(X_train.columns))
X_test_encoded = pd.DataFrame(encoder.transform(X_test), columns=encoder.get_feature_names_out(X_train.columns))

In [None]:
# Perform oversampling using RandomOverSampler
oversampler = RandomOverSampler()
X_resampled, y_resampled = oversampler.fit_resample(X_train_encoded, y_train)

In [None]:
# Standard Scaler
scaler_standard = StandardScaler()
X_resampled_scaled_standard = scaler_standard.fit_transform(X_resampled)
X_test_scaled_standard = scaler_standard.transform(X_test_encoded)


In [None]:
#minmax scaler
scaler2 = MinMaxScaler()
X_resampled_scaled2 = scaler2.fit_transform(X_resampled)
X_test_scaled2 = scaler2.transform(X_test_encoded)

In [None]:
#robust scaler
scaler3 = RobustScaler()
X_resampled_scaled3 = scaler3.fit_transform(X_resampled)
X_test_scaled3 = scaler3.transform(X_test_encoded)

In [None]:
#normalizer scaler
scaler4 = Normalizer()
X_resampled_scaled4 = scaler4.fit_transform(X_resampled)
X_test_scaled4 = scaler4.transform(X_test_encoded)

In [None]:
#Power transformer
scaler5 = PowerTransformer(method='yeo-johnson')
X_resampled_scaled5 = scaler5.fit_transform(X_resampled)
X_test_scaled5 = scaler5.transform(X_test_encoded)

In [None]:
# Perform one-hot encoding for categorical variables
X_encoded = pd.get_dummies(X)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


In [None]:
# Scale the features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Perform feature selection using chi-square test
selector = SelectKBest(chi2, k=10)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

In [None]:
# Get the selected feature names
selected_feature_names = X_train.columns[selector.get_support()]

In [None]:
# Logistic Regression
logreg = LogisticRegression()

In [None]:
# Define the parameter grid for hyperparameter tuning
logreg_param_grid = {
    'C': [0.01, 0.1, 1.0, 10.0, 100.0],  # Regularization parameter
    'penalty': ['l1', 'l2'],  # Regularization type
    'solver': ['liblinear', 'saga']
    
}

In [None]:
logreg_cv = GridSearchCV(logreg, logreg_param_grid, cv=5)
logreg_cv.fit(X_train_scaled, y_train)
logreg_best = logreg_cv.best_estimator_
logreg_predictions = logreg_best.predict(X_test_scaled)
logreg_accuracy = accuracy_score(y_test, logreg_predictions)
print('Logistic Regression Accuracy:', logreg_accuracy)
print(classification_report(y_test, logreg_predictions))

In [None]:
# Random Forest
rf = RandomForestClassifier()
rf_param_grid = {
    'n_estimators': [100, 500, 700],  # Number of trees
    'max_depth': [5, 10, 15],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10]  # Minimum number of samples required to split a node
}

In [None]:
rf_cv = GridSearchCV(rf, rf_param_grid, cv=5)
rf_cv.fit(X_train_scaled, y_train)
rf_best = rf_cv.best_estimator_
rf_predictions = rf_best.predict(X_test_scaled)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print('Random Forest Accuracy:', rf_accuracy)
print(classification_report(y_test, rf_predictions))


In [None]:
# SVM
svm = SVC()
svm_param_grid = {
    'C': [0.1, 1.0, 10.0],  # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Kernel type
    'gamma': ['scale', 'auto']  # Kernel coefficient
}

In [None]:
svm_cv = GridSearchCV(svm, svm_param_grid, cv=5)
svm_cv.fit(X_train_scaled, y_train)
svm_best = svm_cv.best_estimator_
svm_predictions = svm_best.predict(X_test_scaled)
svm_accuracy = accuracy_score(y_test, svm_predictions)
print('SVM Accuracy:', svm_accuracy)
print(classification_report(y_test, svm_predictions)) 

In [None]:
#Decision Tree 
dt = DecisionTreeClassifier()
dt_param_grid = {
    'criterion': ['gini', 'entropy'],  # Split criterion
    'max_depth': [None, 5, 10],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10]  # Minimum number of samples required to split an internal node
}

In [None]:
dt_cv = GridSearchCV(dt, dt_param_grid, cv=5)
dt_cv.fit(X_train_scaled, y_train)
dt_best = dt_cv.best_estimator_
dt_predictions = dt_best.predict(X_test_scaled)
dt_accuracy = accuracy_score(y_test, dt_predictions)
print('Decision Tree Accuracy:', dt_accuracy)
print(classification_report(y_test, dt_predictions))

In [None]:
#Extra Tree Classification
et = ExtraTreesClassifier()
et_param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'criterion': ['gini', 'entropy'],  # Split criterion
    'max_depth': [None, 5, 10],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10]  # Minimum number of samples required to split an internal node
}

In [None]:
et_cv = GridSearchCV(et, et_param_grid, cv=5)
et_cv.fit(X_train_scaled, y_train)
et_best = et_cv.best_estimator_
et_predictions = et_best.predict(X_test_scaled)
et_accuracy = accuracy_score(y_test, et_predictions)
print('Extra Tree Classifier Accuracy:', et_accuracy)
print(classification_report(y_test, et_predictions))


In [None]:
# Print the selected feature names
print("Selected Features:")
print(selected_feature_names)

In [None]:
# Print the best parameters
print("Best Parameters:")
print(logreg_best,rf_best,svm_best,dt_best,et_best)