# Approach 2

## Using large dataset for training and small dataset for testing. Without splitting 

In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
LargeTrain = pd.read_csv('train.csv') # This is the original dataset
SmallTest = pd.read_csv('Original_ObesityDataSet.csv') # This is the train dataset but we are taking it for testing

In [4]:
LargeTrain = LargeTrain.drop(columns='id')

In [5]:
LargeTrain.shape

(20758, 17)

In [6]:
SmallTest.shape

(2111, 17)

In [7]:
LargeTrain.dtypes

Gender                             object
Age                               float64
Height                            float64
Weight                            float64
family_history_with_overweight     object
FAVC                               object
FCVC                              float64
NCP                               float64
CAEC                               object
SMOKE                              object
CH2O                              float64
SCC                                object
FAF                               float64
TUE                               float64
CALC                               object
MTRANS                             object
NObeyesdad                         object
dtype: object

In [8]:
numerical_features = LargeTrain.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = LargeTrain.select_dtypes(include=['object', 'bool']).columns.tolist()

In [9]:
LargeTrain[categorical_features].nunique()

Gender                            2
family_history_with_overweight    2
FAVC                              2
CAEC                              4
SMOKE                             2
SCC                               2
CALC                              3
MTRANS                            5
NObeyesdad                        7
dtype: int64

# Working on Categorical Features 

In [10]:
unique_values = {column: LargeTrain[column].unique().tolist() for column in categorical_features}

# Print unique values
for column, unique_vals in unique_values.items():
    print(f"Unique values in {column}: {unique_vals}")

Unique values in Gender: ['Male', 'Female']
Unique values in family_history_with_overweight: ['yes', 'no']
Unique values in FAVC: ['yes', 'no']
Unique values in CAEC: ['Sometimes', 'Frequently', 'no', 'Always']
Unique values in SMOKE: ['no', 'yes']
Unique values in SCC: ['no', 'yes']
Unique values in CALC: ['Sometimes', 'no', 'Frequently']
Unique values in MTRANS: ['Public_Transportation', 'Automobile', 'Walking', 'Motorbike', 'Bike']
Unique values in NObeyesdad: ['Overweight_Level_II', 'Normal_Weight', 'Insufficient_Weight', 'Obesity_Type_III', 'Obesity_Type_II', 'Overweight_Level_I', 'Obesity_Type_I']


For features with binary unique values like Gender, family_history_with_Overweight, FAVC, SMOKE, SCC I will use binary mappings. For CAEC and CALC I will use custom mapping to preseve the order and lastly for MTRAN one-hot encoding will be used

# Working On Numerical Data 

# Checking whether Standardization is required for Numerical features

In [11]:
numeric_df = LargeTrain.select_dtypes(include=['float64', 'int64'])

# Calculate variance for each numeric feature
variances = numeric_df.var()

# Print variance values
print(variances)


Age        32.354163
Height      0.007623
Weight    695.875017
FCVC        0.284322
NCP         0.497553
CH2O        0.370232
FAF         0.702750
TUE         0.362541
dtype: float64


In [11]:
# As we can see Weight is the High Variance Feature So I will do Standardization to make sure that all features contribute
# equally to the model.

# Working on Target

In [12]:
LargeTrain['NObeyesdad']

0        Overweight_Level_II
1              Normal_Weight
2        Insufficient_Weight
3           Obesity_Type_III
4        Overweight_Level_II
                ...         
20753        Obesity_Type_II
20754    Insufficient_Weight
20755        Obesity_Type_II
20756    Overweight_Level_II
20757        Obesity_Type_II
Name: NObeyesdad, Length: 20758, dtype: object

## For Target I will use LabelEncoder__

### Separating Features and Target 

In [13]:
y_train = LargeTrain['NObeyesdad']
X_train = LargeTrain.drop(columns='NObeyesdad')
y_test = SmallTest['NObeyesdad']
X_test = SmallTest.drop(columns='NObeyesdad')

In [14]:
###Later

# Fit the encoder on the training set
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)

# Transform the test set with the same encoder
y_test = label_encoder.transform(y_test)


In [15]:
y_train

array([6, 1, 0, ..., 3, 6, 3])

In [16]:
from sklearn.preprocessing import StandardScaler
numerical_features = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

In [16]:
X_train

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,Female,-0.522124,-0.875589,-0.862558,yes,no,-0.785019,0.404153,Sometimes,no,-0.013073,no,-1.188039,0.561997,no,Public_Transportation
1,Female,-0.522124,-1.947599,-1.168077,yes,no,1.088342,0.404153,Sometimes,yes,1.618759,yes,2.339750,-1.080625,Sometimes,Public_Transportation
2,Male,-0.206889,1.054029,-0.366090,yes,no,-0.785019,0.404153,Sometimes,no,-0.013073,no,1.163820,0.561997,Frequently,Public_Transportation
3,Male,0.423582,1.054029,0.015808,no,no,1.088342,0.404153,Sometimes,no,-0.013073,no,1.163820,-1.080625,Frequently,Walking
4,Male,-0.364507,0.839627,0.122740,no,no,-0.785019,-2.167023,Sometimes,no,-0.013073,no,-1.188039,-1.080625,Sometimes,Public_Transportation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,-0.525774,0.097045,1.711763,yes,yes,1.088342,0.404153,Sometimes,no,-0.456705,no,0.783135,0.407996,Sometimes,Public_Transportation
2107,Female,-0.367195,0.502844,1.800914,yes,yes,1.088342,0.404153,Sometimes,no,-0.004702,no,0.389341,-0.096251,Sometimes,Public_Transportation
2108,Female,-0.281909,0.541672,1.798868,yes,yes,1.088342,0.404153,Sometimes,no,0.075361,no,0.474971,-0.019018,Sometimes,Public_Transportation
2109,Female,0.007776,0.404927,1.785780,yes,yes,1.088342,0.404153,Sometimes,no,1.377801,no,0.151471,-0.117991,Sometimes,Public_Transportation


In [17]:
from sklearn import preprocessing

def data_transformation(df):
    # Binary mapping for certain columns
    df['family_history_with_overweight'] = df['family_history_with_overweight'].map({'yes': 1, 'no': 0})
    df['FAVC'] = df['FAVC'].map({'yes': 1, 'no': 0})
    df['SMOKE'] = df['SMOKE'].map({'yes': 1, 'no': 0})
    df['SCC'] = df['SCC'].map({'yes': 1, 'no': 0})
    df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
    
    # Custom mapping for CAEC and CALC to match with the Research paper
    custom_mapping = {'no': 1, 'Sometimes': 2, 'Frequently': 3, 'Always': 4}
    df['CAEC'] = df['CAEC'].map(custom_mapping)
    df['CALC'] = df['CALC'].map(custom_mapping)
    
    # One-hot encoding for MTRANS
    one_hot_encoder = preprocessing.OneHotEncoder()
    means_of_trns_encoded = pd.DataFrame(
        one_hot_encoder.fit_transform(df[['MTRANS']]).toarray(),
        columns=one_hot_encoder.get_feature_names_out(['MTRANS'])) 
    
    # Join the one-hot encoded columns and drop the original MTRANS column
    transformed_df = df.join(means_of_trns_encoded) 
    transformed_df = transformed_df.drop(["MTRANS"], axis=1)
    
    return transformed_df


In [18]:
X_train= data_transformation(X_train)

In [19]:
X_train

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,0,-0.522124,-0.875589,-0.862558,1,0,-0.785019,0.404153,2,0,-0.013073,0,-1.188039,0.561997,1,0.0,0.0,0.0,1.0,0.0
1,0,-0.522124,-1.947599,-1.168077,1,0,1.088342,0.404153,2,1,1.618759,1,2.339750,-1.080625,2,0.0,0.0,0.0,1.0,0.0
2,1,-0.206889,1.054029,-0.366090,1,0,-0.785019,0.404153,2,0,-0.013073,0,1.163820,0.561997,3,0.0,0.0,0.0,1.0,0.0
3,1,0.423582,1.054029,0.015808,0,0,1.088342,0.404153,2,0,-0.013073,0,1.163820,-1.080625,3,0.0,0.0,0.0,0.0,1.0
4,1,-0.364507,0.839627,0.122740,0,0,-0.785019,-2.167023,2,0,-0.013073,0,-1.188039,-1.080625,2,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,0,-0.525774,0.097045,1.711763,1,1,1.088342,0.404153,2,0,-0.456705,0,0.783135,0.407996,2,0.0,0.0,0.0,1.0,0.0
2107,0,-0.367195,0.502844,1.800914,1,1,1.088342,0.404153,2,0,-0.004702,0,0.389341,-0.096251,2,0.0,0.0,0.0,1.0,0.0
2108,0,-0.281909,0.541672,1.798868,1,1,1.088342,0.404153,2,0,0.075361,0,0.474971,-0.019018,2,0.0,0.0,0.0,1.0,0.0
2109,0,0.007776,0.404927,1.785780,1,1,1.088342,0.404153,2,0,1.377801,0,0.151471,-0.117991,2,0.0,0.0,0.0,1.0,0.0


In [19]:
X_test= data_transformation(X_test)

In [20]:
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Generate classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)


Accuracy: 0.9071530080530554
Confusion Matrix:
[[270   2   0   0   0   0   0]
 [ 26 233   0   0   0  25   3]
 [  0   0 320  21   1   0   9]
 [  0   0   3 294   0   0   0]
 [  0   0   1   1 322   0   0]
 [  0   6   0   0   0 243  41]
 [  0   0  32   3   0  22 233]]
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.99      0.95       272
           1       0.97      0.81      0.88       287
           2       0.90      0.91      0.91       351
           3       0.92      0.99      0.95       297
           4       1.00      0.99      1.00       324
           5       0.84      0.84      0.84       290
           6       0.81      0.80      0.81       290

    accuracy                           0.91      2111
   macro avg       0.91      0.91      0.91      2111
weighted avg       0.91      0.91      0.91      2111



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
TP = np.diag(conf_matrix)
FP = conf_matrix.sum(axis=0) - TP
FN = conf_matrix.sum(axis=1) - TP
TN = conf_matrix.sum() - (FP + FN + TP)

# Print true positives, false positives, true negatives, and false negatives
print('True Positives (TP):', TP)
print('False Positives (FP):', FP)
print('True Negatives (TN):', TN)
print('False Negatives (FN):', FN)

# Optionally, print these metrics for each class
for i, class_name in enumerate(label_encoder.classes_):
    print(f'\nClass: {class_name}')
    print(f'TP: {TP[i]}')
    print(f'FP: {FP[i]}')
    print(f'TN: {TN[i]}')
    print(f'FN: {FN[i]}')



True Positives (TP): [270 233 320 294 322 243 233]
False Positives (FP): [26  8 36 25  1 47 53]
True Negatives (TN): [1813 1816 1724 1789 1786 1774 1768]
False Negatives (FN): [ 2 54 31  3  2 47 57]

Class: Insufficient_Weight
TP: 270
FP: 26
TN: 1813
FN: 2

Class: Normal_Weight
TP: 233
FP: 8
TN: 1816
FN: 54

Class: Obesity_Type_I
TP: 320
FP: 36
TN: 1724
FN: 31

Class: Obesity_Type_II
TP: 294
FP: 25
TN: 1789
FN: 3

Class: Obesity_Type_III
TP: 322
FP: 1
TN: 1786
FN: 2

Class: Overweight_Level_I
TP: 243
FP: 47
TN: 1774
FN: 47

Class: Overweight_Level_II
TP: 233
FP: 53
TN: 1768
FN: 57


# Approach 3

Approach 3 is in another file named as InitialCodeFile-3 LR,SVM,RF