In [1]:
import pandas as pd
df = pd.read_csv("Personalized_Diet_RecommendationsGC.csv")

In [2]:
# Step 8.1: Apply one-hot encoding to Gender
print("Before encoding:")
print("Gender value counts:\n", df['Gender'].value_counts())
# Create one-hot encoded columns, drop one to avoid multicollinearity
df = pd.get_dummies(df, columns=['Gender'], prefix='Gender')
df = df.drop(columns=['Gender_Other'])
# Convert boolean to Int64
for col in ['Gender_Female', 'Gender_Male']:
    df[col] = df[col].astype('Int64')
# Step 8.2: Verify encoding
print("\nAfter encoding:")
for col in ['Gender_Female', 'Gender_Male']:
    print(f"{col} value counts:\n", df[col].value_counts())
# Step 8.3: Verify data types
print("\nData Types After Encoding:")
print(df.dtypes)

Before encoding:
Gender value counts:
 Gender
Female    1695
Other     1653
Male      1652
Name: count, dtype: int64

After encoding:
Gender_Female value counts:
 Gender_Female
0    3305
1    1695
Name: count, dtype: Int64
Gender_Male value counts:
 Gender_Male
0    3348
1    1652
Name: count, dtype: Int64

Data Types After Encoding:
Patient_ID                   object
Age                           int64
Height_cm                     int64
Weight_kg                     int64
BMI                         float64
Chronic_Disease              object
Blood_Pressure_Systolic       int64
Blood_Pressure_Diastolic      int64
Cholesterol_Level             int64
Blood_Sugar_Level             int64
Genetic_Risk_Factor          object
Allergies                    object
Daily_Steps                   int64
Exercise_Frequency            int64
Sleep_Hours                 float64
Alcohol_Consumption          object
Smoking_Habit                object
Dietary_Habits               object
Caloric_Intake  

In [3]:
# Step 9.1: Apply one-hot encoding to Chronic_Disease
print("Before encoding:")
print("Chronic_Disease value counts:\n", df['Chronic_Disease'].value_counts())

# Create one-hot encoded columns, drop one to avoid multicollinearity
df = pd.get_dummies(df, columns=['Chronic_Disease'], prefix='Chronic_Disease')
df = df.drop(columns=['Chronic_Disease_No_Condition'])

# Convert boolean to Int64
chronic_disease_cols = [col for col in df.columns if col.startswith('Chronic_Disease_')]
for col in chronic_disease_cols:
    df[col] = df[col].astype('Int64')

# Step 9.2: Verify encoding
print("\nAfter encoding:")
for col in chronic_disease_cols:
    print(f"{col} value counts:\n", df[col].value_counts())

# Step 9.3: Verify data types
print("\nData Types After Encoding:")
print(df.dtypes)

Before encoding:
Chronic_Disease value counts:
 Chronic_Disease
No_Condition     2043
Diabetes         1019
Heart Disease     749
Hypertension      693
Obesity           496
Name: count, dtype: int64

After encoding:
Chronic_Disease_Diabetes value counts:
 Chronic_Disease_Diabetes
0    3981
1    1019
Name: count, dtype: Int64
Chronic_Disease_Heart Disease value counts:
 Chronic_Disease_Heart Disease
0    4251
1     749
Name: count, dtype: Int64
Chronic_Disease_Hypertension value counts:
 Chronic_Disease_Hypertension
0    4307
1     693
Name: count, dtype: Int64
Chronic_Disease_Obesity value counts:
 Chronic_Disease_Obesity
0    4504
1     496
Name: count, dtype: Int64

Data Types After Encoding:
Patient_ID                        object
Age                                int64
Height_cm                          int64
Weight_kg                          int64
BMI                              float64
Blood_Pressure_Systolic            int64
Blood_Pressure_Diastolic           int64
Choleste

In [4]:
# Step 10.1: Apply binary encoding to Genetic_Risk_Factor
print("Before encoding:")
print("Genetic_Risk_Factor value counts:\n", df['Genetic_Risk_Factor'].value_counts())
# Map values: No=0, Yes=1
df['Genetic_Risk_Factor'] = df['Genetic_Risk_Factor'].map({'No': 0, 'Yes': 1})
# Convert to Int64
df['Genetic_Risk_Factor'] = df['Genetic_Risk_Factor'].astype('Int64')
# Step 10.2: Verify encoding
print("\nAfter encoding:")
print("Genetic_Risk_Factor value counts:\n", df['Genetic_Risk_Factor'].value_counts())
# Step 10.3: Verify data types
print("\nData Types After Encoding:")
print(df.dtypes)

Before encoding:
Genetic_Risk_Factor value counts:
 Genetic_Risk_Factor
No     3472
Yes    1528
Name: count, dtype: int64

After encoding:
Genetic_Risk_Factor value counts:
 Genetic_Risk_Factor
0    3472
1    1528
Name: count, dtype: Int64

Data Types After Encoding:
Patient_ID                        object
Age                                int64
Height_cm                          int64
Weight_kg                          int64
BMI                              float64
Blood_Pressure_Systolic            int64
Blood_Pressure_Diastolic           int64
Cholesterol_Level                  int64
Blood_Sugar_Level                  int64
Genetic_Risk_Factor                Int64
Allergies                         object
Daily_Steps                        int64
Exercise_Frequency                 int64
Sleep_Hours                      float64
Alcohol_Consumption               object
Smoking_Habit                     object
Dietary_Habits                    object
Caloric_Intake                     

In [5]:
# Step 11.1: Apply one-hot encoding to Allergies
print("Before encoding:")
print("Allergies value counts:\n", df['Allergies'].value_counts())

# Create one-hot encoded columns, drop one to avoid multicollinearity
df = pd.get_dummies(df, columns=['Allergies'], prefix='Allergies')
df = df.drop(columns=['Allergies_No_Condition'])

# Rename columns to replace spaces with underscores
df.columns = [col.replace(' ', '_') for col in df.columns]

# Convert boolean to Int64
allergies_cols = [col for col in df.columns if col.startswith('Allergies_')]
for col in allergies_cols:
    df[col] = df[col].astype('Int64')

# Step 11.2: Verify encoding
print("\nAfter encoding:")
for col in allergies_cols:
    print(f"{col} value counts:\n", df[col].value_counts())

# Step 11.3: Verify data types
print("\nData Types After Encoding:")
print(df.dtypes)

Before encoding:
Allergies value counts:
 Allergies
No_Condition           3497
Lactose Intolerance     556
Nut Allergy             483
Gluten Intolerance      464
Name: count, dtype: int64

After encoding:
Allergies_Gluten_Intolerance value counts:
 Allergies_Gluten_Intolerance
0    4536
1     464
Name: count, dtype: Int64
Allergies_Lactose_Intolerance value counts:
 Allergies_Lactose_Intolerance
0    4444
1     556
Name: count, dtype: Int64
Allergies_Nut_Allergy value counts:
 Allergies_Nut_Allergy
0    4517
1     483
Name: count, dtype: Int64

Data Types After Encoding:
Patient_ID                        object
Age                                int64
Height_cm                          int64
Weight_kg                          int64
BMI                              float64
Blood_Pressure_Systolic            int64
Blood_Pressure_Diastolic           int64
Cholesterol_Level                  int64
Blood_Sugar_Level                  int64
Genetic_Risk_Factor                Int64
Daily_Step

In [6]:
# Step 12.1: Apply binary encoding to Alcohol_Consumption
print("Before encoding:")
print("Alcohol_Consumption value counts:\n", df['Alcohol_Consumption'].value_counts())

# Map values: No=0, Yes=1
df['Alcohol_Consumption'] = df['Alcohol_Consumption'].map({'No': 0, 'Yes': 1})

# Convert to Int64
df['Alcohol_Consumption'] = df['Alcohol_Consumption'].astype('Int64')

# Step 12.2: Verify encoding
print("\nAfter encoding:")
print("Alcohol_Consumption value counts:\n", df['Alcohol_Consumption'].value_counts())

# Step 12.3: Verify data types
print("\nData Types After Encoding:")
print(df.dtypes)

Before encoding:
Alcohol_Consumption value counts:
 Alcohol_Consumption
No     3509
Yes    1491
Name: count, dtype: int64

After encoding:
Alcohol_Consumption value counts:
 Alcohol_Consumption
0    3509
1    1491
Name: count, dtype: Int64

Data Types After Encoding:
Patient_ID                        object
Age                                int64
Height_cm                          int64
Weight_kg                          int64
BMI                              float64
Blood_Pressure_Systolic            int64
Blood_Pressure_Diastolic           int64
Cholesterol_Level                  int64
Blood_Sugar_Level                  int64
Genetic_Risk_Factor                Int64
Daily_Steps                        int64
Exercise_Frequency                 int64
Sleep_Hours                      float64
Alcohol_Consumption                Int64
Smoking_Habit                     object
Dietary_Habits                    object
Caloric_Intake                     int64
Protein_Intake                     

In [7]:
# Step 13.1: Apply binary encoding to Smoking_Habit
print("Before encoding:")
print("Smoking_Habit value counts:\n", df['Smoking_Habit'].value_counts())

# Map values: No=0, Yes=1
df['Smoking_Habit'] = df['Smoking_Habit'].map({'No': 0, 'Yes': 1})

# Convert to Int64
df['Smoking_Habit'] = df['Smoking_Habit'].astype('Int64')

# Step 13.2: Verify encoding
print("\nAfter encoding:")
print("Smoking_Habit value counts:\n", df['Smoking_Habit'].value_counts())

Before encoding:
Smoking_Habit value counts:
 Smoking_Habit
No     4025
Yes     975
Name: count, dtype: int64

After encoding:
Smoking_Habit value counts:
 Smoking_Habit
0    4025
1     975
Name: count, dtype: Int64


In [8]:
# Step 14.1: Apply one-hot encoding to Dietary_Habits
print("Before encoding:")
print("Dietary_Habits value counts:\n", df['Dietary_Habits'].value_counts())

# Create one-hot encoded columns, drop one to avoid multicollinearity
df = pd.get_dummies(df, columns=['Dietary_Habits'], prefix='Dietary_Habits')
df = df.drop(columns=['Dietary_Habits_Regular'])

# Replace spaces with underscores in column names
df.columns = [col.replace(' ', '_') for col in df.columns]

# Convert boolean to Int64
dietary_habits_cols = [col for col in df.columns if col.startswith('Dietary_Habits_')]
for col in dietary_habits_cols:
    df[col] = df[col].astype('Int64')

# Step 14.2: Verify encoding
print("\nAfter encoding:")
for col in dietary_habits_cols:
    print(f"{col} value counts:\n", df[col].value_counts())

Before encoding:
Dietary_Habits value counts:
 Dietary_Habits
Regular       2498
Keto          1006
Vegetarian    1001
Vegan          495
Name: count, dtype: int64

After encoding:
Dietary_Habits_Keto value counts:
 Dietary_Habits_Keto
0    3994
1    1006
Name: count, dtype: Int64
Dietary_Habits_Vegan value counts:
 Dietary_Habits_Vegan
0    4505
1     495
Name: count, dtype: Int64
Dietary_Habits_Vegetarian value counts:
 Dietary_Habits_Vegetarian
0    3999
1    1001
Name: count, dtype: Int64


In [9]:
# Step 15.1: Apply one-hot encoding to Preferred_Cuisine
print("Before encoding:")
print("Preferred_Cuisine value counts:\n", df['Preferred_Cuisine'].value_counts())

# Create one-hot encoded columns, drop one to avoid multicollinearity
df = pd.get_dummies(df, columns=['Preferred_Cuisine'], prefix='Preferred_Cuisine')

# Replace spaces with underscores in column names
df.columns = [col.replace(' ', '_') for col in df.columns]

# Convert boolean to Int64
preferred_cuisine_cols = [col for col in df.columns if col.startswith('Preferred_Cuisine_')]
for col in preferred_cuisine_cols:
    df[col] = df[col].astype('Int64')

# Step 15.2: Verify encoding
print("\nAfter encoding:")
for col in preferred_cuisine_cols:
    print(f"{col} value counts:\n", df[col].value_counts())

Before encoding:
Preferred_Cuisine value counts:
 Preferred_Cuisine
Indian           1259
Asian            1259
Western          1257
Mediterranean    1225
Name: count, dtype: int64

After encoding:
Preferred_Cuisine_Asian value counts:
 Preferred_Cuisine_Asian
0    3741
1    1259
Name: count, dtype: Int64
Preferred_Cuisine_Indian value counts:
 Preferred_Cuisine_Indian
0    3741
1    1259
Name: count, dtype: Int64
Preferred_Cuisine_Mediterranean value counts:
 Preferred_Cuisine_Mediterranean
0    3775
1    1225
Name: count, dtype: Int64
Preferred_Cuisine_Western value counts:
 Preferred_Cuisine_Western
0    3743
1    1257
Name: count, dtype: Int64


In [10]:
# Step 16.1: Apply one-hot encoding to Food_Aversions
print("Before encoding:")
print("Food_Aversions value counts:\n", df['Food_Aversions'].value_counts())

# Create one-hot encoded columns, drop one to avoid multicollinearity
df = pd.get_dummies(df, columns=['Food_Aversions'], prefix='Food_Aversions')


# Replace spaces with underscores in column names
df.columns = [col.replace(' ', '_') for col in df.columns]
df = df.drop(columns=['Food_Aversions_No_Condition'])
# Convert boolean to Int64
food_aversion_cols = [col for col in df.columns if col.startswith('Food_Aversions_')]
for col in food_aversion_cols:
    df[col] = df[col].astype('Int64')

# Step 16.2: Verify encoding
print("\nAfter encoding:")
for col in food_aversion_cols:
    print(f"{col} value counts:\n", df[col].value_counts())

Before encoding:
Food_Aversions value counts:
 Food_Aversions
Spicy           1263
Sweet           1262
Salty           1250
No_Condition    1225
Name: count, dtype: int64

After encoding:
Food_Aversions_Salty value counts:
 Food_Aversions_Salty
0    3750
1    1250
Name: count, dtype: Int64
Food_Aversions_Spicy value counts:
 Food_Aversions_Spicy
0    3737
1    1263
Name: count, dtype: Int64
Food_Aversions_Sweet value counts:
 Food_Aversions_Sweet
0    3738
1    1262
Name: count, dtype: Int64


In [11]:
# Step 17.1: Verify columns before removal
print("Before removal:")
print("Number of columns:", len(df.columns))
print("Columns:\n", df.columns.tolist())

# Step 17.2: Remove specified columns
columns_to_remove = ['Recommended_Calories', 'Recommended_Protein', 'Recommended_Carbs', 'Recommended_Fats']
df = df.drop(columns=columns_to_remove)

# Step 17.3: Verify columns after removal
print("\nAfter removal:")
print("Number of columns:", len(df.columns))
print("Columns:\n", df.columns.tolist())

Before removal:
Number of columns: 43
Columns:
 ['Patient_ID', 'Age', 'Height_cm', 'Weight_kg', 'BMI', 'Blood_Pressure_Systolic', 'Blood_Pressure_Diastolic', 'Cholesterol_Level', 'Blood_Sugar_Level', 'Genetic_Risk_Factor', 'Daily_Steps', 'Exercise_Frequency', 'Sleep_Hours', 'Alcohol_Consumption', 'Smoking_Habit', 'Caloric_Intake', 'Protein_Intake', 'Carbohydrate_Intake', 'Fat_Intake', 'Recommended_Calories', 'Recommended_Protein', 'Recommended_Carbs', 'Recommended_Fats', 'Recommended_Meal_Plan', 'Gender_Female', 'Gender_Male', 'Chronic_Disease_Diabetes', 'Chronic_Disease_Heart_Disease', 'Chronic_Disease_Hypertension', 'Chronic_Disease_Obesity', 'Allergies_Gluten_Intolerance', 'Allergies_Lactose_Intolerance', 'Allergies_Nut_Allergy', 'Dietary_Habits_Keto', 'Dietary_Habits_Vegan', 'Dietary_Habits_Vegetarian', 'Preferred_Cuisine_Asian', 'Preferred_Cuisine_Indian', 'Preferred_Cuisine_Mediterranean', 'Preferred_Cuisine_Western', 'Food_Aversions_Salty', 'Food_Aversions_Spicy', 'Food_Aversion

In [23]:
#Step 18.1: Verify values before encoding
print("Before encoding:")
print("Recommended_Meal_Plan value counts:\n", df['Recommended_Meal_Plan'].value_counts())

# Step 18.2: Apply label encoding to Recommended_Meal_Plan
# Create a mapping dictionary
label_mapping = {label: idx for idx, label in enumerate(df['Recommended_Meal_Plan'].unique())}
# Apply encoding
df['Recommended_Meal_Plan'] = df['Recommended_Meal_Plan'].map(label_mapping)
# Convert to Int64
df['Recommended_Meal_Plan'] = df['Recommended_Meal_Plan'].astype('Int64')

# Step 18.3: Verify encoding
print("\nAfter encoding:")
print("Recommended_Meal_Plan value counts:\n", df['Recommended_Meal_Plan'].value_counts())

# Step 18.4: Create and save mapping file
mapping_df = pd.DataFrame({
    'Recommended_Meal_Plan': label_mapping.keys(),
    'Encoded_Meal_Plan': label_mapping.values()
})
mapping_df.to_csv("Meal_Plan_Mapping.csv", index=False)
print("\nMapping saved to 'Meal_Plan_Mapping.csv':")
print(mapping_df)

# Step 18.5: Verify data types
print("\nData Types After Encoding:")
print(df.dtypes)

# Step 18.6: Verify column count
print("\nNumber of columns:", len(df.columns))
print("Columns:\n", df.columns.tolist())

# Save the updated dataset
df.to_csv("Personalized_Diet_RecommendationsDC.csv", index=False)

Before encoding:
Recommended_Meal_Plan value counts:
 Recommended_Meal_Plan
Low-Fat Diet         1313
High-Protein Diet    1255
Balanced Diet        1250
Low-Carb Diet        1182
Name: count, dtype: int64

After encoding:
Recommended_Meal_Plan value counts:
 Recommended_Meal_Plan
2    1313
0    1255
1    1250
3    1182
Name: count, dtype: Int64

Mapping saved to 'Meal_Plan_Mapping.csv':
  Recommended_Meal_Plan  Encoded_Meal_Plan
0     High-Protein Diet                  0
1         Balanced Diet                  1
2          Low-Fat Diet                  2
3         Low-Carb Diet                  3

Data Types After Encoding:
Patient_ID                          object
Age                                  int64
Height_cm                            int64
Weight_kg                            int64
BMI                                float64
Blood_Pressure_Systolic              int64
Blood_Pressure_Diastolic             int64
Cholesterol_Level                    int64
Blood_Sugar_Level     