In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Personalized_Diet_Recommendations.csv")

# Check for nulls
print("Missing Values per Column:")
print(df.isnull().sum())

In [None]:
# Replace "None" with "No_Condition" in relevant columns
cols_to_replace = ['Chronic_Disease', 'Allergies', 'Food_Aversions']
df[cols_to_replace] = df[cols_to_replace].fillna("No_Condition")

# Verify no nulls remain
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())

In [None]:
# Define new column types 
str_cols = ['Patient_ID', 'Gender', 'Chronic_Disease', 'Genetic_Risk_Factor', 'Allergies','Alcohol_Consumption',
            'Smoking_Habit','Dietary_Habits', 'Preferred_Cuisine','Food_Aversions', 'Recommended_Meal_Plan']
int_cols = ['Age', 'Height_cm', 'Weight_kg', 'Blood_Pressure_Systolic', 'Blood_Pressure_Diastolic',
            'Cholesterol_Level', 'Blood_Sugar_Level', 'Daily_Steps', 'Exercise_Frequency',
            'Caloric_Intake', 'Protein_Intake', 'Carbohydrate_Intake', 'Fat_Intake',
            'Recommended_Calories', 'Recommended_Protein', 'Recommended_Carbs', 'Recommended_Fats']
float_cols = ['BMI', 'Sleep_Hours']

# Convert string columns
for col in str_cols:
    df[col] = df[col].astype(str)

# Convert integer columns (handle non-numeric values)
for col in int_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').round().astype('Int64')  # Round and use Int64 to handle NaNs

# Convert float columns
for col in float_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').astype(float)

# Verify data types
print("\nData Types After Conversion:")
print(df.dtypes)


In [4]:
# Step 3.1: Validate BMI
df['Calculated_BMI'] = df['Weight_kg'] / (df['Height_cm'] / 100) ** 2
df['BMI_Error'] = abs(df['BMI'] - df['Calculated_BMI'])
print("Rows with BMI discrepancy (>1 unit):", len(df[df['BMI_Error'] > 1]))
df = df.drop(columns=['Calculated_BMI', 'BMI_Error'])
# Step 3.2: Check for invalid values
invalid_caloric = df[(df['Caloric_Intake'] <= 0) | (df['Recommended_Calories'] <= 0) |
                    (df['Protein_Intake'] <= 0) | (df['Carbohydrate_Intake'] <= 0) |
                    (df['Fat_Intake'] <= 0) | (df['Recommended_Protein'] <= 0) |
                    (df['Recommended_Carbs'] <= 0) | (df['Recommended_Fats'] <= 0)]
print("Rows with invalid caloric/nutrient values:", len(invalid_caloric))

Rows with BMI discrepancy (>1 unit): 0
Rows with invalid caloric/nutrient values: 0


In [None]:
int_cols = ['Height_cm', 'Age', 'Weight_kg', 'Blood_Pressure_Systolic', 'Blood_Pressure_Diastolic',
            'Cholesterol_Level', 'Blood_Sugar_Level', 'Daily_Steps', 'Exercise_Frequency',
            'Caloric_Intake', 'Protein_Intake', 'Carbohydrate_Intake', 'Fat_Intake',
            'Recommended_Calories', 'Recommended_Protein', 'Recommended_Carbs', 'Recommended_Fats']
float_cols = ['BMI', 'Sleep_Hours']

# Step 3.3: Cap outliers using IQR method
def cap_outliers_iqr(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    print(f"Outliers in {col} (IQR method):", 
          len(df[(df[col] < lower_bound) | (df[col] > upper_bound)]))
    df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)
    return df

for col in int_cols + float_cols:
    df = cap_outliers_iqr(df, col)

In [6]:
#Check for duplicate Patient_ID
duplicate_ids = df[df['Patient_ID'].duplicated(keep=False)]
print("Number of duplicate Patient_IDs:", len(duplicate_ids))
#Check for full row duplicates
full_duplicates = df[df.duplicated(keep=False)]
print("Number of full row duplicates:", len(full_duplicates))

Number of duplicate Patient_IDs: 0
Number of full row duplicates: 0


In [None]:
# Define categorical columns
str_cols = ['Gender', 'Chronic_Disease', 'Genetic_Risk_Factor', 'Allergies',
            'Alcohol_Consumption', 'Smoking_Habit', 'Dietary_Habits', 'Preferred_Cuisine',
            'Food_Aversions', 'Recommended_Meal_Plan']


# Step 5.1: Inspect unique values and value counts
print("Unique Values and Value Counts in Categorical Columns:")
for col in str_cols:
    print(f"\n{col}:")
    print("Unique Values:", df[col].unique())
    print("Value Counts:\n", df[col].value_counts())

In [None]:
# Define numeric columns
int_cols = ['Age', 'Height_cm', 'Weight_kg', 'Blood_Pressure_Systolic', 'Blood_Pressure_Diastolic',
            'Cholesterol_Level', 'Blood_Sugar_Level', 'Daily_Steps', 'Exercise_Frequency',
            'Caloric_Intake', 'Protein_Intake', 'Carbohydrate_Intake', 'Fat_Intake',
            'Recommended_Calories', 'Recommended_Protein', 'Recommended_Carbs', 'Recommended_Fats']
float_cols = ['BMI', 'Sleep_Hours']

# Step 6.1: Check BMI consistency
print("Total rows:", len(df))
df['Calculated_BMI'] = df['Weight_kg'] / (df['Height_cm'] / 100) ** 2
df['BMI_Error'] = abs(df['BMI'] - df['Calculated_BMI'])
invalid_bmi = df[df['BMI_Error'] > 1]
print("Rows with BMI discrepancy (>1 unit):", len(invalid_bmi))
# Drop temporary columns (no changes saved to original columns)
df = df.drop(columns=['Calculated_BMI', 'BMI_Error'])

# Step 6.2: Check for negative or zero values in columns that must be positive
positive_cols = ['Age', 'Height_cm', 'Weight_kg', 'Caloric_Intake', 'Protein_Intake',
                'Carbohydrate_Intake', 'Fat_Intake', 'Recommended_Calories',
                'Recommended_Protein', 'Recommended_Carbs', 'Recommended_Fats', 'Sleep_Hours']
for col in positive_cols:
    invalid_rows = df[df[col] <= 0]
    print(f"Rows with invalid (≤0) values in {col}:", len(invalid_rows))

# Check for negative values in other numeric columns
neg_cols = ['Cholesterol_Level', 'Blood_Sugar_Level']
for col in neg_cols:
    invalid_rows = df[df[col] <= 0]
    print(f"Rows with invalid (≤0) values in {col}:", len(invalid_rows))

# Step 6.3: Check blood pressure (Systolic > Diastolic)
invalid_bp = df[df['Blood_Pressure_Systolic'] <= df['Blood_Pressure_Diastolic']]
print("Rows with invalid blood pressure (Systolic ≤ Diastolic):", len(invalid_bp))

# Step 6.4: Check Exercise_Frequency (0–7)
invalid_exercise = df[(df['Exercise_Frequency'] < 0) | (df['Exercise_Frequency'] > 7)]
print("Rows with invalid Exercise_Frequency (<0 or >7):", len(invalid_exercise))

# Step 6.5: Check Daily_Steps (≥0)
invalid_steps = df[df['Daily_Steps'] < 0]
print("Rows with invalid Daily_Steps (<0):", len(invalid_steps))

In [None]:
df.to_csv("Personalized_Diet_RecommendationsGC.csv", index=False)