In [21]:
import re
import pandas as pd
from PyPDF2 import PdfReader


def extract_dishes_from_pdf(file_path):
    # Load the PDF
    reader = PdfReader(file_path)
    text = ""

    # Extract text from all pages
    for page in reader.pages:
        text += page.extract_text()

    # Merge fragmented text
    text = text.replace("\n", " ")

    # Refined regex for dish names
    dish_pattern = r"\b([A-Za-z]+(?: [A-Za-z]+)+)\b"

    # Find matches
    all_matches = re.findall(dish_pattern, text)

    # Filter irrelevant items
    ignore_keywords = {"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", 
                       "Per Plate", "Served", "Calories", "Breakfast", "Lunch", "Dinner", "Snack"}

    filtered_dishes = [
        match for match in all_matches
        if len(match.split()) > 1 and match not in ignore_keywords
    ]

    return sorted(set(filtered_dishes))  # Return unique and sorted dishes


def match_dishes_with_calories(dishes, dataset_path):
    # Load the dataset
    df = pd.read_csv(dataset_path)

    # Ensure consistent casing for matching
    df["Dish"] = df["Dish"].str.lower()
    dishes = [dish.lower() for dish in dishes]

    # Match dishes with their calorie data
    matched_dishes = df[df["Dish"].isin(dishes)]

    print(f"Matched Dishes:\n{matched_dishes}")  # Debugging step to check matched dishes
    return matched_dishes



def recommend_menu(user_preference, matched_dishes):
    if "Diet" not in matched_dishes.columns:
        raise ValueError("The 'Diet' column is missing in the dataset.")
    
    if "Calories" not in matched_dishes.columns or "Protein" not in matched_dishes.columns or \
       "Fats" not in matched_dishes.columns or "Sugars" not in matched_dishes.columns:
        raise ValueError("Required nutritional columns (Calories, Protein, Fat, Sugar) are missing in the dataset.")
    
    # Filter dishes based on user preference
    filtered_dishes = matched_dishes[matched_dishes["Diet"].str.contains(user_preference, case=False, na=False)]

    # Generate recommendations for breakfast, lunch, snacks, and dinner
    menu = {
        "Breakfast": [],
        "Lunch": [],
        "Snacks": [],
        "Dinner": []
    }

    nutrition_info = {
        "Breakfast": {"Calories": 0, "Protein": 0, "Fat": 0, "Sugar": 0},
        "Lunch": {"Calories": 0, "Protein": 0, "Fat": 0, "Sugar": 0},
        "Snacks": {"Calories": 0, "Protein": 0, "Fat": 0, "Sugar": 0},
        "Dinner": {"Calories": 0, "Protein": 0, "Fat": 0, "Sugar": 0}
    }

    # Sample dishes for each meal type
    for meal in menu.keys():
        meal_dishes = filtered_dishes[filtered_dishes["Category"].str.contains(meal, case=False, na=False)]
        
        # Ensure that we don't try to sample more dishes than available
        sample_size = min(len(meal_dishes), 7)  # Sample as many as possible, but no more than 7

        if sample_size > 0:
            selected_dishes = meal_dishes.sample(n=sample_size, replace=False)  # Avoid duplicates

            for _, dish_row in selected_dishes.iterrows():
                menu[meal].append(dish_row["Dish"])

                # Convert values to numeric (handling possible non-numeric characters like 'g' or missing values)
                calories = pd.to_numeric(dish_row["Calories"], errors='coerce')  # Convert to number, replace errors with NaN
                
                # Use .replace() for single values (no regex support for `str.replace()`)
                protein = dish_row["Protein"].replace('g', '') if isinstance(dish_row["Protein"], str) else str(dish_row["Protein"])
                fat = dish_row["Fats"].replace('g', '') if isinstance(dish_row["Fats"], str) else str(dish_row["Fats"])
                sugar = dish_row["Sugars"].replace('g', '') if isinstance(dish_row["Sugars"], str) else str(dish_row["Sugars"])

                # Convert to numeric after replacing 'g' and handle missing values
                protein = pd.to_numeric(protein, errors='coerce')
                fat = pd.to_numeric(fat, errors='coerce')
                sugar = pd.to_numeric(sugar, errors='coerce')

                # Default NaN to 0 if the conversion failed
                calories = calories if not pd.isna(calories) else 0
                protein = protein if not pd.isna(protein) else 0
                fat = fat if not pd.isna(fat) else 0
                sugar = sugar if not pd.isna(sugar) else 0

                # Add nutritional info
                nutrition_info[meal]["Calories"] += calories
                nutrition_info[meal]["Protein"] += protein
                nutrition_info[meal]["Fat"] += fat
                nutrition_info[meal]["Sugar"] += sugar

    return menu, nutrition_info



# Example usage
pdf_path = r"C:\Users\LENOVO\Downloads\Mess_Menu_June.pdf"
dataset_path = r"C:\Users\LENOVO\Downloads\expanded_hostel_menufinal.csv"

# Read the dataset
df = pd.read_csv(dataset_path)

# Ensure the correct column names
df.columns = df.columns.str.strip()

expected_columns = ['Dish', 'Calories', 'Category', 'Protein', 'Fats', 'Sugars', 'Diet']
if not all(col in df.columns for col in expected_columns):
    print("Missing columns!")
else:
    print("All required columns are present.")

# Convert 'Protein', 'Fats', 'Sugars' to integers by removing 'g' and then converting to integer
df['Protein'] = df['Protein'].apply(lambda x: int(x.replace('g', '')) if isinstance(x, str) else 0)
df['Fats'] = df['Fats'].apply(lambda x: int(x.replace('g', '')) if isinstance(x, str) else 0)
df['Sugars'] = df['Sugars'].apply(lambda x: int(x.replace('g', '')) if isinstance(x, str) else 0)
df['Calories'] = df['Calories'].astype(int)

# Check if the columns are in the correct format
print(df.head())

# Extract dishes from the PDF
dishes = extract_dishes_from_pdf(pdf_path)

# Match dishes with calorie data
matched_dishes = match_dishes_with_calories(dishes, dataset_path)

if not matched_dishes.empty:
    print("Available Diets: Diabetic, Weight Loss, Muscle Gain, General")
    user_preference = input("Enter your diet preference (Diabetic, Weight Loss, Muscle Gain, General): ").strip()

    # Recommend a menu based on user preference
    weekly_menu, nutrition_info = recommend_menu(user_preference, matched_dishes)

# Display the weekly menu with nutritional information
print("\nRecommended Weekly Menu:")
for day in range(7):  # Loop over 7 days
    print(f"\nDay {day + 1}:")
    for meal, dishes in weekly_menu.items():
        if len(dishes) > 0:  # Ensure there are dishes available for this meal
            # Ensure that if there are fewer dishes than days, we cycle through the list
            dish_for_the_day = dishes[day % len(dishes)]  # This ensures no IndexError if there are fewer than 7 dishes
            print(f"  {meal}: {dish_for_the_day}")
        else:
            print(f"  {meal}: No dishes available")

    # Display nutritional info for the day
    total_calories = sum(nutrition_info[meal]["Calories"] for meal in nutrition_info)
    total_protein = sum(nutrition_info[meal]["Protein"] for meal in nutrition_info)
    total_fat = sum(nutrition_info[meal]["Fat"] for meal in nutrition_info)
    total_sugar = sum(nutrition_info[meal]["Sugar"] for meal in nutrition_info)

    print(f"\nTotal Nutritional Information for Day {day + 1}:")
    print(f"  Calories: {total_calories}")
    print(f"  Protein: {total_protein}")
    print(f"  Fat: {total_fat}")
    print(f"  Sugar: {total_sugar}")


else:
    print("No matching dishes found in the dataset.")


All required columns are present.
             Dish  Calories   Category  Protein  Fats  Sugars         Diet
0      Aloo Tikki       150  Breakfast        3     5      10      General
1  Badam Milk Tea       150  Breakfast        3     4      12      General
2  Beetroot Pulya       150      Lunch        4     5       8      General
3      Boost Milk       150  Breakfast        4     5      11      General
4         Chapati        70  Breakfast        2     1       2  Weight Loss
Matched Dishes:
                         Dish  Calories   Category Protein Fats Sugars  \
5                 corn flakes       100  Breakfast      2g   1g    12g   
6                 crispy corn       250  Breakfast      4g  14g     8g   
7                  cut fruits        50  Breakfast      1g   0g     8g   
8                   dal tadka       250      Lunch      8g   7g     4g   
10                 egg bhurji       200  Breakfast     12g  12g     2g   
17                 lemon rice       300      Lunch      

Enter your diet preference (Diabetic, Weight Loss, Muscle Gain, General):  General



Recommended Weekly Menu:

Day 1:
  Breakfast: cut fruits
  Lunch: jeera rice
  Snacks: No dishes available
  Dinner: sabudana vada

Total Nutritional Information for Day 1:
  Calories: 2540
  Protein: 64
  Fat: 105
  Sugar: 102

Day 2:
  Breakfast: crispy corn
  Lunch: tomato bath
  Snacks: No dishes available
  Dinner: veg hyderabadi

Total Nutritional Information for Day 2:
  Calories: 2540
  Protein: 64
  Fat: 105
  Sugar: 102

Day 3:
  Breakfast: corn flakes
  Lunch: black dal
  Snacks: No dishes available
  Dinner: sabudana vada

Total Nutritional Information for Day 3:
  Calories: 2540
  Protein: 64
  Fat: 105
  Sugar: 102

Day 4:
  Breakfast: cut fruits
  Lunch: onion rings with lemon
  Snacks: No dishes available
  Dinner: veg hyderabadi

Total Nutritional Information for Day 4:
  Calories: 2540
  Protein: 64
  Fat: 105
  Sugar: 102

Day 5:
  Breakfast: crispy corn
  Lunch: coconut chutney
  Snacks: No dishes available
  Dinner: sabudana vada

Total Nutritional Information for

In [31]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
import json

def preprocess_data(dataset_path):
    # Load dataset
    data = pd.read_csv(dataset_path)

    # Ensure required columns are present
    required_columns = ["Dish", "Calories", "Category", "Protein", "Fats", "Sugars", "Diet"]
    for col in required_columns:
        if col not in data.columns:
            raise ValueError(f"Missing required column: {col}")

    # Clean numeric columns
    numeric_columns = ["Calories", "Protein", "Fats", "Sugars"]
    for col in numeric_columns:
        data[col] = data[col].replace(r"[^\d.]", "", regex=True)  # Remove non-numeric characters
        data[col] = pd.to_numeric(data[col], errors="coerce")  # Convert to numeric, NaNs for invalid

    # Drop rows with missing or invalid numeric values
    data = data.dropna(subset=numeric_columns)

    # Encode categorical columns
    data_encoded = pd.get_dummies(data, columns=["Category", "Diet"], drop_first=True)
    return data, data_encoded


def train_model(data_encoded, target_column="Suitable"):
    # Example: Add a suitable column if not present (Dummy binary target for illustration)
    if target_column not in data_encoded.columns:
        import numpy as np
        data_encoded[target_column] = np.random.randint(0, 2, size=len(data_encoded))

    # Features and target
    X = data_encoded.drop(columns=["Dish", target_column], errors="ignore")
    y = data_encoded[target_column]

    # Scale numeric features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Train the model
    model = RandomForestClassifier()
    model.fit(X_scaled, y)
   
    
    return model, scaler, X.columns

    return model, scaler, X.columns
   

def recommend_dishes(user_preferences, model, scaler, feature_columns, dataset_path):
    # Load the dataset
    data = pd.read_csv(dataset_path)

    # Check if the dish exists in the dataset
    if user_preferences['Dish'] not in data['Dish'].values:
        raise ValueError(f"Dish '{user_preferences['Dish']}' is not found in the dataset.")

    # Prepare user input as a dataframe
    user_data = pd.DataFrame([user_preferences])
    user_data_encoded = pd.get_dummies(user_data, columns=["Category", "Diet"], drop_first=True)

    # Align with feature columns
    for col in feature_columns:
        if col not in user_data_encoded.columns:
            user_data_encoded[col] = 0

    # Scale the input
    user_data_scaled = scaler.transform(user_data_encoded[feature_columns])

    # Predict suitability
    prediction = model.predict(user_data_scaled)
    prediction_proba = model.predict_proba(user_data_scaled)[:, 1]

    # Return result
    result = {
        "Dish": user_preferences['Dish'],
        "Diet Preference": user_preferences["Diet"],
        "Suitable": bool(prediction[0]),
        "Probability": round(prediction_proba[0], 2)
    }
    return result

# Example usage
dataset_path = r"C:\Users\LENOVO\Downloads\expanded_hostel_menufinal.csv"

# Preprocess the data and train the model
data, data_encoded = preprocess_data(dataset_path)
model, scaler, feature_columns = train_model(data_encoded)

# Get user input
print("Available Diet Preferences: Diabetic, Weight Loss, Muscle Gain, General")
dish_name=input("Enter dish name")
meal_category=input("Enter the meal time(Brekfast, Lunch, Dinner)")
diet_preference = input("Enter your dietary preference (Diabetic, Weight Loss, Muscle Gain, General): ").strip()

# Example user preferences
user_preference = {
    "Dish": dish_name,  # Ensure this matches a dish in the dataset
    "Calories": 150,
    "Protein": 5,
    "Fats": 2,
    "Sugars": 3,
    "Category": meal_category,  # Should align with categories in the dataset
    "Diet": diet_preference  # User-selected diet
}

# Recommend a dish
try:
    recommendation = recommend_dishes(user_preference, model, scaler, feature_columns, dataset_path)
    print(json.dumps(recommendation, indent=4))
except ValueError as e:
    print(e)


Available Diet Preferences: Diabetic, Weight Loss, Muscle Gain, General


Enter dish name Egg Bhurji
Enter the meal time(Brekfast, Lunch, Dinner) Breakfast
Enter your dietary preference (Diabetic, Weight Loss, Muscle Gain, General):  Muscle Gain


{
    "Dish": "Egg Bhurji",
    "Diet Preference": "Muscle Gain",
    "Suitable": false,
    "Probability": 0.38
}
