# Climate-Friendly Food Systems (CFFS) Labelling Project

### The University of British Columbia

****

## Machine Learning
### NLP Approach

***

In [2]:
import pandas as pd
import os

In [None]:
# RUN ONLY ONCE
# os.chdir is used to change the current directory to the specified path
os.chdir("../../") # Sets path to the repo folder as it is two level above where this file exists!
path = os.getcwd()
print(path)

In [4]:
# Step 1: Read the CSV files
ingredients_df = pd.read_csv('data/preprocessed/Ingredients_List.csv')
items_list_df = pd.read_csv('data/preprocessed/Items_List.csv')
preps_list_df = pd.read_csv('data/preprocessed/Preps_List.csv')
product_list_df = pd.read_csv('data/preprocessed/Products_List.csv')
data_labelled_df = pd.read_csv('data/final/2023_2024_CFFS_Outcomes/Data_Labelled_OK23-24_with_name.csv')

In [1]:
# Map IDs to descriptions
item_desc_map = items_list_df.set_index('ItemId')['Description'].to_dict()
prep_desc_map = preps_list_df.set_index('PrepId')['Description'].to_dict()
product_desc_map = product_list_df.set_index('ProdId')['Description'].to_dict()

# Function to map IDs to descriptions
def get_description(id):
    if id.startswith('I-'):
        return item_desc_map.get(id, id)
    elif id.startswith('P-'):
        return prep_desc_map.get(id, id)
    return id

# Aggregate ingredients by recipe and map IDs to descriptions
ingredients_df['IngredientName'] = ingredients_df['IngredientId'].apply(get_description)
recipe_ingredients = ingredients_df.groupby('Recipe')['IngredientName'].apply(lambda x: ', '.join(x)).reset_index()

# Resolve recipe names and include ProdId
recipe_ingredients['ProdId'] = recipe_ingredients['Recipe']
recipe_ingredients['RecipeName'] = recipe_ingredients['ProdId'].apply(lambda x: product_desc_map.get(x, ''))

# Filter out recipes with ProdIds not found in the labelled data
valid_prod_ids = set(data_labelled_df['ProdId'])
recipe_ingredients_filtered = recipe_ingredients[recipe_ingredients['ProdId'].isin(valid_prod_ids)]

# Map "Combined Label" for valid ProdIds
recipe_ingredients_filtered['Combined Label'] = recipe_ingredients_filtered['ProdId'].apply(lambda x: data_labelled_df.set_index('ProdId')['Combined Label'].get(x, ''))

# Exclude products not in the final list (if any)
if recipe_ingredients_filtered.empty:
    print("No matching ProdId found.")
else:
    final_df = recipe_ingredients_filtered[['ProdId', 'RecipeName', 'IngredientName', 'Combined Label']].rename(columns={'IngredientName': 'Ingredients'})
    final_df.to_excel('NLP__ubc_fs_data/data/recipes_with_ingredients.xlsx', index=False)
    print("Filtered products with their names and combined labels are saved to 'recipes_with_ingredients_and_labels_filtered.xlsx'.")