In [26]:
file_path = "s3://datasets-development/diet/dataset_alignment/Bahrain/RawData/Bahrain Food Registry.xlsx"

from dotenv import load_dotenv
import os
load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

In [27]:
import pandas as pd
from pydantic import BaseModel, field_validator, Field, model_validator, ValidationInfo
from typing import Annotated
class FoodItem:
    """
    A class representing a food item in the Bahrain Food Registry.
    """

    def __init__(self, row: pd.Series):
        self.id: int = row['id']
        self.name: str = row['food']
        self.category: str = row['food_category']
        self.sub_category: str = row['food_subcategory']
        self.local_name: str = row['local_name']
        self.scientific_name: str = row['scientific_name']
        self.arabic_names: list[str] = [row['arabic_name_1'], row['arabic_name_2'], row['arabic_name_3'], row['arabic_name_4']]
        self.arabic_names = [name for name in self.arabic_names if name]
    

    def __str__(self):
        """
        Returns a string representation of the food item.
        """
        include = ['id', 'name', 'category', 'sub_category', 'local_name', 'scientific_name', 'arabic_names']
        # return for each property its name and value only if the value is not empty
        return '\n'.join([f'{prop}: {getattr(self, prop)}' for prop in include if getattr(self, prop)])

class FoodItemNameEvaluation(BaseModel):
    # food_item: FoodItem
    id: int = Field(..., description='The id of the food item in the Bahrain Food Registry.')
    best_arabic_name: str = Field(..., description='The best Arabic name for the food item from the given options.')
    gpt_arabic_name: Annotated[str, None] = Field(..., description='Your own Arabic name for the food item.')

    
    @model_validator(mode="after")
    def validate_arabic_names(self, info: ValidationInfo):
        context = info.context
        if context:
            names: list[str] = context['arabic_names']
            if self.best_arabic_name not in names:
                raise ValueError(f'The best Arabic name must be one of the following: {names}')
        return self



In [28]:

def df2FoodItems(df: pd.DataFrame) -> list[FoodItem]:
    """
    Loads the Bahrain Food Registry from the given file path.
    """
    return [FoodItem(row) for _, row in df.iterrows()]


In [29]:
import instructor
from openai import AsyncOpenAI

# Initialize the instructor client
client = instructor.from_openai(AsyncOpenAI(api_key=api_key))

async def find_best_arabic_name_single_request(food_item: FoodItem) -> FoodItem:
    # Define the prompt for the system
    system_prompt = """You are given a list of food items from the Bahrain Food Registry.
For each item, you need to find the best Arabic name from the list of Arabic names provided.
The best Arabic name is the one that is most likely to be used by Arabic speakers in Bahrain and best matches the description of the food item (name, category...).
You can also add a new Arabic name if you think it is more suitable than the ones provided.
"""
    # Define the prompt for the user
    user_prompt = f"""Please choose the best Arabic name for the following food item:
    {str(food_item)}"""

    return await client.chat.completions.create(
        model="gpt-4o",
        response_model=FoodItemNameEvaluation,
        validation_context={"arabic_names": food_item.arabic_names},
        max_retries=3,
        messages=[
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": user_prompt
            }
        ]
    )


In [30]:

import asyncio

async def run_parallel(items: list[FoodItem]) -> list[FoodItemNameEvaluation]:
    """
    Run the system in parallel for multiple food items.
    """
    return await asyncio.gather(
        *[find_best_arabic_name_single_request(item) for item in items]
    )

    

In [31]:
bahrain_df = pd.read_excel(file_path, keep_default_na=False)

food_registry = df2FoodItems(bahrain_df)

evaluations = await run_parallel(food_registry)

# Set the gpt arabic name for each of the food items in the df
for evaluation in evaluations:
    bahrain_df.loc[bahrain_df['id'] == evaluation.id, 'arabic_name'] = evaluation.gpt_arabic_name


In [32]:
# Save the updated df to a new file
bahrain_df.to_excel("s3://datasets-development/diet/dataset_alignment/Bahrain/RawData/bahrain_food_registry_with_arabic_name.xlsx", index=False)