In [1]:
import os
os.chdir('../')

In [2]:
import pandas as pd
import openai
import os
import numpy as np
import time
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [3]:
from dotenv import load_dotenv
load_dotenv()  # This loads environment variables from .env
openai.api_key = os.getenv("OPENAI_API_KEY")

In [4]:
# Load your data
fct_df = pd.read_csv('../data/kenya/Kenya FCT.csv')
img_df = pd.read_csv('../data/kenya/kenya_food_codes_table.csv')

In [5]:
# Step 1: Compute or load FCT embeddings
def get_embeddings(texts, model="text-embedding-3-large"):
    # texts: list of strings
    emb_list = []
    batch_size = 90  # as recommended by OpenAI
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        resp = openai.embeddings.create(input=batch, model=model)
        batch_emb = [np.array(e.embedding) for e in resp.data]
        emb_list.extend(batch_emb)
        # time.sleep(0.5)  # just to be safe on rate limit
    return np.vstack(emb_list)

In [6]:
# Compose a string to embed for each FCT row (combine name, group, desc)
fct_df['embed_text'] = (
    fct_df['FCT food name'].fillna('') + " (" +
    fct_df['User defined food group'].fillna('') + ")"
)
# Only do this once! Save to disk for large files
if not os.path.exists('../data/kenya/kenya_fct_embeddings.npy'):
    fct_embeddings = get_embeddings(fct_df['embed_text'].tolist())
    np.save('../data/kenya/kenya_fct_embeddings.npy', fct_embeddings)
else:
    fct_embeddings = np.load('../data/kenya/kenya_fct_embeddings.npy')

In [7]:
# Step 2: For each input food, compute embedding and find top 30 candidates
def get_top_candidates(food_name, food_category, n_cand=30, model="text-embedding-3-large"):
    input_text = f"{food_name} ({food_category})"
    resp = openai.embeddings.create(input=[input_text], model=model)
    query_emb = np.array(resp.data[0].embedding).reshape(1, -1)
    sims = cosine_similarity(query_emb, fct_embeddings)[0]
    top_idx = sims.argsort()[::-1][:n_cand]
    candidates = fct_df.iloc[top_idx]
    return candidates

def prompt_gpt(food_name, food_category, candidates):
    food_list = [
        f"[{row['Food code*']}] {row['FCT food name']} (Group: {row['User defined food group']})"
        for _, row in candidates.iterrows()
    ]
    prompt = f"""
You are an expert at food matching for nutrition databases.

Given the following candidate foods from the Kenya FCT:
{chr(10).join(food_list)}

For the following query:
- Food Name: "{food_name}"
- Food Category (may not exactly match): "{food_category}"

Your task:
- Return ONLY the Food code(s) of the best match.
- If one code is the most appropriate, return ONLY that code (e.g., 1031).
- If there are two or more equally good matches, return all their codes, separated by commas (e.g., 1063, 1064).
- If none are appropriate, return "None".

Return ONLY the code(s) or "None"—no other explanation.
"""
    try:
        client = openai.OpenAI()
        response = client.chat.completions.create(
            model="gpt-4.1",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=500,
            temperature=0.0,
        )
        return response.choices[0].message.content
    except Exception as e:
        print("GPT Error:", e)
        return None

In [8]:
# Demo: Run for first 5 foods (remove if you want all)
results = []
for i, row in img_df.iterrows():
    
    candidates = get_top_candidates(row['Food_Name'], row['Food_Category'], n_cand=30)
    gpt_result = prompt_gpt(row['Food_Name'], row['Food_Category'], candidates)
    print(f"{row['Food_Name']} ({row['Food_Category']}):\n{gpt_result}\n{'='*50}")
    results.append({
        "Food_Name": row['Food_Name'],
        "Food_Category": row['Food_Category'],
        "GPT_Match": gpt_result
    })
    # time.sleep(1.5)  # Prevent API rate limits

results_df = pd.DataFrame(results)

Maize Ugali (Cereals and Cereal Products):
15009
Mixed Flour Ugali/Millet/Sorghum (Cereals and Cereal Products):
15007
Maize Porridge (Cereals and Cereal Products):
15001
Mixed porridge/Millet Porridge/Sorghum Porridge (Cereals and Cereal Products):
15014, 15119
Boiled White Rice (Cereals and Cereal Products):
1059
Spaghetti (Cereals and Cereal Products):
1031
Macaroni (Cereals and Cereal Products):
1030
Fusilli (Cereals and Cereal Products):
1031
Noodles (Cereals and Cereal Products):
1030, 1031
Sesame Buns (Cereals and Cereal Products):
1011
Weetabix (Cereals and Cereal Products):
1010
Corn Flakes (Cereals and Cereal Products):
1009, 1008
Muesli (Cereals and Cereal Products):
None
Whole Grain Cereal Biscuits (Cereals and Cereal Products):
1010
Oat Porridge (Cereals and Cereal Products):
15123
White Bread/Sweet Yellow Bread - 600g Slice (Cereals and Cereal Products):
1006, 1007
White Bread/ Sweet Yellow Bread - 400g Slice (Cereals and Cereal Products):
1007
White Bread/Sweet Yellow Br

In [9]:
results_df.to_csv('../data/kenya/kenya_food_codes_gpt_matches.csv', index=False)

In [1]:
import copy
import pandas as pd

In [2]:
results_df = pd.read_csv('../../data/kenya/kenya_food_codes_gpt_matches.csv')
fct_df = pd.read_csv('../../data/kenya/Kenya FCT.csv')

In [3]:
for i in range(len(results_df)):
    str_gpt_match = copy.deepcopy(results_df.loc[i, 'GPT_Match'])
    if pd.isna(str_gpt_match) or str_gpt_match != "None":
        str_gpt_match = str(str_gpt_match)
        str_gpt_match = str_gpt_match.replace(' ', '')
        str_gpt_match = str_gpt_match.replace(',', ';')
        results_df.loc[i, 'GPT_Match'] = str_gpt_match

In [5]:
max_codes = results_df['GPT_Match'].dropna().apply(lambda x: len(x.split(';')) if x != "None" else 0).max()

def get_descriptions(code_list, fct_df):
    descs = []
    for code in code_list:
        try:
            code_int = int(code)
            desc = fct_df.loc[fct_df['Food code*'] == code_int, 'FCT food name']
            descs.append(desc.iloc[0] if not desc.empty else None)
        except:
            descs.append(None)
    return descs

In [6]:
for idx, row in results_df.iterrows():
    match_str = row['GPT_Match']
    if pd.isna(match_str) or match_str == "None":
        descs = [None] * int(max_codes)
    else:
        codes = [x.strip() for x in match_str.split(';')]
        descs = get_descriptions(codes, fct_df)
        # pad with None if fewer than max_codes
        descs += [None] * (int(max_codes) - len(descs))
    # Assign descriptions to new columns
    for i in range(int(max_codes)):
        results_df.at[idx, f'FCT_Food_Description_{i+1}'] = descs[i]

In [7]:
results_df.to_csv('../../data/kenya/kenya_food_codes_gpt_matches_with_descriptions.csv', index=False)