In [7]:
!pip install pandas openpyxl



In [1]:
import os, glob
import pandas as pd

def parse_recipe_csv(file_path: str):
    with open(file_path, 'r') as f:
        all_lines = f.readlines()

    header_row_index = -1
    for i, line in enumerate(all_lines):
        if line.strip().startswith('ingredients:'):
            header_row_index = i
            break

    if header_row_index == -1:
        raise ValueError("Could not find the 'ingredients:' header row in the file.")

    recipe_row_starting = -1
    for i, line in enumerate(all_lines):
        if line.strip().startswith('cooking instructions:'):
            recipe_row_starting = i
            break
    
    comment_row_index = -1
    for i, line in enumerate(all_lines):
        stripped = line.strip().lower()
        if stripped.startswith('comment:') or stripped.startswith('comments:'):
            comment_row_index = i
            break

    if recipe_row_starting == -1:
        raise ValueError("Could not find the 'cooking instructions:' row in the file.")


    metadata_lines = all_lines[:header_row_index]
    metadata = {}

    target_name_val = None
    english_name_val = None
    
    for line in metadata_lines:
        parts = line.split(',')
        try:
            if 'Unnamed: 5' in line:
                candidate = parts[6].strip()
                if candidate and candidate.lower() != 'unnamed: 6':
                    target_name_val = candidate

            if 'english name:' in line:
                label_index = -1
                for i, part in enumerate(parts):
                    if 'english name:' in part:
                        label_index = i
                        break
                if label_index != -1:
                    for i in range(label_index + 1, len(parts)):
                        candidate = parts[i].strip()
                        if candidate:
                            english_name_val = candidate
                            break
            
            if 'source:' in line:
                metadata['source'] = parts[16].strip()
            if 'region:' in line:
                metadata['region'] = int(parts[16].strip())    
        except (IndexError, ValueError):
            continue
    
    if comment_row_index != -1:
        comment_text = None
        for offset in (1, 2):
            idx = comment_row_index + offset
            if idx >= len(all_lines):
                break
            candidate_line = all_lines[idx].strip()
            if candidate_line:
                comment_text = candidate_line.split(',')[0].strip()
                if comment_text:
                    break

        if comment_text:
            metadata['comments'] = comment_text
            
    metadata['target_name'] = target_name_val if target_name_val else english_name_val
    if not metadata.get('target_name'):
        raise ValueError("File skipped. No name.")
        

    ingredients_df = pd.read_csv(
        file_path,
        header=None,
        skiprows=header_row_index + 1
    )
    # instruction_idx = recipe_row_starting + 1
    # while instruction_idx < len(all_lines):
    #     candidate_line = all_lines[instruction_idx]
    #     if candidate_line.strip() and not all(part.strip() == '' for part in candidate_line.strip().split(',')):
    #         instruction_line = candidate_line
    #         next_idx = instruction_idx + 1
    #         if next_idx < len(all_lines):
    #             next_line = all_lines[next_idx]
    #             if next_line.strip() and not all(part.strip() == '' for part in next_line.strip().split(',')):
    #                 instruction_line = instruction_line.rstrip().rstrip(',') + ' ' + next_line.lstrip().lstrip(',')
    #         break
    #     instruction_idx += 1
    # else:
    #     raise ValueError("can't find non-empty line after 'cooking instructions:'.")

    instruction_idx = recipe_row_starting + 1
    stop_idx = comment_row_index if comment_row_index != -1 else len(all_lines)
    
    instruction_parts = []
    while instruction_idx < stop_idx:
        candidate_line = all_lines[instruction_idx]
        if candidate_line.strip() and not all(part.strip() == '' for part in candidate_line.strip().split(',')):
            cleaned = candidate_line.strip().strip(',')
            if cleaned: instruction_parts.append(cleaned)
        instruction_idx +=1
    
    if not instruction_parts:
        raise ValueError("cant find instruction lines between cooking instructions and comments")

    instruction_line = ' '.join(instruction_parts)

    if not ingredients_df.empty and max([2, 9, 11]) < len(ingredients_df.columns):
        df_useful = ingredients_df[[2, 9, 11]]
        df_useful.columns = ['ingredient', 'quantity', 'unit']
        ingredients_df = df_useful.dropna(subset=['ingredient']).reset_index(drop=True)
        numeric_cols = ['quantity']
        ingredients_df[numeric_cols] = ingredients_df[numeric_cols].apply(pd.to_numeric, errors='coerce')
        ingredients_df['unit'] = ingredients_df['unit'].fillna('item')
    else:
        ingredients_df = pd.DataFrame(columns=['ingredient', 'quantity', 'unit'])

    if not ingredients_df.empty:
        ingredients_df = ingredients_df.groupby('ingredient', as_index=False).agg({
            'quantity': 'sum',
            'unit': 'first'
        })

    try:
        recipe_text = instruction_line.split(',')[0].strip()
    except IndexError:
        recipe_text = "Recipe text not foun in the expected column."

    df_recipe = pd.DataFrame({'recipe': [recipe_text]})

    print("Extracted Data")
    for key, value in metadata.items():
        if key == "target_name":
            print(f"Dish Name: {value}")

    return {
            "file_path": file_path,
            "metadata": metadata,
            "ingredients_df": ingredients_df,
            "instruction_line": instruction_line.strip().rstrip(",")
        }

CSV_DIR = 'files/csvs'
all_csvs = sorted(glob.glob(os.path.join(CSV_DIR, '*.csv')))

parsed_files = []
for fp in all_csvs:
    try:
        parsed_files.append(parse_recipe_csv(fp))
    except Exception as e:
        print(f"Skipping {os.path.basename(fp)} due to error: {e}")

print(f"Ready: parsed {len(parsed_files)} CSV file(s).")



Extracted Data
Dish Name: Anzac Biscuits
Extracted Data
Dish Name: Asparagus Bundles
Extracted Data
Dish Name: Baguettes
Extracted Data
Dish Name: Baked Mac and Cheese
Extracted Data
Dish Name: Baked Walleye
Extracted Data
Dish Name: Banana Fritters
Extracted Data
Dish Name: Beef Stroganoff
Extracted Data
Dish Name: Biscuits and Sausage Gravy
Extracted Data
Dish Name: Brian's Marriot Blueberry Muffins
Extracted Data
Dish Name: Bolognese Meat Sauce
Extracted Data
Dish Name: Breakfast Burrito
Extracted Data
Dish Name: Breakfast Scramble with Italian Sausage
Extracted Data
Dish Name: British Mushy Peas
Extracted Data
Dish Name: Broccoli Bacon Salad
Skipping Brocolli with Ginger_sheet0.csv due to error: cant find instruction lines between cooking instructions and comments
Extracted Data
Dish Name: Brown Sugar Fudge
Extracted Data
Dish Name: Buttermilk Biscuits
Extracted Data
Dish Name: Caribbean Slaw
Extracted Data
Dish Name: Carrot Souffle
Extracted Data
Dish Name: Cheese Soup
Extracted D

In [2]:
import psycopg2
import pandas as pd

db_params = {
    'dbname': 'ristorante',
    'user': 'headchef',
    'password': '123',
    'host': '127.0.0.1',
    'port': '5432'
}

def get_or_create_id(cur, table, id_col, name_col, name_value):
    cur.execute(f"SELECT {id_col} FROM {table} WHERE {name_col} = %s", (name_value,))
    result = cur.fetchone()

    if result:
        return result[0]
    else:
        print(f"'{name_value}' not found in '{table}'. Creating new entry...")
        cur.execute(f"INSERT INTO {table} ({name_col}) VALUES (%s) RETURNING {id_col}", (name_value,))
        new_id = cur.fetchone()[0]
        return new_id


# recipe_name = metadata.get('target_name', 'Unnamed Recipe')
# instructions = ' '.join(instruction_line.strip().rstrip(',').split())

print("Connecting to the database...")
conn = psycopg2.connect(**db_params)

try:
    with conn.cursor() as cur:
        for item in parsed_files:
            metadata = item["metadata"]
            recipe_name = metadata.get("target_name", "Unnamed Recipe")
            instructions = ' '.join(item["instruction_line"].split())
            comments = metadata.get("comments")
            ingredients_df = item["ingredients_df"]

            cur.execute(
                "SELECT EXISTS(SELECT 1 FROM recipes WHERE recipe_name = %s)",
                (recipe_name,)
            )
            if cur.fetchone()[0]:
                print(f"'{recipe_name}' exists already. Skipping.")
                continue

            cur.execute(
                "INSERT INTO recipes (recipe_name, instructions, comments) VALUES (%s, %s, %s) RETURNING recipe_id;",
                (recipe_name, instructions, comments)
            )
            new_recipe_id = cur.fetchone()[0]

            if not ingredients_df.empty:
                for _, row in ingredients_df.iterrows():
                    ingredient_id = get_or_create_id(cur, 'ingredients', 'ingredient_id', 'ingredient_name', row['ingredient'])
                    unit_id = get_or_create_id(cur, 'units', 'unit_id', 'unit_name', row['unit'])
                    cur.execute(
                        "INSERT INTO recipe_ingredients (recipe_id, ingredient_id, quantity, unit_id) VALUES (%s, %s, %s, %s);",
                        (new_recipe_id, ingredient_id, row['quantity'], unit_id)
                    )

            print(f"Inserted '{recipe_name}' (ID {new_recipe_id}) with {0 if ingredients_df.empty else len(ingredients_df)} ingredient rows.")

        conn.commit()
        print("Transaction successful!")

except (Exception, psycopg2.DatabaseError) as error:
    print(f"An error occurred: {error}")
    conn.rollback()
finally:
    conn.close()
    print("Database connection closed.")

Connecting to the database...
' can sub Light corn if cane ot available.' not found in 'ingredients'. Creating new entry...
'item' not found in 'units'. Creating new entry...
'AP flour' not found in 'ingredients'. Creating new entry...
'c' not found in 'units'. Creating new entry...
'Baking soda' not found in 'ingredients'. Creating new entry...
't' not found in 'units'. Creating new entry...
'Coconut -shredded sweetened' not found in 'ingredients'. Creating new entry...
'Golden cane syrup  ' not found in 'ingredients'. Creating new entry...
'T' not found in 'units'. Creating new entry...
'Rolled oats' not found in 'ingredients'. Creating new entry...
'brown sugar -packed' not found in 'ingredients'. Creating new entry...
'butter -melted' not found in 'ingredients'. Creating new entry...
'water' not found in 'ingredients'. Creating new entry...
Inserted 'Anzac Biscuits' (ID 1) with 9 ingredient rows.
'Asparagus' not found in 'ingredients'. Creating new entry...
'#' not found in 'units'