In [None]:
# Cell 1: Import libraries
import pandas as pd
import numpy as np

# Cell 2: Load raw data and reference
df = pd.read_csv("../data/raw/ingredients_scraped.csv")
ref = pd.read_csv("../data/external/eco_impact_reference.csv")

# Cell 3: Preprocess ingredients
def compute_ingredient_score(ingredients):
    ing_list = [x.strip() for x in ingredients.split(',')]
    scores = []
    for ing in ing_list:
        match = ref[ref['chemical_name'].str.lower() == ing.lower()]
        if not match.empty:
            scores.append(match.iloc[0]['eco_rating'])
        else:
            scores.append(5)  # neutral if unknown
    return round(np.mean(scores), 2)

df['ingredient_score'] = df['ingredients_list'].apply(compute_ingredient_score)

# Cell 4: Packaging scoring logic
packaging_score_map = {
    'Recyclable Plastic': 9,
    'Glass Jar': 8,
    'Plastic Tube': 5,
    'Recycled PET Bottle': 8,
    'Biodegradable Pouch': 9
}
df['packaging_score'] = df['packaging_material'].map(packaging_score_map).fillna(6)

# Cell 5: Sustainability Score
df['sustainability_score'] = round((df['ingredient_score'] + df['packaging_score']) / 2, 2)

# Cell 6: Save processed data
processed = df[['product_id', 'product_name', 'ingredient_score', 'packaging_score', 'sustainability_score']]
processed.to_csv("../data/processed/cleaned_products.csv", index=False)
processed.head()
