In [16]:
import requests
import pandas as pd
import json


## Fetching Discounted Products from the Salling Group Food Waste API

To retrieve real-time data on discounted food items, we use the Salling Group's public **Food Waste API**. This API provides information on products nearing their expiration date and available at a reduced price in various stores such as Netto, Føtex, and Bilka.




In [40]:
# API token
token = "SG_APIM_CM1M3GXGSA98V8PJ19BQDJPM238NHTTS5TVT7JM7Y2E2VEDBFQQ0"
headers = {
    "Authorization": f"Bearer {token}"
}


In [41]:
# Function that fetches from the food waste API

def fetch_food_waste_data(zip_code):
    url = f"https://api.sallinggroup.com/v1/food-waste/?zip={zip_code}"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            api_data = response.json()
            if isinstance(api_data, list) and api_data:
                return api_data
            elif isinstance(api_data, dict):
                return [api_data]
            else:
                print("⚠️ Empty or unrecognized format received from API")
                return []
        else:
            print(f" API request failed with status code {response.status_code}")
            return []
    except Exception as e:
        print(f" Error making API request: {e}")
        return []


## Structure of raw data

In [45]:
# Print out raw data for zip code 2400:

api_data = fetch_food_waste_data("2400")

# Pretty-print with Danish characters
print(json.dumps(api_data[:5], indent=2, ensure_ascii=False))


[
  {
    "clearances": [
      {
        "offer": {
          "currency": "DKK",
          "discount": 9.95,
          "ean": "5712580948638",
          "endTime": "2025-05-01T21:59:59.000Z",
          "lastUpdate": "2025-05-01T07:50:45.000Z",
          "newPrice": 10,
          "originalPrice": 19.95,
          "percentDiscount": 49.87,
          "startTime": "2025-04-28T05:50:06.000Z",
          "stock": 5,
          "stockUnit": "each"
        },
        "product": {
          "categories": {
            "da": "Mejeri & køl>Pålæg>Pålægssalater>Fiske- & skaldyrssalat",
            "en": "Dairy And Cold Storage>Lunch Meats>Mayo Salads>Fish Mayo Salads"
          },
          "description": "KRABBESALAT K-SALAT",
          "ean": "5704000491602",
          "image": "https://digitalassets.sallinggroup.com/image/upload/e_trim/c_limit,e_sharpen:80,f_auto,q_auto,w_400,h_400/82389156-1d55-4953-a5ba-21301dc42148"
        }
      },
      {
        "offer": {
          "currency": "DKK",
   

## Structuring Product Data from the API

After retrieving raw food clearance data from the Salling Group API, it is necessary to convert this nested JSON structure into a clean, usable DataFrame. The function `create_products_dataframe()` performs this transformation.

This function iterates through the list of stores and their clearance products, extracting:
- Store details (name and street)
- Product metadata (description, pricing, discounts)
- Hierarchical product categories (split into `category1` through `categoryN`) since products have different number of sub categories.

To support category-based filtering and semantic similarity tasks, the full category path is split into components. The most specific category is assigned as `final_category`, which becomes key for embedding and recommendation later on.


In [20]:
#function that creates a dataframe from the API

def create_products_dataframe(api_data_list, zip_code, max_splits=10):
    products = []

    if not api_data_list:
        print(f" No data returned for ZIP {zip_code}")
        return pd.DataFrame()

    for store_data in api_data_list:
        clearances = store_data.get("clearances", [])
        store_info = store_data.get("store", {})
        store_name = store_info.get("name", "Unknown Store")
        store_address = store_info.get("address", {})
        store_street = store_address.get("street", "Unknown Street")

        if not clearances:
            continue

        for item in clearances:
            try:
                product = item.get("product", {})
                offer = item.get("offer", {})

                categories = product.get("categories", {})
                category_full = categories.get("en") or categories.get("da") or ""

                # Extract category splits (and pad)
                parts = [p.strip() for p in category_full.split(">")]
                parts += [""] * (max_splits - len(parts))

                row_data = {
                    "zip_code": zip_code,
                    "store_name": store_name,
                    "store_street": store_street,
                    "description": product.get("description", ""),
                    "original_price": offer.get("originalPrice"),
                    "new_price": offer.get("newPrice"),
                    "discount": offer.get("discount"),
                }

                # Assign category1 through categoryN
                for i in range(max_splits):
                    row_data[f"category{i+1}"] = parts[i]

                # Set final_category as the last non-"Unknown" category from the parts
                final = next((cat for cat in reversed(parts) if cat != ""), "")
                row_data["final_category"] = final

                products.append(row_data)

            except Exception as e:
                print(f"Skipping item due to error: {e}")
                continue

    return pd.DataFrame(products)


In [21]:
zip_codes = [2400]

all_products = [] 

for zip_code in zip_codes:
    api_data = fetch_food_waste_data(zip_code)
    df = create_products_dataframe(api_data, zip_code)
    all_products.append(df)

df_all_products = pd.concat(all_products, ignore_index=True)


print(df_all_products.head())


   zip_code       store_name   store_street                description  \
0      2400  Netto Emdrupvej  Emdrupvej 107        KRABBESALAT K-SALAT   
1      2400  Netto Emdrupvej  Emdrupvej 107   ITALIENSK SALAT GRAASTEN   
2      2400  Netto Emdrupvej  Emdrupvej 107  RØGET MEDISTER 3-STJERNET   
3      2400  Netto Emdrupvej  Emdrupvej 107    OKSE STICKS HANEGAL ØKO   
4      2400  Netto Emdrupvej  Emdrupvej 107      HAMBURGERRYG PÅLÆKKER   

   original_price  new_price  discount               category1    category2  \
0           19.95         10      9.95  Dairy And Cold Storage  Lunch Meats   
1           12.00          9      3.00  Dairy And Cold Storage  Lunch Meats   
2           17.95          9      8.95  Dairy And Cold Storage  Lunch Meats   
3            8.00          4      4.00  Dairy And Cold Storage  Lunch Meats   
4           15.95          8      7.95  Dairy And Cold Storage  Lunch Meats   

     category3              category4 category5 category6 category7 category8  \

In [22]:
df_all_products.to_csv("food_waste_ingredients.csv", index=False)
print("CSV file saved")


CSV file saved


# Recommendation b)  ----- Write more aboutthe experience for the user as well
## Semantic Discount-Based Product Recommender
To enhance the user experience and reduce food waste, we developed a semantic product recommender. This module uses text embeddings to recommend similar products that are not only close in meaning (e.g., other types of milk or pasta) but also prioritize items with the highest discounts.

We use the pretrained model 'all-MiniLM-L6-v2' from Sentence Transformers, which has been shown in class (Text Analysis - Part 5) to provide strong performance on semantic similarity tasks. The model converts product descriptions and category information into vector representations, which allows us to compute cosine similarity between a user’s input product and all available items in our dataset.

1. Accept a product name (e.g., "mælk").

2. Find the matching embedded vector for that product.

3. Calculate cosine similarity to all other products in the dataset.

4. Filter out dissimilar items (based on a threshold).

5. Sort results by similarity score and discount amount, giving priority to the most relevant and cost-effective alternatives.

This allows users to find smart, context-aware replacements — particularly useful when specific products are unavailable, too expensive, or undesired.

In [23]:
## 

In [24]:
# Load dataset
data = pd.read_csv('food_waste_ingredients.csv')

# Remove 'Unknown' fine categories
data = data[data['final_category'] != ''].reset_index(drop=True)

# Remove Ready meals
data = data[data['category2'] != 'Ready To Eat Meals'].reset_index(drop=True)

In [25]:
# Combine columns to create richer text for embedding
data['text_for_embedding'] = (
    data['category1'].astype(str) + " " +
    data['category2'].astype(str) + " " +
    data['category3'].astype(str) + " " +
    data['category4'].astype(str) + " " +
    data['category5'].astype(str) + " " +
    data['category6'].astype(str) + " " +
    data['category7'].astype(str) + " " +
    data['category8'].astype(str) + " " +
    data['category9'].astype(str) + " " +
    data['category10'].astype(str) + " " +
    data['final_category'].astype(str)
)

In [26]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


In [27]:
# Load the MiniLM model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
product_texts = data['text_for_embedding'].tolist()
embeddings = model.encode(product_texts, show_progress_bar=True)

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(embeddings)


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [28]:
def recommend_semantic_discounted(product_name, top_n=5, similarity_threshold=0.4):
    """
    Recommend products that are semantically similar (based on embeddings)
    and have a good discount.
    """

    # Step 1: Find product(s) in dataset that match the name
    matches = data[data['text_for_embedding'].str.contains(product_name, case=False, na=False)]

    if matches.empty:
        print(f"No product found with name containing '{product_name}'.")
        print("\nHere are some valid examples you can try:")
        print(data['final_category'].drop_duplicates().sample(5, random_state=42).tolist())
        return None

    # Step 2: Use the first match to get the embedding
    index = matches.index[0]
    input_vector = embeddings[index]

    # Step 3: Compute cosine similarity with all products
    scores = cosine_similarity([input_vector], embeddings)[0]
    data['similarity'] = scores

    # Step 4: Filter products with sufficient similarity (not the same one)
    similar_items = data[
        (data.index != index) &
        (data['similarity'] >= similarity_threshold)
    ].copy()

    # Step 5: Sort by similarity and discount, drop duplicates by category
    similar_items = (
        similar_items
        .sort_values(by=['similarity', 'discount'], ascending=[False, False])
        .drop_duplicates(subset=['final_category'])
        .head(top_n)
    )

    # Step 6: Rename column headers for display
    similar_items = similar_items.rename(columns={
        'zip_code': 'ZIP Code',
        'description': 'Product Description',
        'store_name': 'Store',
        'store_street': 'Street',
        'final_category': 'Ingredient',
        'category1': 'Top Category',
        'original_price': 'Original Price (DKK)',
        'new_price': 'Discounted Price (DKK)',
        'discount': 'Discount (DKK)',
        'similarity': 'Semantic Similarity Score'
    })

    # Step 7: Show matched input info
    print(" Found product info:")
    print(data.loc[index, ['final_category', 'zip_code', 'store_name', 'store_street']])

    # Step 8: Return the selected and renamed columns
    return similar_items[[
        'ZIP Code',
        'Product Description',
        'Store',
        'Street',
        'Ingredient',
        'Top Category',
        'Original Price (DKK)',
        'Discounted Price (DKK)',
        'Discount (DKK)',
        'Semantic Similarity Score'
    ]]


In [29]:
recommend_semantic_discounted("cheese")

 Found product info:
final_category          Cream Cheese
zip_code                        2400
store_name        Netto Utterslevvej
store_street         Utterslevvej 11
Name: 42, dtype: object


Unnamed: 0,ZIP Code,Product Description,Store,Street,Ingredient,Top Category,Original Price (DKK),Discounted Price (DKK),Discount (DKK),Semantic Similarity Score
153,2400,HVID/PURLØG ØKO NATURLI,Netto Tuborgvej,Tuborgvej 239,Cream Cheese,Dairy And Cold Storage,20.0,14,6.0,1.0
118,2400,HYTTEOST PROTEIN LAB,Netto Stærevej,Stærevej 74,Cottage Cheese,Dairy And Cold Storage,12.95,8,4.95,0.939558
163,2400,RØD I SKIVER PRIMA DONNA,Netto Tuborgvej,Tuborgvej 239,Sliced Cheese,Dairy And Cold Storage,34.95,15,19.95,0.922762
49,2400,LAGRET HAVARTI CASTELLO,Netto Utterslevvej,Utterslevvej 11,Cheese Specialities,Dairy And Cold Storage,45.0,35,10.0,0.895133
62,2400,BRIE PRÉSIDENT,Netto Tomsgårdsvej,Tomsgårdsvej 24,Blue Cheese Brie,Dairy And Cold Storage,35.95,19,16.95,0.882537


## CONNECT THIS PART TO LLM///

What needs to be done is the following:

a) If possible to make continous, then make LLM create a matrix with missing ingredients(in danish without measures) and runs that through the function to fetch cheapest prices

b) Add the price of the food waste ingredients to the missing ingredients to find the total price of the recipe. Maybe show the cost saved as well.

c) Would be cool to add the name of the recipe above the table below (* Chicken curry *)

In [30]:
# Another sailinggroup API, which find relevant products

token = "SG_APIM_HMARAVTQFBDNMSBC5XGVGQFP395APFF4MAV2C99FBAAZV6Q6TZ6G"
headers = {
    "Authorization": f"Bearer {token}"
}

def get_relevant_products(query):
    url = f"https://api.sallinggroup.com/v1/product-suggestions/relevant-products?query={query}"
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json()
        print(f"Found {len(data)} relevant products for query: '{query}'")
        return data
    else:
        print(f"Error {response.status_code}: {response.text}")
        return []

relevant_products = get_relevant_products("kyllingbryst")

# Loop through the first 5 suggested products
for p in relevant_products.get("products", [])[:5]:
    print(f"- {p.get('name')} (EAN: {p.get('ean')})")



Found 1 relevant products for query: 'kyllingbryst'


In [31]:
#print(json.dumps(relevant_products, indent=2, ensure_ascii=False))

In [32]:
# BilkaToGo suggestion search
def get_relevant_bilka_products(query):
    url = f"https://api.sallinggroup.com/v1/product-suggestions/relevant-products?query={query}"
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        return response.json().get("suggestions", [])
    else:
        print(f"Error {response.status_code}: Failed to fetch for '{query}'")
        return []

# Main function to find cheapest product per ingredient
def find_cheapest_bilka_options(ingredient_list):
    results = []

    for ingredient in ingredient_list:
        suggestions = get_relevant_bilka_products(ingredient)

        if not suggestions:
            results.append({
                "prod_id": None,
                "ingredient": ingredient,
                "product_name": "Not found",
                "price": None,
                "link": None
            })
            continue

        # Find the product with the lowest price
        cheapest = min(suggestions, key=lambda x: x.get("price", float('inf')))

        results.append({
            "prod_id": cheapest.get("prod_id"),
            "ingredient": ingredient,
            "product_name": cheapest.get("title"),
            "price": cheapest.get("price"),
            "link": cheapest.get("link")
        })

    return pd.DataFrame(results)


In [33]:
# Example: Missing ingredients from LLM-generated recipe
missing_ingredients = ["mælk", "kyllingebryst", "gulerødder", "løg"]
recipe_name = "Curry chicken"  # Replace dynamically if needed

# Step 1: Get cheapest options
df_cheapest = find_cheapest_bilka_options(missing_ingredients)

# Step 2: Rename columns for readability
df_cheapest = df_cheapest.rename(columns={
    'ingredient': 'Ingredient',
    'product_name': 'Suggested Product',
    'price': 'Price (DKK)',
    'link': 'Link'
})

# Step 3: Calculate total price
total_price = df_cheapest["Price (DKK)"].sum()

# Step 4: Display heading
print(f"To make *{recipe_name}*, you also need to buy the following ingredients:")

# Step 5: Show styled table (bold headers + price formatting)
df_cheapest_display = df_cheapest[['Ingredient', 'Suggested Product', 'Price (DKK)', 'Link']].style\
    .set_table_styles([{'selector': 'th', 'props': [('font-weight', 'bold')]}])\
    .format({'Price (DKK)': '{:.2f}'})

# Step 6: Show the styled DataFrame
display(df_cheapest_display)

# Step 7: Print total price
print(f"\n Total price to make the recipe: {total_price:.2f} DKK")


To make *Curry chicken*, you also need to buy the following ingredients:


Unnamed: 0,Ingredient,Suggested Product,Price (DKK),Link
0,mælk,"Letmælk 1,5% fedt",10.0,https://www.bilkatogo.dk/p/144745
1,kyllingebryst,Kyllingebrystfilet,38.95,https://www.bilkatogo.dk/p/115245
2,gulerødder,Snackgulerødder,9.0,https://www.bilkatogo.dk/p/39764
3,løg,Rødløg,8.0,https://www.bilkatogo.dk/p/138030



 Total price to make the recipe: 65.95 DKK
