In [6]:
# make sure we are in WineRecommender so the paths work
import os
if 'evaluation' in os.getcwd():
    os.chdir("..")
print(os.getcwd())

/home/iris/Documents/Uni/AIR/WineRecommender


## Data structures

In [7]:
import re
import pandas as pd

RESULTS_PER_QUERY = 10

df_cleaned = pd.read_csv('models/data/cleaned_wine_data.csv')

unique_countries = df_cleaned['country'].unique().tolist()

unique_varieties = df_cleaned['variety'].unique().tolist()
normalized_unique_varieties = [re.sub(r'[ -]', '_', variety) for variety in unique_varieties]

unique_regions = df_cleaned['region_1'].unique().tolist()
normalized_unique_regions = [re.sub(r'[ -]', '_', region) for region in unique_regions]

unique_wineries = df_cleaned['winery'].unique().tolist()
normalized_unique_wineries = [re.sub(r'[ -]', '_', winery) for winery in unique_wineries]

price_keywords = {
    'budget': ['budget', 'inexpensive', 'cheap'],
    'mid_range': ['mid_range', 'mid_priced', 'affordable'],
    'premium': ['premium', 'luxury', 'fine', 'expensive']
}


# copy paste from data processing
flavor_keywords = {
    'fruit': ['berry', 'cherry', 'apple', 'citrus', 'tropical', 'fruit', 'blackberry', 'raspberry', 'fruity'],
    'dry': ['dry', 'crisp', 'tannic', 'tannins', 'tannin'],
    'sweet': ['sweet', 'honey', 'ripe', 'jam'],
    'oak': ['oak', 'vanilla', 'toast', 'cedar', 'oaky', 'toasty'],
    'spice': ['spice', 'pepper', 'cinnamon', 'clove'],
    'herbal': ['herbal', 'grass', 'mineral', 'earth', 'earthy', 'herbs', 'grassy']
}

# dictionary created with ChatGPT
country_keywords = {
    'italy': ['italy', 'italian'],
    'portugal': ['portugal', 'portuguese', 'portugese'],
    'us': ['us', 'usa', 'united states', 'united states of america'],
    'spain': ['spain', 'spanish'],
    'france': ['france', 'french'],
    'germany': ['germany', 'german'],
    'argentina': ['argentina', 'argentinian'],
    'chile': ['chile', 'chilean'],
    'australia': ['australia', 'australian'],
    'austria': ['austria', 'austrian'],
    'south africa': ['south africa', 'south african'],
    'new zealand': ['new zealand', 'kiwi'],
    'israel': ['israel', 'israeli'],
    'hungary': ['hungary', 'hungarian'],
    'greece': ['greece', 'greek'],
    'romania': ['romania', 'romanian'],
    'mexico': ['mexico', 'mexican'],
    'canada': ['canada', 'canadian'],
    'turkey': ['turkey', 'turkish', 'turkiye'],
    'czech republic': ['czech republic', 'czech', 'czechia'],
    'slovenia': ['slovenia', 'slovenian'],
    'luxembourg': ['luxembourg', 'luxembourger', 'luxembourgian'],
    'croatia': ['croatia', 'croatian'],
    'georgia': ['georgia', 'georgian'],
    'uruguay': ['uruguay', 'uruguayan'],
    'england': ['england', 'english', 'uk', 'united kingdom', 'britain', 'british'],
    'lebanon': ['lebanon', 'lebanese'],
    'serbia': ['serbia', 'serbian'],
    'brazil': ['brazil', 'brazilian'],
    'moldova': ['moldova', 'moldovan'],
    'morocco': ['morocco', 'moroccan'],
    'peru': ['peru', 'peruvian'],
    'india': ['india', 'indian'],
    'bulgaria': ['bulgaria', 'bulgarian'],
    'cyprus': ['cyprus', 'cypriot'],
    'armenia': ['armenia', 'armenian'],
    'switzerland': ['switzerland', 'swiss'],
    'bosnia and herzegovina': ['bosnia and herzegovina', 'bosnia', 'bosnian'],
    'ukraine': ['ukraine', 'ukrainian'],
    'slovakia': ['slovakia', 'slovak'],
    'macedonia': ['macedonia', 'north macedonia', 'macedonian'],
    'china': ['china', 'chinese'],
    'egypt': ['egypt', 'egyptian']
}
# end of content created with ChatGPT

print("Data Structures created")


Data Structures created


## Baseline 

In [13]:
import pickle
import os
from sklearn.metrics.pairwise import cosine_similarity

# baseline: tf-idf and cosine similarity


# find known words in user query and transform to features
def process_query(query):
    words = query.replace('-', ' ').lower().split()
    two_words = [words[i] + '_' + words[i + 1] for i in range(len(words)-1)]
    three_words = [words[i] + '_' + words[i + 1] + '_' + words[i + 2] for i in range(len(words)-2)]
    
    for country, value in country_keywords.items():
        for keyword in value:
            for word in words:
                if word == keyword:
                    query_features.append('country_' + country)
                    
    for flavor, value in flavor_keywords.items():
        for keyword in value:
            for word in words:
                if word == keyword:
                    query_features.append('flavor_' + flavor)
    
    for price, value in price_keywords.items():
        for keyword in value:
            for word in words:
                if word == keyword:
                    query_features.append('pricecat_' + price)
            for pair in two_words:
                if pair == keyword:
                    query_features.append('pricecat_' + price)
                    
    for variety in normalized_unique_varieties:
        for word in words:
            if word == variety.lower():
                query_features.append('variety_' + word)
        for pair in two_words:
            if pair == variety.lower():
                query_features.append('variety_' + pair)
        for triple in three_words:
            if triple == variety.lower():
                query_features.append('variety_' + triple)

    for region in normalized_unique_regions:
        for word in words:
            if word == region.lower():
                query_features.append('region_' + word)
        for pair in two_words:
            if pair == region.lower():
                query_features.append('region_' + pair)
        for triple in three_words:
            if triple == region.lower():
                query_features.append('region_' + triple)

    for winery in normalized_unique_wineries:
        for word in words:
            if word == winery.lower():
                query_features.append('winery_' + word)
        for pair in two_words:
            if pair == winery.lower():
                query_features.append('winery_' + pair)
        for triple in three_words:
            if triple == winery.lower():
                query_features.append('winery_' + triple)


query = "pinot grigio" # enter your query here
query_features = []
process_query(query)
print(f"Query: {query}")
# print(f"Query Features: {query_features}\n")


# tf-idf of query
with open('models/data/vectorizer.pkl', 'rb') as file:
    vectorizer = pickle.load(file)
query_string = " ".join(query_features)
query_tfidf = vectorizer.transform([query_string])

# cosine similarity
with open('models/data/feature_vectors.pkl', 'rb') as file:
    feature_vectors = pickle.load(file)
cosine_similarities = cosine_similarity(query_tfidf, feature_vectors)[0]
recommended_indices = cosine_similarities.argsort()[::-1]


# list to collect results from model + baseline(s)
result_list = []

with pd.option_context('display.max_colwidth', None):
    for i in range(RESULTS_PER_QUERY):
        wine = df_cleaned[df_cleaned['Unnamed: 0'] == recommended_indices[i]]
        result_list.append({"title": wine['title'].values[0],
                            "model": "tf-idf_baseline",
                            "rank": i + 1, "score": None,
                            "description": wine['description'].values[0][:200] + "...", # first 200 characters
                            "country": wine['country'].values[0],
                            "price": f"${int(wine['price'].values[0])}",
                            "variety": wine['variety'].values[0],
                            "region": wine['region_1'].values[0]
                           })

print(f"Total of {len(result_list)} Results generated!")
# print(result_list)


Query: pinot grigio
Total of 10 Results generated!


## Model

In [14]:
# get n best results from model and add to result_list
# note: every time this ran into an error about the machine not having a gpu, it worked on the second try

from models.wine_recommender import get_wine_recommendations

results_model = get_wine_recommendations(query, top_n=RESULTS_PER_QUERY)

for i in range(RESULTS_PER_QUERY):
    result_list.append({"title": results_model[i]['name'],
                        "model": "neural_network",
                        "rank": i + 1,
                        "score": None,
                        "description": results_model[i]['description'],
                        "country": results_model[i]['country'],
                        "price": results_model[i]['price'],
                        "variety": results_model[i].get("variety"),
                        "region": results_model[i].get("region")
                       })
if(len(result_list) == RESULTS_PER_QUERY * 2):
    print(f"Total of {len(result_list)} Results generated!")
else:
    print(f"Total of {len(result_list)} Results generated! {RESULTS_PER_QUERY * 2} Results should have been created instead, consider rerunning sections 'Baseline' and 'Model'")
    
print(result_list)


Total of 20 Results generated!
[{'title': 'Domaine Marcel Deiss 2012 Pinot Gris (Alsace)', 'model': 'tf-idf_baseline', 'rank': 1, 'score': None, 'description': 'A dry style of Pinot Gris, this is crisp with some acidity. It also has weight and a solid, powerful core of spice and baked apple flavors. With its structure still developing, the wine needs to age. ...', 'country': 'France', 'price': '$32', 'variety': 'Pinot Gris', 'region': 'Alsace'}, {'title': 'Domaine Gresser 2013 Kritt Gewurztraminer (Alsace)', 'model': 'tf-idf_baseline', 'rank': 2, 'score': None, 'description': 'Well-drained gravel soil gives this wine its crisp and dry character. It is ripe and fruity, although the spice is subdued in favor of a more serious structure. This is a wine to age for a couple of y...', 'country': 'France', 'price': '$30', 'variety': 'Gewürztraminer', 'region': 'Alsace'}, {'title': 'Citation 2004 Pinot Noir (Oregon)', 'model': 'tf-idf_baseline', 'rank': 3, 'score': None, 'description': 'Citati

## Scoring

In [20]:
import json
import os
import random

# format: {"id": random_index, "model": "random_baseline", "rank": i + 1, "score": None})

def write_json(data):
    json_file_path = 'evaluation/scores.json'

    # if file doesn't exist, create list
    if not os.path.exists(json_file_path):
        query_list = {
            "queries": [data]
        }
        with open(json_file_path, "w") as file:
            json.dump(query_list, file, indent = 4)
        return

    # append new query data
    with open(json_file_path, 'r+') as file:
        file_data = json.load(file)
        file_data["queries"].append(data)
        file.seek(0)
        json.dump(file_data, file, indent = 4)


def score():
    variety_region_missing = False
    for result in result_list_randomized:
        if result['variety'] == None or result['region'] == None:
            variety_region_missing = True
            break
        
    
    for i in range(len(result_list_randomized)):
        result = result_list_randomized[i]

        # check if this wine was already checked, if yes copy score and skip
        result_is_duplicate = False
        for j in range(i):
            if result["title"] == result_list_randomized[j]["title"] and result["description"] == result_list_randomized[j]["description"]:
                result["score"] = result_list_randomized[j]["score"]
                result_is_duplicate = True
                break
                
        if result_is_duplicate:
            continue

        print(
            "\n=============================================================\n"
            f"Query: {query}\n\n"
            f"Wine Name: {result['title']}\n"
            f"Description: {result['description']}\n"
            f"Country: {result['country']}\n"
            f"Price: {result['price']}"
        )
        if not variety_region_missing:
            print(
                f"Variety: {result['variety']}\n"
                f"Region: {result['region']}\n"
            )
        
        
        score = input("Score: ").strip()
        if score.lower() == "exit":
            print("Scoring canceled, no scores will be written to file")
            return False
            
        while not score.isdigit() or int(score) < 0 or int(score) > 3:
            score = input("Score: ").strip()
            if score.lower() == "exit":
                print("Scoring canceled, no scores will be written to file")
                return False
        
        result["score"] = score
    return True

def sort():
    neural_network = []
    tf_idf_baseline = []
    
    # sort by model
    for result in result_list_randomized:
        model = result["model"]
        if model == "neural_network":
            neural_network.append(result)
        elif model == "tf-idf_baseline":
            tf_idf_baseline.append(result)
        else:
            print(f"Model not found: '{model}'") 
    
    
    sorted_neural_network = sorted(neural_network, key = lambda x: x["rank"])
    sorted_tf_idf_baseline = sorted(tf_idf_baseline, key = lambda x: x["rank"])
    return sorted_neural_network, sorted_tf_idf_baseline

def filter_fields(result_list):
    new_result_list = []
    for result in result_list:
        new_result = {}
        for k, v in result.items():
            if k in ("title", "model", "rank", "score"):
                new_result[k] = v
        new_result_list.append(new_result)
    return new_result_list
        

# shuffle list to not bias scores
result_list_randomized = result_list.copy()
random.shuffle(result_list_randomized)

if score():
    sorted_neural_network, sorted_tf_idf_baseline = sort()
    
    print("Writing results to 'scores.json'")
    # TODO: replace with model once available
    write_json(
        {
            "query": query,
            "baseline": filter_fields(sorted_tf_idf_baseline),
            "model": filter_fields(sorted_neural_network)
        }
    )



Query: pinot grigio

Wine Name: Chamlija 2015 Blanc de Noirs Papaskarasi (Thrace)
Description: This lightly pressed white version of an indigenous red grape is pale-straw in color, with a nose of lemon. It is slightly stiff on the palate, with flavors of lemon and peach and a refreshing, almost...
Country: Turkey
Price: $24


Score:  0



Query: pinot grigio

Wine Name: Turasan 2014 Emir (Turkey)
Description: A bouquet of peach, melon and freesia sets the scene for flavors of white peach, apricot, watermelon and rose petal with a touch of spearmint. It is bright on entry and soft in the mouth, and the spea...
Country: Turkey
Price: $15


Score:  0



Query: pinot grigio

Wine Name: Domaine Gresser 2013 Kritt Gewurztraminer (Alsace)
Description: Well-drained gravel soil gives this wine its crisp and dry character. It is ripe and fruity, although the spice is subdued in favor of a more serious structure. This is a wine to age for a couple of y...
Country: France
Price: $30


Score:  0



Query: pinot grigio

Wine Name: Domaine Ehrhart 2013 Domaine Saint-Rémy Herrenweg Gewurztraminer (Alsace)
Description: Initially quite muted, this wine slowly develops impressive richness and spice. It's not sweet, more medium dry, with the spice forming a core of dryness that contrasts with the honeyed texture. It ca...
Country: France
Price: $28


Score:  0



Query: pinot grigio

Wine Name: Domaine Rieflé-Landmann 2013 Seppi Landmann Vallée Noble Pinot Gris (Alsace)
Description: While it's rich, this beautiful dry wine also offers considerable freshness. Acidity cuts easily through the ripe white fruit, pear and red apples, allowing room for spice that provides a contrasting ...
Country: France
Price: $28


Score:  3



Query: pinot grigio

Wine Name: Turasan 2015 Emir (Cappadocia)
Description: Aromas of white peach and apricot set the scene for flavors of white peach, freesia and thyme. This wine is full in the mouth, with a floral flourish on the bright finish....
Country: Turkey
Price: $19


Score:  0



Query: pinot grigio

Wine Name: Budureasca 2016 Vine in Flames Feteascǎ Regalǎ (Dealu Mare)
Description: This straw colored wine made from Feteasca Regala has aromas of lemon pith and Bartlett pear. In the mouth there is bracing acidity, with flavors of lemon, melon and white peach....
Country: Romania
Price: $9


Score:  0



Query: pinot grigio

Wine Name: Pheasant's Tears 2009 Dry Unfiltered Amber Wine Rkatsiteli (Kakheti)
Description: Aromatic, with notes of black tea leaves, sandalwood and incense, this smoky Georgian amber wine boasts delicate orange rind and blossom flavors, swathed in bristling, chestnut-skin-like tannins. Ther...
Country: Georgia
Price: $18


Score:  0



Query: pinot grigio

Wine Name: COS 2013 Frappato (Sicilia)
Description: Intense aromas of wild cherry, baking spice, tilled soil and savory herb lead the nose on this soulful, silky red. The round, smooth palate doles out juicy red cherry, strawberry jelly, mineral, white...
Country: Italy
Price: $30


Score:  0



Query: pinot grigio

Wine Name: Cusumano 2012 Sàgana Tenuta San Giacomo Nero d'Avola (Sicilia)
Description: Blackberry, cassis, grilled herb and toasted aromas come together in the glass. On the palate, espresso, mint and black pepper add depth to the core of black cherry and blackberry flavors. It finishes...
Country: Italy
Price: $40


Score:  0



Query: pinot grigio

Wine Name: Dalton 2012 Oak Aged Cabernet Sauvignon (Galilee)
Description: A bouquet of black cherry, tart cranberry and clove opens into flavors of cherry, anisette, espresso bean and mint, with a hint of tart cranberry. The minty notes can almost seem overly strong for a m...
Country: Israel
Price: $20


Score:  0



Query: pinot grigio

Wine Name: Kindzmarauli Marani 2014 Kisi (Kakheti)
Description: Medium-gold in the glass, this wine has aromas of apricot, melon and orange blossom. There is a soft sense of coolness on the palate, with flavors of apricot, marzipan, orange zest and guava....
Country: Georgia
Price: $15


Score:  0



Query: pinot grigio

Wine Name: Zlatan Otok 2012 Vrhunsko Bijelo Vino Posip (Hvar)
Description: This has aromas of Bartlett pear and black Mission figs. In the mouth, there are flavors of pear, lemon and fresh picked apricots....
Country: Croatia
Price: $20


Score:  0



Query: pinot grigio

Wine Name: Vino z Czech 2011 Welschriesling (Moravia)
Description: Smoke and struck-flint notes add a dark tone to this savory, herb-inflected white wine. The palate is round and rich, and finishes with a reverberating hint of salt and bitter lemon rind....
Country: Czech Republic
Price: $25


Score:  0



Query: pinot grigio

Wine Name: Dr. H. Thanisch (Erben Müller-Burggraef) 2013 Brauneberger Juffer-Sonnenuhr Spätlese Riesling (Mosel)
Description: Notes of honeysuckle and cantaloupe sweeten this deliciously feather-light spätlese. It's intensely juicy, quenching the palate with streams of tart tangerine and grapefruit acidity, yet wraps up with...
Country: Germany
Price: $28


Score:  0



Query: pinot grigio

Wine Name: Domaine Marcel Deiss 2012 Pinot Gris (Alsace)
Description: A dry style of Pinot Gris, this is crisp with some acidity. It also has weight and a solid, powerful core of spice and baked apple flavors. With its structure still developing, the wine needs to age. ...
Country: France
Price: $32


Score:  3



Query: pinot grigio

Wine Name: Citation 2004 Pinot Noir (Oregon)
Description: Citation is given as much as a decade of bottle age prior to release, which means it is pre-cellared and drinking at its peak. Baked cherry, cocoa and coconut flavors combine gracefully, with soft, se...
Country: US
Price: $75


Score:  2



Query: pinot grigio

Wine Name: Caves Transmontanas 2006 Vértice Pinot Noir (Douro)
Description: Fresh and fruity, this is full of red cherry flavors and crisp acidity. It has some age, hinting at a more toasty future. For the moment, it's at its best after eight years aging on the lees in the bo...
Country: Portugal
Price: $48


Score:  2



Query: pinot grigio

Wine Name: Telavi 2009 Satrapezo 10 Qvevri Rkatsiteli (Kakheti)
Description: Notes of rose and waxy flower mingle with savory nut and nut skin on this traditionally-made Georgian wine. Long maceration on the skins in clay vessels gives it a deep, golden hue, and a concentrated...
Country: Georgia
Price: $30


Score:  0


Writing results to 'scores.json'
