## Data structures

In [5]:
import re
import pandas as pd

df_cleaned = pd.read_csv('WineRecommender/models/data/cleaned_wine_data.csv')

unique_countries = df_cleaned['country'].unique().tolist()
print(unique_countries)

unique_varieties = df_cleaned['variety'].unique().tolist()
normalized_unique_varieties = [re.sub(r'[ -]', '_', variety) for variety in unique_varieties]

unique_regions = df_cleaned['region_1'].unique().tolist()
normalized_unique_regions = [re.sub(r'[ -]', '_', region) for region in unique_regions]

unique_wineries = df_cleaned['winery'].unique().tolist()
normalized_unique_wineries = [re.sub(r'[ -]', '_', winery) for winery in unique_wineries]

price_keywords = {
    'budget': ['budget', 'inexpensive', 'cheap'],
    'mid_range': ['mid_range', 'mid_priced', 'affordable'],
    'premium': ['premium', 'luxury', 'fine', 'expensive']
}

# def price_category(price):
#      if price < 25: return 'budget'
#      elif price < 50: return 'mid_range' 
#      else: return 'premium'


# copy paste from data processing
flavor_keywords = {
    'fruit': ['berry', 'cherry', 'apple', 'citrus', 'tropical', 'fruit', 'blackberry', 'raspberry', 'fruity'],
    'dry': ['dry', 'crisp', 'tannic', 'tannins', 'tannin'],
    'sweet': ['sweet', 'honey', 'ripe', 'jam'],
    'oak': ['oak', 'vanilla', 'toast', 'cedar', 'oaky', 'toasty'],
    'spice': ['spice', 'pepper', 'cinnamon', 'clove'],
    'herbal': ['herbal', 'grass', 'mineral', 'earth', 'earthy', 'herbs', 'grassy']
}

# dictionary created with ChatGPT
country_keywords = {
    'italy': ['italy', 'italian'],
    'portugal': ['portugal', 'portuguese', 'portugese'],
    'us': ['us', 'usa', 'united states', 'united states of america'],
    'spain': ['spain', 'spanish'],
    'france': ['france', 'french'],
    'germany': ['germany', 'german'],
    'argentina': ['argentina', 'argentinian'],
    'chile': ['chile', 'chilean'],
    'australia': ['australia', 'australian'],
    'austria': ['austria', 'austrian'],
    'south africa': ['south africa', 'south african'],
    'new zealand': ['new zealand', 'kiwi'],
    'israel': ['israel', 'israeli'],
    'hungary': ['hungary', 'hungarian'],
    'greece': ['greece', 'greek'],
    'romania': ['romania', 'romanian'],
    'mexico': ['mexico', 'mexican'],
    'canada': ['canada', 'canadian'],
    'turkey': ['turkey', 'turkish', 'turkiye'],
    'czech republic': ['czech republic', 'czech', 'czechia'],
    'slovenia': ['slovenia', 'slovenian'],
    'luxembourg': ['luxembourg', 'luxembourger', 'luxembourgian'],
    'croatia': ['croatia', 'croatian'],
    'georgia': ['georgia', 'georgian'],
    'uruguay': ['uruguay', 'uruguayan'],
    'england': ['england', 'english', 'uk', 'united kingdom', 'britain', 'british'],
    'lebanon': ['lebanon', 'lebanese'],
    'serbia': ['serbia', 'serbian'],
    'brazil': ['brazil', 'brazilian'],
    'moldova': ['moldova', 'moldovan'],
    'morocco': ['morocco', 'moroccan'],
    'peru': ['peru', 'peruvian'],
    'india': ['india', 'indian'],
    'bulgaria': ['bulgaria', 'bulgarian'],
    'cyprus': ['cyprus', 'cypriot'],
    'armenia': ['armenia', 'armenian'],
    'switzerland': ['switzerland', 'swiss'],
    'bosnia and herzegovina': ['bosnia and herzegovina', 'bosnia', 'bosnian'],
    'ukraine': ['ukraine', 'ukrainian'],
    'slovakia': ['slovakia', 'slovak'],
    'macedonia': ['macedonia', 'north macedonia', 'macedonian'],
    'china': ['china', 'chinese'],
    'egypt': ['egypt', 'egyptian']
}
# end of content created with ChatGPT


['Italy', 'Portugal', 'US', 'Spain', 'France', 'Germany', 'Argentina', 'Chile', 'Australia', 'Austria', 'South Africa', 'New Zealand', 'Israel', 'Hungary', 'Greece', 'Romania', 'Mexico', 'Canada', 'Unknown_Country', 'Turkey', 'Czech Republic', 'Slovenia', 'Luxembourg', 'Croatia', 'Georgia', 'Uruguay', 'England', 'Lebanon', 'Serbia', 'Brazil', 'Moldova', 'Morocco', 'Peru', 'India', 'Bulgaria', 'Cyprus', 'Armenia', 'Switzerland', 'Bosnia and Herzegovina', 'Ukraine', 'Slovakia', 'Macedonia', 'China', 'Egypt']


## Baseline 

In [17]:
import pickle
from sklearn.metrics.pairwise import cosine_similarity

# baseline: tf-idf and cosine similarity

# TODO: support numbers for price input

# find known words in user query and transform to features
def process_query(query):
    words = query.replace('-', ' ').lower().split()
    two_words = [words[i] + '_' + words[i + 1] for i in range(len(words)-1)]
    three_words = [words[i] + '_' + words[i + 1] + '_' + words[i + 2] for i in range(len(words)-2)]
    
    for country, value in country_keywords.items():
        for keyword in value:
            for word in words:
                if word == keyword:
                    query_features.append('country_' + country)
                    
    for flavor, value in flavor_keywords.items():
        for keyword in value:
            for word in words:
                if word == keyword:
                    query_features.append('flavor_' + flavor)
    
    for price, value in price_keywords.items():
        for keyword in value:
            for word in words:
                if word == keyword:
                    query_features.append('pricecat_' + price)
            for pair in two_words:
                if pair == keyword:
                    query_features.append('pricecat_' + price)
                    
    for variety in normalized_unique_varieties:
        for word in words:
            if word == variety.lower():
                query_features.append('variety_' + word)
        for pair in two_words:
            if pair == variety.lower():
                query_features.append('variety_' + pair)
        for triple in three_words:
            if triple == variety.lower():
                query_features.append('variety_' + triple)

    for region in normalized_unique_regions:
        for word in words:
            if word == region.lower():
                query_features.append('region_' + word)
        for pair in two_words:
            if pair == region.lower():
                query_features.append('region_' + pair)
        for triple in three_words:
            if triple == region.lower():
                query_features.append('region_' + triple)

    for winery in normalized_unique_wineries:
        for word in words:
            if word == winery.lower():
                query_features.append('winery_' + word)
        for pair in two_words:
            if pair == winery.lower():
                query_features.append('winery_' + pair)
        for triple in three_words:
            if triple == winery.lower():
                query_features.append('winery_' + triple)

                

query = "premium cherry wine"
query_features = []
process_query(query)
print(f"Query: {query}")
# print(f"Query Features: {query_features}\n")


# tf-idf of query
with open('WineRecommender/models/data/vectorizer.pkl', 'rb') as file:
    vectorizer = pickle.load(file)
query_string = " ".join(query_features)
query_tfidf = vectorizer.transform([query_string])

# cosine similarity
with open('WineRecommender/models/data/feature_vectors.pkl', 'rb') as file:
    feature_vectors = pickle.load(file)
cosine_similarities = cosine_similarity(query_tfidf, feature_vectors)[0]
recommended_indices = cosine_similarities.argsort()[::-1]


# list to collect results from model + baseline(s)
result_list = []

# print 10 best results
with pd.option_context('display.max_colwidth', None):
    for i in range(10):
        wine = df_cleaned[df_cleaned['Unnamed: 0'] == recommended_indices[i]]
        result_list.append({"id": int(recommended_indices[i]), "model": "tf-idf_baseline", "rank": i + 1, "score": None})


print(f"Total of {len(result_list)} Results generated!")


Query: premium cherry wine
Total of 10 Results generated!


## Random Baseline (temporary)

In [18]:
# baseline 2 - random
import random

wine_rows = df_cleaned.shape[0]

print(f"Query: {query}")
# print(f"Query Features: {query_features}\n")

# print 10 random results
with pd.option_context('display.max_colwidth', None):
    for i in range(10):
        random_index = random.randint(0, wine_rows)
        wine = df_cleaned[df_cleaned['Unnamed: 0'] == random_index]
        result_list.append({"id": random_index, "model": "random_baseline", "rank": i + 1, "score": None})
        #print(
        #     f"Wine #{i+1}:\n"
        #     f"Country: {wine['country'].values[0]}\n"
        #     f"Description: {wine['description'].values[0]}\n"
        #     f"Points: {wine['points'].values[0]}\n"
        #     f"Price: {wine['price'].values[0]}\n"
        #     f"Province: {wine['province'].values[0]}\n"
        #     f"Region: {wine['region_1'].values[0]}\n"
        #     f"Variety: {wine['variety'].values[0]}\n"
        #     f"Winery: {wine['winery'].values[0]}\n"
        #     f"Wine number in documents: {random_index}\n" # debug info
        #     f"Wine features: {wine['enhanced_features'].values[0]}\n" # debug info
        #     "\n"
        # )

print(f"Total of {len(result_list)} Results generated!")

Query: premium cherry wine
Total of 20 Results generated!


## Scoring

In [19]:
import json
import os

# format: {"id": random_index, "model": "random_baseline", "rank": i + 1, "score": None})

def write_json(data):
    json_file_path = 'WineRecommender/evaluation/scores.json'

    # if file doesn't exist, create list
    if not os.path.exists(json_file_path):
        query_list = {
            "queries": [data]
        }
        with open(json_file_path, "w") as file:
            json.dump(query_list, file, indent = 4)
        return

    # append new query data
    with open(json_file_path, 'r+') as file:
        file_data = json.load(file)
        file_data["queries"].append(data)
        file.seek(0)
        json.dump(file_data, file, indent = 4)


def score():
    for i in range(len(result_list_randomized)):
        result = result_list_randomized[i]
    
        # check if this wine was already checked, if yes copy score and skip
        result_is_duplicate = False
        for j in range(i):
            if result["id"] == result_list_randomized[j]["id"]:
                result["score"] = result_list_randomized[j]["score"]
                result_is_duplicate = True
                break
                
        if result_is_duplicate:
            continue
    
        wine = df_cleaned[df_cleaned['Unnamed: 0'] == result["id"]]

        result_id = result["id"] # debug
        print(
            "=============================================================\n"
            f"Query: {query}\n\n"
            f"Wine:\n"
            f"Country: {wine['country'].values[0]}\n"
            f"Description: {wine['description'].values[0]}\n"
            f"Points: {wine['points'].values[0]}\n"
            f"Price: {wine['price'].values[0]}\n"
            f"Province: {wine['province'].values[0]}\n"
            f"Region: {wine['region_1'].values[0]}\n"
            f"Variety: {wine['variety'].values[0]}\n"
            f"Winery: {wine['winery'].values[0]}\n"
            f"Wine number in documents: {result_id}\n" # debug
        )
    
        score = input("Score: ")
        while not score.isdigit() or int(score) < 0 or int(score) > 3:
            score = input("Score: ")
    
        result["score"] = score
        # print(result)

def sort():
    random_baseline = []
    tf_idf_baseline = []
    
    # sort by model
    for result in result_list_randomized:
        model = result["model"]
        if model == "random_baseline":
            random_baseline.append(result)
        elif model == "tf-idf_baseline":
            tf_idf_baseline.append(result)
        else:
            print(f"Model not found: '{model}'") 
    
    
    sorted_random_baseline = sorted(random_baseline, key = lambda x: x["rank"])
    sorted_tf_idf_baseline = sorted(tf_idf_baseline, key = lambda x: x["rank"])
    return sorted_random_baseline, sorted_tf_idf_baseline
    

# shuffle list to not bias scores
result_list_randomized = result_list.copy()
random.shuffle(result_list_randomized)

score()

sorted_random_baseline, sorted_tf_idf_baseline = sort()

print("Writing results to 'scores.json'")
# TODO: replace with model once available
write_json(
    {
        "query": query,
        "baseline": sorted_random_baseline,
        "model": sorted_tf_idf_baseline
    }
)



Query: premium cherry wine

Wine:
Country: France
Description: A fine example of white Hermitage, Chapoutier's 2007 Chante-Alouette boasts attractive aromas of crushed stone, honey and pineapple, while in the mouth the wine goes down easily, flowing along like a gentle, honeyed stream. Harmonious. Drink now–2014, or after 2019 if you prefer aged Hermitage.
Points: 92
Price: 92.0
Province: Rhône Valley
Region: Hermitage
Variety: Marsanne
Winery: M. Chapoutier
Wine number in documents: 49274



Score:  1


Query: premium cherry wine

Wine:
Country: US
Description: So rich, ripe and succulent, you can hardly keep from drinking it now. But don't. This is one for the cellar. The alcohol is a refreshingly balanced. The wine is already throwing some sediment, and more than four years of bottle age are mellowing the wine's edges, with the primary blackberry and cherry flavors developing secondary characteristics. Should begin to peak around 2016 and drink well for another decade.
Points: 94
Price: 75.0
Province: California
Region: Diamond Mountain District
Variety: Cabernet Sauvignon
Winery: Dyer
Wine number in documents: 109538



Score:  3


Query: premium cherry wine

Wine:
Country: Italy
Description: This wine's fragrance evokes underbrush, toast, chopped celery and prune. The tight, angular palate offers dried cherry, star anise, espresso and sage notes alongside bracing tannins that clench the finish.
Points: 85
Price: 25.0
Province: Piedmont
Region: Barbaresco
Variety: Nebbiolo
Winery: Cantina Terre del Barolo
Wine number in documents: 10812



Score:  2


Query: premium cherry wine

Wine:
Country: Argentina
Description: This might be the only commercial Trousseau from Argentina, and it's rusty in color, with translucence. Savory challenging aromas of orange peel and burnt orange include notes of dried cherry and fallen leaves. A tightly knit palate is home to flavors of dried cherry and plum as well as tomato sauce. A hint of maple vies with dried red-fruit flavors on a light finish that shows some elegance.
Points: 88
Price: 60.0
Province: Other
Region: Patagonia
Variety: Trousseau
Winery: Aniello
Wine number in documents: 80158



Score:  3


Query: premium cherry wine

Wine:
Country: Croatia
Description: A kaleidescope of scents and flavors, Korta Katarina's premiere bottling is a fascinating study of Plavac Mali at its best. Sweet on the nose and palate with hints of dark chocolate, café au lait, dried herbs and preserved fruit, it's a richly textured wine with a beautifully feminine profile. Big, bold tannins on the finish smooth out considerably with aeration.
Points: 91
Price: 57.0
Province: Peljesac
Region: Peljesac_Region
Variety: Plavac Mali
Winery: Korta Katarina
Wine number in documents: 77733



Score:  1


Query: premium cherry wine

Wine:
Country: Italy
Description: Powerful and intense in delivery, this Pecorino-based white delivers immediate aromas of mature fruit, maple syrup and chestnut honey. It has firm texture and medium density and would wash down well with white meat or pork.
Points: 88
Price: 55.0
Province: Central Italy
Region: Abruzzo
Variety: Pecorino
Winery: Cataldi Madonna
Wine number in documents: 47932



Score:  1


Query: premium cherry wine

Wine:
Country: Italy
Description: This salmon-colored wine opens with aromas of red berry and a whiff of Mediterranean herb. The easygoing palate offers wild cherry and a hint of cooking spice alongside bright acidity.
Points: 85
Price: 20.0
Province: Southern Italy
Region: Salento
Variety: Negroamaro
Winery: Mocavero
Wine number in documents: 9633



Score:  2


Query: premium cherry wine

Wine:
Country: Austria
Description: From the far western vineyards of the Wachau, this crisp, light and fruity wine is a delicious expression of Grüner's pepper and apple notes. With great freshness under the spice, it has refreshing crispness. Drink from 2014. Screwcap.
Points: 87
Price: 22.5
Province: Wachau
Region: Wachau_Region
Variety: Grüner Veltliner
Winery: Gritsch Mauritiushof
Wine number in documents: 54457



Score:  0


Query: premium cherry wine

Wine:
Country: France
Description: A rounded wine, this has an attractive red-jelly flavor and light acidity. It is full and rich, with fine acids on the finish. Keep for six months.
Points: 86
Price: 20.0
Province: Beaujolais
Region: Côte de Brouilly
Variety: Gamay
Winery: Trenel Fils
Wine number in documents: 12479



Score:  0


Query: premium cherry wine

Wine:
Country: France
Description: Full bodied and thickly concentrated, this is almost too much of a good thing, with nearly overblown orange, honey and truffle notes coating the palate. Shows touches of warmth and bitterness on the finish. Drink over the next few years.
Points: 89
Price: 92.0
Province: Rhône Valley
Region: Hermitage
Variety: Marsanne
Winery: M. Chapoutier
Wine number in documents: 86158



Score:  1


Query: premium cherry wine

Wine:
Country: England
Description: A touch of blossom honey enriches the tart but ripe apple notes of the nose. On the palate that honeyed touch turns into lemony, golden shortbread while the tart apple notes are joined by fresh, ripe lemon. Thousands of tiny, pin-prick bubbles make for a creamy, elegant mousse. The contrast between honeyed, autolytic richness and bright, pure citrus is a joy. This wine is exciting and elegant, drawing you to its mellower, richer core. The purity on the finish is luminous and long.
Points: 94
Price: 56.0
Province: England
Region: England_Region
Variety: Chardonnay
Winery: Hoffmann & Rathbone
Wine number in documents: 84241



Score:  1


Query: premium cherry wine

Wine:
Country: Italy
Description: Made with Malvasia grapes from the beautiful and rustic island of Salina (off the northern coast of Sicily), Capofaro's luscious dessert wine is richly redolent of honey, dried banana, apricot and dried flowers. This is a gorgeous wine with syrupy but smooth density and tangy candied fruit flavors.
Points: 90
Price: 50.0
Province: Sicily & Sardinia
Region: Salina
Variety: Malvasia
Winery: Tasca d'Almerita
Wine number in documents: 23874



Score:  0


Query: premium cherry wine

Wine:
Country: US
Description: The aromas leap out of the glass, with notes of potpourri, dried orange peel and honey. It drinks sweet, with viscous-feeling flower and marmalade flavors, with white flowers lingering on the finish.
Points: 89
Price: 14.0
Province: Washington
Region: Yakima Valley
Variety: Riesling
Winery: Frost Bitten
Wine number in documents: 101848



Score:  0


Query: premium cherry wine

Wine:
Country: Portugal
Description: This wood-aged wine has juicy black fruits that are given shape by the firm tannins and dense, concentrated structure. Initially austere and packed with acidity, it needs time to develop. Drink this powerful wine from 2016.
Points: 91
Price: 34.0
Province: Alentejano
Region: Alentejano_Region
Variety: Portuguese Red
Winery: Herdade Grande
Wine number in documents: 80452



Score:  0


Query: premium cherry wine

Wine:
Country: Croatia
Description: Cranberry, pomegranate and a touch of bramble are found in the nose. The palate reveals fresh fruit flavors of sour red cherry, cranberry and red plum with a pleasant acidic backbone. Drink now or hold through 2016.
Points: 90
Price: 57.0
Province: Peljesac
Region: Peljesac_Region
Variety: Plavac Mali
Winery: Miloš
Wine number in documents: 24208



Score:  3


Query: premium cherry wine

Wine:
Country: Italy
Description: There's no other wine like this in the world, and it won't appeal to everyone. Anfora is matured in ceramic amphorae for seven months in a throwback to ancient winemaking techniques to achieve a deep golden color and penetrating aromas of resin, caramel, chestnut honey and cola. The firmstructure could match that of a mature red and the wine imparts resinous, almost syrupy density.
Points: 93
Price: 120.0
Province: Northeastern Italy
Region: Venezia Giulia
Variety: Ribolla Gialla
Winery: Gravner
Wine number in documents: 58235



Score:  1


Query: premium cherry wine

Wine:
Country: US
Description: Although plummy fruit dominates the nose, if you take an extra moment or two you can ferret out a lot more: mint, leather and vanilla, for example. The flavors feature a wintergreen component that adds a welcome degree of individuality to this offering, before fading into a medium-length finish.
Points: 87
Price: 35.0
Province: California
Region: Carneros
Variety: Pinot Noir
Winery: Mayo
Wine number in documents: 10195



Score:  0


Query: premium cherry wine

Wine:
Country: Germany
Description: A stunner amidst an increasingly competitive set of German Pinot Noir, this lavishly berried, rich wine is full of nuance and complexity. Deep, ripe black-fruit flavors are accented by rose petals and game. Bristling acidity and taut, grippy tannins lend structure and depth. It drinks beautifully now, but is bound to improve through 2021 and beyond.
Points: 95
Price: 50.0
Province: Pfalz
Region: Pfalz_Region
Variety: Spätburgunder
Winery: Ökonomierat Rebholz
Wine number in documents: 109445



Score:  1


Query: premium cherry wine

Wine:
Country: US
Description: Extremely dark fruit tones come from the nose of this wine, including blackberry and black cherry. It also offers touches of red fruit alongside snapped dark chocolate, licorice and a tarragon mintiness. Density and darkness consume the palate too, with blueberry fruit laced by dried basil, oregano, lavender and tar.
Points: 93
Price: 45.0
Province: California
Region: Paso Robles
Variety: Syrah
Winery: Brian Benson
Wine number in documents: 47853



Score:  2


Query: premium cherry wine

Wine:
Country: France
Description: Gorgeous, rich Pinot flavors float over soft, ripe tannins. Strawberry-fruit flavors offer freshness and bright acidity. The dense texture is very evident, layered with wonderfully perfumed flavors from wood and fruit.
Points: 94
Price: 50.0
Province: Burgundy
Region: Nuits-St.-Georges
Variety: Pinot Noir
Winery: Domaine Henri Gouges
Wine number in documents: 6734



Score:  1


Writing results to 'scores.json'
