## Data structures

In [26]:
import re
import pandas as pd

df_cleaned = pd.read_csv('../models/data/cleaned_wine_data.csv')

unique_countries = df_cleaned['country'].unique().tolist()
print(unique_countries)

unique_varieties = df_cleaned['variety'].unique().tolist()
normalized_unique_varieties = [re.sub(r'[ -]', '_', variety) for variety in unique_varieties]

unique_regions = df_cleaned['region_1'].unique().tolist()
normalized_unique_regions = [re.sub(r'[ -]', '_', region) for region in unique_regions]

unique_wineries = df_cleaned['winery'].unique().tolist()
normalized_unique_wineries = [re.sub(r'[ -]', '_', winery) for winery in unique_wineries]

price_keywords = {
    'budget': ['budget', 'inexpensive', 'cheap'],
    'mid_range': ['mid_range', 'mid_priced', 'affordable'],
    'premium': ['premium', 'luxury', 'fine', 'expensive']
}

# def price_category(price):
#      if price < 25: return 'budget'
#      elif price < 50: return 'mid_range' 
#      else: return 'premium'


# copy paste from data processing
flavor_keywords = {
    'fruit': ['berry', 'cherry', 'apple', 'citrus', 'tropical', 'fruit', 'blackberry', 'raspberry', 'fruity'],
    'dry': ['dry', 'crisp', 'tannic', 'tannins', 'tannin'],
    'sweet': ['sweet', 'honey', 'ripe', 'jam'],
    'oak': ['oak', 'vanilla', 'toast', 'cedar', 'oaky', 'toasty'],
    'spice': ['spice', 'pepper', 'cinnamon', 'clove'],
    'herbal': ['herbal', 'grass', 'mineral', 'earth', 'earthy', 'herbs', 'grassy']
}

# dictionary created with ChatGPT
country_keywords = {
    'italy': ['italy', 'italian'],
    'portugal': ['portugal', 'portuguese', 'portugese'],
    'us': ['us', 'usa', 'united states', 'united states of america'],
    'spain': ['spain', 'spanish'],
    'france': ['france', 'french'],
    'germany': ['germany', 'german'],
    'argentina': ['argentina', 'argentinian'],
    'chile': ['chile', 'chilean'],
    'australia': ['australia', 'australian'],
    'austria': ['austria', 'austrian'],
    'south africa': ['south africa', 'south african'],
    'new zealand': ['new zealand', 'kiwi'],
    'israel': ['israel', 'israeli'],
    'hungary': ['hungary', 'hungarian'],
    'greece': ['greece', 'greek'],
    'romania': ['romania', 'romanian'],
    'mexico': ['mexico', 'mexican'],
    'canada': ['canada', 'canadian'],
    'turkey': ['turkey', 'turkish', 'turkiye'],
    'czech republic': ['czech republic', 'czech', 'czechia'],
    'slovenia': ['slovenia', 'slovenian'],
    'luxembourg': ['luxembourg', 'luxembourger', 'luxembourgian'],
    'croatia': ['croatia', 'croatian'],
    'georgia': ['georgia', 'georgian'],
    'uruguay': ['uruguay', 'uruguayan'],
    'england': ['england', 'english', 'uk', 'united kingdom', 'britain', 'british'],
    'lebanon': ['lebanon', 'lebanese'],
    'serbia': ['serbia', 'serbian'],
    'brazil': ['brazil', 'brazilian'],
    'moldova': ['moldova', 'moldovan'],
    'morocco': ['morocco', 'moroccan'],
    'peru': ['peru', 'peruvian'],
    'india': ['india', 'indian'],
    'bulgaria': ['bulgaria', 'bulgarian'],
    'cyprus': ['cyprus', 'cypriot'],
    'armenia': ['armenia', 'armenian'],
    'switzerland': ['switzerland', 'swiss'],
    'bosnia and herzegovina': ['bosnia and herzegovina', 'bosnia', 'bosnian'],
    'ukraine': ['ukraine', 'ukrainian'],
    'slovakia': ['slovakia', 'slovak'],
    'macedonia': ['macedonia', 'north macedonia', 'macedonian'],
    'china': ['china', 'chinese'],
    'egypt': ['egypt', 'egyptian']
}
# end of content created with ChatGPT


['Italy', 'Portugal', 'US', 'Spain', 'France', 'Germany', 'Argentina', 'Chile', 'Australia', 'Austria', 'South Africa', 'New Zealand', 'Israel', 'Hungary', 'Greece', 'Romania', 'Mexico', 'Canada', 'Unknown_Country', 'Turkey', 'Czech Republic', 'Slovenia', 'Luxembourg', 'Croatia', 'Georgia', 'Uruguay', 'England', 'Lebanon', 'Serbia', 'Brazil', 'Moldova', 'Morocco', 'Peru', 'India', 'Bulgaria', 'Cyprus', 'Armenia', 'Switzerland', 'Bosnia and Herzegovina', 'Ukraine', 'Slovakia', 'Macedonia', 'China', 'Egypt']


## Baseline 

In [28]:
import pickle
import os
from sklearn.metrics.pairwise import cosine_similarity

# baseline: tf-idf and cosine similarity

# TODO: support numbers for price input

# find known words in user query and transform to features
def process_query(query):
    words = query.replace('-', ' ').lower().split()
    two_words = [words[i] + '_' + words[i + 1] for i in range(len(words)-1)]
    three_words = [words[i] + '_' + words[i + 1] + '_' + words[i + 2] for i in range(len(words)-2)]
    
    for country, value in country_keywords.items():
        for keyword in value:
            for word in words:
                if word == keyword:
                    query_features.append('country_' + country)
                    
    for flavor, value in flavor_keywords.items():
        for keyword in value:
            for word in words:
                if word == keyword:
                    query_features.append('flavor_' + flavor)
    
    for price, value in price_keywords.items():
        for keyword in value:
            for word in words:
                if word == keyword:
                    query_features.append('pricecat_' + price)
            for pair in two_words:
                if pair == keyword:
                    query_features.append('pricecat_' + price)
                    
    for variety in normalized_unique_varieties:
        for word in words:
            if word == variety.lower():
                query_features.append('variety_' + word)
        for pair in two_words:
            if pair == variety.lower():
                query_features.append('variety_' + pair)
        for triple in three_words:
            if triple == variety.lower():
                query_features.append('variety_' + triple)

    for region in normalized_unique_regions:
        for word in words:
            if word == region.lower():
                query_features.append('region_' + word)
        for pair in two_words:
            if pair == region.lower():
                query_features.append('region_' + pair)
        for triple in three_words:
            if triple == region.lower():
                query_features.append('region_' + triple)

    for winery in normalized_unique_wineries:
        for word in words:
            if word == winery.lower():
                query_features.append('winery_' + word)
        for pair in two_words:
            if pair == winery.lower():
                query_features.append('winery_' + pair)
        for triple in three_words:
            if triple == winery.lower():
                query_features.append('winery_' + triple)


query = "new zealand Sauvignon Blanc" # copy in query from file
query_features = []
process_query(query)
print(f"Query: {query}")
# print(f"Query Features: {query_features}\n")


# tf-idf of query
with open('../models/data/vectorizer.pkl', 'rb') as file:
    vectorizer = pickle.load(file)
query_string = " ".join(query_features)
query_tfidf = vectorizer.transform([query_string])

# cosine similarity
with open('../models/data/feature_vectors.pkl', 'rb') as file:
    feature_vectors = pickle.load(file)
cosine_similarities = cosine_similarity(query_tfidf, feature_vectors)[0]
recommended_indices = cosine_similarities.argsort()[::-1]


# list to collect results from model + baseline(s)
result_list = []

# print 10 best results
with pd.option_context('display.max_colwidth', None):
    for i in range(10):
        wine = df_cleaned[df_cleaned['Unnamed: 0'] == recommended_indices[i]]
        result_list.append({"id": int(recommended_indices[i]), "model": "tf-idf_baseline", "rank": i + 1, "score": None})


print(f"Total of {len(result_list)} Results generated!")


Query: new zealand Sauvignon Blanc
Total of 10 Results generated!


## Random Baseline (temporary)

In [29]:
# baseline 2 - random
import random

wine_rows = df_cleaned.shape[0]

print(f"Query: {query}")
# print(f"Query Features: {query_features}\n")

# print 10 random results
with pd.option_context('display.max_colwidth', None):
    for i in range(10):
        random_index = random.randint(0, wine_rows)
        wine = df_cleaned[df_cleaned['Unnamed: 0'] == random_index]
        result_list.append({"id": random_index, "model": "random_baseline", "rank": i + 1, "score": None})
        #print(
        #     f"Wine #{i+1}:\n"
        #     f"Country: {wine['country'].values[0]}\n"
        #     f"Description: {wine['description'].values[0]}\n"
        #     f"Points: {wine['points'].values[0]}\n"
        #     f"Price: {wine['price'].values[0]}\n"
        #     f"Province: {wine['province'].values[0]}\n"
        #     f"Region: {wine['region_1'].values[0]}\n"
        #     f"Variety: {wine['variety'].values[0]}\n"
        #     f"Winery: {wine['winery'].values[0]}\n"
        #     f"Wine number in documents: {random_index}\n" # debug info
        #     f"Wine features: {wine['enhanced_features'].values[0]}\n" # debug info
        #     "\n"
        # )

print(f"Total of {len(result_list)} Results generated!")

Query: new zealand Sauvignon Blanc
Total of 20 Results generated!


## Scoring

In [30]:
import json
import os

# format: {"id": random_index, "model": "random_baseline", "rank": i + 1, "score": None})

def write_json(data):
    json_file_path = 'scores.json'

    # if file doesn't exist, create list
    if not os.path.exists(json_file_path):
        query_list = {
            "queries": [data]
        }
        with open(json_file_path, "w") as file:
            json.dump(query_list, file, indent = 4)
        return

    # append new query data
    with open(json_file_path, 'r+') as file:
        file_data = json.load(file)
        file_data["queries"].append(data)
        file.seek(0)
        json.dump(file_data, file, indent = 4)


def score():
    for i in range(len(result_list_randomized)):
        result = result_list_randomized[i]
    
        # check if this wine was already checked, if yes copy score and skip
        result_is_duplicate = False
        for j in range(i):
            if result["id"] == result_list_randomized[j]["id"]:
                result["score"] = result_list_randomized[j]["score"]
                result_is_duplicate = True
                break
                
        if result_is_duplicate:
            continue
    
        wine = df_cleaned[df_cleaned['Unnamed: 0'] == result["id"]]

        result_id = result["id"] # debug
        print(
            "=============================================================\n"
            f"Query: {query}\n\n"
            f"Wine:\n"
            f"Country: {wine['country'].values[0]}\n"
            f"Description: {wine['description'].values[0]}\n"
            f"Points: {wine['points'].values[0]}\n"
            f"Price: {wine['price'].values[0]}\n"
            f"Province: {wine['province'].values[0]}\n"
            f"Region: {wine['region_1'].values[0]}\n"
            f"Variety: {wine['variety'].values[0]}\n"
            f"Winery: {wine['winery'].values[0]}\n"
            f"Wine number in documents: {result_id}\n" # debug
        )
        
        score = input("Score: ").strip()
        if score.lower() == "exit":
            print("Scoring canceled, no scores will be written to file")
            return False
            
        while not score.isdigit() or int(score) < 0 or int(score) > 3:
            score = input("Score: ").strip()
            if score.lower() == "exit":
                print("Scoring canceled, no scores will be written to file")
                return False
        
        result["score"] = score
    return True

def sort():
    random_baseline = []
    tf_idf_baseline = []
    
    # sort by model
    for result in result_list_randomized:
        model = result["model"]
        if model == "random_baseline":
            random_baseline.append(result)
        elif model == "tf-idf_baseline":
            tf_idf_baseline.append(result)
        else:
            print(f"Model not found: '{model}'") 
    
    
    sorted_random_baseline = sorted(random_baseline, key = lambda x: x["rank"])
    sorted_tf_idf_baseline = sorted(tf_idf_baseline, key = lambda x: x["rank"])
    return sorted_random_baseline, sorted_tf_idf_baseline
    

# shuffle list to not bias scores
result_list_randomized = result_list.copy()
random.shuffle(result_list_randomized)

if score():
    sorted_random_baseline, sorted_tf_idf_baseline = sort()
    
    print("Writing results to 'scores.json'")
    # TODO: replace with model once available
    write_json(
        {
            "query": query,
            "baseline": sorted_random_baseline,
            "model": sorted_tf_idf_baseline
        }
    )



Query: new zealand Sauvignon Blanc

Wine:
Country: US
Description: Good price for a zesty, sweet dessert wine. It's a nice way to finish dinner over something with white and yellow fruit, vanilla and custard ingredients. Shows rich, honeyed orange, apricot and honey flavors, with tingly acidity.
Points: 88
Price: 16.0
Province: California
Region: Lake County
Variety: Muscat
Winery: Bonterra
Wine number in documents: 76485



Score:  0


Query: new zealand Sauvignon Blanc

Wine:
Country: US
Description: Juicy and fresh, this deeply colored wine offers lots of grapey, berry-like aromas and equally fruity and vivid flavors. It has a touch of sweetness and a soothing, smooth texture. The name refers to the winery's pledge to give 100% of profits to charity.
Points: 89
Price: 18.0
Province: California
Region: California
Variety: Red Blend
Winery: 100 Percent Wine
Wine number in documents: 122898



Score:  0


Query: new zealand Sauvignon Blanc

Wine:
Country: Uruguay
Description: Prickly aromas are short on fruit. This is a streamlined SB with spritz on the palate. Lean citrus flavors are briny and slightly bitter on the finish.
Points: 84
Price: 16.0
Province: Uruguay
Region: Uruguay_Region
Variety: Sauvignon Blanc
Winery: Garzón
Wine number in documents: 18989



Score:  2


Query: new zealand Sauvignon Blanc

Wine:
Country: US
Description: This delightful, off-dry white is a blend of Pinot Gris, Gewürztraminer, Müller Thurgau, Sauvignon Blanc and Riesling,  perfectly matched to deliver crisp, fruity aromatics and flavors of apples, oranges and pears. It's a terrific anytime quaffer.
Points: 88
Price: 10.0
Province: Oregon
Region: Willamette Valley
Variety: White Blend
Winery: Oak Knoll
Wine number in documents: 93890



Score:  0


Query: new zealand Sauvignon Blanc

Wine:
Country: US
Description: Intricate aromas and spicy, complex flavors make this medium-bodied wine an adventure to drink. It smells like cherries and cloves and wood smoke, tastes nicely ripe but dry, and has an appetizing, tannic grip in the texture.
Points: 89
Price: 16.0
Province: California
Region: Amador County
Variety: Barbera
Winery: Terra d'Oro
Wine number in documents: 41787



Score:  0


Query: new zealand Sauvignon Blanc

Wine:
Country: Italy
Description: Made from Merlot, Cabernet and Syrah, it has a soft but elegant palate. Fleshy black cherry, blackberry and spicy blueberry are accented with notes of white pepper and clove alongside smooth, silky tannins. Enjoy this soon.
Points: 91
Price: 30.0
Province: Tuscany
Region: Bolgheri
Variety: Red Blend
Winery: Le Macchiole
Wine number in documents: 36285



Score:  0


Query: new zealand Sauvignon Blanc

Wine:
Country: US
Description: From a winery named after a summit train stop between Los Gatos and Santa Cruz comes aromas of red cherry, rhubarb and clove spice. Flavors of vanilla cookies, dried strawberries, black cherries and a spicy dollop of licorice power the tasty palate.
Points: 92
Price: 49.0
Province: California
Region: Santa Cruz Mountains
Variety: Pinot Noir
Winery: Wrights Station
Wine number in documents: 94170



Score:  0


Query: new zealand Sauvignon Blanc

Wine:
Country: Italy
Description: Aromas suggest coffee, vanilla, oak, savory herb and menthol. The palate offers espresso, roasted hazelnut, dried sage and sour cherry alongside green, astringent tannins. Drink 2018–2025.
Points: 88
Price: 75.0
Province: Piedmont
Region: Barolo
Variety: Nebbiolo
Winery: Amalia
Wine number in documents: 104955



Score:  0


Query: new zealand Sauvignon Blanc

Wine:
Country: US
Description: The flavors run as deep as the dark garnet color in this full-bodied but sophisticated wine. Aromas trigger black cherry, subtle spicy oak and a walk in the forest, while the fruit flavors are ripe and delicious. The texture is like velvet—smooth but not slick. The complexity and inherent richness of flavor seem to increase with each sip, and it has firm acidity and tannins to cleanse the palate.
Points: 94
Price: 72.0
Province: California
Region: Anderson Valley
Variety: Pinot Noir
Winery: Donum
Wine number in documents: 118975



Score:  0


Query: new zealand Sauvignon Blanc

Wine:
Country: Slovenia
Description: Intensely green and herbal on the nose, this has penetrating flavors of gooseberry, lavender, mint and grass. This Sauvignon Blanc, which was aged for two years in French barrique, is textural, with its hints of green-tea-like tannin.
Points: 88
Price: 30.0
Province: Brda
Region: Brda_Region
Variety: Sauvignon
Winery: Movia
Wine number in documents: 62395



Score:  1


Query: new zealand Sauvignon Blanc

Wine:
Country: US
Description: A huge wine, almost old-fashioned in its tannins. They're so strong, they freeze the palate with astringency, like a shot of Novacaine. It's also very dry, despite sweetly ripe blackberry essence, licorice and dark chocolate. All indications are that this is an ageworthy wine. Should begin to throw sediment and mellow in seven or eight years, and could go the distance to 2020 and beyond.
Points: 94
Price: 150.0
Province: California
Region: Howell Mountain
Variety: Cabernet Sauvignon
Winery: Notre Vin
Wine number in documents: 106857



Score:  1


Query: new zealand Sauvignon Blanc

Wine:
Country: Italy
Description: Prunotto's straight Barolo shows juicy, ripe cherry, white pepper and balsamic sensations, and just a hint of espresso. It's nicely balanced, although there's not a ton of complexity. Enjoy it now and over the next few years.
Points: 90
Price: 48.0
Province: Piedmont
Region: Barolo
Variety: Nebbiolo
Winery: Prunotto
Wine number in documents: 51271



Score:  0


Query: new zealand Sauvignon Blanc

Wine:
Country: Romania
Description: Aromas of banana, mango, and pineapple open to flavors of tropical fruit salad, mango and banana and a soft finish.
Points: 86
Price: 7.0
Province: Romania
Region: Romania_Region
Variety: Sauvignon Blanc
Winery: Cramele Recas
Wine number in documents: 62126



Score:  2


Query: new zealand Sauvignon Blanc

Wine:
Country: Chile
Description: Light and generally clean smelling, with standard green apple, melon and lime aromas. Feels a touch flat and pithy for Casablanca, with generic citrus and mild bitterness for flavors. Pithy and just juicy enough on the finish.
Points: 86
Price: 12.0
Province: Casablanca Valley
Region: Casablanca Valley_Region
Variety: Sauvignon Blanc
Winery: Chilensis
Wine number in documents: 35146



Score:  2


Query: new zealand Sauvignon Blanc

Wine:
Country: Czech Republic
Description: Aromas of caramel, toffee and lemon zest set the scene for flavors of peach and white pears with a touch of toffee. It is round in the mouth with a nice balance of fruit and caramel flavors on the finish.
Points: 87
Price: 15.0
Province: Moravia
Region: Moravia_Region
Variety: Sauvignon Blanc
Winery: Vino z Czech
Wine number in documents: 6114



Score:  2


Query: new zealand Sauvignon Blanc

Wine:
Country: Macedonia
Description: Snappy lemon, grapefruit and apples aromas lead this fresh white from Bovin. On the palate, grassy flavors with fresh white fruits are backed by a lively minerality. Clean and flavorful—a perky go-to white with food pairing potential.
Points: 88
Price: 15.0
Province: Tikves
Region: Tikves_Region
Variety: Sauvignon
Winery: Bovin
Wine number in documents: 25466



Score:  1


Query: new zealand Sauvignon Blanc

Wine:
Country: Peru
Description: Forest-floor aromas of mushroom and herbal berry and plum open this Peruvian Malbec that's tight and drawing on the palate. Baked plum, chocolate and pounding oak flavors finish heavy, with notes of prune and carob.
Points: 84
Price: 14.0
Province: Ica
Region: Ica_Region
Variety: Malbec
Winery: Intipalka
Wine number in documents: 110081



Score:  0


Query: new zealand Sauvignon Blanc

Wine:
Country: US
Description: This is a beatifully balanced, not-too-full-bodied wine from vines grown at 2,400 feet in the Sierra range. It has classic black cherry, black olive and anise aromas, harmonious fruit flavors accented with light oak spiciness and a firm, fine-grained tannic structure.
Points: 90
Price: 28.0
Province: California
Region: Nevada County
Variety: Cabernet Franc
Winery: Sierra Starr
Wine number in documents: 68441



Score:  0


Query: new zealand Sauvignon Blanc

Wine:
Country: France
Description: A deliciously creamy wine, shining with apple and grapefruit flavors. The wine is light, bright, softly textured and hinting at wood aging, while displaying a more steely line of tight acidity at the end.
Points: 88
Price: 23.0
Province: Burgundy
Region: Bourgogne Hautes Côtes de Nuits
Variety: Chardonnay
Winery: Manuel Olivier
Wine number in documents: 97499



Score:  0


Query: new zealand Sauvignon Blanc

Wine:
Country: US
Description: Smooth and delicious, this has vibrant cherry and raspberry aromas, a mouthfilling, supple texture and generous fruit flavors. Aged in 35% new French oak, the wine seems to build and intensify on the palate and linger on the finish.
Points: 90
Price: 55.0
Province: California
Region: Sonoma Coast
Variety: Pinot Noir
Winery: Senses
Wine number in documents: 105476



Score:  0


Writing results to 'scores.json'
