In [13]:
import pandas as pd
import numpy as np

In [19]:
wine_df = pd.read_csv('../data/processed/wine_dataset_all.csv', dtype=str).iloc[: , 1:]

In [20]:
wine_df = wine_df.drop_duplicates(subset=['description'])

In [22]:
df = pd.read_csv(
            '../data/processed/wine_dataset_all.csv', dtype=str).dropna(subset=["description"]).drop_duplicates(subset=['description'])
wine_descriptions = df.sample(12)
# wine_df
wine_descriptions

Unnamed: 0.1,Unnamed: 0,country,description,wine_name,province,region,sub_region,grape,vineyard,title
9781,9781,Argentina,This overly ripe Malbec is packed with rooty s...,Padrillos,Mendoza Province,Mendoza,,Malbec,Siesta Wines,
66958,66958,US,It's all in your face in this modern-style Cha...,Sangiacomo Vineyard West Rows,California,Carneros,Napa-Sonoma,Chardonnay,Signaterra Benziger,
216122,65187,US,"A rich pink color and tangy, concentrated rasp...",Vintner's Collection,California,California,California Other,Rosé,Sterling,Sterling 2016 Vintner's Collection Rosé (Calif...
80388,80388,France,"Light colored and fruity, this is all about re...",,Burgundy,Gevrey-Chambertin,,Pinot Noir,Joseph Faiveley,
207632,56697,US,Subtle aromatics ride a steely frame of straig...,,California,Russian River Valley,Sonoma,Chardonnay,Bravium,Bravium 2015 Chardonnay (Russian River Valley)
194563,43628,Portugal,"Produced from the Espadeiro, a grape widely us...",Espadeiro Colheita Seleccionada,Vinho Verde,,,Rosé,Quinta de Gomariz,Quinta de Gomariz 2015 Espadeiro Colheita Sele...
121418,121418,US,"The Andretti style is fruit-driven and easy, a...",,California,Napa Valley,Napa,Cabernet Sauvignon,Andretti,
9148,9148,France,Earthy notes mix in with fresh Mirabelle plum ...,Collection,Alsace,Alsace,,Pinot Blanc,Cave de Ribeauvillé,
76431,76431,US,"A consistent winner, this single-vineyard Char...",The Big Sissy Conner Lee Vineyard,Washington,Columbia Valley (WA),Columbia Valley,Chardonnay,Gorman,
139283,139283,Chile,"Leather, earth and bold fruit on the nose, and...",Quatro Reserva,Colchagua Valley,,,Red Blend,MontGras,


In [193]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

from gensim.models.phrases import Phrases, Phraser

import string

ngrams = Phrases.load('ngrams')
descriptor_map = pd.read_csv('descriptor_mapping.csv').set_index('raw descriptor')

def preprocess_description(description, ngram, descriptor_map, level=3):
    tokens = tokenize_description(description)
    phrase = ngram[tokens]
    descriptors = [map_descriptor(word, descriptor_map, level) for word in phrase]
    descripters_cleaned = [str(desc) for desc in descriptors if desc is not None]
    
    return ' '.join(descripters_cleaned)

def tokenize_description(description):
    stop_words = set(stopwords.words('english'))
    punctuation_table = str.maketrans({key: None for key in string.punctuation})
    stemmer = SnowballStemmer('english')
    
    normalized_description = []
    word_tokens = word_tokenize(description)
    for word in word_tokens:
        lower_case = str.lower(str(word))
        stemmed_word = stemmer.stem(lower_case)
        no_punctuation = stemmed_word.translate(punctuation_table)
        if len(no_punctuation) > 1 and no_punctuation not in stop_words:
            normalized_description.append(no_punctuation)
    
    return normalized_description

def map_descriptor(word, mapping, level=3):
    if word in list(mapping.index):
        return mapping[f'level_{level}'][word]

In [194]:
test_desc = wine_df['description'][0]
print(test_desc)

This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak. Juicy red-cherry fruit and a compelling hint of caramel greet the palate, framed by elegant, fine tannins and a subtle minty tone in the background. Balanced and rewarding from start to finish, it has years ahead of it to develop further nuance. Enjoy 2022–2030.


In [197]:
preprocess_description(test_desc, ngrams, descriptor_map)

'oak juicy cherry fruit caramel elegant mint'

In [11]:
import pickle

def load_tf_idf_weights(pkl):
    tf_idf = pickle.load(open(pkl, "rb"))
    return dict(zip(tf_idf.get_feature_names(), tf_idf.idf_))

tfidf_weightings = load_tf_idf_weights("vectorizer.pickle")

In [12]:
from gensim.models import Word2Vec

def load_model(model):
    return Word2Vec.load(model)

word2vec = load_model("wine_word2vec_model.bin")

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

def order_12_wines(list_of_descriptors, wine_df, tfidf_weightings):
    weighted_review_terms = []
    for term in list_of_descriptors:
        if term not in tfidf_weightings:
            if term not in descriptor_mapping.index:
                print('choose a different descriptor from', term)
                continue
            else:
                term = descriptor_mapping['normalized'][term]
        tfidf_weighting = tfidf_weightings[term]
        word_vector = word2vec.wv.get_vector(term).reshape(1, 300)
        weighted_word_vector = tfidf_weighting * word_vector
        weighted_review_terms.append(weighted_word_vector)
    descriptors_vector = sum(weighted_review_terms)
    
    wine_review_vectors = []
    for terms in wine_df['normalized_descriptors']:
        terms = terms[2:-2]
        terms = terms.split("""', '""")
        descriptor_count = 0
        weighted_review_terms = []
        for term in terms:
            if term in tfidf_weightings.keys():
                tfidf_weighting = tfidf_weightings[term]
                word_vector = word2vec.wv.get_vector(term).reshape(1, 300)
                weighted_word_vector = tfidf_weighting * word_vector
                weighted_review_terms.append(weighted_word_vector)
                descriptor_count += 1
            else:
                continue
        try:
            review_vector = sum(weighted_review_terms)/len(weighted_review_terms)
        except:
            review_vector = []
        vector_and_count = [terms, review_vector, descriptor_count]
        wine_review_vectors.append(review_vector)
        
    wine_df['review_vector'] = list( wine_review_vectors)
    
    cosine_similarities = []
    for vector in wine_review_vectors:
        print(vector)
        cosine = sklearn.metrics.pairwise.cosine_similarity(descriptors_vector, vector)
        cosine_similarities.append(cosine)
        
    wine_df['cosine_similarities'] = list(cosine_similarities)
    
    return wine_df
    
example_df = pd.read_csv('wine_sample.csv', dtype=str)
descriptors = ['savory', 'rich', 'round', 'shimmer', 'spice', 'blackberry', 'oak', 'fruit']
example_df.head()

order_12_wines(descriptors[:2], example_df, tfidf_weightings)

NameError: name 'pd' is not defined

In [6]:
example_df

Unnamed: 0,country,description,wine_name,province,region,sub_region,grape,vineyard,title,normalized_descriptors,description_vector,descriptor_count
0,US,"Exuberant in red fruit, this is a softly textu...",Nance's Vineyard,California,Alexander Valley,Sonoma,Zinfandel,Ousterhout,,"['exuberant', 'fruit', 'soft', 'complex', 'ber...",[[ 0.44267905 -4.394822 -4.396351 -0.08030...,11
1,US,Subdued red fruit is kept fresh by moderate ac...,Benchland Series,California,Napa Valley,Napa,Malbec,William Hill Estate,William Hill Estate 2013 Benchland Series Malb...,"['fruit', 'fresh', 'full_bodied', 'dense', 'gr...",[[ 7.56087363e-01 -5.52146387e+00 -2.48302293e...,13
2,Austria,Beautifully poised notes of ripe red and black...,Burggarten Reserve,Niederösterreich,,,Zweigelt,R&A; Pfaffl,R&A; Pfaffl 2013 Burggarten Reserve Zweigelt (...,"['ripe', 'cherry', 'pepper', 'spice', 'shimmer...",[[-1.2163753e-01 -5.8665137e+00 -3.2175047e+00...,9
3,Portugal,"Tightly coiled, the firm tannins hold down the...",Duas Quintas Reserva,Douro,,,Portuguese Red,Ramos-Pinto,Ramos-Pinto 2004 Duas Quintas Reserva Red (Douro),"['firm', 'exuberant', 'berry', 'fruit', 'fresh...",[[ 8.3381760e-01 -2.7187488e+00 -2.3070815e+00...,10
4,South Africa,"This wine has an exuberant, juicy quality to i...",,Stellenbosch,,,Syrah,Rudi Schultz,Rudi Schultz 2004 Syrah (Stellenbosch),"['exuberant', 'juicy', 'bright', 'berry', 'spi...",[[ 0.38180396 -3.9987466 -4.896322 0.85400...,9
5,Spain,"Solid on the nose, with pure berry aromas matc...",Crianza,Northern Spain,Ribera del Duero,,Tinto del Pais,Viña Arnáiz,,"['berry', 'vanilla', 'fresh', 'medium_bodied',...",[[-1.03987205e+00 -4.75814629e+00 -3.54387379e...,11
6,Austria,Sonorous oak notes in vain seek to calm down t...,Kalkofen,Burgenland,,,Blaufränkisch,Weninger,Weninger 2013 Kalkofen Blaufränkisch (Burgenland),"['oak', 'pepper', 'exuberant', 'blueberry', 's...",[[ 1.4493425e+00 -5.0995746e+00 -3.5631576e+00...,10
7,South Africa,Savory aromas of balsamic vinegar paired with ...,,Western Cape,,,Shiraz,Allée Bleue,,"['savory', 'pepper', 'cherry', 'exuberant', 'j...",[[ 9.16855931e-01 -4.30654907e+00 -5.27289248e...,8
8,Australia,"Smells dry and dusty, like a midwestern countr...",Thomas,New South Wales,Hunter Valley,,Shiraz,Macquariedale,Macquariedale 1999 Thomas Shiraz (Hunter Valley),"['dry', 'dust', 'fruit', 'bright', 'berry', 'c...",[[ 2.57434011e-01 -5.61576271e+00 -3.47763491e...,9
9,Australia,"From a single vineyard in the Ebenezer region,...",Amon-Ra Unfiltered,South Australia,Barossa Valley,,Shiraz,Glaetzer,,"['rich', 'blackberry', 'fruit', 'fresh', 'juic...",[[ 3.62765074e-01 -4.60845900e+00 -3.48230505e...,11


In [27]:

import requests
import json

url = "http://127.0.0.1:8000/"

payload = json.dumps({
  "data": "hello"
})
headers = {
  'Content-Type': 'application/json'
}

response = requests.request("POST", url, headers=headers, data=payload)

print(response.text)


{
  "descriptor_list": [
    "cedar", 
    "plum", 
    "tart", 
    "smooth", 
    "fizz", 
    "gravel", 
    "depth", 
    "stone", 
    "leafy", 
    "herb", 
    "pear", 
    "refreshing", 
    "strawberry", 
    "fruit", 
    "lime", 
    "thick", 
    "buoyant", 
    "flower", 
    "supple", 
    "white", 
    "melon", 
    "tight", 
    "peach", 
    "lemon", 
    "graphite", 
    "rustic", 
    "hot", 
    "minerality", 
    "full_bodied", 
    "wood", 
    "pungent", 
    "juicy", 
    "dense", 
    "warm", 
    "clean", 
    "earth", 
    "bright", 
    "heavy", 
    "eucalyptus", 
    "rich", 
    "ripe", 
    "light_bodied", 
    "modest", 
    "coffee", 
    "french_oak", 
    "fresh", 
    "firm", 
    "cherry", 
    "apple", 
    "spice", 
    "dry", 
    "leather", 
    "complex", 
    "cardamom", 
    "crisp", 
    "raspberry", 
    "dark", 
    "currant"
  ], 
  "descriptor_vectors": [
    [
      [
        -1.237648606300354, 
        -1.5445647239685059, 
        -