In [8]:
import os
import pandas as pd
import numpy as np
import string
from operator import itemgetter
from collections import Counter, OrderedDict

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt

## Dataset

In [9]:
DATA = 'dataset/'

# # preprocess wine dataset 1
# cols = ['country', 'description', 'Wine name', 'province', 'Region', 'Sub region', 'Grape', 'Winery/ Vineyard']
# wine_df1 = pd.read_csv(DATA + 'Vyno - Wine dataset 1.csv', dtype=str, usecols=cols)
# wine_df1.columns = [title.lower() for title in wine_df1.columns]
# wine_df1 = wine_df1.rename(columns={
#     'wine name': 'wine_name',
#     'sub region': 'sub_region', 
#     'winery/ vineyard': 'vineyard'
# })

# # preprocess wine dataset 2
# wine_df2 = pd.read_csv(DATA + 'Vyno - Wine dataset 2.csv', dtype=str).iloc[: , 1:]
# wine_df2.columns = [title.lower() for title in wine_df2.columns]
# wine_df2 = wine_df2.rename(columns={
#     'wine name': 'wine_name',
#     'sub region': 'sub_region', 
#     'winery/ vineyard': 'vineyard'
# })
# cols2 = ['country', 'description', 'wine_name', 'province', 'region', 'sub_region', 'grape', 'vineyard', 'title']
# wine_df2 = wine_df2[cols2]

# # combine datasets into 1 dataframe
# wine_df = pd.concat([wine_df1, wine_df2], axis=0)

In [10]:
wine_df = pd.read_csv(DATA + 'wine_dataset_all.csv', dtype=str).iloc[: , 1:]

In [11]:
wine_df.head(5)

Unnamed: 0,country,description,wine_name,province,region,sub_region,grape,vineyard,title
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz,
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez,
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley,
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi,
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,Provence,Bandol,,Provence red blend,Domaine de la Bégude,


In [44]:
wine_df.shape

(280910, 13)

## Wine descriptions

In [13]:
descriptions_list = list(wine_df['description'])
descriptions_list = [str(r) for r in descriptions_list]
full_corpus = ' '.join(descriptions_list)
sentences_tokenized = sent_tokenize(full_corpus)

print(sentences_tokenized[:5])

['This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak.', 'Juicy red-cherry fruit and a compelling hint of caramel greet the palate, framed by elegant, fine tannins and a subtle minty tone in the background.', 'Balanced and rewarding from start to finish, it has years ahead of it to develop further nuance.', 'Enjoy 2022–2030.', 'Ripe aromas of fig, blackberry and cassis are softened and sweetened by a slathering of oaky chocolate and vanilla.']


In [14]:
print(sentences_tokenized[0])

This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak.


In [15]:
stop_words = set(stopwords.words('english')) 

punctuation_table = str.maketrans({key: None for key in string.punctuation})
sno = SnowballStemmer('english')


def normalize_text(raw_text):
    try:
        word_list = word_tokenize(raw_text)
        normalized_sentence = []
        for w in word_list:
            try:
                w = str(w)
                lower_case_word = str.lower(w)
                stemmed_word = sno.stem(lower_case_word)
                no_punctuation = stemmed_word.translate(punctuation_table)
                if len(no_punctuation) > 1 and no_punctuation not in stop_words:
                    normalized_sentence.append(no_punctuation)
            except:
                continue
        return normalized_sentence
    except:
        return ''

normalized_sentences = []
for s in sentences_tokenized:
    normalized_text = normalize_text(s)
    normalized_sentences.append(normalized_text)

In [16]:
normalized_sentences[0]

['tremend',
 '100',
 'variet',
 'wine',
 'hail',
 'oakvill',
 'age',
 'three',
 'year',
 'oak']

In [17]:
phrases = Phrases(normalized_sentences)
phrases = Phrases(phrases[normalized_sentences])

ngrams = Phraser(phrases)

phrased_sentences = []
for sent in normalized_sentences:
    phrased_sentence = ngrams[sent]
    phrased_sentences.append(phrased_sentence)

full_list_words = [item for sublist in phrased_sentences for item in sublist]

In [18]:
phrased_sentences[0]

['tremend',
 '100_variet',
 'wine',
 'hail',
 'oakvill',
 'age',
 'three_year',
 'oak']

In [19]:
word_counts = Counter(full_list_words)
sorted_counts = OrderedDict(word_counts.most_common(5000))
counter_df = pd.DataFrame.from_dict(sorted_counts, orient='index')
counter_df.to_csv('top_5000_descriptors.csv')

In [20]:
counter_df

Unnamed: 0,0
wine,177973
flavor,156421
fruit,127581
finish,85879
acid,79412
...,...
expos,60
tenuta,60
acrid,60
previous_year,60


In [21]:
descriptor_mapping = pd.read_csv('descriptor_mapping.csv').set_index('raw descriptor')
descriptor_mapping.head(10)

Unnamed: 0_level_0,level_3,level_2,level_1
raw descriptor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
abras,abrasive,high_tannin,tannin
acacia,acacia,flowery,flower
acacia_flower,acacia,flowery,flower
aciddriven,acid_driven,high_acid,acid
aggress,aggressive,high_acid,acid
airi,airy,light_bodied,body
allspic,allspice,baking_spices,spice
almond,almond,nutty,nutty
almond_past,almond,nutty,nutty
marzipan,almond,nutty,nutty


In [45]:
descriptor_mapping

Unnamed: 0_level_0,level_3,level_2,level_1
raw descriptor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
abras,abrasive,high_tannin,tannin
acacia,acacia,flowery,flower
acacia_flower,acacia,flowery,flower
aciddriven,acid_driven,high_acid,acid
aggress,aggressive,high_acid,acid
...,...,...,...
zest,zest,citrus_fruit,fruit
zesti,zesty,high_acid,acid
zing,zingy,high_acid,acid
zingi,zingy,high_acid,acid


In [22]:
def return_mapped_descriptor(word):
    if word in list(descriptor_mapping.index):
        normalized_word = descriptor_mapping['level_3'][word]
        return normalized_word
    else:
        return word

normalized_sentences = []
for sent in phrased_sentences:
    normalized_sentence = []
    for word in sent:
        normalized_word = return_mapped_descriptor(word)
        normalized_sentence.append(str(normalized_word))
    normalized_sentences.append(normalized_sentence)

In [23]:
normalized_sentences[1]

['juicy',
 'cherry',
 'fruit',
 'compel',
 'hint',
 'caramel',
 'greet',
 'palat',
 'frame',
 'elegant',
 'fine',
 'tannin',
 'subtl',
 'mint',
 'tone',
 'background']

In [22]:
# wine_word2vec_model = Word2Vec(normalized_sentences, vector_size=300, min_count=5, epochs=15)
# print(wine_word2vec_model)

# wine_word2vec_model.save('wine_word2vec_model.bin')

Word2Vec(vocab=25421, vector_size=300, alpha=0.025)


In [24]:
wine_word2vec_model = Word2Vec.load("wine_word2vec_model.bin")

In [25]:
wine_word2vec_model.wv.most_similar(positive='peach', topn=10)

[('peach_nectar', 0.6729379892349243),
 ('pear', 0.6664803624153137),
 ('honeydew', 0.6348575353622437),
 ('grapefruit', 0.6208798289299011),
 ('stone_fruit', 0.6191845536231995),
 ('apple', 0.6178895831108093),
 ('citrus', 0.6098344922065735),
 ('lime', 0.6090511083602905),
 ('kiwi', 0.6007109880447388),
 ('lemon', 0.5999903082847595)]

## From Word Embeddings to Wine Review Embeddings

In [26]:
wine_descriptions = list(wine_df['description'])

def return_descriptor_from_mapping(word):
    if word in list(descriptor_mapping.index):
        descriptor_to_return = descriptor_mapping['level_3'][word]
        return descriptor_to_return

descriptorized_descriptions = []
for desc in wine_descriptions:
    normalized_desc = normalize_text(desc)
    phrased_desc = ngrams[normalized_desc]
    descriptors_only = [return_descriptor_from_mapping(word) for word in phrased_desc]
    no_nones = [str(d) for d in descriptors_only if d is not None]
    descriptorized_desc = ' '.join(no_nones)
    descriptorized_descriptions.append(descriptorized_desc)

In [29]:
descriptorized_descriptions[1]

'ripe fig blackberry cassis oak chocolate vanilla rich chocolate fruit baked spice toast heady'

In [31]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit(descriptorized_descriptions)

dict_of_tfidf_weightings = dict(zip(X.get_feature_names(), X.idf_))

wine_description_vectors = []
for d in descriptorized_descriptions:
    descriptor_count = 0
    weighted_description_terms = []
    terms = d.split(' ')
    for term in terms:
        if term in dict_of_tfidf_weightings.keys():
            tfidf_weighting = dict_of_tfidf_weightings[term]
            word_vector = wine_word2vec_model.wv.get_vector(term).reshape(1, 300)
            weighted_word_vector = tfidf_weighting * word_vector
            weighted_description_terms.append(weighted_word_vector)
            descriptor_count += 1
        else:
            continue
    try:
        description_vector = sum(weighted_description_terms)/len(weighted_description_terms)
    except:
        description_vector = []
    vector_and_count = [terms, description_vector, descriptor_count]
    wine_description_vectors.append(vector_and_count)

wine_df['normalized_descriptors'] = list(map(itemgetter(0), wine_description_vectors))
wine_df['description_vector'] = list(map(itemgetter(1), wine_description_vectors))
wine_df['descriptor_count'] = list(map(itemgetter(2), wine_description_vectors))

wine_df.reset_index(inplace=True)
wine_df.head()

Unnamed: 0,index,country,description,wine_name,province,region,sub_region,grape,vineyard,title,normalized_descriptors,description_vector,descriptor_count
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz,,"[oak, juicy, cherry, fruit, caramel, elegant, ...","[[-2.0711305, -4.7418947, -2.138353, -0.596845...",7
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez,,"[ripe, fig, blackberry, cassis, oak, chocolate...","[[-1.8445625, -5.050164, -4.2829804, 0.1221863...",14
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley,,"[complex, white, dark, gold, toasted_hazelnut,...","[[-2.3649457, -4.712655, -3.1457627, 0.8866671...",10
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi,,"[oak, fruit, dense, toast, toast, cigar_box, b...","[[-1.6315883, -5.0111055, -3.8726304, -2.18882...",14
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,Provence,Bandol,,Provence red blend,Domaine de la Bégude,,"[dense, wood, rich, concentrated]","[[0.7936336, -5.4881744, 1.7294126, 2.6966877,...",4


In [32]:
wine_df.to_csv('wine_df_descriptors_3.csv')

In [34]:
type(wine_df['description_vector'][0])

numpy.ndarray

In [35]:
wine_df.to_pickle("./wine_df.pkl")

In [42]:
wine_reviews_mincount = wine_df.loc[wine_df['descriptor_count'] > 8]

In [43]:
wine_reviews_mincount

Unnamed: 0,index,country,description,wine_name,province,region,sub_region,grape,vineyard,title,normalized_descriptors,description_vector,descriptor_count
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez,,"[ripe, fig, blackberry, cassis, oak, chocolate...","[[-1.8445625, -5.050164, -4.2829804, 0.1221863...",14
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley,,"[complex, white, dark, gold, toasted_hazelnut,...","[[-2.3649457, -4.712655, -3.1457627, 0.8866671...",10
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi,,"[oak, fruit, dense, toast, toast, cigar_box, b...","[[-1.6315883, -5.0111055, -3.8726304, -2.18882...",14
5,5,Spain,"Deep, dense and pure from the opening bell, th...",Numanthia,Northern Spain,Toro,,Tinta de Toro,Numanthia,,"[depth, dense, dark, ripe, fruit, oak, blackbe...","[[-0.91695696, -4.525911, -2.2801418, -0.40843...",14
7,7,Spain,Lush cedary black-fruit aromas are luxe and of...,Carodorum Único Crianza,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez,,"[lush, cedar, black_fruit, almond, vanilla, ta...","[[-2.334717, -5.676419, -3.870332, -2.1135352,...",14
...,...,...,...,...,...,...,...,...,...,...,...,...,...
280900,280900,Italy,"Intense aromas of wild cherry, baking spice, t...",,Sicily & Sardinia,Sicilia,,Frappato,COS,COS 2013 Frappato (Sicilia),"[cherry, baked, spice, savory, herb, silky, ro...","[[-0.29395303, -4.318718, -2.8238916, -0.30541...",18
280901,280901,Italy,"Blackberry, cassis, grilled herb and toasted a...",Sàgana Tenuta San Giacomo,Sicily & Sardinia,Sicilia,,Nero d'Avola,Cusumano,Cusumano 2012 Sàgana Tenuta San Giacomo Nero d...,"[blackberry, cassis, grilled_herbs, toast, cof...","[[-1.1284274, -5.352018, -6.0093055, -1.208133...",10
280902,280902,Israel,"A bouquet of black cherry, tart cranberry and ...",Oak Aged,Galilee,,,Cabernet Sauvignon,Dalton,Dalton 2012 Oak Aged Cabernet Sauvignon (Galilee),"[flower, cherry, tart, cranberry, clove, cherr...","[[-0.9823631, -5.9883256, -4.2795463, -2.35708...",16
280904,280904,France,"While it's rich, this beautiful dry wine also ...",Seppi Landmann Vallée Noble,Alsace,Alsace,,Pinot Gris,Domaine Rieflé-Landmann,Domaine Rieflé-Landmann 2013 Seppi Landmann Va...,"[rich, dry, fresh, ripe, white, fruit, pear, a...","[[-0.9369676, -4.214651, -1.8300135, 1.3751801...",9


## Recommender

In [34]:
# first, let's eliminate any review with fewer than 5 descriptors from our dataset
wine_reviews_mincount = wine_df.loc[wine_df['descriptor_count'] > 5]
wine_reviews_mincount.reset_index(inplace=True)

input_vectors = list(wine_reviews_mincount['description_vector'])
input_vectors_listed = [a.tolist() for a in input_vectors]
input_vectors_listed = [a[0] for a in input_vectors_listed]

knn = NearestNeighbors(n_neighbors=10, algorithm= 'brute', metric='cosine')
model_knn = knn.fit(input_vectors_listed)

In [46]:
input_vectors_listed[0]

NameError: name 'input_vectors_listed' is not defined

In [37]:
wine_reviews_mincount

Unnamed: 0,level_0,index,country,description,wine_name,province,region,sub_region,grape,vineyard,title,normalized_descriptors,description_vector,descriptor_count
0,0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz,,"[oak, juicy, cherry, fruit, caramel, elegant, ...","[[-2.0711305, -4.7418947, -2.138353, -0.596845...",7
1,1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez,,"[ripe, fig, blackberry, cassis, oak, chocolate...","[[-1.8445625, -5.050164, -4.2829804, 0.1221863...",14
2,2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley,,"[complex, white, dark, gold, toasted_hazelnut,...","[[-2.3649457, -4.712655, -3.1457627, 0.8866671...",10
3,3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi,,"[oak, fruit, dense, toast, toast, cigar_box, b...","[[-1.6315883, -5.0111055, -3.8726304, -2.18882...",14
4,5,5,Spain,"Deep, dense and pure from the opening bell, th...",Numanthia,Northern Spain,Toro,,Tinta de Toro,Numanthia,,"[depth, dense, dark, ripe, fruit, oak, blackbe...","[[-0.91695696, -4.525911, -2.2801418, -0.40843...",14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234940,280904,129969,France,"While it's rich, this beautiful dry wine also ...",Seppi Landmann Vallée Noble,Alsace,Alsace,,Pinot Gris,Domaine Rieflé-Landmann,Domaine Rieflé-Landmann 2013 Seppi Landmann Va...,"[rich, dry, fresh, ripe, white, fruit, pear, a...","[[-0.9369676, -4.214651, -1.8300135, 1.3751801...",9
234941,280905,129970,Germany,Notes of honeysuckle and cantaloupe sweeten th...,Brauneberger Juffer-Sonnenuhr Spätlese,Mosel,,,Riesling,Dr. H. Thanisch (Erben Müller-Burggraef),Dr. H. Thanisch (Erben Müller-Burggraef) 2013 ...,"[honeysuckle, cataloupe, sweet, juicy, tart, t...","[[-2.0164046, -3.3920963, -3.2081838, -0.17288...",9
234942,280906,129971,US,Citation is given as much as a decade of bottl...,,Oregon,Oregon,Oregon Other,Pinot Noir,Citation,Citation 2004 Pinot Noir (Oregon),"[baked, cherry, cocoa, coconut, soft, fruit, c...","[[-3.085924, -5.2862296, -2.7884707, 0.3871322...",7
234943,280908,129973,France,"A dry style of Pinot Gris, this is crisp with ...",,Alsace,Alsace,,Pinot Gris,Domaine Marcel Deiss,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),"[dry, crisp, weight, powerful, spice, baked, a...","[[-1.3344327, -2.0043838, -1.1969368, 0.530952...",7


In [39]:
name_test = "Seppi Landmann Vallée Noble"

wine_test_vector = wine_reviews_mincount.loc[wine_reviews_mincount['wine_name'] == name_test]['description_vector'].tolist()[0]
distance, indice = model_knn.kneighbors(wine_test_vector, n_neighbors=9)
distance_list = distance[0].tolist()[1:]
indice_list = indice[0].tolist()[1:]

main_wine = wine_reviews_mincount.loc[wine_reviews_mincount['wine_name'] == name_test]

print('Wine to match:', name_test)
print('The original wine has the following descriptors:', list(main_wine['normalized_descriptors'])[0])
print('_________')

n = 1
for d, i in zip(distance_list, indice_list):
    wine_name = wine_reviews_mincount['wine_name'][i]
    wine_descriptors = wine_reviews_mincount['normalized_descriptors'][i]
    print('Suggestion', str(n), ':', wine_name, 'with a cosine distance of', "{:.3f}".format(d))
    print('This wine has the following descriptors:', wine_descriptors)
    print('')
    n+=1

Wine to match: Seppi Landmann Vallée Noble
The original wine has the following descriptors: ['ripe', 'powerful', 'rich', 'perfumed', 'white', 'fruit', 'flower', 'minerality']
_________
Suggestion 1 : Seppi Landmann Vallée Noble with a cosine distance of 0.000
This wine has the following descriptors: ['ripe', 'powerful', 'rich', 'perfumed', 'white', 'fruit', 'flower', 'minerality']

Suggestion 2 : Katharinas Reserve with a cosine distance of 0.052
This wine has the following descriptors: ['perfumed', 'rich', 'minerality', 'white', 'fruit', 'fruit']

Suggestion 3 : Marquis de Laguiche Morgeot Premier Cru with a cosine distance of 0.080
This wine has the following descriptors: ['perfumed', 'flower', 'weight', 'rich', 'concentrated', 'rich', 'yellow', 'white', 'fruit', 'minerality']

Suggestion 4 : Barrel sample with a cosine distance of 0.080
This wine has the following descriptors: ['powerful', 'perfumed', 'fruit', 'rich', 'fruit', 'minerality']

Suggestion 5 : Barrel sample with a cosin

zsh:1: command not found: pandas
