In [2]:
import os
import pandas as pd
import numpy as np
import string
from operator import itemgetter
from collections import Counter, OrderedDict

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from matplotlib import pyplot as plt



In [13]:
# preprocess wine dataset 1
cols = ['country', 'description', 'Wine name', 'province', 'Region', 'Sub region', 'Grape', 'Winery/ Vineyard']
wine_df1 = pd.read_csv('Vyno - Wine dataset 1.csv', dtype=str, usecols=cols)
wine_df1.columns = [title.lower() for title in wine_df1.columns]
wine_df1 = wine_df1.rename(columns={
    'wine name': 'wine_name',
    'sub region': 'sub_region', 
    'winery/ vineyard': 'vineyard'
})

# preprocess wine dataset 2
wine_df2 = pd.read_csv('Vyno - Wine dataset 2.csv', dtype=str).iloc[: , 1:]
wine_df2.columns = [title.lower() for title in wine_df2.columns]
wine_df2 = wine_df2.rename(columns={
    'wine name': 'wine_name',
    'sub region': 'sub_region', 
    'winery/ vineyard': 'vineyard'
})
cols2 = ['country', 'description', 'wine_name', 'province', 'region', 'sub_region', 'grape', 'vineyard', 'title']
wine_df2 = wine_df2[cols2]

In [3]:
DATA = 'dataset/'

wine_df = pd.read_pickle("./wine_df.pkl")
wine_df = wine_df.drop_duplicates(subset=['description'])
wine_df.head()

Unnamed: 0,index,country,description,wine_name,province,region,sub_region,grape,vineyard,title,normalized_descriptors,description_vector,descriptor_count
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz,,"[oak, juicy, cherry, fruit, caramel, elegant, ...","[[-2.0711305, -4.7418947, -2.138353, -0.596845...",7
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez,,"[ripe, fig, blackberry, cassis, oak, chocolate...","[[-1.8445625, -5.050164, -4.2829804, 0.1221863...",14
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley,,"[complex, white, dark, gold, toasted_hazelnut,...","[[-2.3649457, -4.712655, -3.1457627, 0.8866671...",10
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi,,"[oak, fruit, dense, toast, toast, cigar_box, b...","[[-1.6315883, -5.0111055, -3.8726304, -2.18882...",14
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,Provence,Bandol,,Provence red blend,Domaine de la Bégude,,"[dense, wood, rich, concentrated]","[[0.7936336, -5.4881744, 1.7294126, 2.6966877,...",4


In [None]:
wine_df.to_csv('descriptors.csv')

In [5]:
# Load embeddings
wine_word2vec_model = Word2Vec.load("wine_word2vec_model.bin")

## Clustering

In [6]:
# Remove wines with fewer than 5 descriptors
wine_description_mincount = wine_df.loc[wine_df['descriptor_count'] > 5]
wine_description_mincount.reset_index(inplace=True)

input_vectors = list(wine_description_mincount['description_vector'])
input_vectors_listed = [a.tolist() for a in input_vectors]
input_vectors_listed = [a[0] for a in input_vectors_listed]

In [7]:
knn = NearestNeighbors(n_neighbors=10, algorithm='brute', metric='cosine')
model_knn = knn.fit(input_vectors_listed)

## Recommending

In [8]:
import random

random.seed(10)

idx = random.randint(0, wine_description_mincount.shape[0])
wine_desc = wine_description_mincount.loc[idx]

print(f'Wine name: {wine_desc["wine_name"]} \n')
print(f'Wine description: {wine_desc["description"]} \n')
print(f'Wine characteristics: {wine_desc["normalized_descriptors"]}')

Wine name: Rosenberg 

Wine description: Lifted notes of wild, aromatic raspberry hover above a far earthier base of chestnut. But this wine is mutable: sniffed at another moment its red cherry and peony petal flavors come to the fore. The palate is light bodied and has a firm acidic backbone, destining it for the table. There are mystical edges of woodsmoke and pepper but what remains and lingers is pure cherry and raspberry fruit. 

Wine characteristics: ['raspberry', 'earthy', 'cherry', 'light_bodied', 'firm', 'edgy', 'pepper', 'cherry', 'raspberry', 'fruit']


In [64]:
name_test = 'Markowitsch 2013 Rosenberg Red (Carnuntum)'

wine_test_vector = wine_description_mincount.loc[wine_description_mincount['title'] == name_test]['description_vector'].tolist()[0]
distance, indice = model_knn.kneighbors(wine_test_vector, n_neighbors=12)
distance_list = distance[0].tolist()[1:]
indice_list = indice[0].tolist()[1:]

wine_idxs = []

n = 1
for d, i in zip(distance_list, indice_list): 
    wine_idxs.append(wine_description_mincount.iloc[i])

In [67]:
df = pd.DataFrame(wine_idxs)
df = df.drop(columns=['level_0', 'index'])
df

Unnamed: 0,country,description,wine_name,province,region,sub_region,grape,vineyard,title,normalized_descriptors,description_vector,descriptor_count
10463,US,"Exuberant in red fruit, this is a softly textu...",Nance's Vineyard,California,Alexander Valley,Sonoma,Zinfandel,Ousterhout,,"[exuberant, fruit, soft, complex, berry, bramb...","[[0.44267905, -4.394822, -4.396351, -0.0803094...",11
87171,US,Subdued red fruit is kept fresh by moderate ac...,Benchland Series,California,Napa Valley,Napa,Malbec,William Hill Estate,William Hill Estate 2013 Benchland Series Malb...,"[fruit, fresh, full_bodied, dense, grippy, ric...","[[0.75608736, -5.521464, -2.483023, -0.2261784...",13
128292,Austria,Beautifully poised notes of ripe red and black...,Burggarten Reserve,Niederösterreich,,,Zweigelt,R&A; Pfaffl,R&A; Pfaffl 2013 Burggarten Reserve Zweigelt (...,"[ripe, cherry, pepper, spice, shimmer, grippy,...","[[-0.12163753, -5.8665137, -3.2175047, 0.04346...",9
109584,Portugal,"Tightly coiled, the firm tannins hold down the...",Duas Quintas Reserva,Douro,,,Portuguese Red,Ramos-Pinto,Ramos-Pinto 2004 Duas Quintas Reserva Red (Douro),"[firm, exuberant, berry, fruit, fresh, elegant...","[[0.8338176, -2.7187488, -2.3070815, 0.6810064...",10
104215,South Africa,"This wine has an exuberant, juicy quality to i...",,Stellenbosch,,,Syrah,Rudi Schultz,Rudi Schultz 2004 Syrah (Stellenbosch),"[exuberant, juicy, bright, berry, spice, round...","[[0.38180396, -3.9987466, -4.896322, 0.8540057...",9
20734,Spain,"Solid on the nose, with pure berry aromas matc...",Crianza,Northern Spain,Ribera del Duero,,Tinto del Pais,Viña Arnáiz,,"[berry, vanilla, fresh, medium_bodied, juicy, ...","[[-1.039872, -4.7581463, -3.5438738, 0.7693479...",11
120909,Austria,Sonorous oak notes in vain seek to calm down t...,Kalkofen,Burgenland,,,Blaufränkisch,Weninger,Weninger 2013 Kalkofen Blaufränkisch (Burgenland),"[oak, pepper, exuberant, blueberry, spice, rip...","[[1.4493425, -5.0995746, -3.5631576, -0.034575...",10
11920,South Africa,Savory aromas of balsamic vinegar paired with ...,,Western Cape,,,Shiraz,Allée Bleue,,"[savory, pepper, cherry, exuberant, juicy, ber...","[[0.91685593, -4.306549, -5.2728925, 0.2585835...",8
102799,Australia,"Smells dry and dusty, like a midwestern countr...",Thomas,New South Wales,Hunter Valley,,Shiraz,Macquariedale,Macquariedale 1999 Thomas Shiraz (Hunter Valley),"[dry, dust, fruit, bright, berry, cherry, pepp...","[[0.257434, -5.6157627, -3.477635, 0.01581449,...",9
41922,Australia,"From a single vineyard in the Ebenezer region,...",Amon-Ra Unfiltered,South Australia,Barossa Valley,,Shiraz,Glaetzer,,"[rich, blackberry, fruit, fresh, juicy, fresh,...","[[0.36276507, -4.608459, -3.482305, 1.8123993,...",11


In [70]:
descriptors = df['normalized_descriptors'].tolist()

### Make one big descriptor list

In [72]:
import itertools

descriptor_list_all = list(itertools.chain.from_iterable(descriptors))
descriptor_list = list(set(descriptor_list_all))
print(descriptor_list)
print(len(descriptor_list))

['elegant', 'oak', 'complex', 'spice', 'grippy', 'cherry', 'lean', 'firm', 'wood', 'tobacco', 'round', 'blueberry', 'juicy', 'plum', 'exuberant', 'bramble', 'tart', 'fresh', 'ripe', 'soft', 'herb', 'fruit', 'blackberry', 'violet', 'berry', 'rich', 'dry', 'vanilla', 'medium_bodied', 'full_bodied', 'dense', 'bright', 'shimmer', 'dust', 'pepper', 'savory']
36


### Get Word2Vec embeddings of descriptors

In [73]:
descriptor_vectors = []
for term in set(descriptor_list):
    word_vector = wine_word2vec_model.wv.get_vector(term).reshape(1, 300)
    descriptor_vectors.append(word_vector)
    
input_vectors_listed = [a.tolist() for a in descriptor_vectors]
input_vectors_listed = [a[0] for a in input_vectors_listed]

In [74]:
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

kmeans = KMeans(n_clusters=8, random_state=0).fit(input_vectors_listed)
labels = kmeans.predict(input_vectors_listed)
centroids = kmeans.cluster_centers_

In [75]:
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, input_vectors_listed)
sampled_descriptors = np.array(descriptor_list)[closest]

In [76]:
sampled_descriptors

array(['savory', 'rich', 'round', 'shimmer', 'spice', 'blackberry', 'oak',
       'fruit'], dtype='<U13')