In [1]:
import os
import pandas as pd
import numpy as np
import string
from operator import itemgetter
from collections import Counter, OrderedDict

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt

import hdbscan



## Load Data

In [40]:
DATA = 'dataset/'

wine_df = pd.read_pickle("./wine_df.pkl")
wine_df = wine_df.drop_duplicates(subset=['description'])
wine_df.head()

Unnamed: 0,index,country,description,wine_name,province,region,sub_region,grape,vineyard,title,normalized_descriptors,description_vector,descriptor_count
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz,,"[oak, juicy, cherry, fruit, caramel, elegant, ...","[[-2.0711305, -4.7418947, -2.138353, -0.596845...",7
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez,,"[ripe, fig, blackberry, cassis, oak, chocolate...","[[-1.8445625, -5.050164, -4.2829804, 0.1221863...",14
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley,,"[complex, white, dark, gold, toasted_hazelnut,...","[[-2.3649457, -4.712655, -3.1457627, 0.8866671...",10
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi,,"[oak, fruit, dense, toast, toast, cigar_box, b...","[[-1.6315883, -5.0111055, -3.8726304, -2.18882...",14
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,Provence,Bandol,,Provence red blend,Domaine de la Bégude,,"[dense, wood, rich, concentrated]","[[0.7936336, -5.4881744, 1.7294126, 2.6966877,...",4


In [41]:
wine_df.shape

(169437, 13)

In [4]:
# Load embeddings
wine_word2vec_model = Word2Vec.load("wine_word2vec_model.bin")

## Clustering

In [42]:
# Remove wines with fewer than 5 descriptors
wine_description_mincount = wine_df.loc[wine_df['descriptor_count'] > 5]
wine_description_mincount.reset_index(inplace=True)

input_vectors = list(wine_description_mincount['description_vector'])
input_vectors_listed = [a.tolist() for a in input_vectors]
input_vectors_listed = [a[0] for a in input_vectors_listed]

In [45]:
knn = NearestNeighbors(n_neighbors=10, algorithm='brute', metric='cosine')
model_knn = knn.fit(input_vectors_listed)

## Recommending

In [48]:
import random

random.seed(10)

idx = random.randint(0, wine_description_mincount.shape[0])
wine_desc = wine_description_mincount.loc[idx]

print(f'Wine name: {wine_desc["wine_name"]} \n')
print(f'Wine description: {wine_desc["description"]} \n')
print(f'Wine characteristics: {wine_desc["normalized_descriptors"]}')

Wine name: Rosenberg 

Wine description: Lifted notes of wild, aromatic raspberry hover above a far earthier base of chestnut. But this wine is mutable: sniffed at another moment its red cherry and peony petal flavors come to the fore. The palate is light bodied and has a firm acidic backbone, destining it for the table. There are mystical edges of woodsmoke and pepper but what remains and lingers is pure cherry and raspberry fruit. 

Wine characteristics: ['raspberry', 'earthy', 'cherry', 'light_bodied', 'firm', 'edgy', 'pepper', 'cherry', 'raspberry', 'fruit']


In [125]:
name_test = wine_desc["wine_name"]

wine_test_vector = wine_description_mincount.loc[wine_description_mincount['wine_name'] == name_test]['description_vector'].tolist()[0]
distance, indice = model_knn.kneighbors(wine_test_vector, n_neighbors=13)
distance_list = distance[0].tolist()[1:]
indice_list = indice[0].tolist()[1:]

main_wine = wine_description_mincount.loc[wine_description_mincount['wine_name'] == name_test]

print('Wine to match:', name_test)
print('The original wine has the following descriptors:', list(main_wine['normalized_descriptors'])[0], '\n')


descriptors = []
n = 1
for d, i in zip(distance_list, indice_list):
    wine_name = wine_description_mincount['wine_name'][i]
    wine_descriptors = wine_description_mincount['normalized_descriptors'][i]
    descriptors.append(wine_descriptors)
    print('Suggestion', str(n), ':', wine_name, 'with a cosine distance of', "{:.3f}".format(d))
    print('This wine has the following descriptors:', wine_descriptors)
    print('')
    n+=1

Wine to match: Rosenberg
The original wine has the following descriptors: ['soft', 'stone', 'melon', 'fresh', 'clean', 'soft', 'stone', 'dry', 'fresh', 'apple'] 

Suggestion 1 : Extra Dry with a cosine distance of 0.049
This wine has the following descriptors: ['fresh', 'clean', 'stone', 'fruit', 'citrus', 'soft']

Suggestion 2 : Solutré with a cosine distance of 0.079
This wine has the following descriptors: ['white', 'fruit', 'soft', 'rich', 'fresh', 'apple', 'stone']

Suggestion 3 : Extra Dry with a cosine distance of 0.080
This wine has the following descriptors: ['fresh', 'dry', 'flower', 'stone', 'fruit', 'citrus', 'soft']

Suggestion 4 : Ried Loibenberg Smaragd with a cosine distance of 0.084
This wine has the following descriptors: ['fresh', 'green', 'apple', 'ripe', 'citrus', 'light_bodied', 'apple', 'stone', 'fruit', 'citrus', 'stone']

Suggestion 5 : Heissenberg with a cosine distance of 0.086
This wine has the following descriptors: ['round', 'ripe', 'peach', 'fruit', 'brig

### Make one big descriptor list

In [127]:
descriptor_list_all = list(itertools.chain.from_iterable(descriptors))
descriptor_list = list(set(descriptor_list_all))
print(descriptor_list)
print(len(descriptor_list))

['clean', 'complex', 'depth', 'ripe', 'plump', 'stone', 'citrus', 'green', 'white', 'rich', 'crisp', 'lime', 'melon', 'apple', 'bright', 'flower', 'minerality', 'firm', 'pineapple', 'fresh', 'lemon', 'cream', 'round', 'soft', 'peach', 'dry', 'juicy', 'fruit', 'pepper', 'light_bodied']
30


### Get Word2Vec embeddings of descriptors

In [128]:
descriptor_vectors = []
for term in set(descriptor_list):
    word_vector = wine_word2vec_model.wv.get_vector(term).reshape(1, 300)
    descriptor_vectors.append(word_vector)
    
input_vectors_listed = [a.tolist() for a in descriptor_vectors]
input_vectors_listed = [a[0] for a in input_vectors_listed]

In [157]:
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

kmeans = KMeans(n_clusters=8, random_state=0).fit(input_vectors_listed)
labels = kmeans.predict(input_vectors_listed)
centroids = kmeans.cluster_centers_

In [158]:
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, input_vectors_listed)
sampled_descriptors = np.array(descriptor_list)[closest]

In [159]:
sampled_descriptors

array(['depth', 'green', 'round', 'plump', 'fresh', 'bright',
       'light_bodied', 'rich'], dtype='<U12')