In [1]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import pandas as pd
import numpy as np
import string
# from operator import itemgetter
from collections import Counter, OrderedDict

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

[nltk_data] Downloading package punkt to /Users/tayfun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tayfun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
wine = pd.read_csv('Data/winemag-data-130k-v2.csv.zip', compression='zip', low_memory=False)

In [4]:
wine.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [5]:
food_review_dataset = pd.read_csv('Reviews.csv')
print(food_review_dataset.shape)

(568454, 10)


In [6]:
food_review_dataset.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


# 1. Training Word Embeddings for Wine Dataset

In [7]:
wine_reviews_list = list(wine['description'])

In [8]:
wine_reviews_list[0]

"Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity."

In [9]:
full_wine_reviews_list = [str(r) for r in wine_reviews_list] # macht aus jedem Wort ein String
full_wine_corpus = ' '.join(full_wine_reviews_list) # full_wine_reviews_list als EIN ZUSAMMENHÄNGENDER String dargestellt 
wine_sentences_tokenized = sent_tokenize(full_wine_corpus) # Eine Liste mit ganzen Sätzen als Einträge.

In [10]:
print(wine_sentences_tokenized[:2]) # Erste zwei Sätze, also Listeneinträge [0,2)

['Aromas include tropical fruit, broom, brimstone and dried herb.', "The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity."]


In [11]:
wine_sentences_tokenized[0]

'Aromas include tropical fruit, broom, brimstone and dried herb.'

In [12]:
len(wine_reviews_list)

129971

In [13]:
full_wine_reviews_list[0]

"Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity."

In [14]:
stop_words = set(stopwords.words('english'))

punctuation_table = str.maketrans({key: None for key in string.punctuation})
sno = SnowballStemmer('english') # SnowballStemmer bildet den Stamm eines Wortes. 'english' weil engl. Text

def normalize_text(raw_text):
    try:
        word_list = word_tokenize(raw_text)
        normalized_sentence = []
        for w in word_list:
            try:
                w = str(w)
                lower_case_word = str.lower(w)
                stemmed_word = sno.stem(lower_case_word)
                no_punctuation = stemmed_word.translate(punctuation_table)
                if len(no_punctuation) > 1 and no_punctuation not in stop_words:
                    normalized_sentence.append(no_punctuation)
            except:
                continue
        return normalized_sentence
    except:
        return ''

Eine Liste, wo jeder Listeneintrag aus den Wortstämmen eines Satzes besteht:

In [20]:
normalized_wine_sentences = []
for s in wine_sentences_tokenized:
    normalized_text = normalize_text(s)
    normalized_wine_sentences.append(normalized_text)

In [26]:
normalized_wine_sentences[:5]

[['aroma', 'includ', 'tropic', 'fruit', 'broom', 'brimston', 'dri', 'herb'],
 ['palat',
  'nt',
  'express',
  'offer',
  'unripen',
  'appl',
  'citrus',
  'dri',
  'sage',
  'alongsid',
  'brisk',
  'acid'],
 ['ripe', 'fruiti', 'wine', 'smooth', 'still', 'structur'],
 ['firm',
  'tannin',
  'fill',
  'juici',
  'red',
  'berri',
  'fruit',
  'freshen',
  'acid'],
 ['alreadi', 'drinkabl', 'although', 'certain', 'better', '2016']]

In [23]:
# first, take care of the wine trigrams
wine_bigram_model = Phrases(normalized_wine_sentences, min_count=100)
wine_bigrams = [wine_bigram_model[line] for line in normalized_wine_sentences]
wine_trigram_model = Phrases(wine_bigrams, min_count=50)
phrased_wine_sentences = [wine_trigram_model[line] for line in wine_bigrams]
wine_trigram_model.save('wine_trigrams.pkl')

In [24]:
wine_trigram_model = Phraser.load('wine_trigrams.pkl')

In [25]:
wine_trigram_model

<gensim.models.phrases.Phrases at 0x7fdf3b20a850>

Nun zum wichtigsten Teil: Mit Hilfe der bestehenden Weintheorie, der Arbeit anderer wie Bernard Chen, der Zuordnung von Weindeskriptoren und des Weinrades der UC Davis wurden die 5000 häufigsten Weinbegriffe überprüft, um (i) festzustellen, ob es sich um einen Deskriptor handelt, der durch Blindverkostung abgeleitet werden kann, und (ii) ob sie informativ sind (Urteile wie "lecker" und "großartig" werden nicht als informativ angesehen). Die verbleibenden rund 1000 Deskriptoren wurden dann auf einen normalisierten Deskriptor, eine Kategorie und eine Klasse abgebildet:

In [27]:
descriptor_mapping = pd.read_csv('descriptor_mapping.csv', encoding='latin1').set_index('raw descriptor')

In [28]:
descriptor_mapping.shape

(1015, 4)

In [29]:
descriptor_mapping.head(5)

Unnamed: 0_level_0,level_3,level_2,level_1,type
raw descriptor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
abras,abrasive,high_tannin,tannin,nonaroma
acacia,acacia,flowery,flower,aroma
acacia_flower,acacia,flowery,flower,aroma
aciddriven,acid_driven,high_acid,acid,nonaroma
aggress,aggressive,high_acid,acid,nonaroma


In [30]:
# index() is an inbuilt function in Python, which searches for a given element from the
# start of the list and returns the lowest index where the element appears
def return_mapped_descriptor(word, mapping):
    if word in list(mapping.index):
        normalized_word = mapping.at[word, 'level_3']
        return normalized_word
    else:
        return word
    
normalized_wine_sentences = []
for sent in phrased_wine_sentences:
    normalized_wine_sentence = []
    for word in sent:
        normalized_word = return_mapped_descriptor(word, descriptor_mapping) # Funktionsaufruf (s.o.)
        normalized_wine_sentence.append(str(normalized_word))
    normalized_wine_sentences.append(normalized_wine_sentence) # append: fügt Element hinzu

## Preprocessing Wine Dataset

produce recommendations at the grape variety & subregion level

In [31]:
variety_mapping = {'Shiraz': 'Syrah', 'Pinot Gris': 'Pinot Grigio', 'Pinot Grigio/Gris': 'Pinot Grigio', 
                   'Garnacha, Grenache': 'Grenache', 'Garnacha': 'Grenache', 'Carmenère': 'Carmenere',
                    'Grüner Veltliner': 'Gruner Veltliner', 'Torrontés': 'Torrontes', 
                   'Rhône-style Red Blend': 'Rhone-style Red Blend', 'Albariño': 'Albarino',
                  'Gewürztraminer': 'Gewurztraminer', 'Rhône-style White Blend': 'Rhone-style White Blend',
                  'Spätburgunder, Pinot Noir': 'Pinot Noir', 'Sauvignon, Sauvignon Blanc': 'Sauvignon Blanc',
                  'Pinot Nero, Pinot Noir': 'Pinot Noir', 'Malbec-Merlot, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend',
                  'Meritage, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend', 'Garnacha, Grenache': 'Grenache',
                   'Fumé Blanc': 'Sauvignon Blanc', 'Cabernet Sauvignon-Cabernet Franc, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend',
                   'Cabernet Merlot, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend', 'Cabernet Sauvignon-Merlot, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend',
                   'Cabernet Blend, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend', 'Malbec-Cabernet Sauvignon, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend',
                   'Merlot-Cabernet Franc, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend', 'Merlot-Cabernet Sauvignon, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend',
                   'Cabernet Franc-Merlot, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend', 'Merlot-Malbec, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend',
                   'Cabernet, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend', 'Primitivo, Zinfandel': 'Zinfandel',
                   'Aragonês, Tempranillo': 'Aragonez, Tempranillo'
                  }

def consolidate_varieties(variety_name):
    if variety_name in variety_mapping:
        return variety_mapping[variety_name]
    else:
        return variety_name

In [39]:
# copy dataframe and rename the columns:
wine_df_clean = wine.copy()

wine_df_clean = wine_df_clean.rename({'country': 'Country', 'province': 'Province', 'region_1': 'Region', 'region_2': 'Subregion',
                     'variety': 'Variety'}, axis=1)

In [41]:
wine_df_clean['Variety'] = wine_df_clean['Variety'].apply(consolidate_varieties)

In [42]:
order_of_geographies = ['Subregion', 'Region', 'Province', 'Country']

# replace any nan values in the geography columns with the word none
def replace_nan_for_zero(value):
    if str(value) == '0' or str(value) == 'nan':
        return 'none'
    else:
        return value

for o in order_of_geographies:
    wine_df_clean[o] = wine_df_clean[o].apply(replace_nan_for_zero)

wine_df_clean.loc[:, order_of_geographies].fillna('none', inplace=True)

In [43]:
variety_geo = wine_df_clean.groupby(['Variety', 'Country', 'Province', 'Region', 'Subregion']).size().reset_index().rename(columns={0:'count'})
variety_geo_sliced = variety_geo.loc[variety_geo['count'] > 1]

vgeos_df = pd.DataFrame(variety_geo_sliced, columns=['Variety', 'Country', 'Province', 'Region', 'Subregion', 'count']) 
vgeos_df.to_csv('varieties_all_geos.csv')

In [44]:
wine_df_clean.head()

Unnamed: 0.1,Unnamed: 0,Country,description,designation,points,price,Province,Region,Subregion,taster_name,taster_twitter_handle,title,Variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,none,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,none,none,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Grigio,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,none,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [46]:
wine_df_clean.drop(['Unnamed: 0','designation', 'points', 'price', 'taster_name', 'taster_twitter_handle', 
                     'winery'], axis=1, inplace=True)

In [47]:
wine_df_clean.head()

Unnamed: 0,Country,description,Province,Region,Subregion,title,Variety
0,Italy,"Aromas include tropical fruit, broom, brimston...",Sicily & Sardinia,Etna,none,Nicosia 2013 Vulkà Bianco (Etna),White Blend
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Douro,none,none,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red
2,US,"Tart and snappy, the flavors of lime flesh and...",Oregon,Willamette Valley,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Grigio
3,US,"Pineapple rind, lemon pith and orange blossom ...",Michigan,Lake Michigan Shore,none,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling
4,US,"Much like the regular bottling from 2012, this...",Oregon,Willamette Valley,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir


In [48]:
wine_df_clean.shape

(129971, 7)

In [51]:
variety_geo_df = pd.read_csv('varieties_all_geos_normalized.csv', index_col=0)

wine_df_merged = pd.merge(left=wine_df_clean, right=variety_geo_df, left_on=['Variety', 'Country', 'Province', 'Region', 'Subregion'],
                         right_on=['Variety', 'Country', 'Province', 'Region', 'Subregion'])

In [55]:
wine_df_merged.head

<bound method NDFrame.head of             Country                                        description  \
0             Italy  Aromas include tropical fruit, broom, brimston...   
1             Italy  This is a mineral-driven wine from the blacken...   
2             Italy  Delicate scents of white flower, flint and whi...   
3             Italy  Subdued aromas of Spanish broom and brimstone ...   
4             Italy  Cavanera is a white blend produced by one of t...   
...             ...                                                ...   
36859     Australia  Dry woody notes reminiscent of sawdust join ap...   
36860   New Zealand  A rare Marlborough Merlot, this is a superfici...   
36861  South Africa  Light and easy with a great lift to the green ...   
36862  South Africa  Fresh, clean and easy with ripe aromas of red ...   
36863       Germany  Dusty mineral complexities juxtapose luscious ...   

                Province            Region Subregion  \
0      Sicily & Sardinia 

In [57]:
wine_df_merged.drop(['count'], axis=1, inplace=True)

In [58]:
wine_df_merged.head

<bound method NDFrame.head of             Country                                        description  \
0             Italy  Aromas include tropical fruit, broom, brimston...   
1             Italy  This is a mineral-driven wine from the blacken...   
2             Italy  Delicate scents of white flower, flint and whi...   
3             Italy  Subdued aromas of Spanish broom and brimstone ...   
4             Italy  Cavanera is a white blend produced by one of t...   
...             ...                                                ...   
36859     Australia  Dry woody notes reminiscent of sawdust join ap...   
36860   New Zealand  A rare Marlborough Merlot, this is a superfici...   
36861  South Africa  Light and easy with a great lift to the green ...   
36862  South Africa  Fresh, clean and easy with ripe aromas of red ...   
36863       Germany  Dusty mineral complexities juxtapose luscious ...   

                Province            Region Subregion  \
0      Sicily & Sardinia 

In [59]:
variety_geos = wine_df_merged.groupby(['Variety', 'geo_normalized']).size()
at_least_n_types = variety_geos[variety_geos > 30].reset_index()

In [64]:
at_least_n_types

Unnamed: 0,Variety,geo_normalized,0
0,Bordeaux-style Red Blend,"Coastal Region, South Africa",84
1,Bordeaux-style Red Blend,"Graves, Bordeaux, France",69
2,Bordeaux-style Red Blend,"Libournais, Bordeaux, France",300
3,Bordeaux-style Red Blend,"Medoc, Bordeaux, France",644
4,Bordeaux-style Red Blend,"Mendoza, Argentina",75
5,Bordeaux-style Red Blend,"Southwest France, France",53
6,Bordeaux-style Red Blend,"Virginia, USA",73
7,Cabernet Franc,"Anjou-Saumar, Loire Valley, France",44
8,Cabernet Franc,"Finger Lakes, New York, USA",125
9,Cabernet Franc,"Mendoza, Argentina",53


In [65]:
wine_df_merged_filtered = pd.merge(wine_df_merged, at_least_n_types, left_on=['Variety', 'geo_normalized'], right_on=['Variety', 'geo_normalized'])

In [68]:
wine_df_merged_filtered = wine_df_merged_filtered[['title', 'Variety', 'geo_normalized', 'description']]

In [70]:
wine_df_merged_filtered.head()

Unnamed: 0,title,Variety,geo_normalized,description
0,Nicosia 2013 Vulkà Bianco (Etna),White Blend,"Sicily, Southern Italy, Italy","Aromas include tropical fruit, broom, brimston..."
1,Tenuta delle Terre Nere 2007 Etna,White Blend,"Sicily, Southern Italy, Italy",This is a mineral-driven wine from the blacken...
2,Cantine Valenti 2015 Enrico IV Bianco (Etna),White Blend,"Sicily, Southern Italy, Italy","Delicate scents of white flower, flint and whi..."
3,Contrada Santo Spirito di Passopisciaro 2012 A...,White Blend,"Sicily, Southern Italy, Italy",Subdued aromas of Spanish broom and brimstone ...
4,Firriato 2009 Cavanera Ripa di Scorciavacca (...,White Blend,"Sicily, Southern Italy, Italy",Cavanera is a white blend produced by one of t...


The dataframe wine_df_merged_filtered now contains the title of the wine, the variety (which describes the type of grapes used to make the wine (ie White Blend), geographic data regarding where the data is from and a description regarding the wine. Wine varities with a count smaller than 30 were filtered, as well as datasets with incomplete geographic data.

# Data preprocessing food dataset

In [85]:
food_review_dataset = pd.read_csv('Reviews.csv')
print(food_review_dataset.shape)

(568454, 10)


In [86]:
food_reviews_list = list(food_review_dataset['Text'])

In [87]:
food_reviews_list[:5]

['I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.',
 'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".',
 'This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis\' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.',
 'If you are looking f

In [88]:
full_food_reviews_list = [str(r) for r in food_reviews_list]
full_food_corpus = ' '.join(full_food_reviews_list)
food_sentences_tokenized = sent_tokenize(full_food_corpus)

In [89]:
print(food_sentences_tokenized[:2])

['I have bought several of the Vitality canned dog food products and have found them all to be of good quality.', 'The product looks more like a stew than a processed meat and it smells better.']


In [90]:
normalized_food_sentences = []
for s in food_sentences_tokenized:
    normalized_text = normalize_text(s)
    normalized_food_sentences.append(normalized_text)

In [91]:
food_bigram_model = Phrases(normalized_food_sentences, min_count=100)
food_bigrams = [food_bigram_model[sent] for sent in normalized_food_sentences]
food_trigram_model = Phrases(food_bigrams, min_count=50)
phrased_food_sentences = [food_trigram_model[sent] for sent in food_bigrams]
food_trigram_model.save('food_trigrams.pkl')

In [92]:
food_trigram_model = Phraser.load('food_trigrams.pkl')

In [93]:
aroma_descriptor_mapping = descriptor_mapping.loc[descriptor_mapping['type'] == 'aroma']
normalized_food_sentences = []
for sent in phrased_food_sentences:
    normalized_food_sentence = []
    for word in sent:
        normalized_word = return_mapped_descriptor(word, aroma_descriptor_mapping)
        normalized_food_sentence.append(str(normalized_word))
    normalized_food_sentences.append(normalized_food_sentence)

In [94]:
normalized_sentences = normalized_wine_sentences + normalized_food_sentences

In [96]:
wine_word2vec_model = Word2Vec(normalized_sentences, vector_size=300, min_count=8, epochs=15)
print(wine_word2vec_model)

wine_word2vec_model.save('food_word2vec_model.bin')

Word2Vec(vocab=36695, vector_size=300, alpha=0.025)


In [97]:
wine_word2vec_model = Word2Vec.load("food_word2vec_model.bin")

# Create vectors for wine

In [98]:
wine_reviews = list(wine_df_merged_filtered['description'])

descriptor_mapping = pd.read_csv('descriptor_mapping_tastes.csv', encoding='latin1').set_index('raw descriptor')

core_tastes = ['aroma', 'weight', 'sweet', 'acid', 'salt', 'piquant', 'fat', 'bitter']
descriptor_mappings = dict()
for c in core_tastes:
    if c=='aroma':
        descriptor_mapping_filtered=descriptor_mapping.loc[descriptor_mapping['type']=='aroma']
    else:
        descriptor_mapping_filtered=descriptor_mapping.loc[descriptor_mapping['primary taste']==c]
    descriptor_mappings[c] = descriptor_mapping_filtered                                                   
    

def return_descriptor_from_mapping(descriptor_mapping, word, core_taste):
    if word in list(descriptor_mapping.index):
        descriptor_to_return = descriptor_mapping['combined'][word]
        return descriptor_to_return
    else:
        return None

review_descriptors = []
for review in wine_reviews:
    taste_descriptors = []
    normalized_review = normalize_text(review)
    phrased_review = wine_trigram_model[normalized_review]
#     print(phrased_review)
    
    for c in core_tastes:                                                      
        descriptors_only = [return_descriptor_from_mapping(descriptor_mappings[c], word, c) for word in phrased_review]
        no_nones = [str(d).strip() for d in descriptors_only if d is not None]
        descriptorized_review = ' '.join(no_nones)
        taste_descriptors.append(descriptorized_review)
    review_descriptors.append(taste_descriptors)

In [99]:
print(phrased_review)

['initi', 'doe_nt', 'distinguish', 'nose', 'deal', 'typic', 'chilean', 'chardonnay', 'aroma', 'corn', 'stalk', 'along', 'warmweath', 'fruit', 'mouth', 'howev', 'miner', 'intens', 'drive', 'acid', 'push', 'flavor', 'oak', 'cinnamon', 'toast', 'bake', 'appl', 'finish', 'fruit', 'fade', 'rather', 'quick', 'vanilla', 'resini', 'oak', 'flavor', 'settl', 'take']


Now we will take the list of descriptors for each wine and its aroma/nonaroma vectors and compute a TF-IDF weighted embedding for each. We will store the results in a dataframe.

In [100]:
taste_descriptors = []
taste_vectors = []

for n, taste in enumerate(core_tastes):
    print(taste)
    taste_words = [r[n] for r in review_descriptors]
    
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit(taste_words)
    dict_of_tfidf_weightings = dict(zip(X.get_feature_names(), X.idf_))
        
    wine_review_descriptors = []
    wine_review_vectors = []
    
    for d in taste_words:
        descriptor_count = 0
        weighted_review_terms = []
        terms = d.split(' ')
        for term in terms:
            if term in dict_of_tfidf_weightings.keys():
                tfidf_weighting = dict_of_tfidf_weightings[term]
                try:
                    word_vector = wine_word2vec_model.wv.get_vector(term).reshape(1, 300)
                    weighted_word_vector = tfidf_weighting * word_vector
                    weighted_review_terms.append(weighted_word_vector)
                    descriptor_count += 1
                except:
                    continue
            else:
                continue
        try:
            review_vector = sum(weighted_review_terms)/len(weighted_review_terms)
            review_vector = review_vector[0]
        except:
            review_vector = np.nan
#         terms_and_vec = [terms, review_vector]
        wine_review_vectors.append(review_vector)
        wine_review_descriptors.append(terms)
    
    taste_vectors.append(wine_review_vectors)
    taste_descriptors.append(wine_review_descriptors)
    

taste_vectors_t = list(map(list, zip(*taste_vectors)))
taste_descriptors_t = list(map(list, zip(*taste_descriptors)))

review_vecs_df = pd.DataFrame(taste_vectors_t, columns=core_tastes)

columns_taste_descriptors = [a + '_descriptors' for a in core_tastes]
review_descriptors_df = pd.DataFrame(taste_descriptors_t, columns=columns_taste_descriptors)

wine_df_vecs = pd.concat([wine_df_merged_filtered, review_descriptors_df, review_vecs_df], axis=1)
wine_df_vecs.head(5)

aroma
weight
sweet
acid
salt
piquant
fat
bitter


Unnamed: 0,title,Variety,geo_normalized,description,aroma_descriptors,weight_descriptors,sweet_descriptors,acid_descriptors,salt_descriptors,piquant_descriptors,fat_descriptors,bitter_descriptors,aroma,weight,sweet,acid,salt,piquant,fat,bitter
0,Nicosia 2013 Vulkà Bianco (Etna),White Blend,"Sicily, Southern Italy, Italy","Aromas include tropical fruit, broom, brimston...","[tropical_fruit, fruit, herb, apple, citrus, s...",[],"[dry, dry]",[high_acid],[],[],[],[],"[2.0234935, -0.26817894, 2.7913525, -7.105846,...",,"[0.13942882, -1.8551763, 1.710772, -3.8352144,...","[-0.06676228, -0.87576926, -1.0497628, -0.4488...",,,,
1,Tenuta delle Terre Nere 2007 Etna,White Blend,"Sicily, Southern Italy, Italy",This is a mineral-driven wine from the blacken...,"[mineral, pear, peach, jasmine]",[],[],[],[],[],[],[],"[0.9331111, -3.00844, 4.531299, -9.727131, 6.2...",,,,,,,
2,Cantine Valenti 2015 Enrico IV Bianco (Etna),White Blend,"Sicily, Southern Italy, Italy","Delicate scents of white flower, flint and whi...","[white_flower, flint, stone, fruit, unripe, pe...",[],[],"[high_acid, high_acid, high_acid]",[],[],[],[],"[2.8463452, -3.3319855, 3.8698688, -6.294742, ...",,,"[-0.06676228, -0.8757693, -1.0497628, -0.44886...",,,,
3,Contrada Santo Spirito di Passopisciaro 2012 A...,White Blend,"Sicily, Southern Italy, Italy",Subdued aromas of Spanish broom and brimstone ...,"[apple, citrus_peel, minerality]",[],[],[high_acid],[],[],[],[],"[3.2716115, -3.0502708, 5.912527, -6.5354977, ...",,,"[-0.06676228, -0.87576926, -1.0497628, -0.4488...",,,,
4,Firriato 2009 Cavanera Ripa di Scorciavacca (...,White Blend,"Sicily, Southern Italy, Italy",Cavanera is a white blend produced by one of t...,[minerality],[],[dry],"[high_acid, high_acid]",[],[],[],[],"[3.5302823, 0.44590807, 6.1244144, -6.750567, ...",,"[0.13942882, -1.8551763, 1.710772, -3.8352144,...","[-0.06676228, -0.87576926, -1.0497628, -0.4488...",,,,


In [101]:
print(wine_df_vecs.shape)
print(wine_df_vecs['aroma'].isnull().sum(axis = 0))

(29373, 20)
139


If there is not a nonaroma embedding for one of the wines, take the average nonaroma embedding for all the wines in the dataset.

In [114]:
# pull the average embedding for the wine attribute across all wines. 
avg_taste_vecs = dict()
for k in core_tastes:
    # look at the average embedding for a taste, across all wines that have descriptors for that taste 
    review_arrays = wine_df_vecs[k].dropna()
    average_taste_vec = np.average(review_arrays)
    avg_taste_vecs[k] = average_taste_vec

In [122]:
avg_taste_vecs

{'aroma': array([ 0.81392586, -2.4280865 ,  3.3803036 , -5.6719713 ,  0.50349313,
        -1.085401  ,  0.47793332, -1.4958851 ,  0.44697052,  0.93451875,
         2.8767602 ,  5.579465  ,  0.52665836, -1.3484944 ,  0.56253725,
         1.1931461 , -0.8177047 , -1.7281162 , -0.12070356,  0.89030814,
         3.0349307 ,  0.43837   ,  5.239982  ,  2.0910823 ,  0.399469  ,
        -0.73639476,  4.130707  , -1.4159293 , -1.1690022 ,  0.16318634,
        -1.8165119 , -3.0381386 , -0.83788747, -0.6672929 ,  4.683531  ,
         3.0057762 , -0.07149196, -1.8383843 , -1.9233437 , -2.1461124 ,
        -0.8083863 , -3.370956  , -3.6041806 ,  2.9414213 ,  0.55599684,
        -0.660081  ,  2.3295577 , -1.0785248 , -0.39634427, -0.23678061,
        -0.97342014,  1.3977376 , -0.6915297 , -2.9769847 ,  2.7327397 ,
         2.1818762 ,  1.8410211 , -2.0737739 ,  3.0057352 ,  0.15505809,
         0.8134393 ,  0.5814624 , -0.58185554, -3.1508656 ,  2.521926  ,
        -0.9575736 ,  1.3087318 , -2.14962