In [10]:
from collections import Counter
import numpy as np
import nltk
import re
import sklearn.manifold
import pandas as pd
import gensim.models.word2vec as w2v

In [15]:
data = pd.read_csv('datasets/wineColors.csv')

In [25]:
descriptions = data['description']

corpus = ""

for desc in descriptions:
    corpus += " " + desc

In [26]:
#Use the NLTK tokenizer to break up the corpus into sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentences = tokenizer.tokenize(corpus)


In [103]:
def sent_to_words(sentence):
    clean = re.sub("[^a-zA-Z]"," ", sentence)
    words = clean.split()
    return words

In [82]:
cleaned_sentences = []

for raw in raw_sentences:
    cleaned_sentences.append(sent_to_words(raw))

In [83]:
token_count = sum([len(sentence) for sentence in cleaned_sentence])
token_count

2362855

In [85]:
#Hyperparameters for the word2vec model
context_size = 5
num_features = 100
min_word_count = 5

In [86]:
wine2vec = w2v.Word2Vec(
    sg=1,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
)


In [87]:
wine2vec.build_vocab(cleaned_sentences)

In [88]:
wine2vec.train(cleaned_sentences, total_examples=wine2vec.corpus_count, epochs=wine2vec.epochs)

(8502718, 11814275)

In [91]:
from gensim.test.utils import get_tmpfile
path = get_tmpfile("wine2vec.model")
wine2vec.save("wine2vec.model")


In [90]:
wine2vec.wv.most_similar("tannins")

[('Tannins', 0.7540991306304932),
 ('tannin', 0.7201492786407471),
 ('grained', 0.666390061378479),
 ('firm', 0.659520149230957),
 ('tannic', 0.6561658382415771),
 ('framework', 0.6164436340332031),
 ('gripping', 0.6127274036407471),
 ('proportionate', 0.6055161952972412),
 ('clench', 0.6008082032203674),
 ('sugars', 0.5975766181945801)]

In [102]:
sent_to_words(raw_sentences[0])

['Pineapple',
 'rind',
 'lemon',
 'pith',
 'and',
 'orange',
 'blossom',
 'start',
 'off',
 'the',
 'aromas']