In [1]:
import pickle
import spacy
import pandas as pd
import numpy as np
import nltk
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, CategoricalNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "textcat", "ner"])

In [3]:
df = pd.read_csv('../data/df_nonans.csv', index_col = 0)

In [4]:
df.columns

Index(['country', 'description', 'designation', 'points', 'price', 'province',
       'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title',
       'variety', 'winery', 'type'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,type
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,white
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,red
2,US,"Tart and snappy, the flavors of lime flesh and...",Pinot Gris,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,white
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,white
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,red


In [6]:
df.shape

(107539, 14)

In [50]:
def custom_tokenizer(text):
    '''
    used to filter out unwanted words, punctuation, and so on
    '''
    tokens = []
    for t in nlp(text):
        if not(len(t) < 2 or t.is_stop or t.like_num or 
               t.is_punct or not t.pos_=='ADV'):
            tokens.append(t.lemma_)
    return tokens 

In [51]:
df['description'][0]

"Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity."

In [52]:
x = custom_tokenizer(df['description'][0])

In [53]:
x

['overly']

In [12]:
corpus = df['description']

In [13]:
corpus

0         Aromas include tropical fruit, broom, brimston...
1         This is ripe and fruity, a wine that is smooth...
2         Tart and snappy, the flavors of lime flesh and...
3         Pineapple rind, lemon pith and orange blossom ...
4         Much like the regular bottling from 2012, this...
                                ...                        
129966    Notes of honeysuckle and cantaloupe sweeten th...
129967    Citation is given as much as a decade of bottl...
129968    Well-drained gravel soil gives this wine its c...
129969    A dry style of Pinot Gris, this is crisp with ...
129970    Big, rich and off-dry, this is powered by inte...
Name: description, Length: 107539, dtype: object

In [14]:
y = df['variety']

In [15]:
y.value_counts()

Pinot Noir                       13272
Chardonnay                       11753
Cabernet Sauvignon                9472
Red Blend                         8946
Bordeaux-style Red Blend          6915
Riesling                          5189
Sauvignon Blanc                   4967
Syrah                             4142
Rosé                              3564
Merlot                            3102
Nebbiolo                          2804
Zinfandel                         2714
Sangiovese                        2707
Malbec                            2652
Portuguese Red                    2466
White Blend                       2360
Tempranillo                       1810
Rhône-style Red Blend             1471
Pinot Gris                        1455
Cabernet Franc                    1353
Grüner Veltliner                  1345
Portuguese White                  1159
Bordeaux-style White Blend        1066
Pinot Grigio                      1052
Gamay                             1025
Gewürztraminer           

In [16]:
bow = CountVectorizer(tokenizer=custom_tokenizer, 
                      ngram_range=(1, 1), 
                      min_df=0.01, 
                      max_df=0.99)

In [17]:
bags_fit = bow.fit(corpus)



In [18]:
bags_transform = bags_fit.transform(corpus)

In [19]:
bags_transform

<107539x400 sparse matrix of type '<class 'numpy.int64'>'
	with 1751389 stored elements in Compressed Sparse Row format>

In [26]:
sum_words = bags_transform.sum(axis=0)

In [27]:
words_freq = [(word, sum_words[0, idx]) for word, idx in bags_fit.vocabulary_.items()]

In [28]:
words_freq_sort = sorted(words_freq, key = lambda x: x[1], reverse=True)

In [29]:
words_freq_sort

[('wine', 72341),
 ('flavor', 60805),
 ('fruit', 55001),
 ('finish', 33822),
 ('palate', 31324),
 ('cherry', 30096),
 ('acidity', 29778),
 ('tannin', 29396),
 ('drink', 28534),
 ('aroma', 27838),
 ('black', 25600),
 ('ripe', 24126),
 ('dry', 23228),
 ('note', 20798),
 ('spice', 20499),
 ('red', 19156),
 ('berry', 16717),
 ('rich', 16133),
 ('oak', 15936),
 ('fresh', 14315),
 ('nose', 14118),
 ('plum', 14038),
 ('show', 13371),
 ('blackberry', 13084),
 ('blend', 12907),
 ('offer', 12481),
 ('apple', 12352),
 ('soft', 12317),
 ('texture', 11990),
 ('light', 11746),
 ('sweet', 11704),
 ('good', 11220),
 ('dark', 11064),
 ('crisp', 10757),
 ('age', 10551),
 ('cabernet', 10189),
 ('raspberry', 9886),
 ('bodied', 9840),
 ('vanilla', 9687),
 ('hint', 9401),
 ('white', 9339),
 ('balance', 9198),
 ('herb', 9083),
 ('bright', 9020),
 ('touch', 9012),
 ('pepper', 8996),
 ('year', 8840),
 ('citrus', 8783),
 ('structure', 8484),
 ('firm', 8465),
 ('fruity', 8441),
 ('green', 8286),
 ('juicy', 8208)