In [1]:
import pickle
import spacy
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, CategoricalNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [33]:
from sklearn import metrics

In [2]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "textcat", "ner"])

In [3]:
df = pd.read_csv('winemag-data-130k-v2.csv', index_col = 0)

In [4]:
df.columns

Index(['country', 'description', 'designation', 'points', 'price', 'province',
       'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title',
       'variety', 'winery'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [6]:
df.shape

(129971, 13)

In [7]:
df['description'][10:12]

10    Soft, supple plum envelopes an oaky structure ...
11    This is a dry wine, very spicy, with a tight, ...
Name: description, dtype: object

In [8]:
df['taster_name'].value_counts()

Roger Voss            25514
Michael Schachner     15134
Kerin O’Keefe         10776
Virginie Boone         9537
Paul Gregutt           9532
Matt Kettmann          6332
Joe Czerwinski         5147
Sean P. Sullivan       4966
Anna Lee C. Iijima     4415
Jim Gordon             4177
Anne Krebiehl MW       3685
Lauren Buzzeo          1835
Susan Kostrzewa        1085
Mike DeSimone           514
Jeff Jenssen            491
Alexander Peartree      415
Carrie Dykes            139
Fiona Adams              27
Christina Pickard         6
Name: taster_name, dtype: int64

In [9]:
df.loc[df['taster_name']=='Roger Voss'].head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
7,France,This dry and restrained wine offers spice in p...,,87,24.0,Alsace,Alsace,,Roger Voss,@vossroger,Trimbach 2012 Gewurztraminer (Alsace),Gewürztraminer,Trimbach
9,France,This has great depth of flavor with its fresh ...,Les Natures,87,27.0,Alsace,Alsace,,Roger Voss,@vossroger,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam
11,France,"This is a dry wine, very spicy, with a tight, ...",,87,30.0,Alsace,Alsace,,Roger Voss,@vossroger,Leon Beyer 2012 Gewurztraminer (Alsace),Gewürztraminer,Leon Beyer
30,France,Red cherry fruit comes laced with light tannin...,Nouveau,86,,Beaujolais,Beaujolais-Villages,,Roger Voss,@vossroger,Domaine de la Madone 2012 Nouveau (Beaujolais...,Gamay,Domaine de la Madone


In [10]:
df['variety'].value_counts()

Pinot Noir                  13272
Chardonnay                  11753
Cabernet Sauvignon           9472
Red Blend                    8946
Bordeaux-style Red Blend     6915
                            ...  
Carcajolu                       1
Thrapsathiri                    1
Pignolo                         1
Diamond                         1
Kotsifali                       1
Name: variety, Length: 707, dtype: int64

In [11]:
df['variety'].value_counts() == 1

Pinot Noir                  False
Chardonnay                  False
Cabernet Sauvignon          False
Red Blend                   False
Bordeaux-style Red Blend    False
                            ...  
Carcajolu                    True
Thrapsathiri                 True
Pignolo                      True
Diamond                      True
Kotsifali                    True
Name: variety, Length: 707, dtype: bool

In [12]:
df['single_variety'] = df['variety'].value_counts()==1

In [13]:
df.variety.value_counts().loc[lambda x: x>5]

Pinot Noir                  13272
Chardonnay                  11753
Cabernet Sauvignon           9472
Red Blend                    8946
Bordeaux-style Red Blend     6915
                            ...  
Rieslaner                       6
Tannat-Syrah                    6
Avesso                          6
Grenache-Mourvèdre              6
Roter Traminer                  6
Name: variety, Length: 367, dtype: int64

In [14]:
df['taster_name'].value_counts()

Roger Voss            25514
Michael Schachner     15134
Kerin O’Keefe         10776
Virginie Boone         9537
Paul Gregutt           9532
Matt Kettmann          6332
Joe Czerwinski         5147
Sean P. Sullivan       4966
Anna Lee C. Iijima     4415
Jim Gordon             4177
Anne Krebiehl MW       3685
Lauren Buzzeo          1835
Susan Kostrzewa        1085
Mike DeSimone           514
Jeff Jenssen            491
Alexander Peartree      415
Carrie Dykes            139
Fiona Adams              27
Christina Pickard         6
Name: taster_name, dtype: int64

In [15]:
df['province'].value_counts()

California                   36247
Washington                    8639
Bordeaux                      5941
Tuscany                       5897
Oregon                        5373
                             ...  
Dingač                           1
Monemvasia                       1
Hrvatsko Primorje                1
Middle and South Dalmatia        1
Piekenierskloof                  1
Name: province, Length: 425, dtype: int64

In [16]:
roger = df.loc[df['taster_name']=='Roger Voss']

In [17]:
roger['description']

1         This is ripe and fruity, a wine that is smooth...
7         This dry and restrained wine offers spice in p...
9         This has great depth of flavor with its fresh ...
11        This is a dry wine, very spicy, with a tight, ...
30        Red cherry fruit comes laced with light tannin...
                                ...                        
129964    Initially quite muted, this wine slowly develo...
129965    While it's rich, this beautiful dry wine also ...
129968    Well-drained gravel soil gives this wine its c...
129969    A dry style of Pinot Gris, this is crisp with ...
129970    Big, rich and off-dry, this is powered by inte...
Name: description, Length: 25514, dtype: object

In [18]:
def custom_tokenizer(text):
    '''
    used to filter out unwanted words, punctuation, and so on
    '''
    tokens = []
    for t in nlp(text):
        if not(len(t) < 2 or t.is_stop or t.like_num or 
               t.is_punct or not t.is_alpha):
            tokens.append(t.lemma_)
    return tokens 

In [19]:
df['description'][0]

"Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity."

In [20]:
x = custom_tokenizer(df['description'][0])

In [21]:
x

['aroma',
 'include',
 'tropical',
 'fruit',
 'broom',
 'brimstone',
 'dry',
 'herb',
 'palate',
 'overly',
 'expressive',
 'offer',
 'unripened',
 'apple',
 'citrus',
 'dry',
 'sage',
 'alongside',
 'brisk',
 'acidity']

In [22]:
corpus = df['description']

In [23]:
corpus

0         Aromas include tropical fruit, broom, brimston...
1         This is ripe and fruity, a wine that is smooth...
2         Tart and snappy, the flavors of lime flesh and...
3         Pineapple rind, lemon pith and orange blossom ...
4         Much like the regular bottling from 2012, this...
                                ...                        
129966    Notes of honeysuckle and cantaloupe sweeten th...
129967    Citation is given as much as a decade of bottl...
129968    Well-drained gravel soil gives this wine its c...
129969    A dry style of Pinot Gris, this is crisp with ...
129970    Big, rich and off-dry, this is powered by inte...
Name: description, Length: 129971, dtype: object

In [24]:
y = df['variety']

In [25]:
y.value_counts()

Pinot Noir                  13272
Chardonnay                  11753
Cabernet Sauvignon           9472
Red Blend                    8946
Bordeaux-style Red Blend     6915
                            ...  
Carcajolu                       1
Thrapsathiri                    1
Pignolo                         1
Diamond                         1
Kotsifali                       1
Name: variety, Length: 707, dtype: int64

In [28]:
bow = CountVectorizer(tokenizer=custom_tokenizer, 
                      ngram_range=(1, 1), 
                      min_df=0.01, 
                      max_df=0.99)

In [26]:
with open('bow.p', 'rb') as f:
    bow = pickle.load(f)

In [27]:
corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, 
                                                              test_size=0.4, 
                                                              train_size=0.6)

In [28]:
X_train = bow.fit_transform(corpus_train)

In [44]:
X_train.shape

(77982, 393)

In [29]:
X_test = bow.transform(corpus_test)

In [30]:
m_nb = Pipeline([
    ('TfIdf', TfidfTransformer()),
    ('NB', BernoulliNB())
])

In [31]:
cross_val_score(m_nb, X_train, y_train, scoring='accuracy', n_jobs=4, cv=4).mean()



0.42597010383647926

In [36]:
m_nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('TfIdf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('NB',
                 BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                             fit_prior=True))],
         verbose=False)

In [38]:
m_nb.predict(X_test)

array(['Pinot Noir', 'Cabernet Sauvignon', 'Bordeaux-style Red Blend',
       ..., 'Red Blend', 'Portuguese White', 'Chardonnay'], dtype='<U35')

In [40]:
#metrics.accuracy_score(y_test, m_nb.predict(X_test))

In [42]:
# get the NaiveBayes form the pipeline and return marginal probabilites P(X|Y=k)
feat_imp = m_nb['NB'].feature_log_prob_
feat_imp.shape

(637, 393)

In [43]:
feat_imp = pd.DataFrame(feat_imp, columns=bow.get_feature_names()).T
np.exp(feat_imp)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,627,628,629,630,631,632,633,634,635,636
accent,0.25,0.024390,0.039024,0.333333,0.2,0.0625,0.333333,0.017857,0.25,0.25,...,0.25,0.333333,0.045455,0.071429,0.2,0.034056,0.333333,0.014493,0.333333,0.333333
acid,0.25,0.097561,0.004878,0.333333,0.2,0.0625,0.333333,0.053571,0.25,0.25,...,0.25,0.333333,0.045455,0.071429,0.2,0.020433,0.333333,0.014493,0.333333,0.333333
acidic,0.25,0.024390,0.004878,0.333333,0.2,0.0625,0.333333,0.028571,0.25,0.25,...,0.25,0.333333,0.045455,0.071429,0.4,0.006811,0.666667,0.021739,0.333333,0.333333
acidity,0.50,0.121951,0.107317,0.333333,0.2,0.7500,0.333333,0.253571,0.25,0.25,...,0.25,0.333333,0.363636,0.500000,0.2,0.115789,0.333333,0.275362,0.333333,0.333333
add,0.25,0.048780,0.039024,0.333333,0.2,0.0625,0.333333,0.007143,0.25,0.25,...,0.25,0.333333,0.090909,0.071429,0.4,0.028483,0.333333,0.050725,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yellow,0.25,0.024390,0.004878,0.333333,0.2,0.6250,0.666667,0.035714,0.25,0.25,...,0.25,0.333333,0.318182,0.285714,0.2,0.000619,0.666667,0.007246,0.333333,0.333333
young,0.25,0.024390,0.034146,0.333333,0.2,0.0625,0.333333,0.010714,0.25,0.25,...,0.25,0.333333,0.045455,0.071429,0.2,0.014241,0.333333,0.065217,0.333333,0.333333
z,0.25,0.024390,0.019512,0.333333,0.2,0.1250,0.333333,0.010714,0.25,0.25,...,0.25,0.333333,0.045455,0.071429,0.2,0.005573,0.333333,0.007246,0.333333,0.333333
zest,0.25,0.024390,0.009756,0.333333,0.2,0.5000,0.333333,0.007143,0.25,0.25,...,0.25,0.333333,0.409091,0.071429,0.2,0.001858,0.333333,0.007246,0.333333,0.333333


In [58]:
feat_imp.sum(axis=1).sort_values() #find most common words

riesling     -1546.004733
sangiovese   -1543.282498
verdot       -1528.567210
cab          -1527.819434
malbec       -1515.469469
                 ...     
aroma         -695.482612
finish        -677.110674
fruit         -587.061196
flavor        -517.176112
wine          -478.741285
Length: 393, dtype: float64

In [46]:
np.exp(feat_imp[1].sort_values(ascending=False))

wine      0.682927
red       0.658537
flavor    0.609756
cherry    0.609756
nose      0.560976
            ...   
nicely    0.024390
noir      0.024390
nuance    0.024390
oak       0.024390
accent    0.024390
Name: 1, Length: 393, dtype: float64

In [51]:
m_nb.predict_proba(bow.transform(['cherry flavor nice oak accent'])).max()

0.3052741119504553

In [183]:
m_nbg = Pipeline([
    ('TfIdf', TfidfTransformer()),
    ('NB', GaussianNB())
])

In [184]:
cross_val_score(m_nbg, X_train, y_train, scoring='accuracy', n_jobs=4, cv=4).mean()



nan

In [185]:
m_nbm = Pipeline([
    ('TfIdf', TfidfTransformer()),
    ('NB', MultinomialNB())
])

In [186]:
cross_val_score(m_nbm, X_train, y_train, scoring='accuracy', n_jobs=4, cv=4).mean()



0.4104793436297703

In [214]:
m_nbc = Pipeline([
    ('TfIdf', TfidfTransformer()),
    ('NB', CategoricalNB())
])

In [215]:
cross_val_score(m_nbc, X_train, y_train, scoring='accuracy', n_jobs=4, cv=4).mean()



nan

In [224]:
m_rf = Pipeline([
    ('TfIdf', TfidfTransformer()),
    ('RFC', RandomForestClassifier())
])

In [None]:
cross_val_score(m_rf, X_train, y_train, scoring='accuracy',  cv=4).mean()



In [222]:
m_rfr = Pipeline([
    ('TfIdf', TfidfTransformer()),
    ('RFC', RandomForestRegressor())
])

In [223]:
cross_val_score(m_rfr, X_train, y_train, scoring='accuracy', n_jobs=4, cv=4).mean()

nan

In [None]:
m = Pipeline([
    ('TfIdf', TfidfTransformer()),
    ('LogReg', LogisticRegression(class_weight='balanced'))
    ])

In [None]:
cross_val_score(m, X_train, y_train, scoring='accuracy', n_jobs=4, cv=4).mean()