In [1]:
import pickle
import spacy
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, CategoricalNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "textcat", "ner"])

In [3]:
df = pd.read_csv('dfs_variety/Merlot.csv', index_col = 0)

In [4]:
df.columns

Index(['country', 'description', 'designation', 'points', 'price', 'province',
       'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title',
       'variety', 'winery', 'type', 'sparkling'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,type,sparkling
23,US,This wine from the Geneseo district offers aro...,Signature Selection,87,22.0,California,Paso Robles,Central Coast,Matt Kettmann,@mattkettmann,Bianchi 2011 Signature Selection Merlot (Paso ...,Merlot,Bianchi,red,no
44,Chile,A berry aroma comes with cola and herb notes. ...,Merlot,86,9.0,Maule Valley,,,Michael Schachner,@wineschach,Sundance 2011 Merlot (Maule Valley),Merlot,Sundance,red,no
87,US,"Softened tannins surround a light-bodied, lean...",Blau Vineyards,86,55.0,California,Knights Valley,Sonoma,Virginie Boone,@vboone,Passaggio 2014 Blau Vineyards Merlot (Knights ...,Merlot,Passaggio,red,no
168,US,"A fairly elegant expression of the variety, th...",Rector Creek Vineyard,91,95.0,California,Napa Valley,Napa,Virginie Boone,@vboone,Duckhorn 2012 Rector Creek Vineyard Merlot (Na...,Merlot,Duckhorn,red,no
188,Chile,Bisquertt usually does well with its value-pri...,Casa La Joya Reserve,88,11.0,Colchagua Valley,,,Michael Schachner,@wineschach,Viña Bisquertt 2007 Casa La Joya Reserve Merlo...,Merlot,Viña Bisquertt,red,no


In [6]:
def custom_tokenizer(text):
    '''
    used to filter out unwanted words, punctuation, and so on
    '''
    tokens = []
    for t in nlp(text):
        if not(len(t) < 2 or t.is_stop or t.like_num or 
               t.is_punct or not t.is_alpha):
            tokens.append(t.lemma_)
    return tokens 

In [7]:
corpus = df['description']

In [8]:
corpus

23        This wine from the Geneseo district offers aro...
44        A berry aroma comes with cola and herb notes. ...
87        Softened tannins surround a light-bodied, lean...
168       A fairly elegant expression of the variety, th...
188       Bisquertt usually does well with its value-pri...
                                ...                        
129840    The entry is oaky and the fruit light; it seem...
129846    Simple but meaty, this 100% Merlot has substan...
129847    Rich, soft and lush, if a little obvious, this...
129857    Dusty tannins make for a soft texture in this ...
129900    This wine offers a delightful bouquet of black...
Name: description, Length: 3102, dtype: object

In [9]:
y = df['title']

In [10]:
y.value_counts()

Twin Arches 2010 Merlot (California)                           2
Ballentine 2014 Estate Grown Merlot (Napa Valley)              2
Rexford 2009 Merlot (Santa Cruz Mountains)                     2
Gnarly Head 2014 Merlot (California)                           2
The Williamsburg Winery 2006 Adagio Merlot (Virginia)          2
                                                              ..
Three Rivers 2011 Merlot (Columbia Valley (WA))                1
Abeja 2011 Merlot (Columbia Valley (WA))                       1
Frog's Leap 2012 Merlot (Rutherford)                           1
Terre de Vignerons 2016 Le Phare Merlot Merlot (Atlantique)    1
Castle Rock 2006 Merlot (Columbia Valley (WA))                 1
Name: title, Length: 2889, dtype: int64

In [11]:
bow = CountVectorizer(tokenizer=custom_tokenizer, 
                      ngram_range=(1, 1), 
                      min_df=0.01, 
                      max_df=0.99)

In [12]:
corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, 
                                                              test_size=0.4, 
                                                              train_size=0.6)

In [13]:
X_train = bow.fit_transform(corpus_train)

In [14]:
X_train.shape

(1861, 364)

In [15]:
X_test = bow.transform(corpus_test)

In [16]:
m_nb = Pipeline([
    ('TfIdf', TfidfTransformer()),
    ('NB', BernoulliNB())
])

In [17]:
y_train

24899                    Pennywise 2010 Merlot (California)
11697        Santa Alba 2013 Reserve Merlot (Curicó Valley)
105359    Viña Casablanca 2014 Cefiro Reserva Merlot (Ma...
79555     Mount Pleasant Winery NV Estates Merlot (America)
24867                    Illustrious 2011 Merlot (Carneros)
                                ...                        
50196            Oak Grove 2008 Reserve Merlot (California)
92058             HRM Rex Goliath NV Merlot (Central Coast)
108873    Milbrandt 2012 The Estates Merlot (Wahluke Slope)
22462     Carmel 2009 Sha'al Single Vineyard Merlot (Gal...
40203     Carmel 2006 Appellation Kosher Merlot (Upper G...
Name: title, Length: 1861, dtype: object

In [18]:
m_nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('TfIdf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('NB',
                 BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                             fit_prior=True))],
         verbose=False)

In [19]:
m_nb.predict(X_test)

array(["Frog's Leap 2010 Merlot (Rutherford)",
       'Folie à Deux 2008 Merlot (Napa Valley)',
       "Gualdo del Re 2007 I'Rennero Merlot (Val di Cornia Suvereto)",
       ..., 'Bandit NV Merlot (California)',
       'Hightower 2013 Merlot (Red Mountain)',
       'Midnight 2010 Estate Merlot (Paso Robles)'], dtype='<U100')

In [20]:
metrics.accuracy_score(y_test, m_nb.predict(X_test))

0.0

In [21]:
m_nb.predict(bow.transform(['dry savory bitter smooth balanced polished earthy earth soil minerality graphite carmel toast spice oak oaky smokey toasty smoke cedar tea']))

array(['Bandit NV Merlot (California)'], dtype='<U100')

In [22]:
m_nb.predict_proba(bow.transform(['cherry flavor nice oak accent'])).max()

0.20850200578748665

In [23]:
query = ['dry savory bitter smooth balanced polished earthy earth soil minerality graphite carmel toast spice oak oaky smokey toasty smoke cedar tea']

In [24]:
df_pred = pd.DataFrame(m_nb.predict_proba(bow.transform(query)).T.round(4), index=m_nb.classes_ ,columns=['probability'])

In [25]:
df_pred.sort_values(by='probability', ascending=False).head(5)

Unnamed: 0,probability
Bandit NV Merlot (California),0.2609
Kenwood 2009 Merlot (Sonoma County),0.209
Hightower 2013 Merlot (Red Mountain),0.209
Gualdo del Re 2007 I'Rennero Merlot (Val di Cornia Suvereto),0.0697
Happy Camper 2010 Merlot (California),0.0697


In [26]:
m_nbm = Pipeline([
    ('TfIdf', TfidfTransformer()),
    ('NB', MultinomialNB())
])

In [27]:
m_nbm.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('TfIdf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('NB',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [28]:
m_nbm.predict(X_test)

array(["Frog's Leap 2010 Merlot (Rutherford)",
       'Materra Cunat Family Vineyards 2010 Estate Grown Merlot (Oak Knoll District)',
       'Happy Camper 2010 Merlot (California)', ...,
       'Folie à Deux 2008 Merlot (Napa Valley)',
       'Happy Camper 2010 Merlot (California)',
       'Viña Casablanca 2014 Nimbus Single Vineyard Merlot (Casablanca Valley)'],
      dtype='<U100')

In [29]:
metrics.accuracy_score(y_test, m_nbm.predict(X_test))

0.0008058017727639

In [30]:
m_nbm.predict(bow.transform(query))

array(['Dieu Donne 2000 Merlot (Franschhoek)'], dtype='<U100')

In [31]:
m_nbm.predict_proba(bow.transform(query)).max()

0.0014049587048620045

In [34]:
df_pred = pd.DataFrame(m_nbm.predict_proba(bow.transform(query)).T.round(4), index=m_nbm.classes_ ,columns=['probability'])

In [35]:
df_pred.sort_values(by='probability', ascending=False).head(5)

Unnamed: 0,probability
Kenwood 2009 Merlot (Sonoma County),0.0014
Dieu Donne 2000 Merlot (Franschhoek),0.0014
Spellbound 2010 Merlot (California),0.0014
Decoy 2014 Merlot (Sonoma County),0.0013
Rancho Sisquoc 2013 Merlot (Santa Barbara County),0.0013


In [36]:
m = Pipeline([
    ('TfIdf', TfidfTransformer()),
    ('LogReg', LogisticRegression(class_weight='balanced'))
    ])

In [37]:
m.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('TfIdf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('LogReg',
                 LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [38]:
m.predict(X_test)

array(['Nickel & Nickel 2010 Suscol Ranch Merlot (Napa Valley)',
       'Bogle 2009 Merlot (California)', 'Hahn 2006 Merlot (Monterey)',
       ..., 'Robert Mondavi 2007 Merlot (Napa Valley)',
       'Woodbridge by Robert Mondavi 2010 Merlot (California)',
       'Bulgariana 2012 Merlot (Thracian Valley)'], dtype=object)

In [39]:
metrics.accuracy_score(y_test, m.predict(X_test))

0.08460918614020951

In [40]:
df_pred = pd.DataFrame(m.predict_proba(bow.transform(query)).T.round(4), index=m.classes_ ,columns=['probability'])

In [41]:
df_pred.sort_values(by='probability', ascending=False).head(5)

Unnamed: 0,probability
Clos du Val 2012 Merlot (Napa Valley),0.0007
Dieu Donne 2000 Merlot (Franschhoek),0.0007
Montpezat 2005 Les Enclos Prestige Merlot (Vin de Pays d'Oc),0.0007
Cycles Gladiator 2014 Merlot (Central Coast),0.0007
WildHaven 2010 Reserve Merlot (Washington),0.0007
