In [1]:
import pickle
import spacy
import sys
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, CategoricalNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
sys.path.append('../../../')

In [3]:
from utils.wine_functions import custom_tokenizer

In [4]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "textcat", "ner"])

In [5]:
df = pd.read_csv('../../../data/df_red.csv', index_col = 0)

In [6]:
df.columns

Index(['country', 'description', 'designation', 'points', 'price', 'province',
       'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title',
       'variety', 'winery', 'type'],
      dtype='object')

In [7]:
df.shape

(71030, 14)

In [8]:
df

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,type
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,red
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,red
10,US,"Soft, supple plum envelopes an oaky structure ...",Mountain Cuvée,87,19.0,California,Napa Valley,Napa,Virginie Boone,@vboone,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature,red
12,US,"Slightly reduced, this wine offers a chalky, t...",Cabernet Sauvignon,87,34.0,California,Alexander Valley,Sonoma,Virginie Boone,@vboone,Louis M. Martini 2012 Cabernet Sauvignon (Alex...,Cabernet Sauvignon,Louis M. Martini,red
16,Argentina,"Baked plum, molasses, balsamic vinegar and che...",Felix,87,30.0,Other,Cafayate,,Michael Schachner,@wineschach,Felix Lavaque 2010 Felix Malbec (Cafayate),Malbec,Felix Lavaque,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129957,Spain,Lightly baked berry aromas vie for attention w...,Crianza,90,17.0,Northern Spain,Rioja,,Michael Schachner,@wineschach,Viñedos Real Rubio 2010 Crianza (Rioja),Tempranillo Blend,Viñedos Real Rubio,red
129958,New Zealand,This blend of Cabernet Sauvignon-Merlot and Ca...,Irongate,90,35.0,Hawke's Bay,,,Joe Czerwinski,@JoeCz,Babich 2010 Irongate Red (Hawke's Bay),Bordeaux-style Red Blend,Babich,red
129960,Portugal,"Fresh and fruity, this is full of red cherry f...",Vértice,90,48.0,Douro,,,Roger Voss,@vossroger,Caves Transmontanas 2006 Vértice Pinot Noir (D...,Pinot Noir,Caves Transmontanas,red
129963,Israel,"A bouquet of black cherry, tart cranberry and ...",Oak Aged,90,20.0,Galilee,,,Mike DeSimone,@worldwineguys,Dalton 2012 Oak Aged Cabernet Sauvignon (Galilee),Cabernet Sauvignon,Dalton,red


In [9]:
#def custom_tokenizer(text):
#    '''
#    used to filter out unwanted words, punctuation, and so on
#    '''
#    tokens = []
#    for t in nlp(text):
#        if not(len(t) < 2 or t.is_stop or t.like_num or 
#               t.is_punct or not t.is_alpha):
#            tokens.append(t.lemma_)
#    return tokens 

In [10]:
corpus = df['description']

In [11]:
corpus

1         This is ripe and fruity, a wine that is smooth...
4         Much like the regular bottling from 2012, this...
10        Soft, supple plum envelopes an oaky structure ...
12        Slightly reduced, this wine offers a chalky, t...
16        Baked plum, molasses, balsamic vinegar and che...
                                ...                        
129957    Lightly baked berry aromas vie for attention w...
129958    This blend of Cabernet Sauvignon-Merlot and Ca...
129960    Fresh and fruity, this is full of red cherry f...
129963    A bouquet of black cherry, tart cranberry and ...
129967    Citation is given as much as a decade of bottl...
Name: description, Length: 71030, dtype: object

In [12]:
y = df['variety']

In [13]:
y.value_counts()

Pinot Noir                       13272
Cabernet Sauvignon                9472
Red Blend                         8946
Bordeaux-style Red Blend          6915
Syrah                             4142
Merlot                            3102
Nebbiolo                          2804
Zinfandel                         2714
Sangiovese                        2707
Malbec                            2652
Portuguese Red                    2466
Tempranillo                       1810
Rhône-style Red Blend             1471
Cabernet Franc                    1353
Gamay                             1025
Shiraz                             836
Petite Sirah                       770
Sangiovese Grosso                  751
Barbera                            721
Port                               668
Grenache                           651
Corvina, Rondinella, Molinara      619
Tempranillo Blend                  588
Carmenère                          575
Name: variety, dtype: int64

In [14]:
bow = CountVectorizer(tokenizer=custom_tokenizer, 
                      ngram_range=(1, 2), 
                      min_df=0.01, 
                      max_df=0.99)

In [15]:
corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, 
                                                              test_size=0.4, 
                                                              train_size=0.6)

In [16]:
X_train = bow.fit_transform(corpus_train)

In [17]:
X_train.shape

(42618, 498)

In [18]:
X_test = bow.transform(corpus_test)

In [19]:
m_nb = Pipeline([
    ('TfIdf', TfidfTransformer()),
    ('NB', MultinomialNB())
])

In [20]:
cross_val_score(m_nb, X_train, y_train, scoring='accuracy', n_jobs=4, cv=4).mean()

0.48322303694106955

In [21]:
m_nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('TfIdf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('NB',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [22]:
pred = m_nb.predict(X_test)

In [23]:
metrics.accuracy_score(y_test, pred)

0.4877516542306068

In [24]:
metrics.f1_score(y_test, pred, average='weighted')

0.44202749781747636

In [25]:
m_nb.predict(bow.transform(['cherry flavor nice oak accent']))

array(['Cabernet Sauvignon'], dtype='<U29')

In [26]:
m_nb.predict_proba(bow.transform(['cherry flavor nice oak accent'])).max()

0.24554251067453212

In [27]:
with open('m_red.p', 'wb') as f:
    pickle.dump(m_nb, f)

In [28]:
with open('bow_red.p', 'wb') as f:
    pickle.dump(bow, f)