In [1]:
import pickle
import spacy
import sys
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, CategoricalNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
sys.path.append('../../../')

In [3]:
from utils.wine_functions import custom_tokenizer

In [4]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "textcat", "ner"])

In [5]:
df = pd.read_csv('../../../data/df_white.csv', index_col = 0)

In [6]:
df.columns

Index(['country', 'description', 'designation', 'points', 'price', 'province',
       'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title',
       'variety', 'winery', 'type'],
      dtype='object')

In [7]:
df.shape

(32945, 14)

In [8]:
df

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,type
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,white
2,US,"Tart and snappy, the flavors of lime flesh and...",Pinot Gris,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,white
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,white
7,France,This dry and restrained wine offers spice in p...,Gewürztraminer,87,24.0,Alsace,Alsace,,Roger Voss,@vossroger,Trimbach 2012 Gewurztraminer (Alsace),Gewürztraminer,Trimbach,white
8,Germany,Savory dried thyme notes accent sunnier flavor...,Shine,87,12.0,Rheinhessen,,,Anna Lee C. Iijima,,Heinz Eifel 2013 Shine Gewürztraminer (Rheinhe...,Gewürztraminer,Heinz Eifel,white
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129965,France,"While it's rich, this beautiful dry wine also ...",Seppi Landmann Vallée Noble,90,28.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Rieflé-Landmann 2013 Seppi Landmann Va...,Pinot Gris,Domaine Rieflé-Landmann,white
129966,Germany,Notes of honeysuckle and cantaloupe sweeten th...,Brauneberger Juffer-Sonnenuhr Spätlese,90,28.0,Mosel,,,Anna Lee C. Iijima,,Dr. H. Thanisch (Erben Müller-Burggraef) 2013 ...,Riesling,Dr. H. Thanisch (Erben Müller-Burggraef),white
129968,France,Well-drained gravel soil gives this wine its c...,Kritt,90,30.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Gewürztraminer,Domaine Gresser,white
129969,France,"A dry style of Pinot Gris, this is crisp with ...",Pinot Gris,90,32.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Pinot Gris,Domaine Marcel Deiss,white


In [9]:
#def custom_tokenizer(text):
#    '''
#    used to filter out unwanted words, punctuation, and so on
#    '''
#    tokens = []
#    for t in nlp(text):
#        if not(len(t) < 2 or t.is_stop or t.like_num or 
#               t.is_punct or not t.is_alpha):
#            tokens.append(t.lemma_)
#    return tokens 

In [10]:
corpus = df['description']

In [11]:
corpus

0         Aromas include tropical fruit, broom, brimston...
2         Tart and snappy, the flavors of lime flesh and...
3         Pineapple rind, lemon pith and orange blossom ...
7         This dry and restrained wine offers spice in p...
8         Savory dried thyme notes accent sunnier flavor...
                                ...                        
129965    While it's rich, this beautiful dry wine also ...
129966    Notes of honeysuckle and cantaloupe sweeten th...
129968    Well-drained gravel soil gives this wine its c...
129969    A dry style of Pinot Gris, this is crisp with ...
129970    Big, rich and off-dry, this is powered by inte...
Name: description, Length: 32945, dtype: object

In [12]:
y = df['variety']

In [13]:
y.value_counts()

Chardonnay                    11753
Riesling                       5189
Sauvignon Blanc                4967
White Blend                    2360
Pinot Gris                     1455
Grüner Veltliner               1345
Portuguese White               1159
Bordeaux-style White Blend     1066
Pinot Grigio                   1052
Gewürztraminer                 1012
Viognier                        996
Chenin Blanc                    591
Name: variety, dtype: int64

In [14]:
bow = CountVectorizer(tokenizer=custom_tokenizer, 
                      ngram_range=(1, 1), 
                      min_df=0.01, 
                      max_df=0.99)

In [15]:
corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, 
                                                              test_size=0.4, 
                                                              train_size=0.6)

In [16]:
X_train = bow.fit_transform(corpus_train)

In [17]:
X_train.shape

(19767, 362)

In [18]:
X_test = bow.transform(corpus_test)

In [19]:
m_nb = Pipeline([
    ('TfIdf', TfidfTransformer()),
    ('GBC', GradientBoostingClassifier())
])

In [20]:
cross_val_score(m_nb, X_train, y_train, scoring='accuracy', n_jobs=4, cv=4).mean()

0.661253653082087

In [21]:
m_nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('TfIdf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('GBC',
                 GradientBoostingClassifier(ccp_alpha=0.0,
                                            criterion='friedman_mse', init=None,
                                            learning_rate=0.1, loss='deviance',
                                            max_depth=3, max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators=100,
                              

In [22]:
pred = m_nb.predict(X_test)

In [23]:
metrics.accuracy_score(y_test, m_nb.predict(X_test))

0.6654272271968432

In [24]:
metrics.f1_score(y_test, pred, average='weighted')

0.6463498226312664

In [27]:
m_nb.predict(bow.transform(['cherry flavor nice oak accent']))

array(['White Blend'], dtype=object)

In [28]:
m_nb.predict_proba(bow.transform(['cherry flavor nice oak accent'])).max()

0.7236354469908594

In [30]:
with open('m_white.p', 'wb') as f:
    pickle.dump(m_nb, f)

In [29]:
with open('bow_white.p', 'wb') as f:
    pickle.dump(bow, f)