In [1]:
import pickle
import spacy
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, CategoricalNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "textcat", "ner"])

In [11]:
df = pd.read_csv('../data/dfs_variety/Merlot.csv', index_col = 0)

In [12]:
df.columns

Index(['country', 'description', 'designation', 'points', 'price', 'province',
       'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title',
       'variety', 'winery', 'type', 'sparkling'],
      dtype='object')

In [13]:
df

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,type,sparkling
23,US,This wine from the Geneseo district offers aro...,Signature Selection,87,22.0,California,Paso Robles,Central Coast,Matt Kettmann,@mattkettmann,Bianchi 2011 Signature Selection Merlot (Paso ...,Merlot,Bianchi,red,no
44,Chile,A berry aroma comes with cola and herb notes. ...,Merlot,86,9.0,Maule Valley,,,Michael Schachner,@wineschach,Sundance 2011 Merlot (Maule Valley),Merlot,Sundance,red,no
87,US,"Softened tannins surround a light-bodied, lean...",Blau Vineyards,86,55.0,California,Knights Valley,Sonoma,Virginie Boone,@vboone,Passaggio 2014 Blau Vineyards Merlot (Knights ...,Merlot,Passaggio,red,no
168,US,"A fairly elegant expression of the variety, th...",Rector Creek Vineyard,91,95.0,California,Napa Valley,Napa,Virginie Boone,@vboone,Duckhorn 2012 Rector Creek Vineyard Merlot (Na...,Merlot,Duckhorn,red,no
188,Chile,Bisquertt usually does well with its value-pri...,Casa La Joya Reserve,88,11.0,Colchagua Valley,,,Michael Schachner,@wineschach,Viña Bisquertt 2007 Casa La Joya Reserve Merlo...,Merlot,Viña Bisquertt,red,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129840,US,The entry is oaky and the fruit light; it seem...,Chateau Puryear Vineyard,86,20.0,Washington,Rattlesnake Hills,Columbia Valley,Paul Gregutt,@paulgwine,Bonair 2006 Chateau Puryear Vineyard Merlot (R...,Merlot,Bonair,red,no
129846,US,"Simple but meaty, this 100% Merlot has substan...",Merlot,86,10.0,Washington,Rattlesnake Hills,Columbia Valley,Paul Gregutt,@paulgwine,Hyatt 2005 Merlot (Rattlesnake Hills),Merlot,Hyatt,red,no
129847,US,"Rich, soft and lush, if a little obvious, this...",Black Clover,86,48.0,California,Napa Valley,Napa,,,JAQK Cellars 2006 Black Clover Merlot (Napa Va...,Merlot,JAQK Cellars,red,no
129857,US,Dusty tannins make for a soft texture in this ...,Reserve,90,44.0,California,Carneros,Napa-Sonoma,Virginie Boone,@vboone,Ca' Momi 2013 Reserve Merlot (Carneros),Merlot,Ca' Momi,red,no


In [6]:
corpus = df['description']

In [7]:
corpus

0         Aromas include tropical fruit, broom, brimston...
1         This is ripe and fruity, a wine that is smooth...
2         Tart and snappy, the flavors of lime flesh and...
3         Pineapple rind, lemon pith and orange blossom ...
4         Much like the regular bottling from 2012, this...
                                ...                        
129966    Notes of honeysuckle and cantaloupe sweeten th...
129967    Citation is given as much as a decade of bottl...
129968    Well-drained gravel soil gives this wine its c...
129969    A dry style of Pinot Gris, this is crisp with ...
129970    Big, rich and off-dry, this is powered by inte...
Name: description, Length: 107539, dtype: object

In [8]:
y = df['variety']

In [9]:
class_labels = list(y)

In [10]:
class_labels

['White Blend',
 'Portuguese Red',
 'Pinot Gris',
 'Riesling',
 'Pinot Noir',
 'Gewürztraminer',
 'Gewürztraminer',
 'Pinot Gris',
 'Cabernet Sauvignon',
 'Gewürztraminer',
 'Cabernet Sauvignon',
 'Chardonnay',
 'Riesling',
 'Malbec',
 'Malbec',
 'Tempranillo Blend',
 'Red Blend',
 'Pinot Noir',
 'White Blend',
 'Merlot',
 'Pinot Noir',
 'White Blend',
 'Red Blend',
 'Chenin Blanc',
 'Gamay',
 'Red Blend',
 'White Blend',
 'Red Blend',
 'Sauvignon Blanc',
 'Pinot Noir',
 'Cabernet Sauvignon',
 'Pinot Noir',
 'Gamay',
 'Sauvignon Blanc',
 'Merlot',
 'Red Blend',
 'Riesling',
 'Sauvignon Blanc',
 'Gamay',
 'Red Blend',
 'Bordeaux-style White Blend',
 'Red Blend',
 'Chardonnay',
 'Chardonnay',
 'Pinot Noir',
 'Malbec',
 'Cabernet Sauvignon',
 'Sangiovese',
 'Cabernet Franc',
 'Sauvignon Blanc',
 'Chardonnay',
 'Chardonnay',
 'Bordeaux-style Red Blend',
 'Red Blend',
 'Chardonnay',
 'Cabernet Sauvignon',
 'Cabernet Sauvignon',
 'Petite Sirah',
 'Bordeaux-style Red Blend',
 'Riesling',
 'Ch

In [30]:
def custom_tokenizer(text):
    '''
    used to filter out unwanted words, punctuation, and so on
    '''
    tokens = []
    for t in nlp(text):
        if not(len(t) < 2 or t.is_stop or t.like_num or 
               t.is_punct or not t.is_alpha):
            tokens.append(t.lemma_)
    return tokens 

In [31]:
bow = CountVectorizer(tokenizer=custom_tokenizer, 
                      ngram_range=(1, 1), 
                      min_df=0.01, 
                      max_df=0.99)

In [16]:
#word_count_vector = bow.fit_transform(corpus)

In [14]:
#with open('../data/word_count_vector.p', 'wb') as f:
#    pickle.dump(word_count_vector, f)

In [14]:
with open('../data/word_count_vector.p', 'rb') as f:
    word_count_vector = pickle.load(f)

In [15]:
word_count_vector.shape

(111797, 392)

In [16]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [18]:
# print idf values
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=bow.get_feature_names(),columns=["idf_weights"])
 
# sort ascending
df_idf.sort_values(by=['idf_weights'], ascending = False)

#these values tell us how unique a word is to a document
#low scores indicate that the word is not that unique and higher score indicate mroe uniqueness 

In [2]:
df_idf.sort_values(by=['idf_weights'])

In [20]:
df_idf.sort_values(by=['idf_weights'], ascending = False).head(50)

Unnamed: 0,idf_weights
low,5.604258
hot,5.603365
racy,5.602473
petite,5.60069
menthol,5.592708
focus,5.592708
mild,5.587421
opulent,5.583913
winemaker,5.581289
feature,5.575194
