# Blind Taste Wine


## 1. Import

In [1]:
import os

import warnings  
warnings.filterwarnings('ignore')

import pandas as pd
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from sklearn.cluster import KMeans
from xgboost import XGBClassifier
from scipy.sparse import hstack

# nltk packages
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from string import punctuation
import unidecode

In [2]:
data = pd.read_csv('./input/winemag-data_first150k.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


## 2. Preprocessing

### 2.1 Drop useless features

In [3]:
data_sel = data.drop(['Unnamed: 0','designation','points','region_2',], axis = 1)

In [4]:
data_sel.shape

(150930, 7)

### 2.2 Remove duplicates, NAs
We do not want any duplication in the description column since that might falsely over emphasize certain features. Therefore, we start by dropping any description duplicates.

In [5]:
#data_single = data_sel.drop_duplicates('description')
data_single = data_sel.drop_duplicates('description',keep=False)
data_single.sort_values('description').head(5)

Unnamed: 0,country,description,price,province,region_1,variety,winery
96052,Austria,"""Chremisa,"" the ancient name of Krems, is comm...",24.0,Niederösterreich,,Grüner Veltliner,Winzer Krems
77587,Portugal,"""New moon on old vines” is the name of this po...",15.0,Douro,,Portuguese Red,Wines & Winemakers
83019,US,$10 for this very drinkable Cab? That's crazy....,10.0,California,North Coast,Cabernet Sauvignon,Line 39
32824,US,$14 is a pretty good price for a Chardonnay th...,14.0,California,California,Chardonnay,Jamieson Ranch
39937,US,). Certainly not lacking in richness. Shows a ...,42.0,California,Sonoma Coast,Syrah,Keller Estate


In [6]:
data_single.shape

(58537, 7)

In [7]:
data_single = data_single.dropna()

In [8]:
data_single.head()

Unnamed: 0,country,description,price,province,region_1,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,235.0,California,Napa Valley,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",110.0,Northern Spain,Toro,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Mac Watson honors the memory of a wine once ma...,90.0,California,Knights Valley,Sauvignon Blanc,Macauley
3,US,"This spent 20 months in 30% new French oak, an...",65.0,Oregon,Willamette Valley,Pinot Noir,Ponzi
4,France,"This is the top wine from La Bégude, named aft...",66.0,Provence,Bandol,Provence red blend,Domaine de la Bégude


In [9]:
data_single.shape

(44901, 7)

### 2.3 Text format ( lower )

In [10]:
for col in ['variety', 'description', 'province', 'region_1', 'winery', 'country']:
    data_single[col] = data_single[col].str.lower()


In [11]:
def unidecode_text(text):
    try:
        #pdb.set_trace()
        text = unidecode.unidecode(text)
    except:
        pass
    return text


In [12]:
for col in ['description', 'variety', 'province', 'winery']:
    data_single[col] = data_single.apply(lambda row: unidecode_text(row[col]), axis=1)

### 2.4 Remove hard targets

In [13]:
data_single.variety.value_counts()

pinot noir                  4794
chardonnay                  4398
cabernet sauvignon          4267
red blend                   3214
bordeaux-style red blend    2152
                            ... 
madeira blend                  1
viognier-valdiguie             1
groppello                      1
macabeo-gewurztraminer         1
duras                          1
Name: variety, Length: 423, dtype: int64

Remove all blend types.

In [14]:
filtered_name = ['red blend', 'portuguese red', 'white blend', 'sparkling blend', 'champagne blend', 
                 'portuguese white', 'rose', 'bordeaux-style red blend', 'rhone-style red blend',
                 'bordeaux-style white blend', 'alsace white blend', 'austrian red blend',
                 'austrian white blend', 'cabernet blend', 'malbec blend', 'portuguese rose',
                 'portuguese sparkling', 'provence red blend', 'provence white blend',
                 'rhone-style white blend', 'tempranillo blend', 'grenache blend',
                 'meritage' # beaurdaux blend
                ]

In [15]:
data_filtered = data_single.copy()
data_filtered = data_filtered[~data_filtered['variety'].isin(filtered_name)]
data_filtered.variety.value_counts()

pinot noir            4794
chardonnay            4398
cabernet sauvignon    4267
syrah                 1817
merlot                1506
                      ... 
angevine                 1
albarossa                1
ansonica                 1
clairette                1
greco bianco             1
Name: variety, Length: 406, dtype: int64

Unify different words with same meannings.

In [16]:
def correct_grape_names(row):
    regexp = [r'shiraz', r'ugni blanc', r'cinsaut', r'carinyena', r'^ribolla$', r'palomino', r'turbiana', r'verdelho', r'viura', r'pinot bianco|weissburgunder', r'garganega|grecanico', r'moscatel', r'moscato', r'melon de bourgogne', r'trajadura|trincadeira', r'cannonau|garnacha', r'grauburgunder|pinot grigio', r'pinot noir|pinot nero', r'colorino', r'mataro|monastrell', r'mourv(\w+)']
    grapename = ['syrah', 'trebbiano', 'cinsault', 'carignan', 'ribolla gialla', 'palomino','verdicchio', 'verdejo','macabeo', 'pinot blanc', 'garganega', 'muscatel', 'muscat', 'muscadet', 'treixadura', 'grenache', 'pinot gris', 'pinot noir', 'lambrusco', 'mourvedre', 'mourvedre']
    f = row
    for exsearch, gname in zip(regexp, grapename):
        f = re.sub(exsearch, gname, f)
    return f

name_pairs = [('spatburgunder', 'pinot noir'), ('garnacha', 'grenache'), ('pinot nero', 'pinot noir'),
              ('alvarinho', 'albarino'), ('assyrtico', 'assyrtiko'), ('black muscat', 'muscat hamburg'),
              ('kekfrankos', 'blaufrankisch'), ('garnacha blanca', 'grenache blanc'),
              ('garnacha tintorera', 'alicante bouschet'), ('sangiovese grosso', 'sangiovese')
             ]

In [17]:
data_corrected = data_filtered.copy()
data_corrected['variety'] = data_corrected['variety'].apply(lambda row: correct_grape_names(row))
for start, end in name_pairs:
    data_corrected['variety'] = data_corrected['variety'].replace(start, end) 


In [18]:
data_corrected.variety.value_counts()

pinot noir              4845
chardonnay              4398
cabernet sauvignon      4267
syrah                   2370
merlot                  1506
                        ... 
merlot-petite verdot       1
chardonel                  1
muscat di noto             1
sauvignon musque           1
grignolino                 1
Name: variety, Length: 376, dtype: int64

Remove types which has not enough samples to train

In [19]:
data_reduced = data_corrected.groupby('variety').filter(lambda x: len(x) > 200)
data_reduced.variety.value_counts()

pinot noir                       4845
chardonnay                       4398
cabernet sauvignon               4267
syrah                            2370
merlot                           1506
sauvignon blanc                  1473
sangiovese                       1368
zinfandel                        1347
malbec                           1033
riesling                         1011
tempranillo                       927
pinot gris                        867
nebbiolo                          560
corvina, rondinella, molinara     530
viognier                          492
cabernet franc                    475
grenache                          397
gewurztraminer                    375
petite sirah                      353
muscat                            269
pinot blanc                       235
albarino                          228
Name: variety, dtype: int64

In [20]:
len(data_reduced.variety.value_counts())

22

In [21]:
grapes = list(np.unique(data_reduced.variety.value_counts().index.tolist()))

### 2.5 Add colour feature

In [22]:
colour_map = {'aglianico': 'red', 'albarino': 'white', 'barbera': 'red', 'cabernet franc': 'red',
              'cabernet sauvignon': 'red', 'carmenere': 'red', 'chardonnay': 'white', 'chenin blanc': 'white',
              'corvina, rondinella, molinara': 'red', 'gamay': 'red', 'garganega': 'white', 
              'gewurztraminer': 'white', 'glera': 'white', 'grenache': 'red', 'gruner veltliner': 'white',
              'malbec': 'red', 'merlot': 'red', 'mourvedre': 'red', 'muscat': 'white', 'nebbiolo': 'red',
              "nero d'avola": 'red', 'petite sirah': 'red', 'pinot blanc': 'white', 'pinot gris': 'white',
              'pinot noir': 'red', 'port': 'red', 'prosecco': 'white', 'riesling': 'white', 'sangiovese': 'red',
              'sauvignon blanc': 'white', 'syrah': 'red', 'tempranillo': 'red', 'torrontes': 'white', 
              'verdejo': 'white', 'viognier': 'white', 'zinfandel': 'white'
             }

We create two more columns with names *red* and *white*, and their values can be 1 if the wine has that colour or 0 if it does not. In *sklearn* this corresponds to *one hot incoding*, we transform categorical data into vectors.

In [23]:
final_input = data_reduced.copy()
final_input['colour'] = final_input.apply(lambda row: colour_map[row['variety']], axis=1)
colour_dummies = pd.get_dummies(final_input['colour'])
final_input = final_input.merge(colour_dummies, left_index=True, right_index=True)

In [24]:
final_input.reset_index(inplace=True)
final_input.head()

Unnamed: 0,index,country,description,price,province,region_1,variety,winery,colour,red,white
0,0,us,this tremendous 100% varietal wine hails from ...,235.0,california,napa valley,cabernet sauvignon,heitz,red,1,0
1,2,us,mac watson honors the memory of a wine once ma...,90.0,california,knights valley,sauvignon blanc,macauley,white,0,1
2,3,us,"this spent 20 months in 30% new french oak, an...",65.0,oregon,willamette valley,pinot noir,ponzi,red,1,0
3,8,us,this re-named vineyard was formerly bottled as...,65.0,oregon,chehalem mountains,pinot noir,bergstrom,red,1,0
4,9,us,the producer sources from two blocks of the vi...,60.0,california,sonoma coast,pinot noir,blue farm,red,1,0


## 3. Modify descriptions use NLTK

Stop words

These are words that we do not want to come accross in the text and do not want to analyse them. That is why we remove any occurance of the country, province and winery from the text. 

In [25]:
# stop words for countries
stop_country = list(np.unique(final_input.country.dropna().str.lower().tolist()))

#stop words for province
stop_province = list(np.unique(final_input.province.dropna().str.lower().tolist()))

#stop words for winery
stop_winery = list(np.unique(final_input.winery.dropna().str.lower().tolist()))

stop_words_nltk = stopwords.words('english')
stop_append = ['.', ',', '`', '"', "'", '!', ';', 'wine', 'fruit', '%', 'flavour', 'aromas', 'palate']
stop_words1 = stop_words_nltk + stop_append + grapes + stop_country + stop_province + stop_winery
#stop_words1 = stop_words_nltk + stop_append

Other steps

In [26]:
# list of word types (nouns and adjectives) to leave in the text
defTags = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJS', 'JJR']#, 'RB', 'RBS', 'RBR', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

# functions to determine the type of a word
def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']

def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']

def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']

# transform tag forms
def penn_to_wn(tag):
    if is_adjective(tag):
        return nltk.stem.wordnet.wordnet.ADJ
    elif is_noun(tag):
        return nltk.stem.wordnet.wordnet.NOUN
    elif is_adverb(tag):
        return nltk.stem.wordnet.wordnet.ADV
    elif is_verb(tag):
        return nltk.stem.wordnet.wordnet.VERB
    return nltk.stem.wordnet.wordnet.NOUN
    
# lemmatizer, tokenizer, stemming
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.stemmer = nltk.stem.SnowballStemmer('english') 
    def __call__(self, doc):
        # pattern for numbers | words of length=2 | punctuations | words of length=1
        pattern = re.compile(r'[0-9]+|\b[\w]{2,2}\b|[%.,_`!"&?\')({~@;:#}+-]+|\b[\w]{1,1}\b')
        # tokenize document
        doc_tok = word_tokenize(doc)
        #filter out patterns from words
        doc_tok = [x for x in doc_tok if x not in stop_words1]
        doc_tok = [pattern.sub('', x) for x in doc_tok]
        # get rid of anything with length=1
        doc_tok = [x for x in doc_tok if len(x) > 1]
        # position tagging
        doc_tagged = nltk.pos_tag(doc_tok)
        # selecting nouns and adjectives
        doc_tagged = [(t[0], t[1]) for t in doc_tagged if t[1] in defTags]
        # preparing lemmatization
        doc = [(t[0], penn_to_wn(t[1])) for t in doc_tagged]
        # lemmatization
        doc = [self.wnl.lemmatize(t[0], t[1]) for t in doc]
        # stemming
        doc = [self.stemmer.stem(x) for x in doc]
        return doc

Text vectorizer

In [27]:
vec_tdidf = TfidfVectorizer(ngram_range=(1,1), analyzer='word', #stop_words=stop_words1, 
                                               norm='l2', tokenizer=LemmaTokenizer())

In [28]:
# split the data into train and test
combined_features = ['description', 'white', 'red', 'price']
target = 'variety'

X_train, X_test, y_train, y_test = train_test_split(final_input[combined_features], final_input[target], 
                                                    test_size=0.33, random_state=42, stratify=final_input[target])

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(19648, 4) (9678, 4) (19648,) (9678,)


In [29]:
vect = CountVectorizer(stop_words = stop_words1)
X_train_dtm = vect.fit_transform(X_train.description)
price = X_train.price.values[:,None]
X_train_dtm = hstack((X_train_dtm, price))
X_train_dtm

<19648x14271 sparse matrix of type '<class 'numpy.float64'>'
	with 459400 stored elements in COOrdinate format>

In [30]:
X_test_dtm = vect.transform(X_test.description)
price_test = X_test.price.values[:,None]
X_test_dtm = hstack((X_test_dtm, price_test))
X_test_dtm

<9678x14271 sparse matrix of type '<class 'numpy.float64'>'
	with 223740 stored elements in COOrdinate format>

In [31]:
wine =final_input.variety.unique().tolist()

In [32]:
# from sklearn.model_selection import KFold, cross_val_score
# k_fold = KFold(n_splits=5)
# cross_val_score(clf, X_digits, y_digits, cv=k_fold, n_jobs=-1)

NameError: name 'clf' is not defined

Some help functions

In [33]:
class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None, *parg, **kwarg):
        return self

    def transform(self, X):
        # returns the input as a string
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # returns the input as a dataframe
        return X[[self.key]]

In [34]:
def print_stats(preds, target, labels, sep='-', sep_len=40, fig_size=(10,8)):
    print('Accuracy = %.3f' % metrics.accuracy_score(target, preds))
    print(sep*sep_len)
    print('Classification report:')
    print(metrics.classification_report(target, preds))
    print(sep*sep_len)
    print('Confusion matrix')
    cm=metrics.confusion_matrix(target, preds)
    cm = cm / np.sum(cm, axis=1)[:,None]
    sns.set(rc={'figure.figsize':fig_size})
    sns.heatmap(cm, 
        xticklabels=labels,
        yticklabels=labels,
           annot=True, cmap = 'YlGnBu')
    plt.pause(0.05)

## 4. Algorithm

### 4.1 LogisticRegression

In [35]:
# Logistic Regression Classifier
clf = LogisticRegression()

In [36]:
models = {}
for z in wine:
    y = y_train == z
    clf.fit(X_train_dtm, y)
    models[z] = clf

testing_probs = pd.DataFrame(columns = wine)

In [37]:
for variety in wine:
    testing_probs[variety] = models[variety].predict_proba(X_test_dtm)[:,1]
    
predicted_wine = testing_probs.idxmax(axis=1)

comparison = pd.DataFrame({'actual':y_test.values, 'predicted':predicted_wine.values})   

metrics.accuracy_score(comparison.actual, comparison.predicted)

0.1454846042570779

### 4.2 SVM

In [38]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
clf = TruncatedSVD(n_components=5, n_iter=7, random_state=42)

from sklearn import svm
clf = svm.SVC(probability=True)

In [None]:
models = {}
for z in wine:
    y = y_train == z
    clf.fit(X_train_dtm, y)
    models[z] = clf

testing_probs = pd.DataFrame(columns = wine)

In [None]:
for variety in wine:
    testing_probs[variety] = models[variety].predict_proba(X_test_dtm)[:,1]
    
predicted_wine = testing_probs.idxmax(axis=1)

comparison = pd.DataFrame({'actual':y_test.values, 'predicted':predicted_wine.values})   

metrics.accuracy_score(comparison.actual, comparison.predicted)

#### 4.2.1 Kernalize

In [None]:
clf = svm.SVC(probability=True, kernel='rbf')
# kernel='Radial basis function kernel'

In [None]:
models = {}
for z in wine:
    y = y_train == z
    clf.fit(X_train_dtm, y)
    models[z] = clf

testing_probs = pd.DataFrame(columns = wine)

In [None]:
for variety in wine:
    testing_probs[variety] = models[variety].predict_proba(X_test_dtm)[:,1]
    
predicted_wine = testing_probs.idxmax(axis=1)

comparison = pd.DataFrame({'actual':y_test.values, 'predicted':predicted_wine.values})   

metrics.accuracy_score(comparison.actual, comparison.predicted)

### 4.3 XGBoost

In [None]:
# XGBoost classifier
clf = XGBClassifier(random_state=42, seed=2, colsample_bytree=0.6, subsample=0.7)

In [None]:
text = Pipeline([
                ('selector', TextSelector(key='description')),
                ('vectorizer', vec_tdidf)
                ])
red = Pipeline([
                ('selector', NumberSelector(key='red')),
                ])
white = Pipeline([
                ('selector', NumberSelector(key='white')),
                ])
# To combine all feature, we use the *FeatureUnion* object. That makes sure there will not be any errors from combining text and number based inputs.
feats = FeatureUnion([('description', text),
                      ('red', red),
                      ('white', white)
                      ])
# Finally, we are ready to combine the input features with the classifier into one single pipeline .  
pipe = Pipeline([
                 ('feats', feats),
                 ('clf',clf)
                 ])

In [None]:
# Hyperparameter we can tuning
pipe.named_steps['clf'].get_params()

In [None]:
# definition of parameter grid to scan through
param_grid = {
     'clf__n_estimators': [50,100,300]
}

In [None]:
# grid search cross validation instantiation
grid_search = GridSearchCV(estimator = pipe, param_grid = param_grid, 
                          cv = 3, n_jobs = 1, verbose = 0, return_train_score=True)

In [None]:
#hyperparameter fitting
grid_search.fit(X_train, y_train)

In [None]:
grid_search.cv_results_['mean_train_score']

In [None]:
grid_search.cv_results_['mean_test_score']

In [None]:
# test stats
clf_test = grid_search.best_estimator_
preds = clf_test.predict(X_test)
print_stats(y_test, preds, clf_test.classes_)

### 4.4 Adaboost

In [None]:
# AdaBoost classifier
clf = AdaBoostClassifier(n_estimators=600, learning_rate=1)

In [None]:
pipe = Pipeline([
                 ('feats', feats),
                 ('clf',clf)
                 ])

In [None]:
param_grid = {
     'clf__n_estimators': [50,100,300]
}

In [None]:
# grid search cross validation instantiation
grid_search = GridSearchCV(estimator = pipe, param_grid = param_grid, 
                          cv = 3, n_jobs = 1, verbose = 0, return_train_score=True)

In [None]:
#hyperparameter fitting
grid_search.fit(X_train, y_train)

In [None]:
grid_search.cv_results_['mean_train_score']

In [None]:
grid_search.cv_results_['mean_test_score']

In [None]:
# test stats
clf_test = grid_search.best_estimator_
preds = clf_test.predict(X_test)
print_stats(y_test, preds, clf_test.classes_)

### 4.5 Random Forest

In [None]:
# Random Forest Classifier
clf = RandomForestClassifier(max_depth=40, random_state=42, n_estimators=600)

In [None]:
pipe = Pipeline([
                 ('feats', feats),
                 ('clf',clf)
                 ])

In [None]:
param_grid = {
     'clf__n_estimators': [50,100,300]
}

In [None]:
# grid search cross validation instantiation
grid_search = GridSearchCV(estimator = pipe, param_grid = param_grid, 
                          cv = 3, n_jobs = 1, verbose = 0, return_train_score=True)

In [None]:
#hyperparameter fitting
grid_search.fit(X_train, y_train)

In [None]:
grid_search.cv_results_['mean_train_score']

In [None]:
grid_search.cv_results_['mean_test_score']

In [None]:
# test stats
clf_test = grid_search.best_estimator_
preds = clf_test.predict(X_test)
print_stats(y_test, preds, clf_test.classes_)