# DS5559 Naive Bayes Wine Reviews v5 FINAL

## Synopsis

We attempt a Naive Bayes classifier for the sentiment of a corpus of wine reviews. We use the score of the review as our sentiment value, assuming that a bad review will have negative sentiment and a good review will have positive sentiment.

## Configuration

In [372]:
src_file = 'winereviews.csv'

# Set Hyperparameters
class Params():
    qntile_P = .9
    qntile_N = .1
    n_sets = 4
    smooth_alpha = .1
    binary_mode = True
params = Params()

## Libraries

In [373]:
import pandas as pd
import numpy as np
from numpy import log2 as log
from numpy import exp2 as exp
from numpy.random import randint
import textman as tx

## Pragmas

In [374]:
%matplotlib inline

# Process

## PREPARATION

### Import wine reviews

In [375]:
docs = pd.read_csv('winereviews.csv', index_col='doc_id')

In [376]:
docs.head()

Unnamed: 0_level_0,doc_content,points
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"Aromas include tropical fruit, broom, brimston...",87
1,"This is ripe and fruity, a wine that is smooth...",87
2,"Tart and snappy, the flavors of lime flesh and...",87
3,"Pineapple rind, lemon pith and orange blossom ...",87
4,"Much like the regular bottling from 2012, this...",87


### Convert points to labels

#### Keep only reviews with high and low ratings

In [377]:
bound_P = int(docs['points'].quantile(params.qntile_P))
bound_N = int(docs['points'].quantile(params.qntile_N))
docs = docs[(docs.points <= bound_N) | (docs.points >= bound_P)]

In [378]:
docs.sample(5)

Unnamed: 0_level_0,doc_content,points
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1
91785,"The '04 is considerably better than the '03, p...",84
58887,"Soft and sweet, this Merlot has a rustic mouth...",84
79887,"This smells rather strange, with stewed fruit ...",83
56169,"Soft and simple with sugary peach, citrus and ...",82
19040,"A smooth, fruity wine, with fine red-cherry fl...",84


#### Assign labels for high and low

In [379]:
docs.loc[docs['points'] >= bound_P, 'doc_label'] = 'P'
docs.loc[docs['points'] <= bound_N, 'doc_label'] = 'N'

In [380]:
docs.head()

Unnamed: 0_level_0,doc_content,points,doc_label
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
336,"Gritty, heavily roasted aromas of peanuts and ...",83,N
337,"An easy and inviting selection, there's a ment...",83,N
338,The wine is earthy and somewhat rustic. There ...,82,N
339,"Red in color, with berry and apple aromas, thi...",82,N
340,"The nose is muted, despite the slight spritz o...",82,N


In [381]:
docs = docs.drop('points', 1)

In [382]:
docs.head()

Unnamed: 0_level_0,doc_content,doc_label
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1
336,"Gritty, heavily roasted aromas of peanuts and ...",N
337,"An easy and inviting selection, there's a ment...",N
338,The wine is earthy and somewhat rustic. There ...,N
339,"Red in color, with berry and apple aromas, thi...",N
340,"The nose is muted, despite the slight spritz o...",N


### Divide docs into train and test sets

#### Assign random numbers to docs

In [383]:
docs['set'] = randint(0,params.n_sets, len(docs.index))

In [384]:
docs.head()

Unnamed: 0_level_0,doc_content,doc_label,set
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
336,"Gritty, heavily roasted aromas of peanuts and ...",N,2
337,"An easy and inviting selection, there's a ment...",N,3
338,The wine is earthy and somewhat rustic. There ...,N,1
339,"Red in color, with berry and apple aromas, thi...",N,2
340,"The nose is muted, despite the slight spritz o...",N,1


#### Split docs by assigned number 

In [385]:
training_docs = docs[docs.set != 0].copy()
testing_docs = docs[docs.set == 0].copy()
del(docs)

In [386]:
training_docs = training_docs.drop('set', 1)
testing_docs = testing_docs.drop('set', 1)

In [387]:
round(len(training_docs) / len(testing_docs), 2)

2.93

### Convert docs to tokens

In [388]:
training_tokens, vocab = tx.create_tokens_and_vocab(training_docs, src_col='doc_content')
testing_tokens, _ = tx.create_tokens_and_vocab(testing_docs, src_col='doc_content')

In [389]:
training_tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,token,term_str,term_id
doc_id,sent_id,token_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
336,0,0,Gritty,gritty,6461
336,0,1,heavily,heavily,6722
336,0,2,roasted,roasted,11750
336,0,3,aromas,aromas,995
336,0,5,peanuts,peanuts,10136


### Apply training vocab to testing -- CRUCIAL

In [390]:
testing_tokens['term_id'] = testing_tokens.term_str.map(vocab.reset_index().set_index('term_str').term_id)
testing_tokens = testing_tokens.dropna()
testing_tokens['term_id'] = testing_tokens['term_id'].astype('int')

In [391]:
testing_tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,token,term_str,term_id
doc_id,sent_id,token_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
341,0,0,Plummy,plummy,10553
341,0,2,hinting,hinting,6839
341,0,4,black,black,1634
341,0,5,cherry,cherry,2722
341,0,7,wine,wine,15619


#### Simplify tokens table 

In [392]:
training_tokens = training_tokens.reset_index()[['doc_id','term_id']]
testing_tokens = testing_tokens.reset_index()[['doc_id','term_id']]

In [393]:
if params.binary_mode:
    training_tokens = training_tokens.drop_duplicates()
    testing_tokens = testing_tokens.drop_duplicates()

In [394]:
training_tokens.head()

Unnamed: 0,doc_id,term_id
0,336,6461
1,336,6722
2,336,11750
3,336,995
4,336,10136


### Transfer doc labels and splits to tokens -- CRUCIAL

In [395]:
training_tokens = training_tokens.join(training_docs[['doc_label']], on='doc_id', how='inner')
# testing_tokens = testing_tokens.join(testing_docs[['doc_label']], on='doc_id', how='left')

In [396]:
training_tokens.head()

Unnamed: 0,doc_id,term_id,doc_label
0,336,6461,N
1,336,6722,N
2,336,11750,N
3,336,995,N
4,336,10136,N


## TRAINING

### Estimate class priors $p(c)$

$$
\hat{P}(c) = \dfrac{N_{c}}{N_{d}}
$$

In [397]:
Nc = training_docs['doc_label'].value_counts()
Nd = training_docs.shape[0]
class_priors = Nc / Nd

In [398]:
np.round(class_priors, 2)

P    0.5
N    0.5
Name: doc_label, dtype: float64

Since these are literally $50/50$, we really don't need them.

#### Convert priors to logs

In [399]:
class_priors_log = log(class_priors)

In [400]:
np.round(class_priors_log)

P   -1.0
N   -1.0
Name: doc_label, dtype: float64

### Estimate likelihoods $p(w|c)$

$$
\hat{P}(w_i|c) = \dfrac{count(w_i,c)}{\sum_{w \in V} count(w,c)}
$$

$$
\hat{P}(w_i|c) = \dfrac{count(w_i,c)+1}{\sum_{w \in V} (count(w,c)+1)} = \dfrac{count(w_i,c)+1}{(\sum_{w \in V} count(w,c))+|V|} 
$$



Now we compute the probability of a token given the label. This will in effect product two language models, one for each label. Key idea = **the likelihoods are language models** (see Pearl for interpretation of likelihoods).

In [401]:
class_likelihoods = training_tokens.groupby(['term_id', 'doc_label']).doc_label.count()\
    .unstack().fillna(0)

In [402]:
class_likelihoods = class_likelihoods + params.smooth_alpha
class_likelihoods = class_likelihoods / class_likelihoods.sum()

In [403]:
class_likelihoods.head()

doc_label,N,P
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
344,6.109508e-07,1.143083e-05
346,6.109508e-07,1.143083e-05
349,6.109508e-07,1.143083e-05
350,1.893947e-05,3.687366e-07
352,6.109508e-07,1.143083e-05


#### Convert likelihoods to logs

In [404]:
class_likelihoods_log = log(class_likelihoods)

In [405]:
class_likelihoods_log.head()

doc_label,N,P
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
344,-20.64244,-16.41671
346,-20.64244,-16.41671
349,-20.64244,-16.41671
350,-15.688244,-21.370906
352,-20.64244,-16.41671


## TESTING

### Add likelihood columns to test tokens table

This is effectively how we apply our model to the test set.

In [406]:
testing_tokens.head()

Unnamed: 0,doc_id,term_id
0,341,10553
1,341,6839
2,341,1634
3,341,2722
4,341,15619


In [407]:
testing_tokens = testing_tokens\
    .join(class_likelihoods_log[['P','N']], on='term_id', how='inner')

In [408]:
testing_tokens.sample(5)

Unnamed: 0,doc_id,term_id,P,N
71800,62780,5158,-12.291421,-20.64244
80219,73166,15710,-11.743372,-10.51187
17825,15264,9141,-9.888602,-8.76554
125853,112664,7303,-8.724572,-10.892571
138110,123074,7947,-8.83199,-9.997683


### Compute posteriors $p(c|w)$

$$
c_{NB} = \arg\max \log{P(c)} + \sum_{id=1}^{id_{max}} \log{P(token_{id}|c)}
$$




In [409]:
testing_docs['prediction'] = testing_tokens.groupby('doc_id')\
    .apply(lambda x: x[['P','N']].sum())\
    .apply(lambda x: x + class_priors_log, 1)\
    .idxmax(1)

In [410]:
testing_docs.head()

Unnamed: 0_level_0,doc_content,doc_label,prediction
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
341,"Plummy and hinting at black cherry, this wine ...",N,N
344,"Aromas of pumpkin, squash and corn chips are s...",N,N
348,Deep mahogany. Dried fig and black tea on the ...,P,P
356,"Dusty, firm, powerful: just a few apt descript...",P,P
363,This is an opulent wine from one of the fabled...,P,P


## EVALUATION

In [411]:
testing_docs['result'] = testing_docs.doc_label == testing_docs.prediction

### Show raw T & F count

In [412]:
raw = testing_docs.result.value_counts()

In [413]:
raw

True     5953
False     217
Name: result, dtype: int64

In [414]:
raw[True] / raw[False]

27.433179723502302

### Create confusion matrix

In [415]:
CM = testing_docs.reset_index().groupby(['prediction','doc_label']).doc_id.count().unstack().fillna(0)

In [416]:
CM.columns.name = 'actual'

In [417]:
CM

actual,N,P
prediction,Unnamed: 1_level_1,Unnamed: 2_level_1
N,2926,54
P,163,3027


In [472]:
def get_results(CM):
    class Results():
        TP = CM.iloc[0,0] # hits
        FP = CM.iloc[0,1] # Type I errors; false alarms
        TN = CM.iloc[1,1] # correct rejections
        FN = CM.iloc[1,0] # Type  II errors; misses
        T = TP + TN
        F = FP + FN
        ALL =  T + F
        ACC = T / ALL # Accuracy
        TPR = TP / (TP + FN) # Recall, Sensitivity
        TNR = TN / (TN + FP) # Specificity
        PPV = TP / (TP + FP)  # Precision; Positive predictive value 
        BA = (TNR + TPR) / 2 # Balanced Accuracy
        F1 = (2 *  TP) / (2 * TP + FP + FN) # F-score where F =  1

        assert R.ALL == CM.sum().sum()
        
        def show_results(self):
            print('TPR:', round(self.TPR, 2), '(sensitivity)')
            print('TNR:', round(self.TNR, 2), '(specificity)')
            print('F1: ', round(self.F1, 2), '<-- GRADE')
            print('-'*9)
            print('PPV:', round(self.PPV, 2),  '(precision)')
            print('ACC:', round(self.ACC, 2), '(accuracy)')
            
    return Results()

In [473]:
R = get_results(CM)

In [474]:
R.show_results()

TPR: 0.95 (sensitivity)
TNR: 0.98 (specificity)
F1:  0.96 <-- GRADE
---------
PPV: 0.98 (precision)
ACC: 0.96 (accuracy)
