# Synopsis

#  Configuration

In [1]:
base_path = '/Users/rca2t/COURSES/DSI/DS5559/UVA_DSI_REPO'
local_lib = base_path + '/lib'
src_file = 'winereviews.csv'

# Set Hyperparameters
params = dict(
    qntile_B=.1,
    qntile_A=.9,
    n_sets=4,
    smooth_alpha=1,
    binary_counts=True
)

# Libraries

In [2]:
import pandas as pd
from numpy.random import randint
import sys; sys.path.append(local_lib)
import textman.textman as tx

# Process

## Import raw review data

In [3]:
try:
    docs = pd.read_csv('winereviews.csv', index_col='doc_id')
except FileNotFoundError as e:
    print(e)

In [4]:
docs.head()

Unnamed: 0_level_0,doc_content,points
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"Aromas include tropical fruit, broom, brimston...",87
1,"This is ripe and fruity, a wine that is smooth...",87
2,"Tart and snappy, the flavors of lime flesh and...",87
3,"Pineapple rind, lemon pith and orange blossom ...",87
4,"Much like the regular bottling from 2012, this...",87


## Clip DOC table by quantile

We only want reviews that are very good or very bad.

In [5]:
bound_A = int(docs.points.quantile(params['qntile_A']))
bound_B = int(docs.points.quantile(params['qntile_B']))
docs = docs[(docs.points <= bound_B) | (docs.points >= bound_A)].copy()

In [6]:
print('A:', bound_A, '\nB:', bound_B)

A: 93 
B: 84


## Convert DOC points feature to A and B labels

In [7]:
docs.loc[docs.points >= bound_A, 'doc_label'] = 'A'
docs.loc[docs.points <= bound_B, 'doc_label'] = 'B'

## TRAINING

### Split out training and test sets from DOC 

We randomly assign each doc a value from 0 to 9, and then select one group for testing, i.e. 10%.

In [8]:
docs['set'] = randint(0,params['n_sets'], len(docs.index))
training = docs.query('set != 0').copy()
testing = docs.query('set == 0').copy()

### Get TOKEN and VOCAB from training corpus

We use our text parsing library tokenize.

In [9]:
tokens, _ = tx.create_tokens_and_vocab(training, src_col='doc_content')

In [10]:
tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,token,term_str,term_id
doc_id,sent_id,token_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
336,0,0,Gritty,gritty,6427
336,0,1,heavily,heavily,6689
336,0,2,roasted,roasted,11733
336,0,3,aromas,aromas,972
336,0,5,peanuts,peanuts,10104


### Add sentiment label to TOKEN

We now transfer the doc label to the tokens, by transitive inheritance.

In [11]:
tokens = tokens.join(training.doc_label, on='doc_id')

### Create VOCAB from TOKEN

In [16]:
vocab = tokens.groupby('term_id').term_str.value_counts()\
    .to_frame().rename(columns={'term_str':'n'})
vocab = vocab.reset_index().set_index('term_id')

In [17]:
vocab

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
337,aaron,5
340,abbey,4
342,abbott,3
343,abbreviated,3
345,abeja,4
348,ability,30
349,able,13
353,abound,22
354,abounds,7
358,abrasive,30


### Adjust TOKENS

In [18]:
tokens = tokens.reset_index()[['doc_label', 'doc_id', 'term_id']]

In [19]:
tokens.sample(5)

Unnamed: 0,doc_label,doc_id,term_id
412831,B,120305,13063
371854,A,108462,3183
58699,B,16093,9136
361212,A,105183,11663
417802,B,121724,660


### Compute Priors

We compute the probability of each label in the corpus.

In [20]:
priors = tokens.groupby('doc_label').doc_id.count()
priors = priors / priors.sum()

### Compute Likelihoods

Now we compute the probability of a token given the label. This will in effect product two language models, one for each label. Key idea = **the likelihoods are language models** (see Pearl for interpretation of likelihoods).

In [21]:
likelihoods = tokens.groupby(['doc_label'])\
    .term_id.value_counts()\
    .to_frame().rename(columns={'term_id':'n'})\
    .reset_index()
likelihoods = likelihoods.set_index(['term_id','doc_label']).n.unstack().fillna(0)

In [22]:
likelihoods = (likelihoods + params['smooth_alpha'])\
    .div(likelihoods.sum() + (len(vocab.index) * params['smooth_alpha']))

## TESTING

### Get test corpus

Note that we replace the vocabulary IDs with the old.

In [24]:
test, _ = tx.create_tokens_and_vocab(testing, src_col='doc_content')
test['term_id'] = test.term_str.map(vocab.reset_index().set_index('term_str').term_id)
test = test.dropna()

### Convert corpus to BOW

In [25]:
test_docs = test.groupby(['doc_id','term_id']).term_id.count()\
    .unstack().apply(lambda x: x.dropna().index.astype('int').tolist(), 1)\
    .to_frame().rename(columns={0:'bow'})
test_docs['doc_label'] = testing.doc_label
if params['binary_counts']:
    # set() forces BOW to consist of only one token for each term
    test_docs['bow'] = test_docs.bow.apply(lambda x: set(x))

### Compute POSTERIOR and make prediction

In [26]:
posteriors = test_docs.bow.apply(lambda x: likelihoods.loc[x].product() * priors)
test_docs['prediction'] = posteriors.T.idxmax()

### Evaluation 

In [27]:
test_docs['result'] = test_docs.doc_label == test_docs.prediction
T, F = test_docs.result.value_counts()
grade = round(T/(T+F) * 100, 4)
CM = test_docs.reset_index().groupby(['doc_label','prediction']).doc_id.count().unstack().fillna(0)

### Results

In [28]:
print("______________________")
print("      RESULTS")
print("----------------------")
print('Grade:', grade)
print("----------------------")
print("Confusion matrix:")
print(CM)
print("______________________")

______________________
      RESULTS
----------------------
Grade: 96.0781
----------------------
Confusion matrix:
prediction     A     B
doc_label             
A           2986    42
B            195  2820
______________________
