In [118]:
import pandas as pd
import numpy as np
from afinn import Afinn
import nltk
from nltk.corpus import sentiwordnet as swn
from normalization import normalize_accented_characters, html_parser, strip_html
from utils import display_evaluation_metrics, display_confusion_matrix, display_classification_report
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')
import dynet as dy

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


## Movie review dataset

In [100]:
def prepare_movie_dataset(train_start,train_end,test_start,test_end) : 

    dataset = pd.read_csv(r'datasets/movie_reviews.csv')
    print('dataset size : ',dataset.shape[0])

    train_data = dataset[train_start:train_end]
    test_data = dataset[test_start:test_end]
    
    print('Train_X : ',train_data.shape[0])
    print('Test_X  : ',test_data.shape[0])

    test_reviews = np.array(test_data['review'])
    test_sentiments = np.array(test_data['sentiment'])

    return train_data,test_reviews,test_sentiments


## Evaluation for unsupervised Lexicon sentiment tagging

#### compare against the sentence tagging (already provided in the dataset )

In [85]:
### BORROWED FROM THE AR_SARKAR METRIC
def analyze_sentiment_sentiwordnet_lexicon(review,
                                           verbose=False):
    
    review = html_parser.unescape(review)
    review = strip_html(review)
    
    text_tokens = nltk.word_tokenize(review)
    tagged_text = nltk.pos_tag(text_tokens)
    pos_score = neg_score = token_count = obj_score = 0

    for word, tag in tagged_text:
        ss_set = None
        if 'NN' in tag and swn.senti_synsets(word, 'n'):
            ss_set = list(swn.senti_synsets(word, 'n'))
            if ss_set : 
                ss_set=ss_set[0]
        elif 'VB' in tag and swn.senti_synsets(word, 'v'):
            ss_set = list(swn.senti_synsets(word, 'v'))
            if ss_set : 
                ss_set=ss_set[0]
        elif 'JJ' in tag and swn.senti_synsets(word, 'a'):
            ss_set = list(swn.senti_synsets(word, 'a'))
            if ss_set : 
                ss_set=ss_set[0]
        elif 'RB' in tag and swn.senti_synsets(word, 'r'):
            ss_set = list(swn.senti_synsets(word, 'r'))
            if ss_set : 
                ss_set=ss_set[0]
        
        if ss_set:
            
            pos_score += ss_set.pos_score()
            neg_score += ss_set.neg_score()
            obj_score += ss_set.obj_score()
            token_count += 1
    
    
    final_score = pos_score - neg_score
    norm_final_score = round(float(final_score) / token_count, 2)
    final_sentiment = 'positive' if norm_final_score >= 0 else 'negative'
    if verbose:
        norm_obj_score = round(float(obj_score) / token_count, 2)
        norm_pos_score = round(float(pos_score) / token_count, 2)
        norm_neg_score = round(float(neg_score) / token_count, 2)
        
        sentiment_frame = pd.DataFrame([[final_sentiment, norm_obj_score,
                                         norm_pos_score, norm_neg_score,
                                         norm_final_score]],
                                         columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'], 
                                                                      ['Predicted Sentiment', 'Objectivity',
                                                                       'Positive', 'Negative', 'Overall']], 
                                                              labels=[[0,0,0,0,0],[0,1,2,3,4]]))
        print (sentiment_frame)   
    return final_sentiment
            
                                                               
def evaluate_lexicons(TRUE_LABELS,PREDICTED_LABELS) : 

    print ('Performance metrics:')
    display_evaluation_metrics(true_labels=TRUE_LABELS,
                               predicted_labels=PREDICTED_LABELS,
                               positive_class='positive')  
    print ('\nConfusion Matrix:'             )              
    display_confusion_matrix(true_labels=TRUE_LABELS,
                             predicted_labels=PREDICTED_LABELS,
                             classes=['positive', 'negative'])
    print ('\nClassification report:' )                        
    display_classification_report(true_labels=TRUE_LABELS,
                                  predicted_labels=PREDICTED_LABELS,
                                  classes=['positive', 'negative'])
    return

                               

## Basline lexicon evaluation

#### test size = 2000 sentences with binary classification

In [116]:
train_x,test_x,test_y=prepare_movie_dataset(0,1000,1000,3000)
sentiwordnet_predictions = [analyze_sentiment_sentiwordnet_lexicon(review) for review in test_x]
evaluate_lexicons(test_y.tolist(),sentiwordnet_predictions)

dataset size :  50000
Train_X :  1000
Test_X  :  2000
Performance metrics:
Accuracy: 0.59
Precision: 0.56
Recall: 0.93
F1 Score: 0.69

Confusion Matrix:
                 Predicted:         
                   positive negative
Actual: positive        934       73
        negative        747      246

Classification report:
              precision    recall  f1-score   support

    positive       0.56      0.93      0.69      1007
    negative       0.77      0.25      0.38       993

   micro avg       0.59      0.59      0.59      2000
   macro avg       0.66      0.59      0.53      2000
weighted avg       0.66      0.59      0.54      2000



In [120]:
# create a parameter collection and add the parameters.
m = dy.ParameterCollection()
W = m.add_parameters((8,2))
V = m.add_parameters((1,8))
b = m.add_parameters((8))

dy.renew_cg() # new computation graph. not strictly needed here, but good practice.
b.value() ## bias values

[0.4914777874946594,
 -0.2553456127643585,
 0.3203781247138977,
 0.06885159015655518,
 0.1069909930229187,
 0.14929407835006714,
 -0.13909754157066345,
 -0.27597248554229736]

In [141]:
## SIMPLE NETWORK WITH THE sigma(V*tanh(WX+B)) ## for the XOR problem

x=dy.vecInput(2) ## 2 sized inputs 
output=dy.logistic(V*(dy.tanh(W*x)+b)) ## output node

y = dy.scalarInput(0) ## objective function
loss = dy.binary_log_loss(output,y) ## loss function
## trainer with the initialized parameters m 
trainer=dy.SimpleSGDTrainer(m)
x.set([1,0])
y.set(1)
loss_value = loss.value() # this performs a forward through the network.
print("the loss before step is:",loss_value)

loss.backward()  # compute the gradients
trainer.update()

loss_value = loss.value(recalculate=True) 
print("the loss after step is:",loss_value)


the loss before step is: 0.4169560670852661
the loss after step is: 0.3646645247936249


In [142]:
pc = dy.ParameterCollection()
NUM_LAYERS=2
INPUT_DIM=50
HIDDEN_DIM=10
builder = dy.LSTMBuilder(NUM_LAYERS, INPUT_DIM, HIDDEN_DIM, pc)

In [143]:
s0 = builder.initial_state()
x1 = dy.vecInput(INPUT_DIM)
s1=s0.add_input(x1)
y1 = s1.output()
s2=s1.add_input(x1) # we can add another input
y2=s2.output()