In [3]:
import pandas as pd
import numpy as np
from afinn import Afinn
import nltk
import itertools
from nltk.corpus import sentiwordnet as swn
from normalization import normalize_accented_characters, html_parser, strip_html
from utils import display_evaluation_metrics, display_confusion_matrix, display_classification_report
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')
import dynet as dy
from nltk.corpus import wordnet as wn
from collections import defaultdict

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


## Movie review dataset

In [163]:
def prepare_movie_dataset(train_start,train_end,test_start,test_end) : 

    dataset = pd.read_csv(r'datasets/movie_reviews.csv')
    print('dataset size : ',dataset.shape[0])

    train_data = dataset[train_start:train_end]
    test_data = dataset[test_start:test_end]
    
    print('Train_X : ',train_data.shape[0])
    print('Test_X  : ',test_data.shape[0])

    test_reviews = np.array(test_data['review'])
    test_sentiments = np.array(test_data['sentiment'])

    return train_data,test_reviews,test_sentiments

def prepare_labeled_data(train_start,train_end,test_start,test_end) : 
    
    labeled_data=open("datasets/labeledTrainData.tsv","r")
    data=labeled_data.readlines()
    data=[d.split("\t") for d in data]
    sa_data=pd.DataFrame(data,columns=['ind','sentiment','review'])
    sa_data=sa_data[['sentiment','review']]
    
    print('dataset size : ',sa_data.shape[0])

    train_data = sa_data[train_start:train_end]
    test_data = sa_data[test_start:test_end]
    
    print('Train_X : ',train_data.shape[0])
    print('Test_X  : ',test_data.shape[0])

    test_reviews = np.array(test_data['review'])
    test_sentiments = np.array(test_data['sentiment'])

    return train_data,test_reviews,test_sentiments

In [165]:
train_x,test_x,test_y=prepare_labeled_data(1,1000,1000,2000)
test_x

dataset size :  25001
Train_X :  999
Test_X  :  1000


array(['"This move was on TV last night. I guess as a time filler, because it sucked bad! The movie is just an excuse to show some tits and ass at the start and somewhere about half way. (Not bad tits and ass though). But the story is too ridiculous for words. The \\"wolf\\", if that is what you can call it, is hardly shown fully save his teeth. When it is fully in view, you can clearly see they had some interns working on the CGI, because the wolf runs like he\'s running in a treadmill, and the CGI fur looks like it\'s been waxed, all shiny :)<br /><br />The movie is full of gore and blood, and you can easily spot who is going to get killed/slashed/eaten next. Even if you like these kind of splatter movies you will be disappointed, they didn\'t do a good job at it.<br /><br />Don\'t even get me started on the actors... Very corny lines and the girls scream at everything about every 5 seconds. But then again, if someone asked me to do bad acting just to give me a few bucks, then hey, w

## Evaluation for unsupervised Lexicon sentiment tagging

#### compare against the sentence tagging (already provided in the dataset )

[add markdown #11 here]

In [177]:
### BORROWED FROM THE AR_SARKAR METRIC
def analyze_sentiment_sentiwordnet_lexicon(review,verbose=False):
    
    review = html_parser.unescape(review)
    review = strip_html(review)
    
    text_tokens = nltk.word_tokenize(review)
    tagged_text = nltk.pos_tag(text_tokens)
    pos_score = neg_score = token_count = obj_score = 0

    for word, tag in tagged_text:
        ss_set = None
        if 'NN' in tag and swn.senti_synsets(word, 'n'):
            ss_set = list(swn.senti_synsets(word, 'n'))
            if ss_set : 
                ss_set=ss_set[0]
        elif 'VB' in tag and swn.senti_synsets(word, 'v'):
            ss_set = list(swn.senti_synsets(word, 'v'))
            if ss_set : 
                ss_set=ss_set[0]
        elif 'JJ' in tag and swn.senti_synsets(word, 'a'):
            ss_set = list(swn.senti_synsets(word, 'a'))
            if ss_set : 
                ss_set=ss_set[0]
        elif 'RB' in tag and swn.senti_synsets(word, 'r'):
            ss_set = list(swn.senti_synsets(word, 'r'))
            if ss_set : 
                ss_set=ss_set[0]
        
        if ss_set:
            
            pos_score += ss_set.pos_score()
            neg_score += ss_set.neg_score()
            obj_score += ss_set.obj_score()
            token_count += 1
    
    
    final_score = pos_score - neg_score
    norm_final_score = round(float(final_score) / token_count, 2)
    final_sentiment = 'positive' if norm_final_score >= 0 else 'negative'
    if verbose:
        norm_obj_score = round(float(obj_score) / token_count, 2)
        norm_pos_score = round(float(pos_score) / token_count, 2)
        norm_neg_score = round(float(neg_score) / token_count, 2)
        
        sentiment_frame = pd.DataFrame([[final_sentiment, norm_obj_score,
                                         norm_pos_score, norm_neg_score,
                                         norm_final_score]],
                                         columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'], 
                                                                      ['Predicted Sentiment', 'Objectivity',
                                                                       'Positive', 'Negative', 'Overall']], 
                                                              labels=[[0,0,0,0,0],[0,1,2,3,4]]))
        print (sentiment_frame)   
    return final_sentiment
            
                                                               
def evaluate_lexicons(TRUE_LABELS,PREDICTED_LABELS,POS_CLASS,NEG_CLASS) : 

    print ('Performance metrics:')
    display_evaluation_metrics(true_labels=TRUE_LABELS,
                               predicted_labels=PREDICTED_LABELS,
                               positive_class=str(POS_CLASS))  
    print ('\nConfusion Matrix:'             )              
    display_confusion_matrix(true_labels=TRUE_LABELS,
                             predicted_labels=PREDICTED_LABELS,
                             classes=[str(POS_CLASS),str(NEG_CLASS)])
    print ('\nClassification report:' )                        
    display_classification_report(true_labels=TRUE_LABELS,
                                  predicted_labels=PREDICTED_LABELS,
                                  classes=[str(POS_CLASS),str(NEG_CLASS)])
    return

                               

## Basline lexicon evaluation

#### movie dataset 

In [116]:
train_x,test_x,test_y=prepare_movie_dataset(0,1000,1000,3000)
sentiwordnet_predictions = [analyze_sentiment_sentiwordnet_lexicon(review) for review in test_x]
evaluate_lexicons(test_y.tolist(),sentiwordnet_predictions,'postitive','negative')

dataset size :  50000
Train_X :  1000
Test_X  :  2000
Performance metrics:
Accuracy: 0.59
Precision: 0.56
Recall: 0.93
F1 Score: 0.69

Confusion Matrix:
                 Predicted:         
                   positive negative
Actual: positive        934       73
        negative        747      246

Classification report:
              precision    recall  f1-score   support

    positive       0.56      0.93      0.69      1007
    negative       0.77      0.25      0.38       993

   micro avg       0.59      0.59      0.59      2000
   macro avg       0.66      0.59      0.53      2000
weighted avg       0.66      0.59      0.54      2000



#### labeled dataset 

In [180]:
train_x,test_x,test_y=prepare_labeled_data(0,1000,1000,3000)
sentiwordnet_predictions = [analyze_sentiment_sentiwordnet_lexicon(review) for review in test_x]
binary_predicted=['1' if p=='positive' else '0' for p in sentiwordnet_predictions ]
evaluate_lexicons(test_y.tolist(),binary_predicted,'1','0')

dataset size :  25001
Train_X :  1000
Test_X  :  2000
Performance metrics:
Accuracy: 0.6
Precision: 0.56
Recall: 0.93
F1 Score: 0.7

Confusion Matrix:
          Predicted:     
                   1    0
Actual: 1        947   70
        0        734  249

Classification report:
              precision    recall  f1-score   support

           1       0.56      0.93      0.70      1017
           0       0.78      0.25      0.38       983

   micro avg       0.60      0.60      0.60      2000
   macro avg       0.67      0.59      0.54      2000
weighted avg       0.67      0.60      0.54      2000



### Simple network for learning (do afterwards)

In [141]:
## SIMPLE NETWORK WITH THE sigma(V*tanh(WX+B)) ## for the XOR problem
# create a parameter collection and add the parameters.
m = dy.ParameterCollection()
W = m.add_parameters((8,2))
V = m.add_parameters((1,8))
b = m.add_parameters((8))

dy.renew_cg() # new computation graph. not strictly needed here, but good practice.
b.value() ## bias values
x=dy.vecInput(2) ## 2 sized inputs 
output=dy.logistic(V*(dy.tanh(W*x)+b)) ## output node

y = dy.scalarInput(0) ## objective function
loss = dy.binary_log_loss(output,y) ## loss function
## trainer with the initialized parameters m 
trainer=dy.SimpleSGDTrainer(m)
x.set([1,0])
y.set(1)
loss_value = loss.value() # this performs a forward through the network.
print("the loss before step is:",loss_value)

loss.backward()  # compute the gradients
trainer.update()

loss_value = loss.value(recalculate=True) 
print("the loss after step is:",loss_value)
pc = dy.ParameterCollection()
NUM_LAYERS=2
INPUT_DIM=50
HIDDEN_DIM=10
builder = dy.LSTMBuilder(NUM_LAYERS, INPUT_DIM, HIDDEN_DIM, pc)
s0 = builder.initial_state()
x1 = dy.vecInput(INPUT_DIM)
s1=s0.add_input(x1)
y1 = s1.output()
s2=s1.add_input(x1) # we can add another input
y2=s2.output()

the loss before step is: 0.4169560670852661
the loss after step is: 0.3646645247936249


### IMPLEMENTING 2.1 SECTION OF GOOGLE PAPER FOR LEXICON EXPANSION [this is for the lexicon expansion ]

In [251]:
ALL_WORDS=list(wn.words())
SCORES=defaultdict()
df=pd.DataFrame(ALL_WORDS,columns=['word'])

swn.all_
    

### Comparing the lexicons from stanford paper "Incuding Domain-Specific Sentiment Lexicons from Unalabeled Copora"

In [10]:
import socialsent_util
def load_lexicon(name, remove_neutral=True):
    lexicon = socialsent_util.load_json("./lexicons_socialsent/"+ name + '.json')
    return {w: p for w, p in lexicon.items() if p != 0} if remove_neutral else lexicon

def compare_lexicons(print_disagreements=False):
    lexicons = {
        "inquirer": load_lexicon("inquirer", False),
        "mpqa": load_lexicon("mpqa", False),
        "bingliu": load_lexicon("bingliu", False),
    }

    for l in lexicons:
        print( l, len(lexicons[l]), len([w for w in lexicons[l] if lexicons[l][w] != 0]))

    for l1, l2 in itertools.combinations(lexicons.keys(), 2):
        ps1, ps2 = lexicons[l1], lexicons[l2]
        common_words = set(ps1.keys()) & set(ps2.keys())
        print( l1, l2, "agreement: {:.2f}".format(
            100.0 * sum(1 if ps1[w] == ps2[w] else 0 for w in common_words) / len(common_words)))
        common_words = set([word for word in ps1.keys() if ps1[word] != 0]) & \
                       set([word for word in ps2.keys() if ps2[word] != 0])  
        print (l1, l2, "agreement ignoring neutral: {:.2f}".format(
            100.0 * sum(1 if ps1[w] * ps2[w] == 1 else 0 for w in common_words) / len(common_words)))
        
        if print_disagreements and l1 == 'opinion' and l2 == 'inquirer':
            for w in common_words:
                if lexicons[l1][w] != lexicons[l2][w]:
                    print (w, lexicons[l1][w], lexicons[l2][w])
                                   

finance_lexicons=load_lexicon('finance')
bingliu_lexicons=load_lexicon('bingliu')
inquirer_lexicons=load_lexicon('inquirer')
mpqa_lexicons=load_lexicon('mpqa')
twitter_lexicons=load_lexicon('twitter')

### Compares the different lexicon repositories through the mutual information between them (common words)

In [281]:
compare_lexicons()

inquirer 8640 3457
mpqa 6886 6462
bingliu 6785 6785
inquirer mpqa agreement: 82.47
inquirer mpqa agreement ignoring neutral: 98.50
inquirer bingliu agreement: 84.39
inquirer bingliu agreement ignoring neutral: 98.74
mpqa bingliu agreement: 99.19
mpqa bingliu agreement ignoring neutral: 99.44


#### Lexicon Induction : the idea is to generate the lexicons provided the corpus. This method makes sure that the lexicon are sensitive to the context they are drawn from. They may prove useful if we would like to assess them in a simiar context. For instance, financial lexicons will reflect better sentiments than using general lexicons such as SentiWordNet. Three ways purposed for induction 

- SENTPROP
- DENSIFIER
- Sentiment140

In [31]:
import seeds
from representations.representation_factory import create_representation
import constants

def evaluate_methods():
    """
    Evaluates different methods on standard English.
    """
    print ("Getting evalution words..")
    np.random.seed(0)
    
    ## inquirer is ternrary -1,0,1
    lexicon = load_lexicon("inquirer", remove_neutral=False)
    
    ## kuperman is continus -5.0 to 5
    kuperman = load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())

    qwn = load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation("GIGA", constants.GOOGLE_EMBEDDINGS,eval_words.union(positive_seeds).union(negative_seeds))
    
    
    embed_words = set(common_embed.iw)
    
    
    eval_words = eval_words.intersection(embed_words)

    eval_words = [word for word in eval_words 
            if not word in positive_seeds 
            and not word in negative_seeds]
    
    
    print ("Evaluating with ", len(eval_words), "out of", len(lexicon))
    print ("SentProp:")
    
    
    polarities = run_method(positive_seeds, negative_seeds, 
            common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
            method=polarity_induction_methods.label_propagate_probabilistic,beta=0.99, nn=10, **DEFAULT_ARGUMENTS)
    
    
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    util.write_pickle(polarities, "tmp/gi-cc-walk-pols.pkl")

In [34]:
import polarity_induction_methods

### THIS IS THE FUNCTION FOR INDUCING LEXICONS GIVEN THE SEEDS, EMBEDDINGS AND THE METHOD.
def run_method(positive_seeds, negative_seeds, embeddings, transform_embeddings=False, post_densify=False,
        method=polarity_induction_methods.densify, **kwargs):
    
    
    if transform_embeddings:
        print ("Transforming embeddings...")
        embeddings = embedding_transformer.apply_embedding_transformation(embeddings, positive_seeds, negative_seeds, n_dim=50)
    
    
    ## using densify method
    if post_densify:
        polarities = method(embeddings, positive_seeds, negative_seeds, **kwargs)
        top_pos = [word for word in 
                sorted(polarities, key = lambda w : -polarities[w])[:150]]
        top_neg = [word for word in 
                sorted(polarities, key = lambda w : polarities[w])[:150]]
        top_pos.extend(positive_seeds)
        top_neg.extend(negative_seeds)
        return polarity_induction_methods.densify(embeddings, top_pos, top_neg)
    
    
    positive_seeds = [s for s in positive_seeds if s in embeddings]
    negative_seeds = [s for s in negative_seeds if s in embeddings]
    
    
    return method(embeddings, positive_seeds, negative_seeds, **kwargs)


In [33]:
def dist(embeds, positive_seeds, negative_seeds, **kwargs):
    polarities = {}
    sim_mat = similarity_matrix(embeds, **kwargs)
    for i, w in enumerate(embeds.iw):
        if w not in positive_seeds and w not in negative_seeds:
            pol = sum(sim_mat[embeds.wi[p_seed], i] for p_seed in positive_seeds)
            pol -= sum(sim_mat[embeds.wi[n_seed], i] for n_seed in negative_seeds)
            polarities[w] = pol
    return polarities


def pmi(count_embeds, positive_seeds, negative_seeds, smooth=0.01, **kwargs):
    """
    Learns polarity scores using PMI with seed words.
    Adapted from Turney, P. and M. Littman. "Measuring Praise and Criticism: Inference of semantic orientation from assocition".
    ACM Trans. Inf. Sys., 2003. 21(4) 315-346.

    counts is explicit embedding containing raw co-occurrence counts
    """
    w_index = count_embeds.wi
    c_index = count_embeds.ci
    counts = count_embeds.m
    polarities = {}
    for w in count_embeds.iw:
        if w not in positive_seeds and w not in negative_seeds:
            pol = sum(np.log(counts[w_index[w], c_index[seed]] + smooth) 
                    - np.log(counts[w_index[seed],:].sum()) for seed in positive_seeds)
            pol -= sum(np.log(counts[w_index[w], c_index[seed]] + smooth) 
                    - np.log(counts[w_index[seed],:].sum())for seed in negative_seeds)
            polarities[w] = pol
    return polarities

In [36]:
evaluate_methods()

Getting evalution words..


FileNotFoundError: [Errno 2] No such file or directory: '/dfs/scratch0/gigawordvecs/GoogleNews-vectors-negative300_transformed.txt'