In [1]:
import pandas as pd
import numpy as np
import nltk
import itertools
from nltk.corpus import sentiwordnet as swn
from normalization import normalize_accented_characters, html_parser, strip_html
from utils import display_evaluation_metrics, display_confusion_matrix, display_classification_report
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')
import dynet as dy
from nltk.corpus import wordnet as wn
from collections import defaultdict

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


## Movie review dataset

In [2]:
def prepare_movie_dataset(train_start,train_end,test_start,test_end) : 

    dataset = pd.read_csv(r'datasets/movie_reviews.csv')
    print('dataset size : ',dataset.shape[0])

    train_data = dataset[train_start:train_end]
    test_data = dataset[test_start:test_end]
    
    print('Train_X : ',train_data.shape[0])
    print('Test_X  : ',test_data.shape[0])

    test_reviews = np.array(test_data['review'])
    test_sentiments = np.array(test_data['sentiment'])

    return train_data,test_reviews,test_sentiments

def prepare_labeled_data(train_start,train_end,test_start,test_end) : 
    
    labeled_data=open("datasets/labeledTrainData.tsv","r")
    data=labeled_data.readlines()
    data=[d.split("\t") for d in data]
    sa_data=pd.DataFrame(data,columns=['ind','sentiment','review'])
    sa_data=sa_data[['sentiment','review']]
    
    print('dataset size : ',sa_data.shape[0])

    train_data = sa_data[train_start:train_end]
    test_data = sa_data[test_start:test_end]
    
    print('Train_X : ',train_data.shape[0])
    print('Test_X  : ',test_data.shape[0])

    test_reviews = np.array(test_data['review'])
    test_sentiments = np.array(test_data['sentiment'])

    return train_data,test_reviews,test_sentiments

In [3]:
train_x,test_x,test_y=prepare_movie_dataset(1,1000,1000,2000)

dataset size :  50000
Train_X :  999
Test_X  :  1000


## Evaluation for unsupervised Lexicon sentiment tagging

#### compare against the sentence tagging (already provided in the dataset )

[add markdown #11 here]

In [4]:
### BORROWED FROM THE AR_SARKAR METRIC
def analyze_sentiment_sentiwordnet_lexicon(review,verbose=False):
    
    
    #review = normalize_accented_characters(review)
    #review = review.decode('utf-8')
    review = html_parser.unescape(review)
    review = strip_html(review)
    
    text_tokens = nltk.word_tokenize(review)
    tagged_text = nltk.pos_tag(text_tokens)
    pos_score = neg_score = token_count = obj_score = 0

    for word, tag in tagged_text:
        ss_set = None
        if 'NN' in tag and swn.senti_synsets(word, 'n'):
            ss_set = list(swn.senti_synsets(word, 'n'))
            if ss_set : 
                ss_set=ss_set[0]
        elif 'VB' in tag and swn.senti_synsets(word, 'v'):
            ss_set = list(swn.senti_synsets(word, 'v'))
            if ss_set : 
                ss_set=ss_set[0]
        elif 'JJ' in tag and swn.senti_synsets(word, 'a'):
            ss_set = list(swn.senti_synsets(word, 'a'))
            if ss_set : 
                ss_set=ss_set[0]
        elif 'RB' in tag and swn.senti_synsets(word, 'r'):
            ss_set = list(swn.senti_synsets(word, 'r'))
            if ss_set : 
                ss_set=ss_set[0]
        
        if ss_set:
            
            pos_score += ss_set.pos_score()
            neg_score += ss_set.neg_score()
            obj_score += ss_set.obj_score()
            token_count += 1
    
    
    final_score = pos_score - neg_score
    norm_final_score = round(float(final_score) / token_count, 2)
    final_sentiment = 'positive' if norm_final_score >= 0 else 'negative'
    if verbose:
        norm_obj_score = round(float(obj_score) / token_count, 2)
        norm_pos_score = round(float(pos_score) / token_count, 2)
        norm_neg_score = round(float(neg_score) / token_count, 2)
        
        sentiment_frame = pd.DataFrame([[final_sentiment, norm_obj_score,
                                         norm_pos_score, norm_neg_score,
                                         norm_final_score]],
                                         columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'], 
                                                                      ['Predicted Sentiment', 'Objectivity',
                                                                       'Positive', 'Negative', 'Overall']], 
                                                              labels=[[0,0,0,0,0],[0,1,2,3,4]]))
        print (sentiment_frame)   
    return final_sentiment
            
                                                               
def evaluate_lexicons(TRUE_LABELS,PREDICTED_LABELS,POS_CLASS,NEG_CLASS) : 

    print ('Performance metrics:')
    display_evaluation_metrics(true_labels=TRUE_LABELS,
                               predicted_labels=PREDICTED_LABELS,
                               positive_class=str(POS_CLASS))  
    print ('\nConfusion Matrix:'             )              
    display_confusion_matrix(true_labels=TRUE_LABELS,
                             predicted_labels=PREDICTED_LABELS,
                             classes=[str(POS_CLASS),str(NEG_CLASS)])
    print ('\nClassification report:' )                        
    display_classification_report(true_labels=TRUE_LABELS,
                                  predicted_labels=PREDICTED_LABELS,
                                  classes=[str(POS_CLASS),str(NEG_CLASS)])
    return

                               

## Basline lexicon evaluation

#### movie dataset 

In [None]:
train_x,test_x,test_y=prepare_movie_dataset(0,1000,1000,2000)
sentiwordnet_predictions = [analyze_sentiment_sentiwordnet_lexicon(review) for review in test_x]
evaluate_lexicons(test_y.tolist(),sentiwordnet_predictions,'positive','negative')

#### labeled dataset 

In [None]:
train_x,test_x,test_y=prepare_labeled_data(0,1000,1000,2000)
sentiwordnet_predictions = [analyze_sentiment_sentiwordnet_lexicon(review) for review in test_x]
binary_predicted=['1' if p=='positive' else '0' for p in sentiwordnet_predictions ]
evaluate_lexicons(test_y.tolist(),binary_predicted,'1','0')

### Simple network for learning (do afterwards)

In [None]:
## SIMPLE NETWORK WITH THE sigma(V*tanh(WX+B)) ## for the XOR problem
# create a parameter collection and add the parameters.
m = dy.ParameterCollection()
W = m.add_parameters((8,2))
V = m.add_parameters((1,8))
b = m.add_parameters((8))

dy.renew_cg() # new computation graph. not strictly needed here, but good practice.
b.value() ## bias values
x=dy.vecInput(2) ## 2 sized inputs 
output=dy.logistic(V*(dy.tanh(W*x)+b)) ## output node

y = dy.scalarInput(0) ## objective function
loss = dy.binary_log_loss(output,y) ## loss function
## trainer with the initialized parameters m 
trainer=dy.SimpleSGDTrainer(m)
x.set([1,0])
y.set(1)
loss_value = loss.value() # this performs a forward through the network.
print("the loss before step is:",loss_value)

loss.backward()  # compute the gradients
trainer.update()

loss_value = loss.value(recalculate=True) 
print("the loss after step is:",loss_value)
pc = dy.ParameterCollection()
NUM_LAYERS=2
INPUT_DIM=50
HIDDEN_DIM=10
builder = dy.LSTMBuilder(NUM_LAYERS, INPUT_DIM, HIDDEN_DIM, pc)
s0 = builder.initial_state()
x1 = dy.vecInput(INPUT_DIM)
s1=s0.add_input(x1)
y1 = s1.output()
s2=s1.add_input(x1) # we can add another input
y2=s2.output()

### IMPLEMENTING 2.1 SECTION OF GOOGLE PAPER FOR LEXICON EXPANSION [this is for the lexicon expansion ]

In [None]:
ALL_WORDS=list(wn.words())
SCORES=defaultdict()
df=pd.DataFrame(ALL_WORDS,columns=['word'])

swn.all_
    

### Comparing the lexicons from stanford paper "Incuding Domain-Specific Sentiment Lexicons from Unalabeled Copora"

In [5]:
import socialsent_util
def load_lexicon(name, remove_neutral=True):
    lexicon = socialsent_util.load_json("./lexicons_socialsent/"+ name + '.json')
    return {w: p for w, p in lexicon.items() if p != 0} if remove_neutral else lexicon

def compare_lexicons(print_disagreements=False):
    lexicons = {
        "inquirer": load_lexicon("inquirer", False),
        "mpqa": load_lexicon("mpqa", False),
        "bingliu": load_lexicon("bingliu", False),
    }

    for l in lexicons:
        print( l, len(lexicons[l]), len([w for w in lexicons[l] if lexicons[l][w] != 0]))

    for l1, l2 in itertools.combinations(lexicons.keys(), 2):
        ps1, ps2 = lexicons[l1], lexicons[l2]
        common_words = set(ps1.keys()) & set(ps2.keys())
        print( l1, l2, "agreement: {:.2f}".format(
            100.0 * sum(1 if ps1[w] == ps2[w] else 0 for w in common_words) / len(common_words)))
        common_words = set([word for word in ps1.keys() if ps1[word] != 0]) & \
                       set([word for word in ps2.keys() if ps2[word] != 0])  
        print (l1, l2, "agreement ignoring neutral: {:.2f}".format(
            100.0 * sum(1 if ps1[w] * ps2[w] == 1 else 0 for w in common_words) / len(common_words)))
        
        if print_disagreements and l1 == 'opinion' and l2 == 'inquirer':
            for w in common_words:
                if lexicons[l1][w] != lexicons[l2][w]:
                    print (w, lexicons[l1][w], lexicons[l2][w])
      
    
## ALL THESE LEXICONS ARE 2-CLASS SENTIMENTS. 1 = POSITIVE; -1 = NEGATIVE
finance_lexicons=load_lexicon('finance')
bingliu_lexicons=load_lexicon('bingliu')
inquirer_lexicons=load_lexicon('inquirer')
mpqa_lexicons=load_lexicon('mpqa')
twitter_lexicons=load_lexicon('twitter')

### Compares the different lexicon repositories through the mutual information between them (common words)

### The comparison is done through looking for words in two lexicon dictionaries L1 AND L2, and how many words are common in them which have the same scores.

In [6]:
## FIGURE OUT WHAT DOES THE COMPARE_LEXICONS DOES.
compare_lexicons()

inquirer 8640 3457
mpqa 6886 6462
bingliu 6785 6785
inquirer mpqa agreement: 82.47
inquirer mpqa agreement ignoring neutral: 98.50
inquirer bingliu agreement: 84.39
inquirer bingliu agreement ignoring neutral: 98.74
mpqa bingliu agreement: 99.19
mpqa bingliu agreement ignoring neutral: 99.44


#### Lexicon Induction : the idea is to generate the lexicons provided the corpus. This method makes sure that the lexicon are sensitive to the context they are drawn from. They may prove useful if we would like to assess them in a simiar context. For instance, financial lexicons will reflect better sentiments than using general lexicons such as SentiWordNet. Three ways purposed for induction 

- SENTPROP
- DENSIFIER
- Sentiment140

### POLARITY INDUCTION METHOD : This is used for re-scoring of the lexicons(tokens) by taking information from the word-embeddings (domain-specific), positive and the negative seed words.

In [7]:
import polarity_induction_methods

### THIS IS THE FUNCTION FOR INDUCING LEXICONS GIVEN THE SEEDS, EMBEDDINGS AND THE METHOD.
def run_method(positive_seeds, negative_seeds, embeddings, transform_embeddings=False, post_densify=False,
        method=polarity_induction_methods.densify, **kwargs):
    
    
    if transform_embeddings:
        print ("Transforming embeddings...")
        embeddings = embedding_transformer.apply_embedding_transformation(embeddings, positive_seeds, negative_seeds, n_dim=50)
    
    
    ## using densify method
    if post_densify:
        polarities = method(embeddings, positive_seeds, negative_seeds, **kwargs)
        top_pos = [word for word in 
                sorted(polarities, key = lambda w : -polarities[w])[:150]]
        top_neg = [word for word in 
                sorted(polarities, key = lambda w : polarities[w])[:150]]
        top_pos.extend(positive_seeds)
        top_neg.extend(negative_seeds)
        return polarity_induction_methods.densify(embeddings, top_pos, top_neg)
    
    
    positive_seeds = [s for s in positive_seeds if s in embeddings]
    negative_seeds = [s for s in negative_seeds if s in embeddings]
    
    
    return method(embeddings, positive_seeds, negative_seeds, **kwargs)


data =  /home/ubuntu/workspace/nlpclass-1187-g-Mad_Titans/sa/embeddings_socialsent/


Using Theano backend.


In [8]:
import seeds
from representations.representation_factory import create_representation
import constants

def evaluate_methods():
    """
    Evaluates different methods on standard English.
    """
    print ("Getting evalution words..")
    np.random.seed(0)
    
    ## inquirer is ternrary -1,0,1
    lexicon = load_lexicon("inquirer", remove_neutral=False)
    
    ## kuperman is continus -5.0 to 5
    kuperman = load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())

    qwn = load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.hist_seeds()
    
    common_embed = create_representation("GIGA", constants.GLOVE_EMBEDDINGS,eval_words.union(positive_seeds).union(negative_seeds))
    
    
    embed_words = set(common_embed.iw)
    
    
    eval_words = eval_words.intersection(embed_words)

    eval_words = [word for word in eval_words 
            if not word in positive_seeds 
            and not word in negative_seeds]
    
    
    print ("Evaluating with ", len(eval_words), "out of", len(lexicon))
    print ("SentProp:")
    
    
    polarities = run_method(positive_seeds, negative_seeds, 
            common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
            method=polarity_induction_methods.label_propagate_probabilistic,beta=0.99, nn=10)
    
    
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    socialsent_util.write_pickle(polarities, "tmp/gi-cc-walk-pols.pkl")

### This is using evaluate_methods() 

### THIS IS FOR THE STANDARD ENGLISH ACCORDING TO THE PAPER.
### it evaluates custom induced "inquirer" and "kuberman"


### THE WORKFLOW IS AS FOLLOWS : 


### DOMAIN-CORPUS---> WORD_EMBEDDINGS+SEEDS(POSITIVE+NEGATIVE) ---> METHOD(LABEL_PROPAGATION) ---> NEW LEXICON (NEW POLARITY SCORES WHICH IS SENSITIVE TO THE CONTEXT DOMAIN)


#### IN THE END, THESE LEXICONS CAN BE USED TO FIND THE SENTIMENT OF THE WHOLE SENTENCE. 

In [9]:
%reload_ext autoreload
%autoreload 2
from evaluate_methods import evaluate
evaluate_methods()

Getting evalution words..
Evaluating with  8528 out of 8640
SentProp:
Binary metrics:
Accuracy with optimal threshold: 1.1549
ROC AUC Score: 0.8001
Average Precision Score: 0.7663
Ternary metrics:
Majority macro F1 baseline 0.2497
Macro F1 with cmn threshold: 0.1024
Kendall Tau 0.3498
Confusion matrix: 
[[   0    1 1874]
 [   0    0 5106]
 [   0    0 1547]]
Neg : 0.0
Neut : 0.0
Pos : 1.0
Latex table line: 80.0 & 10.2 & 0.35\\


  'precision', 'predicted', average, warn_for)
