In [1]:
import pandas as pd
import numpy as np

# Load the cleaned movie reviews dataset
dataset = pd.read_csv(r'movie_reviews_short.csv')
# Print the first few data points
print(dataset.head())

# Divide data into training and testing sets
train_data = dataset[:7000]
test_data = dataset[7000:]

# Divide the test data into the data (review) and the label (sentiment)
test_reviews = np.array(test_data['review'])
test_sentiments = np.array(test_data['sentiment'])


# Let's first try a sample dataset for experimenting
sample_docs = [92, 817, 626, 356, 1008, 1155, 2533, 2002]
sample_data = [(test_reviews[index],
                test_sentiments[index])
                  for index in sample_docs]

print(sample_data)       

# AFINN is a rich sentiment lexicon with values for polarity and intensity
# It even has scores for smileys!
from afinn import Afinn
afn = Afinn(emoticons=True) 
print(afn.score('I really hated the plot of this movie'))

print(afn.score('I really hated the plot of this movie :('))



# We will try the SentiWordNet and VADER lexicons for Sentiment Analysis

# NLTK provides a nice interface to SentiWordNet
import nltk
import html
from nltk.corpus import sentiwordnet as swn

# Get synset for 'good' from sentiwordnet (SWN)
good = list(swn.senti_synsets('good', 'n'))[0]
# Print synset sentiment scores
print('Positive Polarity Score:', good.pos_score())
print('Negative Polarity Score:', good.neg_score())
print('Objective Score:', good.obj_score())

from normalization import normalize_accented_characters

def analyze_sentiment_sentiwordnet_lexicon(review, verbose=False):
    # Pre-process text
    review = normalize_accented_characters(review)
    review = html.unescape(review)
    # review = strip_html(review) - we need to write a function for this!
    # Tokenize and POS tag text tokens
    text_tokens = nltk.word_tokenize(review)
    tagged_text = nltk.pos_tag(text_tokens)
    pos_score = neg_score = token_count = obj_score = 0
    # Get wordnet synsets based on POS tags
    # Get sentiment scores if synsets are found
    for word, tag in tagged_text:
        ss_set = None
        if 'NN' in tag and list(swn.senti_synsets(word, 'n')):
            ss_set = list(swn.senti_synsets(word, 'n'))[0]
        elif 'VB' in tag and list(swn.senti_synsets(word, 'v')):
            ss_set = list(swn.senti_synsets(word, 'v'))[0]
        elif 'JJ' in tag and list(swn.senti_synsets(word, 'a')):
            ss_set = list(swn.senti_synsets(word, 'a'))[0]
        elif 'RB' in tag and list(swn.senti_synsets(word, 'r')):
            ss_set = list(swn.senti_synsets(word, 'r'))[0]
        # If senti-synset is found        
        if ss_set:
            # Add scores for all found synsets
            pos_score += ss_set.pos_score()
            neg_score += ss_set.neg_score()
            obj_score += ss_set.obj_score()
            token_count += 1
    
    # Aggregate final scores
    final_score = pos_score - neg_score
    norm_final_score = round(float(final_score) / token_count, 2)
    final_sentiment = 'positive' if norm_final_score >= 0 else 'negative'
    if verbose:
        norm_obj_score = round(float(obj_score) / token_count, 2)
        norm_pos_score = round(float(pos_score) / token_count, 2)
        norm_neg_score = round(float(neg_score) / token_count, 2)
        # Display results in a nice (pandas) table
        sentiment_frame = pd.DataFrame([[final_sentiment, norm_obj_score,
                                         norm_pos_score, norm_neg_score,
                                         norm_final_score]], columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'], ['Predicted Sentiment', 'Objectivity','Positive', 'Negative', 'Overall']], labels=[[0,0,0,0,0],[0,1,2,3,4]]))
        print(sentiment_frame)
        
    return final_sentiment
            
# Detailed sentiment analysis for sample reviews       
for review, review_sentiment in sample_data:  
    print('Review:')
    print(review)
    print()
    print('Labeled Sentiment:', review_sentiment)   
    print()    
    final_sentiment = analyze_sentiment_sentiwordnet_lexicon(review, verbose=True)
    print('-'*60)                       



# Predict sentiment for test movie reviews dataset - Warning: will take some time
sentiwordnet_predictions = [analyze_sentiment_sentiwordnet_lexicon(review)
                            for review in test_reviews]

from utils import display_evaluation_metrics, display_confusion_matrix, display_classification_report

# Get model performance statistics
print('Performance metrics (SentiWordNet):')
display_evaluation_metrics(true_labels=test_sentiments,
                           predicted_labels=sentiwordnet_predictions,
                           positive_class='positive')  
print('\nConfusion Matrix:')                       
display_confusion_matrix(true_labels=test_sentiments,
                         predicted_labels=sentiwordnet_predictions,
                         classes=['positive', 'negative'])
print('\nClassification report:')                 
display_classification_report(true_labels=test_sentiments,
                              predicted_labels=sentiwordnet_predictions,
                              classes=['positive', 'negative'])  
print()
                                                



# Now we use the VADER lexicon for Sentiment Analysis

from nltk.sentiment.vader import SentimentIntensityAnalyzer

def analyze_sentiment_vader_lexicon(review, threshold=0.1, verbose=False):
    # Pre-process text
    review = normalize_accented_characters(review)
    review = html.unescape(review)
    # review = strip_html(review)
    # Analyze the sentiment for review
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)
    # Get aggregate scores and final sentiment
    agg_score = scores['compound']
    final_sentiment = 'positive' if agg_score >= threshold\
                                   else 'negative'
    if verbose:
        # Display detailed sentiment statistics
        positive = str(round(scores['pos'], 2)*100)+'%'
        final = round(agg_score, 2)
        negative = str(round(scores['neg'], 2)*100)+'%'
        neutral = str(round(scores['neu'], 2)*100)+'%'
        sentiment_frame = pd.DataFrame([[final_sentiment, final, positive,
                                        negative, neutral]],
                                        columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'], ['Predicted Sentiment', 'Polarity Score','Positive', 'Negative','Neutral']], labels=[[0,0,0,0,0],[0,1,2,3,4]]))
        print(sentiment_frame)
    
    return final_sentiment
        
    
    
# Get detailed sentiment statistics
for review, review_sentiment in sample_data:
    print('Review:')
    print(review)
    print()
    print('Labeled Sentiment:', review_sentiment)
    print()   
    final_sentiment = analyze_sentiment_vader_lexicon(review, threshold=0.1, verbose=True)
    print('-'*60)                      

# Predict sentiment for test movie reviews dataset - Warning: will take some time
vader_predictions = [analyze_sentiment_vader_lexicon(review, threshold=0.1)
                     for review in test_reviews] 

# Get model performance statistics
print('Performance metrics (Vader):')
display_evaluation_metrics(true_labels=test_sentiments,
                           predicted_labels=vader_predictions,
                           positive_class='positive')  
print('\nConfusion Matrix:')                        
display_confusion_matrix(true_labels=test_sentiments,
                         predicted_labels=vader_predictions,
                         classes=['positive', 'negative'])
print('\nClassification report:')                       
display_classification_report(true_labels=test_sentiments,
                              predicted_labels=vader_predictions,
                              classes=['positive', 'negative']) 
print()




                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
-3.0
-5.0
Positive Polarity Score: 0.5
Negative Polarity Score: 0.0
Objective Score: 0.5
Review:
This has to be the most boring movie I ever sat through. It is dreary and drab, has no excitement, the acting by Hulce is terrible as Hulce cannot pull off the proper accent required for this film. The story is stupid and I sure wouldn't recommend this crap for anyone unless you want to die of boredom.

Labeled Sentiment: negative





     SENTIMENT STATS:                                      
  Predicted Sentiment Objectivity Positive Negative Overall
0            negative        0.72     0.12     0.16   -0.04
------------------------------------------------------------
Review:
University Professor Justin Thorne (Jimmy Smits) has got it made. A good-looking, sophisticated teacher, with a loving wife and two adorable children. He plays the saxophone, owns an expensive car and his students love and respect him. But when temptation calls, in the form of one of his bright, pretty, sexy and willing students, Jennifer Carter (Naomi Watts), he foolishly gives in. The next day, he is being charged with her rape, and his perfect life could be forever ruined.<br /><br />When we see an American actor in Australian film, we know we are not in for a masterpiece. But even viewed with low expectations, "Gross Misconduct" is a huge flop. Based on a play with a rather unimaginative title and then adapted into a reasonably enjoyable

     SENTIMENT STATS:                                      
  Predicted Sentiment Objectivity Positive Negative Overall
0            positive        0.88     0.08     0.04    0.04
------------------------------------------------------------
Performance metrics (SentiWordNet):
Accuracy: 0.61
Precision: 0.58
Recall: 0.91
F1 Score: 0.71

Confusion Matrix:
                 Predicted:         
                   positive negative
Actual: positive       1400      133
        negative       1025      442

Classification report:


  labels=[[0,0],[0,1]]),
  labels=[[0,0],[0,1]]))


              precision    recall  f1-score   support

    positive       0.58      0.91      0.71      1533
    negative       0.77      0.30      0.43      1467

    accuracy                           0.61      3000
   macro avg       0.67      0.61      0.57      3000
weighted avg       0.67      0.61      0.57      3000


Review:
This has to be the most boring movie I ever sat through. It is dreary and drab, has no excitement, the acting by Hulce is terrible as Hulce cannot pull off the proper accent required for this film. The story is stupid and I sure wouldn't recommend this crap for anyone unless you want to die of boredom.

Labeled Sentiment: negative

     SENTIMENT STATS:                                                     
  Predicted Sentiment Polarity Score Positive Negative             Neutral
0            negative          -0.95     9.0%    33.0%  57.99999999999999%
------------------------------------------------------------
Review:
University Professor Justin Thorne (





     SENTIMENT STATS:                                         
  Predicted Sentiment Polarity Score Positive Negative Neutral
0            negative          -0.81     4.0%     8.0%   89.0%
------------------------------------------------------------
Review:
This movie is supposed to take place in Milford NJ. I know the house that it is based on as well as the person. As you see at the end of themovie, she was killed in the world trade center incident. I know that, because I was one of the police officers that helped with the identification of her remains. (She was the only one in our area lost). The nudity in the movie went a bit far. I am not a prude but the actors could have filmed the scene with the two woman without actually showing the whole thing. This movie is in poor taste and I cannot see how her family would give there blessing to it. This is an insult to the person whom it is based on.

Labeled Sentiment: negative

     SENTIMENT STATS:                                     