# Importing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk

# Loading Data

In [2]:
dataset = pd.read_csv('movie_reviews.csv')

reviews = np.array(dataset['review'])
sentiments = np.array(dataset['sentiment'])

# Cleaning Text

Removing HTML Tags

In [3]:
from bs4 import BeautifulSoup

In [4]:
def strip_html_tag(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

Remove Accented Characters

In [5]:
import unicodedata

In [6]:
def strip_accents(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

Expanding Contraction

In [7]:
CONTRACTION_MAP = {"ain't": "is not",
                   "aren't": "are not",
                   "can't": "cannot",
                   "can't've": "cannot have",
                   "'cause": "because",
                   "could've": "could have",
                   "couldn't": "could not",
                   "couldn't've": "could not have",
                   "didn't": "did not",
                   "doesn't": "does not",
                   "don't": "do not",
                   "hadn't": "had not",
                   "hadn't've": "had not have",
                   "hasn't": "has not",
                   "haven't": "have not",
                   "he'd": "he would",
                   "he'd've": "he would have",
                   "he'll": "he will",
                   "he'll've": "he will have",
                   "he's": "he is",
                   "how'd": "how did",
                   "how'd'y": "how do you",
                   "how'll": "how will",
                   "how's": "how is",
                   "I'd": "I would",
                   "I'd've": "I would have",
                   "I'll": "I will",
                   "I'll've": "I will have",
                   "I'm": "I am",
                   "I've": "I have",
                   "i'd": "i would",
                   "i'd've": "i would have",
                   "i'll": "i will",
                   "i'll've": "i will have",
                   "i'm": "i am",
                   "i've": "i have",
                   "isn't": "is not",
                   "it'd": "it would",
                   "it'd've": "it would have",
                   "it'll": "it will",
                   "it'll've": "it will have",
                   "it's": "it is",
                   "let's": "let us",
                   "ma'am": "madam",
                   "mayn't": "may not",
                   "might've": "might have",
                   "mightn't": "might not",
                   "mightn't've": "might not have",
                   "must've": "must have",
                   "mustn't": "must not",
                   "mustn't've": "must not have",
                   "needn't": "need not",
                   "needn't've": "need not have",
                   "o'clock": "of the clock",
                   "oughtn't": "ought not",
                   "oughtn't've": "ought not have",
                   "shan't": "shall not",
                   "sha'n't": "shall not",
                   "shan't've": "shall not have",
                   "she'd": "she would",
                   "she'd've": "she would have",
                   "she'll": "she will",
                   "she'll've": "she will have",
                   "she's": "she is",
                   "should've": "should have",
                   "shouldn't": "should not",
                   "shouldn't've": "should not have",
                   "so've": "so have",
                   "so's": "so as",
                   "that'd": "that would",
                   "that'd've": "that would have",
                   "that's": "that is",
                   "there'd": "there would",
                   "there'd've": "there would have",
                   "there's": "there is",
                   "they'd": "they would",
                   "they'd've": "they would have",
                   "they'll": "they will",
                   "they'll've": "they will have",
                   "they're": "they are",
                   "they've": "they have","to've": "to have",
                   "wasn't": "was not",
                   "we'd": "we would",
                   "we'd've": "we would have",
                   "we'll": "we will",
                   "we'll've": "we will have",
                   "we're": "we are",
                   "we've": "we have",
                   "weren't": "were not",
                   "what'll": "what will",
                   "what'll've": "what will have",
                   "what're": "what are",
                   "what's": "what is",
                   "what've": "what have",
                   "when's": "when is",
                   "when've": "when have",
                   "where'd": "where did",
                   "where's": "where is",
                   "where've": "where have",
                   "who'll": "who will",
                   "who'll've": "who will have",
                   "who's": "who is",
                   "who've": "who have",
                   "why's": "why is",
                   "why've": "why have",
                   "will've": "will have",
                   "won't": "will not",
                   "won't've": "will not have",
                   "would've": "would have",
                   "wouldn't": "would not",
                   "wouldn't've": "would not have",
                   "y'all": "you all",
                   "y'all'd": "you all would",
                   "y'all'd've": "you all would have",
                   "y'all're": "you all are",
                   "y'all've": "you all have",
                   "you'd": "you would",
                   "you'd've": "you would have",
                   "you'll": "you will",
                   "you'll've": "you will have",
                   "you're": "you are",
                   "you've": "you have"}

In [8]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [9]:
expand_contractions("  It's an amazing language which can be used for Scripting")

'  It is an amazing language which can be used for Scripting'

Removing Special Characters

In [10]:
def strip_special_characters(text):
    text = re.sub('[^a-zA-z0-9\s]', '', text)
    return text

Lemmatizing Text

In [11]:
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer()

In [12]:
def lemmatize_text(text):
    tokens = nltk.word_tokenize (text)
    text =' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return text

Removing Stopwords

In [13]:
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [14]:
def strip_stopwords(text, is_lower_case=False):
    tokens = nltk.word_tokenize (text)
    #tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

Cleaned Text

In [15]:
def clean_text(text,strip_html=True, expand_contraction=True,
               accent_remove=True, text_lower_case=True,text_lemmatize=True, 
               special_char_remove=True, stopword_remove=True):
    
    processed_text=[]
    for doc in text:
        
        #HTML tah striping
        if strip_html:
            doc=strip_html_tag(doc)
        
        ## remove accented characters
        if accent_remove:
            doc = strip_accents(doc)
            
        # expand contractions    
        if expand_contraction:
            doc = expand_contractions(doc)
            
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
            
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        
        # insert spaces between special characters to isolate them    
        special_char_pattern = re.compile(r'([{.(-)!}])')
        doc = special_char_pattern.sub(" \\1 ", doc)
        
        # lemmatizing text
        if text_lemmatize:
            doc = lemmatize_text(doc)
        
        # remove special characters    
        if special_char_remove:
            doc = strip_special_characters(doc)  
        
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        
        # remove stopwords
        if stopword_remove:
            doc = strip_stopwords(doc, is_lower_case=text_lower_case)
            
        processed_text.append(doc)
        
    return processed_text


# Sentiment Analysis with AFINN

In [20]:
from afinn import Afinn

afn = Afinn(emoticons=True)

# Predict Sentiment

In [24]:
sample_id= [4726,12103,25726,49255]
sample_reviews= reviews[sample_id]

In [32]:
sample_reviews[2]

'What the (beep) is going wrong with Disney the last years? Are there totally run out of good ideas? Where is the magic? Where are the good animators, the good songwriters, the good directors, the good... Okay, i know, Walt himself and the famous "nine old man" can\'t come back. But is this a reason to crank out countless of those cheap sequels and slowly but surely destroying the ideals of Walt Disney? I never rent or bought a Disney-sequel of what movie however. Because i had read much enough about its (absence of) quality. But "Atlantis: Milo\'s Return" was aired today on TV in Germany and so i watch it. It confirmed my doubts about sequels. It was absolutely boring. Flaw animation, primitive color-rotation, simple characters, some unsuccessful tries to simulate the famous Multiplane-Camera with CGI, mediocre music and a patchwork of different, simple stories. It looks absolutely not like Disney! Not like Disney i know! It looks like one of the countless, cheap and simple animation-

In [27]:
norm_reviews=clean_text(sample_reviews)

In [33]:
norm_reviews[2]

'beep going wrong disney last year totally run good idea magic good animator good songwriter good director good okay know walt famous `` nine old man not come back reason crank countless cheap sequel slowly surely destroying ideal walt disney never rent bought disneysequel movie however read much enough absence quality `` atlantis milo return wa aired today tv germany watch confirmed doubt sequel wa absolutely boring flaw animation primitive colorrotation simple character unsuccessful try simulate famous multiplanecamera cgi mediocre music patchwork different simple story look absolutely not like disney not like disney know look like one countless cheap simple animationseries like `` dragonballz `` beyblade etc aired every day tv child first reaction showing crap wa load `` bambi dvdplayer see disney immortal magic depth spirit charm see disney climax see awesome art handmade animation `` bambi wa first today movie give 10 10 star `` atlantis milo return no magic no depth no charm no s

In [31]:
for review, sentiment in zip(reviews[sample_id], sentiments[sample_id]):
    print('REVIEW:', review)
    print('Actual Sentiment:', sentiment)
    print('Predicted Sentiment polarity:', afn.score(review))
    print('*'*120)

REVIEW: Utter dreck. I got to the 16 minute/27 second point, and gave up. I'd have given it a negative number review if that were possible (although 'pissible' is a more fitting word...). Unlike the sizzle you could see and practically feel between MacMurray and Stanwyck in the original, the chemistry between dumb ol' Dicky Crenna and whats-her-face here is just non-existent. The anklet becomes an unattractive chunky bracelet? There's no ciggy-lighting-by-fingertip? And I thought I'd be SICK when they have a mortified-looking (and rightly so, believe you me) Lee J. Cobb as Keyes practically burping/upchucking his way through the explanation of his "Little Man" to Mr. Garloupis. No offence to the non-sighted, but it looks as though a posse of blind men ran amuck with the set design of both the Dietrichson and Neff houses. The same goes for those horrid plaid pants that Phyllis wears. And crikey, how much $$ does Neff make, that he lives overlooking a huge marina? This, folks, again, all

# Sentiment for Complete Data Set

In [34]:
sent_polarity = [afn.score(review) for review in reviews]
pred_sentiment = ['positive' if score >= 1.0 else 'negative' for score in sent_polarity]

# Evaluate Model Performance

In [35]:
from sklearn import metrics

In [37]:
def display_metric(actual_sent, predicted_sent):
    print("Accuracy:", np.round(metrics.accuracy_score(actual_sent,predicted_sent),4))
    print('Precision:', np.round(metrics.precision_score(actual_sent, predicted_sent, average='weighted'),4))
    print('Recall:', np.round(metrics.recall_score(actual_sent,predicted_sent,average='weighted'),4))
    print('F1 Score:', np.round( metrics.f1_score(actual_sent,predicted_sent,average='weighted'),4))

In [38]:
def display_confusion_matrix(actual_sent, predicted_sent, classes=[1,0]):
    
    total_classes = len(classes)
    level_labels = [total_classes*[0], list(range(total_classes))]

    cm = metrics.confusion_matrix(y_true=actual_sent, y_pred=predicted_sent, labels=classes)
    cm_frame = pd.DataFrame(data=cm, columns=pd.MultiIndex(levels=[['Predicted:'], classes], labels=level_labels),
                            index=pd.MultiIndex(levels=[['Actual:'], classes], labels=level_labels)) 
    print(cm_frame)

In [39]:
def display_classification_report(actual_sent, predicted_sent, classes=[1,0]):

    report = metrics.classification_report(y_true=actual_sent,y_pred=predicted_sent, labels=classes) 
    print(report)

In [40]:
def display_model_performance(actual_sent, predicted_sent, classes=[1,0]):
    print('\nPrediction Confusion Matrix:')
    print('*'*50)
    display_confusion_matrix(actual_sent=actual_sent, predicted_sent=predicted_sent,classes=classes)
    print('\nModel Classification report:')
    print('*'*50)
    display_classification_report(actual_sent=actual_sent, predicted_sent=predicted_sent,classes=classes)    
    print('Model Performance metrics:')
    print('*'*50)
    display_metric(actual_sent=actual_sent, predicted_sent=predicted_sent)

In [42]:
display_model_performance(actual_sent=sentiments, predicted_sent=pred_sentiment, 
                                  classes=['positive', 'negative'])


Prediction Confusion Matrix:
**************************************************
                 Predicted:         
                   positive negative
Actual: positive      21147     3853
        negative      10539    14461

Model Classification report:
**************************************************
             precision    recall  f1-score   support

   positive       0.67      0.85      0.75     25000
   negative       0.79      0.58      0.67     25000

avg / total       0.73      0.71      0.71     50000

Model Performance metrics:
**************************************************
Accuracy: 0.7122
Precision: 0.7285
Recall: 0.7122
F1 Score: 0.7069


# Work in Progres.......

# Sentiment Analysis with SentiWordNet

In [43]:
from nltk.corpus import sentiwordnet as swn

In [46]:
def analyze_sentiment_sentiwordnet_lexicon(review,verbose=False):

    # tokenize and POS tag text tokens
    #tagged_text = [(token.text, token.tag_) for token in tn.nlp(review)]
    tokens = nltk.word_tokenize (review)
    tagged_text = [token.strip() for token in tokens]
    pos_score = neg_score = token_count = obj_score = 0
    # get wordnet synsets based on POS tags
    # get sentiment scores if synsets are found
    for word, tag in tagged_text:
        ss_set = None
        if 'NN' in tag and list(swn.senti_synsets(word, 'n')):
            ss_set = list(swn.senti_synsets(word, 'n'))[0]
        elif 'VB' in tag and list(swn.senti_synsets(word, 'v')):
            ss_set = list(swn.senti_synsets(word, 'v'))[0]
        elif 'JJ' in tag and list(swn.senti_synsets(word, 'a')):
            ss_set = list(swn.senti_synsets(word, 'a'))[0]
        elif 'RB' in tag and list(swn.senti_synsets(word, 'r')):
            ss_set = list(swn.senti_synsets(word, 'r'))[0]
        # if senti-synset is found        
        if ss_set:
            # add scores for all found synsets
            pos_score += ss_set.pos_score()
            neg_score += ss_set.neg_score()
            obj_score += ss_set.obj_score()
            token_count += 1
    
    # aggregate final scores
    final_score = pos_score - neg_score
    norm_final_score = round(float(final_score) / token_count, 2)
    final_sentiment = 'positive' if norm_final_score >= 0 else 'negative'
    if verbose:
        norm_obj_score = round(float(obj_score) / token_count, 2)
        norm_pos_score = round(float(pos_score) / token_count, 2)
        norm_neg_score = round(float(neg_score) / token_count, 2)
        # to display results in a nice table
        sentiment_frame = pd.DataFrame([[final_sentiment, norm_obj_score, norm_pos_score, 
                                         norm_neg_score, norm_final_score]],
                                       columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'], 
                                                             ['Predicted Sentiment', 'Objectivity',
                                                              'Positive', 'Negative', 'Overall']], 
                                                             labels=[[0,0,0,0,0],[0,1,2,3,4]]))
        print(sentiment_frame)
        
    return final_sentiment

In [47]:
for review, sentiment in zip(reviews[sample_id], sentiments[sample_id]):
    print('REVIEW:', review)
    print('Actual Sentiment:', sentiment)
    pred = analyze_sentiment_sentiwordnet_lexicon(review, verbose=True)    
    print('*'*120)

REVIEW: Utter dreck. I got to the 16 minute/27 second point, and gave up. I'd have given it a negative number review if that were possible (although 'pissible' is a more fitting word...). Unlike the sizzle you could see and practically feel between MacMurray and Stanwyck in the original, the chemistry between dumb ol' Dicky Crenna and whats-her-face here is just non-existent. The anklet becomes an unattractive chunky bracelet? There's no ciggy-lighting-by-fingertip? And I thought I'd be SICK when they have a mortified-looking (and rightly so, believe you me) Lee J. Cobb as Keyes practically burping/upchucking his way through the explanation of his "Little Man" to Mr. Garloupis. No offence to the non-sighted, but it looks as though a posse of blind men ran amuck with the set design of both the Dietrichson and Neff houses. The same goes for those horrid plaid pants that Phyllis wears. And crikey, how much $$ does Neff make, that he lives overlooking a huge marina? This, folks, again, all

ValueError: too many values to unpack (expected 2)