# Sentiment Analysis on Movie Reviews

The sentiment labels are:
- 0 - negative
- 1 - somewhat negative
- 2 - neutral
- 3 - somewhat positive
- 4 - positive


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import string
from string import punctuation
import matplotlib.pyplot as plt
%matplotlib inline

from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 

## Data Loading and Preparation

In [None]:
train = pd.read_table('../input/movie-review-sentiment-analysis-kernels-only/train.tsv',delimiter="\t",encoding="utf-8")
test = pd.read_table('../input/movie-review-sentiment-analysis-kernels-only/test.tsv',delimiter="\t",encoding="utf-8")
submission = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/sampleSubmission.csv')

In [None]:
submission.head()

In [None]:
train.head()

In [None]:
test.head()

## Data Modeling

In [None]:
df = pd.concat([train, test], ignore_index=True)
print(df.shape)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
newStemmer = SnowballStemmer('english')
newLemma = WordNetLemmatizer()

#### Algorithm to clean the existing reviews before processing them into our model

In [None]:
def cleaning(review_col):
    review_corpus=[]
    for i in range(0,len(review_col)):
        review=str(review_col[i])
        review=re.sub('[^a-zA-Z]',' ',review)
        #review=[stemmer.stem(w) for w in word_tokenize(str(review).lower())]
        review=[newLemma.lemmatize(w) for w in word_tokenize(str(review).lower())]
        review=' '.join(review)
        review_corpus.append(review)
    return review_corpus

In [None]:
df['optimized_reviews']=cleaning(df.Phrase.values)
df.head()

In [None]:
tfidf=TfidfVectorizer(ngram_range=(1,2),max_df=0.95,min_df=10,sublinear_tf=True)

In [None]:
df_train=df[df.Sentiment!=-999]
df_train.shape

In [None]:
df_test=df[df.Sentiment==-999]
df_test.drop('Sentiment',axis=1,inplace=True)
print(df_test.shape)
df_test.head()

## Data Preprocessing and Elementary EDA

<b>Check if the data containes Null or NaNs</b>

In [None]:
train.isna().sum()

In [None]:
train.isnull().sum()

<b> Below, we are converting the sentiment score to its meaningful string assignment to be used during the analysis</b>

In [None]:
train['sentiment_label'] = ''
train.loc[train.Sentiment == 0, 'sentiment_label'] = 'Negative'
train.loc[train.Sentiment == 1, 'sentiment_label'] = 'Somewhat Negative'
train.loc[train.Sentiment == 2, 'sentiment_label'] = 'Neutral'
train.loc[train.Sentiment == 3, 'sentiment_label'] = 'Somewhat Positive'
train.loc[train.Sentiment == 4, 'sentiment_label'] = 'Positive'

In [None]:
train.head()

<b>Sentiment labels distribution</b>

- The most frequent label is Neutral which represent about 50% from the data.
- The total positive reviews represent about 27%.
- The total negative reviews represent about 23%.

In [None]:
train.sentiment_label.value_counts()

In [None]:
train.shape

Below, let us get rid of PhraseId and SentenceId, as these assignments don't help in solving our problem

In [None]:
train = train.drop(['PhraseId', 'SentenceId'], axis=1)

In [None]:
train.head()

Below, let us analyze the length of each phrase for the reviews

In [None]:
train['lengthOfPhrase'] = [len(x) for x in train.Phrase]
train.head()

Notice that above, the length of each string - by characters - is calculated. Spaces are included.

## Data Visualization - Finding different sentiment distributions

In [None]:
sns.set_palette("dark")

In [None]:
fig, ax = plt.subplots(1, 1,dpi=100, figsize=(10,5))
sentiment_labels = train.sentiment_label.value_counts().index
sentiment_count = train.sentiment_label.value_counts()
sns.barplot(x=sentiment_labels,y=sentiment_count)
ax.set_ylabel('Count', fontsize = 14)    
ax.set_xlabel('Sentiment Type', fontsize = 14)
ax.set_xticklabels(sentiment_labels , rotation=30)

In [None]:
fig = plt.figure(figsize=[10, 10])
sentiment_labels = train.sentiment_label.value_counts().index
sentiment_count = train.sentiment_label.value_counts()
plt.pie(x=sentiment_count, labels=sentiment_labels,autopct='%0.2f %%')
plt.show

#### <b>More Visualizations for better insights :: WordClouds for Positive, Negative and Neutral Reviews.</b>

Define a list of english stopwords to be eliminated from the text during text analysis process.

In [None]:
Stopwords = list(ENGLISH_STOP_WORDS) + stopwords.words()

In [None]:
def textPreparation(text):
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('[%s]' % re.escape(string.digits), '', text)
    text = re.sub('[%s]' % re.escape(' +'), ' ', text)
    text = text.lower()
    text = text.strip()
    return text

In [None]:
train['cleaned_phrase'] = ''
train['cleaned_phrase'] = [textPreparation(phrase) for phrase in train.Phrase]
test['cleaned_phrase'] = ''
test['cleaned_phrase'] = [textPreparation(phrase) for phrase in test.Phrase]

In [None]:
def cloud(sentiment):
    stopwordslist = Stopwords
    ## extend list of stopwords with the common words between the 3 classes which is not helpful to represent them
    stopwordslist.extend(['movie','movies','film','nt','rrb','lrb','make','work','like','story','time','little'])
    reviews = train.loc[train.Sentiment.isin(sentiment)]
    print("Word Cloud for Sentiment Labels: ", reviews.sentiment_label.unique())
    phrases = ' '.join(reviews.cleaned_phrase)
    words = " ".join([word for word in phrases.split()])
    wordcloud = WordCloud(stopwords=stopwordslist,width=3000,height=2500,background_color='white',).generate(words)
    plt.figure(figsize=(10, 10))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()


In [None]:
cloud([3,4])

In [None]:
cloud([0,1])

Now, this is important. Note we are concerned with neutral wording that may lead to neutral sentiments. Here, we analyze which words are most commonly associated with neutral sentiment within this dataset.Sometimes, just having words like 'good' or 'bad' may not necessarily equivocate to the reviews being positive or negative sentiment. We have to further analyze our data to tryly determine the sentiment.

In [None]:
cloud([2])

#### More Analysis and visualizations: Measuring frequency of words. 

In order to do this, we utilized a model known as Zipf's Law. 

Zipf's Law states that, if $t_1$ is the most common term in the collection, $t_2$ is the next most common, and so on, then the collection frequency cf(i) of the i'th most common term is proportional to $1/i$:

Formula: cf(i) α 1/i

Theoretical Description: Say we have a term that occurs cf1 times, then the next most frequent time has about half the probability of occurance as before. Basically, the frequency decreases with rank. It is a good model that helps formalize a rapid decrease in probability of occurance.

In order to implement the above model, we need to analyze the frequency of terms that appear in the model. We can use sklearn's CountVectorizer to do this.

In [None]:
vectorizor = CountVectorizer(min_df=0.0, max_df=1.0, ngram_range=(1,2))
vectorizor.fit(train.Phrase)

#### We are concerned with finding the frequency for negative reviews. So let use our vectorizor model to find the frequency of words that may lead to neutral sentiment

In [None]:
neutral_frequency = vectorizor.transform(train[train.Sentiment == 2].Phrase)

In [None]:
neutral_words = neutral_frequency.sum(axis=0)
neutral_words_frequency = [(word, neutral_words[0, idx]) for word, idx in vectorizor.vocabulary_.items()]
neutral_words_tf = pd.DataFrame(list(sorted(neutral_words_frequency, key = lambda x: x[1], reverse=True)), columns=['Terms', 'neutral'])
neutral_words_tf_df = neutral_words_tf.set_index('Terms')
neutral_words_tf_df.head()

#### Below, we are creating a dataframe that we will keep track of to measure neutral phrase/term frequency. We will use this later to create our frequency plot

In [None]:
term_freq_df = pd.concat([neutral_words_tf_df],axis=1)

In [None]:
term_freq_df['total'] = term_freq_df['neutral']
term_freq_df.sort_values(by='total', ascending=False).head(20)

#### Here, we plot the top 50 frequency used phrase in neutral movie reviews

In [None]:
position = np.arange(50)
plt.figure(figsize=(12,10))
plt.bar(position, term_freq_df.sort_values(by='neutral', ascending=False)['neutral'][:50], align='center', alpha=0.5)
plt.xticks(position, term_freq_df.sort_values(by='neutral', ascending=False)['neutral'][:50].index,rotation='vertical')
plt.ylabel('Frequency')
plt.xlabel('Top 50 neutral words')
plt.title('Top 50 words in neutral movie reviews')