# Overview
This project is aimed at scraping and analyzing the reviews from Rotten Tomatoes website for movie "Eternals". 

The first main part of this project includes data aquisition through webscraping technique and data cleaning/wrangling process. The website of Rotten Tomatoes is javascript-rendered. Thus, the Python package Selenium is used for webscraping. 

The second part of this project is focusing on sentiment analysis on audiences' reviews. It is conducted through Vader sentiment algorithem provided by natural langaue toolkit package in python,NLTK. 


# I. Data aquisition + Data cleaning

### 1. Data aquisition- Web Scraping

In [None]:
# import needed python packages 
# !pip install selenium
import selenium 
from selenium import webdriver
import os
import pandas as pd
import time
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist, word_tokenize
import re
%matplotlib inline
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', -1)

In [None]:
# to build Chrome webdriver to automatically browser webpages from rotten tomatoes's website and parse the web elements
driver = webdriver.Chrome(executable_path=r'/Users/stephanie/Downloads/chromedriver')
driver.set_page_load_timeout(30)
driver.get("https://www.rottentomatoes.com/m/eternals/reviews?type=verified_audience&intcmp=rt-scorecard_audience-score-reviews")

# to create an empty list for storing the scraped data
data = []

# to scrape users and their reviews for this movie from the 1st page all the way to the 199th page and store scrapped data into the list "data"
for i in range (1,200):
    reviews = driver.find_elements_by_xpath("//ul[@class='audience-reviews']/li")
    for review in reviews:
        user_name=review.find_element_by_xpath("div/div/span[@class='audience-reviews__name']").text
        user_review=review.find_element_by_xpath("div[2]/p[1]").text
        data.append([user_name,user_review])

    try:
        # click "next" button to the next page
        driver.find_element_by_xpath("//nav[@class='prev-next-paging__wrapper']/button[@class='js-prev-next-paging-next btn prev-next-paging__button prev-next-paging__button-right']").click()
        time.sleep(10)


    except e:
        print(e)
        break
        
# close the web driver when the work is completed   
driver.close()


### 2. Data Cleaning -Text processing 

In [None]:
# split scraped data into 2 lists: a list of just movie review called data_review and a list of user's name called "data_user"
data_review= [i[1] for i in data]
data_user=[i[0]for i in data]


In [None]:
#create a funcation called "cleanedreview" to clean up the movie reviews which removes unneccesary characters and meaningless stopwords in English
def cleanedreview(reviews):
    clean_reviews=[]
    for review in reviews:
        print(review)
        # convert each item in the list to string in lowercase. Remove unneccesary characters such as ?!"" and only keep alphanumeric characters
        review = re.sub("[@#$%^&*()]", " ", review.lower())
        # remove useless stopwords    
        All_stoplist = set(stopwords.words('english'))
        review = [word for word in review.split() if word not in All_stoplist and len(word) > 2]
        clean_reviews.append( " ".join(review))
    return clean_reviews

In [None]:
# pass movie reviews into this function to obtain cleaned reviews
clean_reviews=cleanedreview(data_review)
# consolidate the data_user list and cleaned data_review list into a disctionary called data_cleaned with key "User" and value "Review" pairs
data_cleaned = {'User':data_user,'Review':clean_reviews}
# convert this disctionary into dataframe
df = pd.DataFrame(data_cleaned)
# save the cleaned data into a csv file 
df.to_csv('rotten_movie_review.csv')
# display summary info of this dataframe 
df.info()

In [None]:
# print the first 10 rows of this dataframe
df.head(10)

# II. Data Analysis

### 1. Vader Sentiment analysis

In [None]:
# convert the csv file with review data into a new dataframe called "df_1" for further analysis
df_1=pd.read_csv('rotten_movie_review.csv')

In [None]:
# display the 1st 5 rows of this new dataframe to ensure data looks good
df_1.head()

In [None]:
# instantiate a SentimentIntensityAnalyzer object
analyzer = SentimentIntensityAnalyzer()
# The polarity_scores method of SentimentIntensityAnalyzer object returns a sentiment dictionary,which contains pos, neg, neu, and compound scores.
# The compound score is computed by normalizing the sum of positive,negative, and neutral scores 
# and will be shown in normalized format whcih between -1(most negative) and 1 (most positive).
# Then store these ratings into different columns in df dataframe
df_1['compound'] = [analyzer.polarity_scores(x)['compound'] for x in df_1['Review']]
df_1['neg'] = [analyzer.polarity_scores(x)['neg'] for x in df_1['Review']]
df_1['neu'] = [analyzer.polarity_scores(x)['neu'] for x in df_1['Review']]
df_1['pos'] = [analyzer.polarity_scores(x)['pos'] for x in df_1['Review']]
# Create a new column to mark if the review is negative or positive based on resulted compound score.
# if compound score is >=0, then mark the review as "pos", else "neg"
df_1['Rating'] = df_1['compound'].apply(lambda x: 'pos'if x>=0 else 'neg')




In [None]:
# to display the 1st 15 rows of this dataframe 
df_1.head(15)

In [None]:
# to show the top 5 postive reviews 
df_1.sort_values('compound', ascending=False).head(5)


In [None]:
# to show the top 5 negative reviews 
df_1.sort_values('compound', ascending=True).head(5)

In [None]:
# to show descriptive statistics for the all numeric ratings 
df_1.describe()

In [None]:
# to plot the compound ratings in histagrams
plt.figure(figsize=(14,6))
plt.hist(df_1['compound'], bins=20)
plt.title('Histogram of Movie Reviews')
plt.xlabel('Compound score')

In [None]:
# to group data by "Rating" column and show the size in bar chart
df_1.groupby('Rating').size().plot.bar()

In [None]:
# import python package for wordcloud
from wordcloud import WordCloud

In [None]:
# create wordcloud to show words that are frequently used in the reviews
wordcloud = WordCloud().generate(' '.join(df_1['Review']))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()