In [61]:
#Importing relevant libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
from time import sleep
from random import randint
import numpy as np
import feedparser
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from pymongo import MongoClient
import json
import sys
import certifi

#import OAuth #-- this is a module where I keep my connection string with secret passwords and authentication. You'll have to make yours

In [3]:
#Assigning columns to pandas dataframe
tag = []      #The unique movie tag from IMDB
titles = []   #Movie titles/names
years = []    #Year movie was released
time = []     #The length of movie run time
rating = []    #Ratings on IMDB
genre = []    #Genre(s) of movie
votes = []    #Number of votes on movie title
link = []     #URL link to movie's IMDB web page
image = []    #URL link to movie poster
review = []   #Reviews for each movie

In [4]:
#Setting headers to match browser type
headers = dict()
headers[
    "User-Agent"
] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"

#setting the number of pages to be scraped using numpy
#Two pages are needed to get the first 100 movies
pages = np.arange(1, 101, 50)

#Assigning url link and iterating scraping code over pages
for page in pages:
    url = "https://www.imdb.com/search/title/?groups=top_1000&start=" + str(page)
    results = requests.get(url, headers=headers)
    soup = BeautifulSoup(results.text, "html.parser")
    movie_div = soup.find_all("div", class_="lister-item mode-advanced")
    sleep(randint(2, 10))
    #print(page)

In [5]:
    #iterating scraping code over each section in the webpage
    for movieSection in movie_div:
        name = movieSection.h3.a.text
        titles.append(name)

        year = movieSection.h3.find("span", class_="lister-item-year").text
        years.append(year)

        runTime = movieSection.find("span", class_="runtime").text
        time.append(runTime) 

        ratings = movieSection.strong.text
        rating.append(ratings)

        category = movieSection.find("span", class_="genre").text.strip()
        genre.append(category)

        nv = movieSection.find_all("span", attrs={"name": "nv"})
        vote = nv[0].text
        votes.append(vote)

        links = "https://www.imdb.com" + movieSection.a.attrs['href']
        link.append(links)

        thumbs = movieSection.img.attrs['loadlate']
        images = thumbs.replace("._V1_UX67_CR0,0,67,98_AL_.jpg", "._V1_FMjpg_UX1000_.jpg")
        image.append(images)
        
        tagg = links.replace("https://www.imdb.com/title/", "")
        tags = tagg.replace("/?ref_=adv_li_i", "")
        tag.append(tags)
        


            #getting reviews for each movie link
    for l in link:
        rev = l.replace("?ref_=adv_li_i", "reviews?ref_=tt_ov_rt")
        response = requests.get(rev, headers=headers)
        stew = BeautifulSoup(response.text, "html.parser")
        content = stew.find_all('div', class_=['text','show-more__control'])
        #print(rev)

        reviews = [tag.get_text() for tag in content]
        review.append(reviews[:5])

#print (review[:5])
        

In [6]:
#Saving to a dataframe called movie
movie = pd.DataFrame(
    {
        "UniqueTags": tag,
        "Movie": titles,
        "Year": years,
        "RunTime": time,
        "imdb": rating,
        "Genre": genre,
        "votes": votes,
        "URL": link,
        "ImageLinks": image,
        "Reviews": review,
    }
)

In [7]:
#printing dataframe to console
print(movie)

   UniqueTags                                              Movie         Year  \
0   tt0108358                                          Tombstone       (1993)   
1   tt2543164                                            Arrival  (II) (2016)   
2   tt8503618                                           Hamilton       (2020)   
3   tt0119217                                  Good Will Hunting       (1997)   
4   tt3281548                                       Little Women       (2019)   
5   tt3501632                                     Thor: Ragnarok       (2017)   
6   tt8367814                                      The Gentlemen       (2019)   
7   tt0075314                                        Taxi Driver       (1976)   
8   tt0144084                                    American Psycho       (2000)   
9   tt0106677                                 Dazed and Confused       (1993)   
10  tt0137523                                         Fight Club       (1999)   
11  tt0114369               

In [8]:
#Cleaning the dataframe

#Removing brackets and converting from text to number
movie["Year"] = movie["Year"].str.extract("(\\d+)").astype(int)

#Changing min to minutes
movie["RunTime"] = movie["RunTime"].str.replace("min", "minutes")

#Converting votes from text to numbers and removing commas
movie["votes"] = movie["votes"].str.replace(",", "").astype(int)


movie.head()

Unnamed: 0,UniqueTags,Movie,Year,RunTime,imdb,Genre,votes,URL,ImageLinks,Reviews
0,tt0108358,Tombstone,1993,130 minutes,7.8,"Action, Biography, Drama",136206,https://www.imdb.com/title/tt0108358/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BODRkYz...,[I wont review the film here as it has all bee...
1,tt2543164,Arrival,2016,116 minutes,7.9,"Drama, Sci-Fi",634923,https://www.imdb.com/title/tt2543164/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BMTExMz...,"[""I used to think this was the beginning of yo..."
2,tt8503618,Hamilton,2020,160 minutes,8.4,"Biography, Drama, History",74102,https://www.imdb.com/title/tt8503618/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BNjViNW...,"[I mean, i WAS pretty high, but still.It defin..."
3,tt0119217,Good Will Hunting,1997,126 minutes,8.3,"Drama, Romance",901714,https://www.imdb.com/title/tt0119217/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BOTI0Mz...,"[Sure, this film's plot is fairly predictable...."
4,tt3281548,Little Women,2019,135 minutes,7.8,"Drama, Romance",166035,https://www.imdb.com/title/tt3281548/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BY2QzYT...,[Some comments lament the movie has changed fr...


In [9]:
#Converting column from list to string
movie['Reviews'] = movie['Reviews'].apply(', '.join)

In [10]:
#Deleting Punctuations from Column
movie['Reviews'] = movie['Reviews'].str.replace('[^\w\s]','')

  movie['Reviews'] = movie['Reviews'].str.replace('[^\w\s]','')


In [11]:
#Deleting stop Words
stop = stopwords.words('english')
movie['Reviews']= movie['Reviews'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))


In [12]:
#Ensures each row in column has a minimum of 3 words
movie= movie[movie['Reviews'].str.len()>3]

In [13]:
#Convert to lowercase
movie['Reviews']= movie['Reviews'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [14]:
#Dataframe head to console
movie.head()

Unnamed: 0,UniqueTags,Movie,Year,RunTime,imdb,Genre,votes,URL,ImageLinks,Reviews
0,tt0108358,Tombstone,1993,130 minutes,7.8,"Action, Biography, Drama",136206,https://www.imdb.com/title/tt0108358/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BODRkYz...,i wont review film said i want praise val kilm...
1,tt2543164,Arrival,2016,116 minutes,7.9,"Drama, Sci-Fi",634923,https://www.imdb.com/title/tt2543164/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BMTExMz...,i used think beginning story were bound time d...
2,tt8503618,Hamilton,2020,160 minutes,8.4,"Biography, Drama, History",74102,https://www.imdb.com/title/tt8503618/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BNjViNW...,i mean was pretty high stillit definitely live...
3,tt0119217,Good Will Hunting,1997,126 minutes,8.3,"Drama, Romance",901714,https://www.imdb.com/title/tt0119217/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BOTI0Mz...,sure films plot fairly predictable sure boiled...
4,tt3281548,Little Women,2019,135 minutes,7.8,"Drama, Romance",166035,https://www.imdb.com/title/tt3281548/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BY2QzYT...,some comments lament movie changed book especi...


In [15]:
#Saving to a csv file on local disk
#movies.to_csv(r"C:\Users\dtosi\Downloads\movies_.csv", index=False, header=True)

In [18]:
#Calculating polarity scores for each entry
analyzer = SentimentIntensityAnalyzer()
emptyline = []
for row in movie['Reviews']:
    ps = analyzer.polarity_scores(row)
    emptyline.append(ps)

In [19]:
#Creating a dataframe for the sentiments data
df_sentiments = pd.DataFrame(emptyline)
df_sentiments.head()

Unnamed: 0,neg,neu,pos,compound
0,0.093,0.595,0.312,0.9988
1,0.068,0.727,0.205,0.9974
2,0.0,0.586,0.414,0.9729
3,0.149,0.589,0.262,0.9933
4,0.077,0.722,0.201,0.9879


In [21]:
#Merging main data frame and sentiment data frame
df_join = pd.concat([movie.reset_index(drop = True), df_sentiments], axis=1)
df_join.head(5)

Unnamed: 0,UniqueTags,Movie,Year,RunTime,imdb,Genre,votes,URL,ImageLinks,Reviews,neg,neu,pos,compound
0,tt0108358,Tombstone,1993,130 minutes,7.8,"Action, Biography, Drama",136206,https://www.imdb.com/title/tt0108358/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BODRkYz...,i wont review film said i want praise val kilm...,0.093,0.595,0.312,0.9988
1,tt2543164,Arrival,2016,116 minutes,7.9,"Drama, Sci-Fi",634923,https://www.imdb.com/title/tt2543164/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BMTExMz...,i used think beginning story were bound time d...,0.068,0.727,0.205,0.9974
2,tt8503618,Hamilton,2020,160 minutes,8.4,"Biography, Drama, History",74102,https://www.imdb.com/title/tt8503618/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BNjViNW...,i mean was pretty high stillit definitely live...,0.0,0.586,0.414,0.9729
3,tt0119217,Good Will Hunting,1997,126 minutes,8.3,"Drama, Romance",901714,https://www.imdb.com/title/tt0119217/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BOTI0Mz...,sure films plot fairly predictable sure boiled...,0.149,0.589,0.262,0.9933
4,tt3281548,Little Women,2019,135 minutes,7.8,"Drama, Romance",166035,https://www.imdb.com/title/tt3281548/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BY2QzYT...,some comments lament movie changed book especi...,0.077,0.722,0.201,0.9879


In [23]:
#Populating sentiment field and words field
df_join['sentiment'] = df_join['compound'].apply(lambda score: 'positive' if score > 0.01 else 'negative')
df_join['words'] = df_join['Reviews'].apply(lambda lst: lst.split(" "))
df_join.head(5)


Unnamed: 0,UniqueTags,Movie,Year,RunTime,imdb,Genre,votes,URL,ImageLinks,Reviews,neg,neu,pos,compound,sentiment,words
0,tt0108358,Tombstone,1993,130 minutes,7.8,"Action, Biography, Drama",136206,https://www.imdb.com/title/tt0108358/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BODRkYz...,i wont review film said i want praise val kilm...,0.093,0.595,0.312,0.9988,positive,"[i, wont, review, film, said, i, want, praise,..."
1,tt2543164,Arrival,2016,116 minutes,7.9,"Drama, Sci-Fi",634923,https://www.imdb.com/title/tt2543164/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BMTExMz...,i used think beginning story were bound time d...,0.068,0.727,0.205,0.9974,positive,"[i, used, think, beginning, story, were, bound..."
2,tt8503618,Hamilton,2020,160 minutes,8.4,"Biography, Drama, History",74102,https://www.imdb.com/title/tt8503618/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BNjViNW...,i mean was pretty high stillit definitely live...,0.0,0.586,0.414,0.9729,positive,"[i, mean, was, pretty, high, stillit, definite..."
3,tt0119217,Good Will Hunting,1997,126 minutes,8.3,"Drama, Romance",901714,https://www.imdb.com/title/tt0119217/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BOTI0Mz...,sure films plot fairly predictable sure boiled...,0.149,0.589,0.262,0.9933,positive,"[sure, films, plot, fairly, predictable, sure,..."
4,tt3281548,Little Women,2019,135 minutes,7.8,"Drama, Romance",166035,https://www.imdb.com/title/tt3281548/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BY2QzYT...,some comments lament movie changed book especi...,0.077,0.722,0.201,0.9879,positive,"[some, comments, lament, movie, changed, book,..."


In [63]:
#Saving to database. Here I use MongoDB (I find it very convinient, friendly and interesting), you could also use any database management system that suites you.

#MongoDB connection and authenication
ca = certifi.where() #certificates

client = MongoClient("mongodb+srv://et:1234@sdmt.ddltj.mongodb.net/myFirstDatabase?retryWrites=true&w=majority", tlsCAFile=ca) #dbSvr in the module OAuth is my database server address in MongoDB
db = client['sentiment'] #'sentiment' is the name I chose for the database
collection = db.imdb


In [64]:
#Now we convert our data to dictionary format for ease in JSON format. You do not have to save
#in JSON format or dictionaries. Mongo DB is versatile and can work with flat data frames.
df_join.reset_index(drop = True, inplace = True)
data_dict = df_join.to_dict("records")



In [65]:
#Now we upload to Mongo
collection.insert_many(data_dict)
#collection.insert_one(df_join)

<pymongo.results.InsertManyResult at 0x2694a3c24c0>

In [None]:
#I continued my visualization on Tableau. You can see my vizzes on 