In [1]:
#Importing relevant libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
from time import sleep
from random import randint
import numpy as np
import feedparser
from nltk.corpus import stopwords

In [2]:
#Assigning columns to pandas dataframe
tag = []      #The unique movie tag from IMDB
titles = []   #Movie titles/names
years = []    #Year movie was released
time = []     #The length of movie run time
rating = []    #Ratings on IMDB
genre = []    #Genre(s) of movie
votes = []    #Number of votes on movie title
link = []     #URL link to movie's IMDB web page
image = []    #URL link to movie poster
review = []   #Reviews for each movie

In [3]:
#Setting headers to match browser type
headers = dict()
headers[
    "User-Agent"
] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"

#setting the number of pages to be scraped using numpy
#Two pages are needed to get the first 100 movies
pages = np.arange(1, 101, 50)

#Assigning url link and iterating scraping code over pages
for page in pages:
    url = "https://www.imdb.com/search/title/?groups=top_1000&start=" + str(page)
    results = requests.get(url, headers=headers)
    soup = BeautifulSoup(results.text, "html.parser")
    movie_div = soup.find_all("div", class_="lister-item mode-advanced")
    sleep(randint(2, 10))
    #print(page)

In [4]:
    #iterating scraping code over each section in the webpage
    for movieSection in movie_div:
        name = movieSection.h3.a.text
        titles.append(name)

        year = movieSection.h3.find("span", class_="lister-item-year").text
        years.append(year)

        runTime = movieSection.find("span", class_="runtime").text
        time.append(runTime) 

        ratings = movieSection.strong.text
        rating.append(ratings)

        category = movieSection.find("span", class_="genre").text.strip()
        genre.append(category)

        nv = movieSection.find_all("span", attrs={"name": "nv"})
        vote = nv[0].text
        votes.append(vote)

        links = "https://www.imdb.com" + movieSection.a.attrs['href']
        link.append(links)

        thumbs = movieSection.img.attrs['loadlate']
        images = thumbs.replace("._V1_UX67_CR0,0,67,98_AL_.jpg", "._V1_FMjpg_UX1000_.jpg")
        image.append(images)
        
        tagg = links.replace("https://www.imdb.com/title/", "")
        tags = tagg.replace("/?ref_=adv_li_i", "")
        tag.append(tags)
        


            #getting reviews for each movie link
    for l in link:
        rev = l.replace("?ref_=adv_li_i", "reviews?ref_=tt_ov_rt")
        response = requests.get(rev, headers=headers)
        stew = BeautifulSoup(response.text, "html.parser")
        content = stew.find_all('div', class_=['text','show-more__control'])
        #print(rev)

        reviews = [tag.get_text() for tag in content]
        review.append(reviews[:5])

#print (review[:5])
        

In [5]:
#Saving to a dataframe called movie
movie = pd.DataFrame(
    {
        "UniqueTags": tag,
        "Movie": titles,
        "Year": years,
        "RunTime": time,
        "imdb": rating,
        "Genre": genre,
        "votes": votes,
        "URL": link,
        "ImageLinks": image,
        "Reviews": review,
    }
)

In [6]:
#printing dataframe to console
print(movie)

   UniqueTags                                          Movie    Year  RunTime  \
0   tt0093779                             The Princess Bride  (1987)   98 min   
1   tt0796366                                      Star Trek  (2009)  127 min   
2   tt3281548                                   Little Women  (2019)  135 min   
3   tt0477348                         No Country for Old Men  (2007)  122 min   
4   tt0102926                       The Silence of the Lambs  (1991)  118 min   
5   tt1130884                                 Shutter Island  (2010)  138 min   
6   tt0144084                                American Psycho  (2000)  101 min   
7   tt0090605                                         Aliens  (1986)  137 min   
8   tt0118749                                  Boogie Nights  (1997)  155 min   
9   tt0091042                       Ferris Bueller's Day Off  (1986)  103 min   
10  tt0083658                                   Blade Runner  (1982)  117 min   
11  tt0482571               

In [7]:
#Cleaning the dataframe

#Removing brackets and converting from text to number
movie["Year"] = movie["Year"].str.extract("(\\d+)").astype(int)

#Changing min to minutes
movie["RunTime"] = movie["RunTime"].str.replace("min", "minutes")

#Converting votes from text to numbers and removing commas
movie["votes"] = movie["votes"].str.replace(",", "").astype(int)


movie.head()

Unnamed: 0,UniqueTags,Movie,Year,RunTime,imdb,Genre,votes,URL,ImageLinks,Reviews
0,tt0093779,The Princess Bride,1987,98 minutes,8.1,"Adventure, Family, Fantasy",405780,https://www.imdb.com/title/tt0093779/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BMGM4M2...,[I have been waiting for the right time to rev...
1,tt0796366,Star Trek,2009,127 minutes,7.9,"Action, Adventure, Sci-Fi",587732,https://www.imdb.com/title/tt0796366/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BMjE5ND...,[I'm not going to hit this from a scientific p...
2,tt3281548,Little Women,2019,135 minutes,7.8,"Drama, Romance",161921,https://www.imdb.com/title/tt3281548/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BY2QzYT...,[Some comments lament the movie has changed fr...
3,tt0477348,No Country for Old Men,2007,122 minutes,8.1,"Crime, Drama, Thriller",893434,https://www.imdb.com/title/tt0477348/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BMjA5Nj...,[The message of this film is that there is no ...
4,tt0102926,The Silence of the Lambs,1991,118 minutes,8.6,"Crime, Drama, Thriller",1321862,https://www.imdb.com/title/tt0102926/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BNjNhZT...,[The Silence of the Lambs runs two hours.Antho...


In [8]:
#Converting column from list to string
movie['Reviews'] = movie['Reviews'].apply(', '.join)

In [9]:
#Deleting Punctuations from Column
movie['Reviews'] = movie['Reviews'].str.replace('[^\w\s]','')

In [10]:
#Deleting stop Words
stop = stopwords.words('english')
movie['Reviews']= movie['Reviews'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))


In [11]:
#Ensures each row in column has a minimum of 3 words
movie= movie[movie['Reviews'].str.len()>3]

In [12]:
#Convert to lowercase
movie['Reviews']= movie['Reviews'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [13]:
#Dataframe head to console
movie.head()

Unnamed: 0,UniqueTags,Movie,Year,RunTime,imdb,Genre,votes,URL,ImageLinks,Reviews
0,tt0093779,The Princess Bride,1987,98 minutes,8.1,"Adventure, Family, Fantasy",405780,https://www.imdb.com/title/tt0093779/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BMGM4M2...,i waiting right time review film i feel today ...
1,tt0796366,Star Trek,2009,127 minutes,7.9,"Action, Adventure, Sci-Fi",587732,https://www.imdb.com/title/tt0796366/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BMjE5ND...,im going hit scientific perspective that would...
2,tt3281548,Little Women,2019,135 minutes,7.8,"Drama, Romance",161921,https://www.imdb.com/title/tt3281548/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BY2QzYT...,some comments lament movie changed book especi...
3,tt0477348,No Country for Old Men,2007,122 minutes,8.1,"Crime, Drama, Thriller",893434,https://www.imdb.com/title/tt0477348/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BMjA5Nj...,the message film symmetry life what goes aroun...
4,tt0102926,The Silence of the Lambs,1991,118 minutes,8.6,"Crime, Drama, Thriller",1321862,https://www.imdb.com/title/tt0102926/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BNjNhZT...,the silence lambs runs two hoursanthony hopkin...


In [14]:
#Saving to a csv file on local disk
#movies.to_csv(r"C:\Users\dtosi\Downloads\movies_.csv", index=False, header=True)