In [1]:

import requests
from bs4 import BeautifulSoup

In [2]:
import pandas as pd
import numpy as np
import itertools
import re
import random

In [3]:
url = "https://www.imdb.com/search/title/?title_type=feature&release_date=2020-01-01,2021-12-31&num_votes=20000,&count=20"
def getSoup(url):
    """
    Utility function which takes a url and returns a Soup object.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    return soup
def getReviews(soup):
    '''Function returns a negative and positive review for each movie.'''
    
    # get a list of user ratings
    user_review_ratings = [tag.previous_element for tag in 
                           soup.find_all('span', attrs={'class': 'point-scale'})]
    
    
    
    
    # get the review tags
    user_review_list = soup.find_all('a', attrs={'class':'title'})
    ans = []
    for i in range(5):
        ans.append(user_review_list[random.randint(0, len(user_review_list) -1)])
    links = ["https://www.imdb.com" + tag['href'] for tag in ans]
    return links

def getReviewText(review_url):
    '''Returns the user review text given the review url.'''
    
    # get the review_url's soup
    soup = getSoup(review_url)
    
    # find div tags with class text show-more__control
    tag = soup.find('div', attrs={'class': 'text show-more__control'})
    
    return tag.getText()

def getMovieTitle(review_url):
    '''Returns the movie title from the review url.'''
    
    # get the review_url's soup
    soup = getSoup(review_url)
    
    # find h1 tag
    tag = soup.find('h1')
    
    return list(tag.children)[1].getText()

def getNounChunks(user_review):
    
    # create the doc object
    doc = nlp(user_review)
    
    # get a list of noun_chunks
    noun_chunks = list(doc.noun_chunks)
    
    # convert noun_chunks from span objects to strings, otherwise it won't pickle
    noun_chunks_strlist = [chunk.text for chunk in noun_chunks]
    
    return noun_chunks_strlist
movies_soup = getSoup(url)

In [4]:
movie_tags = movies_soup.find_all('a', attrs={'class': None})

# filter the a-tags to get just the titles
movie_tags = [tag.attrs['href'] for tag in movie_tags 
              if tag.attrs['href'].startswith('/title') & tag.attrs['href'].endswith('/')]

# remove duplicate links
movie_tags = list(dict.fromkeys(movie_tags))

print("There are a total of " + str(len(movie_tags)) + " movie titles")
print("Displaying 10 titles")
movie_tags[:10]

There are a total of 20 movie titles
Displaying 10 titles


['/title/tt1160419/',
 '/title/tt9376612/',
 '/title/tt6264654/',
 '/title/tt3811906/',
 '/title/tt10954652/',
 '/title/tt9347730/',
 '/title/tt7737528/',
 '/title/tt6334354/',
 '/title/tt10155932/',
 '/title/tt3228774/']

In [5]:
base_url = "https://www.imdb.com"
movie_links = [base_url + tag + 'reviews' for tag in movie_tags]
print("There are a total of " + str(len(movie_links)) + " movie user reviews")
print("Displaying 10 user reviews links")
movie_links[:10]

There are a total of 20 movie user reviews
Displaying 10 user reviews links


['https://www.imdb.com/title/tt1160419/reviews',
 'https://www.imdb.com/title/tt9376612/reviews',
 'https://www.imdb.com/title/tt6264654/reviews',
 'https://www.imdb.com/title/tt3811906/reviews',
 'https://www.imdb.com/title/tt10954652/reviews',
 'https://www.imdb.com/title/tt9347730/reviews',
 'https://www.imdb.com/title/tt7737528/reviews',
 'https://www.imdb.com/title/tt6334354/reviews',
 'https://www.imdb.com/title/tt10155932/reviews',
 'https://www.imdb.com/title/tt3228774/reviews']

In [6]:
movie_soups = [getSoup(link) for link in movie_links]

# get all 500 movie review links
movie_review_list = [getReviews(movie_soup) for movie_soup in movie_soups]

#movie_review_list = list(itertools.chain(*movie_review_list))
#print(len(movie_review_list))

#print("There are a total of " + str(len(movie_review_list)) + " individual movie reviews")
#print("Displaying 10 reviews")
#movie_review_list[:10]

In [7]:
movie_review_list = list(itertools.chain(*movie_review_list))
print(len(movie_review_list))

print("There are a total of " + str(len(movie_review_list)) + " individual movie reviews")
print("Displaying 10 reviews")
movie_review_list[:10]

100
There are a total of 100 individual movie reviews
Displaying 10 reviews


['https://www.imdb.com/review/rw7336413/',
 'https://www.imdb.com/review/rw7389798/',
 'https://www.imdb.com/review/rw7348970/',
 'https://www.imdb.com/review/rw7359489/',
 'https://www.imdb.com/review/rw7333470/',
 'https://www.imdb.com/review/rw7306727/',
 'https://www.imdb.com/review/rw7306235/',
 'https://www.imdb.com/review/rw7298598/',
 'https://www.imdb.com/review/rw7304999/',
 'https://www.imdb.com/review/rw7301900/']

In [8]:
review_texts = [getReviewText(url) for url in movie_review_list]

# get movie name from the review link
movie_titles = [getMovieTitle(url) for url in movie_review_list]

# label each review with negative or positive

# construct a dataframe
df = pd.DataFrame({'movie': movie_titles, 'user_review_permalink': movie_review_list,
             'user_review': review_texts })

In [9]:
df.head()

Unnamed: 0,movie,user_review_permalink,user_review
0,Dune,https://www.imdb.com/review/rw7336413/,"""If you loved Arrival and Blade Runner 2049, t..."
1,Dune,https://www.imdb.com/review/rw7389798/,When the e-mail came I was kinda dreading it.A...
2,Dune,https://www.imdb.com/review/rw7348970/,Denis Villeneuve's Dune might be one of the vi...
3,Dune,https://www.imdb.com/review/rw7359489/,"The visuals and the score are amazing, there's..."
4,Dune,https://www.imdb.com/review/rw7333470/,I saw the film and I loved it. The film have e...


In [10]:
from google.colab import files
df.to_csv('filename.csv') 
files.download('filename.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# New Section

In [None]:
from google.colab import drive
drive.mount('/content/drive')