In [1]:
from requests import get
from bs4 import BeautifulSoup
from time import sleep,time
from warnings import warn
from IPython.core.display import clear_output
from random import randint
import pandas as pd

In [2]:
names = []
years = []
imdb_ratings = []
metascores = []
votes = []

pages = [str(i) for i in range(1,5)]
years_url = [str(i) for i in range(2000,2018)]
headers = {"Accept-Language": "en-US, en;q=0.5"}

In [3]:
# Preparing the monitoring of the loop
start_time = time()
requests = 0
# For every year in the interval 2000-2017
for year_url in years_url:

    # For every page in the interval 1-4
    for page in pages:
        print(f"page {page}")
        # Make a get request
        response = get('http://www.imdb.com/search/title?release_date=' + year_url +
        '&sort=num_votes,desc&page=' + page, headers = headers)

        # Pause the loop
        sleep(randint(8,15))

        # Monitor the requests
        requests += 1
        elapsed_time = time() - start_time
        print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
        clear_output(wait = True)

        # Throw a warning for non-200 status codes
        if response.status_code != 200:
            warn('Request: {}; Status code: {}'.format(requests, response.status_code))

        # Break the loop if the number of requests is greater than expected
        if requests > 72:
            warn('Number of requests was greater than expected.')
            break

        # Parse the content of the request with BeautifulSoup
        page_html = BeautifulSoup(response.text, 'html.parser')

        # Select all the 50 movie containers from a single page
        mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
        print()
        # For every movie of these 50
        for container in mv_containers:
            # If the movie has a Metascore, then:
            if container.find('div', class_ = 'ratings-metascore') is not None:

                # Scrape the name
                name = container.h3.a.text
                names.append(name)

                # Scrape the year
                year = container.h3.find('span', class_ = 'lister-item-year').text
                years.append(year)

                # Scrape the IMDB rating
                imdb = float(container.strong.text)
                imdb_ratings.append(imdb)

                # Scrape the Metascore
                m_score = container.find('span', class_ = 'metascore').text
                metascores.append(int(m_score))

                # Scrape the number of votes
                vote = container.find('span', attrs = {'name':'nv'})['data-value']
                votes.append(int(vote))




In [5]:
df = pd.DataFrame({'movie': names,
'year': years,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes
})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 5 columns):
movie        3276 non-null object
year         3276 non-null object
imdb         3276 non-null float64
metascore    3276 non-null int64
votes        3276 non-null int64
dtypes: float64(1), int64(2), object(2)
memory usage: 102.4+ KB


In [15]:
df.head()

Unnamed: 0,movie,year,imdb,metascore,votes
0,Gladiator,2000,8.5,67,1227358
1,Memento,2000,8.4,80,1040921
2,Snatch,2000,8.3,55,726145
3,Requiem for a Dream,2000,8.3,68,707720
4,X-Men,2000,7.4,64,541485


## Removibg duplicates

In [46]:
df.drop_duplicates(inplace=True)
df.reset_index(drop=True,inplace=True)
df.head()

Unnamed: 0,movie,year,imdb,metascore,votes
0,Gladiator,2000,8.5,67,1227358
1,Memento,2000,8.4,80,1040921
2,Snatch,2000,8.3,55,726145
3,Requiem for a Dream,2000,8.3,68,707720
4,X-Men,2000,7.4,64,541485


## Movies with both ratings > 8.0


In [49]:
res = df[(df['imdb']>8.0) & (df['metascore']>80)]
res[['movie','year']]

Unnamed: 0,movie,year
20,Amores Perros,2000
43,In the Mood for Love,2000
49,The Lord of the Rings: The Fellowship of the Ring,2001
56,Spirited Away,2001
94,The Lord of the Rings: The Two Towers,2002
96,The Pianist,2002
142,The Lord of the Rings: The Return of the King,2003
145,Finding Nemo,2003
189,Eternal Sunshine of the Spotless Mind,2004
192,Million Dollar Baby,2004
