<a href="https://colab.research.google.com/github/WittmannF/imdb-tv-ratings/blob/master/notebooks/00.scrape-imdb-ratings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from bs4 import BeautifulSoup as BS
import requests
from time import sleep
import numpy as np
import re
import pandas as pd

In [39]:
RE_KEY = 'href="/title/(.*)/" title='
RE_TITLE = '">(.*)</a>'
RE_YEAR = 'secondaryInfo">((.*))</span>'
RE_RATING = '<strong title="(.*) based on'
RE_NRATINGS = 'based on (.*) user ratings">'

RE_CODE = 'href="/title/(.*)/" title='
RE_NAME = '">(.*)</a>'
RE_RATE = '<strong title="(.*) based on'
RE_NRATES = 'based on (.*) user ratings">'

def get_top_ratings(URL):
    print('Send request')
    soup = get_soup(URL)
    
    print('Create lists with html tags')
    key_title_year = soup.findAll("td", {"class": "titleColumn"})
    rating_and_count = soup.findAll("td", {"class": "ratingColumn imdbRating"})
    
    print('Extract info from html formatting')
    keys = [extract_re(RE_KEY, n) for n in key_title_year]
    titles = [extract_re(RE_TITLE, n) for n in key_title_year]
    ratings = [extract_re(RE_RATING, n) for n in rating_and_count]
    nratings = [extract_re(RE_NRATINGS, n) for n in rating_and_count]
    years = [extract_re(RE_YEAR, n).replace('(','').replace(')','') for n in key_title_year]
    
    print('Convert to a dataframe')
    df = pd.DataFrame(zip(titles, years, ratings, nratings), index=keys, columns=['Title', 'Year', 'Rating', 'Rating Count'])
    
    print('Done')
    return df

def get_ratings(serie_code, season):
    URL = f'https://www.imdb.com/title/{serie_code}/episodes?season={season}'
    c = requests.get(URL).content
    soup = BS(c, 'html.parser')
    allrates = soup.findAll("div", { "class" : 'ipl-rating-star small' })
    ratings = [float(re.search('ipl-rating-star__rating">(.*)</span', str(ar)).group(1)) for ar in allrates]
    return ratings

def extract_re(code, array):
    return re.search(code, str(array)).group(1)

def get_soup(URL):
    c = requests.get(URL).content
    soup = BS(c, 'html.parser')
    return soup

flatten = lambda l: [item for sublist in l for item in sublist]

def print_stats(rates):
    print(f'Mean: {np.mean(flatten(rates))}')
    print(f'Median: {np.median(flatten(rates))}')

def get_all_ratings(serie_code, max_seasons=100):
    all_ratings = []
    for season in range(1,max_seasons+1):
        ratings = get_ratings(serie_code, season)
        try:
            unrepeated = ratings!=all_ratings[-1]
        except:
            unrepeated = True
        if len(ratings)>0 and unrepeated:
            all_ratings.append(ratings)
            #print(f'Season {season} = {ratings}')
            sleep(1)
        else:
            #print_stats(all_ratings)
            return convert2df(all_ratings, serie_code)
            break
    #print_stats(all_ratings)
    return convert2df(all_ratings, serie_code)

def convert2df(all_ratings, code):
    season_number = []
    episode_number = []
    ratings = []
    for i in range(len(all_ratings)):
        for j in range(len(all_ratings[i])):
            season_number.append(i+1)
            episode_number.append(j+1)
            ratings.append(all_ratings[i][j])

    serie_df = pd.DataFrame(zip(season_number, episode_number, ratings), columns=['Season', 'Episode', 'Rating'])
    serie_df['Code'] = code
    return serie_df

def get_all_series_info():
    soup = get_soup('https://www.imdb.com/chart/toptv')
    name_code = soup.findAll("td", {"class": "titleColumn"}); name_code[0]
    serie_ranks = soup.findAll("td", {"class": "ratingColumn imdbRating"}); serie_ranks[0]
    names = [extract_re(RE_NAME, n) for n in name_code]
    codes = [extract_re(RE_CODE, n) for n in name_code]
    series_rates = [extract_re(RE_RATE, n) for n in serie_ranks]
    number_rates = [extract_re(RE_NRATES, n) for n in serie_ranks]
    all_series = pd.DataFrame(zip(names, codes, series_rates, number_rates), columns=['Title', 'Code', 'Rating', 'Rating Count'])
    all_series.index = all_series['Code']
    return all_series


In [7]:
get_ratings('tt2802850', 2)

[8.8, 8.5, 8.6, 8.7, 9.1, 9.2, 8.8, 9.4, 9.4, 8.5]

In [9]:
got = get_all_ratings('tt0944947', max_seasons=8)

Season 1 = [8.9, 8.6, 8.5, 8.6, 9.0, 9.1, 9.1, 8.9, 9.6, 9.4]
Season 2 = [8.7, 8.4, 8.7, 8.6, 8.6, 8.9, 8.8, 8.6, 9.6, 9.3]
Season 3 = [8.6, 8.5, 8.7, 9.5, 8.9, 8.7, 8.6, 8.9, 9.9, 9.1]
Season 4 = [9.0, 9.7, 8.7, 8.7, 8.6, 9.7, 9.0, 9.7, 9.6, 9.6]
Season 5 = [8.4, 8.4, 8.4, 8.6, 8.5, 7.9, 8.9, 9.8, 9.4, 9.1]
Season 6 = [8.4, 9.3, 8.6, 9.0, 9.7, 8.3, 8.5, 8.3, 9.9, 9.9]
Season 7 = [8.5, 8.8, 9.1, 9.7, 8.7, 9.0, 9.4]
Season 8 = [7.6, 7.9, 7.5, 5.5, 6.0, 4.0]
Mean: 8.747945205479452
Median: 8.8


In [11]:
got.head()

Unnamed: 0,Season,Episode,Rating,Code
0,1,1,8.9,tt0944947
1,1,2,8.6,tt0944947
2,1,3,8.5,tt0944947
3,1,4,8.6,tt0944947
4,1,5,9.0,tt0944947


In [28]:
all_series = get_all_series_info()
all_series.head()

Unnamed: 0_level_0,Title,Code,Rating,Rating Count
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tt5491994,Planet Earth II,tt5491994,9.4,142844
tt0903747,Breaking Bad,tt0903747,9.4,1817275
tt0795176,Planet Earth,tt0795176,9.4,208191
tt0185906,Band of Brothers,tt0185906,9.4,460468
tt7366338,Chernobyl,tt7366338,9.3,729461


# Scrape All Top 250 Series Episodes Ratings

In [43]:
ratings_df = []
for i, code in enumerate(all_series['Code']):
    title = all_series[all_series.index==code]['Title'][0]
    print(i, title)
    try:
        all_ratings = get_all_ratings(code)
        all_ratings['Title'] = title
        ratings_df.append(all_ratings)
    except Exception as e:
        print(e)

ratings_df = pd.concat(ratings_df)

0 Planet Earth II
1 Breaking Bad
2 Planet Earth
3 Band of Brothers
4 Chernobyl
5 The Wire
6 Blue Planet II
7 Avatar: The Last Airbender
8 Cosmos: A Spacetime Odyssey
9 The Sopranos
10 Our Planet
11 Cosmos
12 Game of Thrones
13 Rick and Morty
14 The World at War
15 Fullmetal Alchemist: Brotherhood
16 The Last Dance
17 Life
18 Sherlock
19 The Twilight Zone
20 The Vietnam War
21 Batman: The Animated Series
22 Scam 1992: The Harshad Mehta Story
23 Arcane
24 The Blue Planet
25 Attack on Titan
26 The Office
27 Firefly
28 Human Planet
29 Frozen Planet
30 Death Note
31 Only Fools and Horses
32 True Detective
33 The Civil War
34 Hunter x Hunter
35 The Beatles: Get Back
36 Seinfeld
37 Persona
38 Fargo
39 Dekalog
40 Clarkson's Farm
41 Cowboy Bebop
42 Better Call Saul
43 Nathan for You
44 Gravity Falls
45 Friends
46 When They See Us
47 Last Week Tonight with John Oliver
48 Africa
49 TVF Pitchers
50 Apocalypse: The Second World War
51 Monty Python's Flying Circus
52 It's Always Sunny in Philadelphi

In [44]:
ratings_df

Unnamed: 0,Season,Episode,Rating,Code,Title
0,1,1,9.4,tt5491994,Planet Earth II
1,1,2,9.1,tt5491994,Planet Earth II
2,1,3,8.9,tt5491994,Planet Earth II
3,1,4,8.8,tt5491994,Planet Earth II
4,1,5,8.6,tt5491994,Planet Earth II
...,...,...,...,...,...
176,6,23,8.2,tt0043208,I Love Lucy
177,6,24,8.8,tt0043208,I Love Lucy
178,6,25,8.4,tt0043208,I Love Lucy
179,6,26,8.5,tt0043208,I Love Lucy


In [45]:
all_series.to_csv("00.imdb_top_250_series_global_ratings.csv")

In [46]:
ratings_df.to_csv('00.imdb_top_250_series_episode_ratings.csv')

# Get All Movie Ratings from Top 250 IMDb Movies


In [None]:
URL = 'https://www.imdb.com/chart/top'

top_movies = get_top_ratings(URL)

Send request
Create lists with html tags
Extract info from html formatting
Convert to a dataframe
Done


In [None]:
top_movies.head()

Unnamed: 0,Title,Year,Rating,Rating Count
tt0111161,The Shawshank Redemption,1994,9.2,2089382
tt0068646,The Godfather,1972,9.2,1433884
tt0071562,The Godfather: Part II,1974,9.0,995997
tt0468569,The Dark Knight,2008,9.0,2055819
tt0050083,12 Angry Men,1957,8.9,591688


In [None]:
top_movies.to_csv('top-250-movie-ratings.csv')