<a href="https://colab.research.google.com/github/WittmannF/imdb-tv-ratings/blob/master/IMDB_get_ratings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get all ratings from IMDb TV series
Get all ratings from a IMDb TV Series and plot them:

In [0]:
from bs4 import BeautifulSoup as BS
import requests
from time import sleep
import numpy as np
import re
import pandas as pd

In [0]:
def get_ratings(serie_code, season):
    URL = f'https://www.imdb.com/title/{serie_code}/episodes?season={season}'
    c = requests.get(URL).content
    soup = BS(c, 'html.parser')
    allrates = soup.findAll("div", { "class" : 'ipl-rating-star small' })
    ratings = [float(re.search('ipl-rating-star__rating">(.*)</span', str(ar)).group(1)) for ar in allrates]
    return ratings

In [0]:
def extract_re(code, array):
    return re.search(code, str(array)).group(1)

In [0]:
def get_soup(URL):
    c = requests.get(URL).content
    soup = BS(c, 'html.parser')
    return soup

In [0]:
RE_KEY = 'href="/title/(.*)/" title='
RE_TITLE = '">(.*)</a>'
RE_YEAR = 'secondaryInfo">((.*))</span>'
RE_RATING = '<strong title="(.*) based on'
RE_NRATINGS = 'based on (.*) user ratings">'

def get_top_ratings(URL):
    print('Send request')
    soup = get_soup(URL)
    
    print('Create lists with html tags')
    key_title_year = soup.findAll("td", {"class": "titleColumn"})
    rating_and_count = soup.findAll("td", {"class": "ratingColumn imdbRating"})
    
    print('Extract info from html formatting')
    keys = [extract_re(RE_KEY, n) for n in key_title_year]
    titles = [extract_re(RE_TITLE, n) for n in key_title_year]
    ratings = [extract_re(RE_RATING, n) for n in rating_and_count]
    nratings = [extract_re(RE_NRATINGS, n) for n in rating_and_count]
    years = [extract_re(RE_YEAR, n).replace('(','').replace(')','') for n in key_title_year]
    
    print('Convert to a dataframe')
    df = pd.DataFrame(zip(titles, years, ratings, nratings), index=keys, columns=['Title', 'Year', 'Rating', 'Rating Count'])
    
    print('Done')
    return df

In [0]:
get_ratings('tt2802850', 2)

[9.0, 8.7, 8.8, 8.9, 9.3, 9.4, 9.0, 9.5, 9.6, 8.6]

In [0]:
flatten = lambda l: [item for sublist in l for item in sublist]

def print_stats(rates):
    print(f'Mean: {np.mean(flatten(rates))}')
    print(f'Median: {np.median(flatten(rates))}')

def get_all_ratings(serie_code, max_seasons=100):
    all_ratings = []
    for season in range(1,max_seasons+1):
        ratings = get_ratings(serie_code, season)
        try:
            unrepeated = ratings!=all_ratings[-1]
        except:
            unrepeated = True
        if len(ratings)>0 and unrepeated:
            all_ratings.append(ratings)
            print(f'Season {season} = {ratings}')
            sleep(1)
        else:
            print_stats(all_ratings)
            return convert2df(all_ratings, serie_code)
            break
    print_stats(all_ratings)
    return convert2df(all_ratings, serie_code)

In [0]:
def convert2df(all_ratings, code):
    season_number = []
    episode_number = []
    ratings = []
    for i in range(len(all_ratings)):
        for j in range(len(all_ratings[i])):
            season_number.append(i+1)
            episode_number.append(j+1)
            ratings.append(all_ratings[i][j])

    serie_df = pd.DataFrame(zip(season_number, episode_number, ratings), columns=['Season', 'Episode', 'Rating'])
    serie_df['Code'] = code
    return serie_df

In [0]:
get_all_ratings('tt2802850', max_seasons=8)

Season 1 = [9.4, 8.5, 8.7, 9.1, 8.6, 9.4, 9.1, 8.8, 9.4, 9.4]
Season 2 = [9.0, 8.7, 8.8, 8.9, 9.3, 9.4, 9.0, 9.5, 9.6, 8.6]
Season 3 = [8.7, 8.2, 8.1, 8.4, 8.5, 8.9, 8.5, 9.1, 9.0, 8.7]
Mean: 8.91
Median: 8.9


[[9.4, 8.5, 8.7, 9.1, 8.6, 9.4, 9.1, 8.8, 9.4, 9.4],
 [9.0, 8.7, 8.8, 8.9, 9.3, 9.4, 9.0, 9.5, 9.6, 8.6],
 [8.7, 8.2, 8.1, 8.4, 8.5, 8.9, 8.5, 9.1, 9.0, 8.7]]

In [0]:
fargo = [[9.4, 8.5, 8.7, 9.1, 8.6, 9.4, 9.1, 8.8, 9.4, 9.4],
 [9.0, 8.7, 8.8, 8.9, 9.3, 9.4, 9.0, 9.5, 9.6, 8.6],
 [8.7, 8.2, 8.1, 8.4, 8.5, 8.9, 8.5, 9.1, 9.0, 8.7]]

In [0]:
list(range(1,6))

[1, 2, 3, 4, 5]

In [0]:
np.mean(fargo)

8.91

In [0]:
np.median(fargo)

8.9

In [0]:
got = get_all_ratings('tt0944947', max_seasons=8)

Season 1 = [9.1, 8.8, 8.7, 8.8, 9.1, 9.2, 9.3, 9.1, 9.6, 9.5]
Season 2 = [8.9, 8.6, 8.9, 8.9, 8.9, 9.1, 9.0, 8.9, 9.7, 9.4]
Season 3 = [8.9, 8.7, 8.9, 9.6, 9.0, 8.9, 8.8, 9.1, 9.9, 9.2]
Season 4 = [9.1, 9.7, 8.9, 8.9, 8.8, 9.7, 9.2, 9.7, 9.6, 9.7]
Season 5 = [8.6, 8.6, 8.6, 8.8, 8.7, 8.1, 9.1, 9.9, 9.5, 9.1]
Season 6 = [8.6, 9.4, 8.8, 9.2, 9.7, 8.5, 8.7, 8.5, 9.9, 9.9]
Season 7 = [8.7, 9.0, 9.3, 9.8, 9.0, 9.2, 9.5]
Season 8 = [8.3, 8.6, 8.3, 6.4, 7.1]


In [0]:
got

[[9.1, 8.8, 8.7, 8.8, 9.1, 9.2, 9.3, 9.1, 9.6, 9.5],
 [8.9, 8.6, 8.9, 8.9, 8.9, 9.1, 9.0, 8.9, 9.7, 9.4],
 [8.9, 8.7, 8.9, 9.6, 9.0, 8.9, 8.8, 9.1, 9.9, 9.2],
 [9.1, 9.7, 8.9, 8.9, 8.8, 9.7, 9.2, 9.7, 9.6, 9.7],
 [8.6, 8.6, 8.6, 8.8, 8.7, 8.1, 9.1, 9.9, 9.5, 9.1],
 [8.6, 9.4, 8.8, 9.2, 9.7, 8.5, 8.7, 8.5, 9.9, 9.9],
 [8.7, 9.0, 9.3, 9.8, 9.0, 9.2, 9.5],
 [8.3, 8.6, 8.3, 6.4, 7.1]]

In [0]:
np.mean(flatten(got))

9.016666666666667

In [0]:
np.median(flatten(got))

9.0

In [0]:
rickmorty = get_all_ratings('tt2861424')

Season 1 = [8.0, 8.5, 8.3, 8.5, 8.8, 8.8, 8.0, 8.5, 8.3, 9.1, 8.3]
Season 2 = [8.6, 8.6, 8.4, 9.3, 8.1, 9.0, 8.3, 7.5, 8.3, 9.0]
Season 3 = [9.5, 8.1, 9.2, 8.1, 8.4, 8.8, 9.8, 8.8, 7.9, 8.1]
Mean: 8.545161290322582
Median: 8.5


In [0]:
avatar = get_all_ratings('tt0417299')

Season 1 = [8.6, 8.2, 8.4, 8.5, 8.2, 8.3, 8.1, 8.3, 8.9, 8.1, 8.0, 7.2, 9.0, 9.0, 8.0, 8.0, 8.5, 8.1, 8.6, 9.4]
Season 2 = [8.7, 8.3, 8.5, 7.9, 7.7, 9.1, 9.4, 8.9, 8.6, 8.8, 8.8, 8.4, 9.0, 8.7, 9.1, 8.8, 9.0, 8.6, 9.0, 9.5]
Season 3 = [8.7, 8.5, 8.1, 8.8, 8.5, 9.4, 8.3, 9.0, 7.9, 9.1, 9.3, 8.9, 9.1, 8.7, 9.2, 8.9, 8.7, 9.1, 9.4, 9.6, 9.8]
Mean: 8.675409836065572
Median: 8.7


In [0]:
it_crowd = get_all_ratings('tt0487831')

Season 1 = [7.7, 8.3, 7.9, 8.0, 8.5, 8.2]
Season 2 = [9.4, 8.2, 8.3, 8.7, 7.9, 7.7]
Season 3 = [7.8, 8.4, 8.2, 9.0, 8.5, 7.8]
Season 4 = [8.1, 8.4, 7.5, 8.3, 8.4, 7.6]
Season 5 = [8.6]
Mean: 8.216000000000001
Median: 8.2


In [0]:
soup = get_soup('https://www.imdb.com/chart/toptv')

In [0]:
name_code = soup.findAll("td", {"class": "titleColumn"}); name_code[0]

<td class="titleColumn">
      1.
      <a href="/title/tt5491994/" title="David Attenborough">Planet Earth II</a>
<span class="secondaryInfo">(2016)</span>
</td>

In [0]:
serie_ranks = soup.findAll("td", {"class": "ratingColumn imdbRating"}); serie_ranks[0]

<td class="ratingColumn imdbRating">
<strong title="9.5 based on 69,834 user ratings">9.5</strong>
</td>

In [0]:
RE_CODE = 'href="/title/(.*)/" title='
RE_NAME = '">(.*)</a>'
RE_RATE = '<strong title="(.*) based on'
RE_NRATES = 'based on (.*) user ratings">'

bs4.element.Tag

In [0]:
names = [extract_re(RE_NAME, n) for n in name_code]
codes = [extract_re(RE_CODE, n) for n in name_code]
series_rates = [extract_re(RE_RATE, n) for n in serie_ranks]
number_rates = [extract_re(RE_NRATES, n) for n in serie_ranks]

In [0]:
all_series = pd.DataFrame(zip(names, codes, series_rates, number_rates), columns=['Title', 'Code', 'Rating', 'Rating Count'])
all_series.index = all_series['Code']

In [0]:
all_series.head()

Unnamed: 0,Title,Code,Rating,Rating Count,Rank
0,Planet Earth II,tt5491994,9.5,69834,1
1,Band of Brothers,tt0185906,9.4,314596,2
2,Game of Thrones,tt0944947,9.4,1477002,3
3,Planet Earth,tt0795176,9.4,146143,4
4,Breaking Bad,tt0903747,9.4,1185584,5


In [0]:
ratings_df = None
for code in all_series['Code']:
    print(all_series[all_series.index==code]['Title'][0])
    try:
        ratings_df = ratings_df.append(get_all_ratings(code))
    except: #initialize ratings_df if it is the first time
        ratings_df = get_all_ratings(code)
    
    

Planet Earth II
Season 1 = [8.0, 7.3, 6.9, 6.8, 6.8, 6.8]
Mean: 7.1000000000000005
Median: 6.85
Band of Brothers
Season 1 = [8.5, 8.8, 8.6, 8.5, 8.4, 8.7, 8.9, 8.4, 8.9, 8.6]
Mean: 8.63
Median: 8.6
Game of Thrones
Season 1 = [9.1, 8.8, 8.7, 8.8, 9.1, 9.2, 9.3, 9.1, 9.6, 9.5]
Season 2 = [8.9, 8.6, 8.9, 8.9, 8.9, 9.1, 9.0, 8.9, 9.7, 9.4]
Season 3 = [8.9, 8.7, 8.9, 9.6, 9.0, 8.9, 8.8, 9.1, 9.9, 9.2]
Season 4 = [9.1, 9.7, 8.9, 8.9, 8.8, 9.7, 9.2, 9.7, 9.6, 9.7]
Season 5 = [8.6, 8.6, 8.6, 8.8, 8.7, 8.1, 9.1, 9.9, 9.5, 9.1]
Season 6 = [8.6, 9.4, 8.8, 9.2, 9.7, 8.5, 8.7, 8.5, 9.9, 9.9]
Season 7 = [8.7, 9.0, 9.3, 9.8, 9.0, 9.2, 9.5]
Season 8 = [8.3, 8.6, 8.2, 6.4, 7.3]
Mean: 9.018055555555556
Median: 9.0
Planet Earth
Season 1 = [7.9, 7.5, 7.4, 7.3, 7.3, 7.3, 7.2, 7.3, 7.2, 7.2, 7.2]
Mean: 7.345454545454547
Median: 7.3
Breaking Bad
Season 1 = [8.9, 8.7, 8.7, 8.2, 8.3, 9.2, 8.8]
Season 2 = [8.7, 9.2, 8.3, 8.1, 8.3, 8.8, 8.7, 9.0, 8.9, 8.6, 8.8, 9.1, 9.1]
Season 3 = [8.6, 8.7, 8.4, 8.1, 8.7, 9.2,

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Mean: nan
Median: nan
Borgen
Season 1 = [8.0, 8.1, 7.8, 8.1, 7.9, 7.9, 7.9, 7.9, 8.3, 8.1]
Season 2 = [7.9, 8.2, 8.0, 8.3, 8.0, 8.5, 7.8, 8.1, 8.1, 8.3]
Season 3 = [7.7, 8.0, 7.7, 7.8, 8.0, 7.8, 8.0, 8.1, 8.2, 8.3]
Mean: 8.026666666666666
Median: 8.0
Inside No. 9
Season 1 = [8.1, 8.9, 8.5, 7.0, 7.1, 7.3]
Season 2 = [8.2, 9.3, 7.4, 8.5, 7.6, 7.3]
Season 3 = [7.8, 8.3, 8.6, 7.1, 7.9, 7.7]
Season 4 = [8.1, 8.4, 8.6, 8.0, 6.9, 8.0]
Season 5 = [8.1]
Mean: 7.9479999999999995
Median: 8.0
Doctor Who
Season 1 = [8.2, 8.4, 6.6, 6.5, 6.7, 7.9, 7.9, 7.8, 7.8, 7.5, 7.3, 7.8, 7.0, 7.1, 7.6, 7.3, 7.3, 7.3, 7.3, 7.3, 7.4, 7.3, 7.4, 7.2, 7.0, 7.4, 7.4, 7.5, 7.6, 7.7, 7.7, 7.1, 6.9, 6.7, 6.8, 6.6, 6.6, 7.0, 6.8, 6.7, 6.6, 6.6, 6.8]
Season 2 = [7.1, 7.1, 6.9, 8.1, 8.2, 8.2, 8.0, 8.0, 8.6, 7.4, 7.4, 7.6, 7.6, 7.7, 7.8, 6.6, 6.2, 6.0, 5.9, 6.0, 5.9, 7.3, 7.2, 7.4, 7.3, 7.9, 7.1, 6.8, 6.7, 7.4, 7.2, 7.3, 7.0, 7.3, 8.0, 7.9, 7.7, 7.9, 8.3]
Season 3 = [6.7, 6.6, 6.8, 6.7, 7.4, 7.5, 7.6, 7.6, 7.9, 8.2, 8.2, 7.

In [0]:
!pip install easycolab
import easycolab as ec
ec.mount()

Collecting easycolab
  Downloading https://files.pythonhosted.org/packages/7a/ff/017693c8f12c9b586d2bc9965ebed05b8aa1d2ba0ec9f1d88df9f0a70542/easycolab-0.1b29.tar.gz
Building wheels for collected packages: easycolab
  Building wheel for easycolab (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/1b/c3/03/9d9371c4d3a117eff9caf88148e2f18ddf556543b4475055f1
Successfully built easycolab
Installing collected packages: easycolab
Successfully installed easycolab-0.1b29
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
·····

In [0]:
!mkdir imdb-tv

In [0]:
cd imdb-tv

/content/gdrive/My Drive/imdb-tv


In [0]:
all_series.to_csv("top250ratings.csv")

In [0]:
ratings_df.to_csv('allratings.csv')

In [0]:
rating_means = ratings_df.groupby('Code')['Rating'].mean()

In [0]:
rating_means_sorted = rating_means.sort_values(ascending=False)

In [0]:
all_series.join(rating_means_sorted, on='Code')

ValueError: ignored

In [0]:
all_series = all_series.drop('Code', axis=1)

In [0]:
rating_means_sorted.name='Rating Mean'

In [0]:
all_series_mean = all_series.join(rating_means_sorted)

In [0]:
all_series_mean_sorted = all_series_mean.sort_values('Rating Mean', ascending=False)

In [0]:
all_series_mean_sorted.to_csv('all-series-ep-average.csv')

In [0]:
all_series_mean_sorted

Unnamed: 0_level_0,Title,Rating,Rating Count,Rank,Rating Mean
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt7366338,Chernobyl,9.1,5431,12,9.300000
tt9253866,Our Planet,9.3,8356,6,9.262500
tt2560140,Attack on Titan,8.7,109643,66,9.258824
tt8595766,Yeh Meri Family,8.5,15026,159,9.200000
tt2395695,Cosmos,9.2,94934,9,9.076923
tt3322312,Daredevil,8.6,326756,101,9.023077
tt0944947,Game of Thrones,9.4,1477002,3,9.018056
tt4269716,Umbre,8.5,5697,179,9.007143
tt4508902,One Punch Man,8.8,81313,34,9.000000
tt2433738,Wentworth,8.5,14125,205,8.984286


# Get All Movie Ratings from Top 250 IMDb Movies


In [35]:
URL = 'https://www.imdb.com/chart/top'

top_movies = get_top_ratings(URL)

Send request
Create lists with html tags
Extract info from html formatting
Convert to a dataframe
Done


In [36]:
top_movies.head()

Unnamed: 0,Title,Year,Rating,Rating Count
tt0111161,The Shawshank Redemption,1994,9.2,2089382
tt0068646,The Godfather,1972,9.2,1433884
tt0071562,The Godfather: Part II,1974,9.0,995997
tt0468569,The Dark Knight,2008,9.0,2055819
tt0050083,12 Angry Men,1957,8.9,591688


In [37]:
!pip install easycolab
import easycolab as ec
ec.mount()

Collecting easycolab
  Downloading https://files.pythonhosted.org/packages/7a/ff/017693c8f12c9b586d2bc9965ebed05b8aa1d2ba0ec9f1d88df9f0a70542/easycolab-0.1b29.tar.gz
Building wheels for collected packages: easycolab
  Building wheel for easycolab (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/1b/c3/03/9d9371c4d3a117eff9caf88148e2f18ddf556543b4475055f1
Successfully built easycolab
Installing collected packages: easycolab
Successfully installed easycolab-0.1b29
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
·····

In [38]:
cd imdb-tv/

/content/gdrive/My Drive/imdb-tv


In [0]:
top_movies.to_csv('top-250-movie-ratings.csv')