# API

In [45]:
import pandas as pd
import json
import requests
import time
pd.options.display.max_columns = 30

In [36]:
api_key = "api_key=08cd2508b22085e68cf85cbcadecf313"
urltv = lambda i: f"https://api.themoviedb.org/3/tv/{i}?{api_key}"
urlmovie = lambda i: f"https://api.themoviedb.org/3/movie/{i}?{api_key}"

def api_thmdb(tv_id,tv=True,trst = 0.2):
    json_lst = []
    for i in tv_id:
        time.sleep(trst)
        r = requests.get(urltv(i) if tv else urlmovie(i))
        if r.status_code == 200:
            json_lst.append(r.json())
    return pd.DataFrame(json_lst)

## Scraping a single item

In [34]:
df = api_thmdb([550],tv=False)

In [None]:
# Normalize semi-structured JSON data into a flat table.
pd.json_normalize(data=data,record_path="genres",meta="title")
pd.json_normalize(data=data,record_path="production_companies",meta=["title","release_date"])

## Getting Top rated TV-series from 2015-2020

In [42]:
root_url = "https://api.themoviedb.org/3/discover/tv?"
query = "&sort_by=popularity.desc&air_date.gte=2015&air_date.lte=2022&include_null_first_air_dates=false"
url = root_url + api_key + query

In [43]:
data = requests.get(url).json()
df = pd.DataFrame(data["results"])

## Get all info about the most popular TV-series

In [44]:
tv_id = df["id"].tolist()
df = api_thmdb(tv_id,tv=True)

## Saving our JSON file

In [None]:
df.to_json("tvs.json",orient="records")

In [None]:
# Opening json

with open("tvs.json") as f:
    data = json.load(f)

In [None]:
# Json to DataFrame

pd.json_normalize(data)
pd.read_json("tvs.json")

# Web scraping

In [70]:
from bs4 import BeautifulSoup
import re
import numpy as np

## Scrape IMDB

In [64]:
url = "https://www.imdb.com/search/title/?year=2008-01-01,2008-12-31&sort=num_votes,desc&start=1&ref_=adv_nxt"

html = requests.get(url).content
soup = BeautifulSoup(html)

In [65]:
pattern = r"[\d]+"

mnr = "h3.lister-item-header"
mt = "h3.lister-item-header>a"
md = "h3.lister-item-header>span:last-child"
mrat = "div.ratings-imdb-rating>strong"
mrun = "span.runtime"
mgro = "p.sort-num_votes-visible>span:nth-child(5)"

In [66]:
# Title
[i.text for i in soup.select(mnr)][0:5]
# Year
[re.findall(pattern, i.text.split("–")[0]) for i in soup.select(md)][0:8]
# Rating
[i.text for i in soup.select(mrat)][0:8]
# Runtime
[i.text for i in soup.select(mrun)][0:8]
# Gross
[i.text for i in soup.select(mgro)][0:8]

['$534.86M',
 '$223.81M',
 '$318.41M',
 '$141.32M',
 '$148.10M',
 '$127.51M',
 '$145.00M',
 '$134.52M']

In [62]:
# Get links / href
cont = soup.select("div.lister-item")
[i["href"] for i in cont[0].select("h3 > a")]

['/title/tt0468569/']

## Container method

In [71]:
# Build a function that takes care of missing data

def item_extr(item_cont, key):
    parse = item_cont.select(key)
    return parse[0].text if parse else np.nan

In [72]:
mm = "div.lister-item"
name_key = "h3 > a"
gross_key = "p.sort-num_votes-visible > span:nth-child(5)"
year_key = "h3 > span.lister-item-year"
rating_key = "div.inline-block > strong"
runtime_key = "span.runtime"

name_lst = []
gross_lst = []
year_lst = []
rating_lst = []
runtime_lst = []

for i in soup.select(mm):
    # name
    name_lst.append(item_extr(i, name_key))
    
    # gross
    gross_lst.append(item_extr(i, gross_key))
    
    # year
    year_lst.append(item_extr(i, year_key))
    
    # rating
    rating_lst.append(item_extr(i, rating_key))
    
    # runtime
    runtime_lst.append(item_extr(i, runtime_key))