In [None]:
from dotenv import load_dotenv
import os
import time
import datetime
import json
import requests
from requests.utils import quote
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
load_dotenv('.env')
OMDB_KEY = os.environ.get("OMDB_API_KEY")
omdb = 'http://www.omdbapi.com/?apikey={}&t='.format(OMDB_KEY)

# 1. Netflix

In [None]:
netflix = pd.read_csv('../../data/netflix_titles.csv', sep=',', header=0)
netflix.drop(columns=['show_id', 'date_added', 'description'], inplace=True)
netflix.rename(columns={'type': 'Type', 'title': 'Title', 'director': 'Director', 'cast': 'Actors',
                  'country': 'Country', 'release_year': 'Release', 'rating': 'Rating',
                  'duration': 'Runtime', 'listed_in': 'Genre'}, inplace=True)
netflix = netflix.reindex(columns=['Type', 'Title', 'Director', 'Actors', 'Country', 'Release', 'Rating', 
                                   'Runtime', 'Genre', 'Language', 'IMDb', 'RottenTomatoes', 'Metacritic'])
netflix = netflix[['Title', 'Type', 'Director', 'Actors', 'Release', 'Genre', 'Runtime', 'Language', 
                   'Country', 'Rating', 'IMDb', 'RottenTomatoes', 'Metacritic']]

In [None]:
netflix.head()

In [None]:
netflix.info()

In [None]:
netflix.iloc[1]['Actors']

In [None]:
listRes = list(netflix.iloc[1]['Actors'].split(", "))
listRes

In [None]:
def extract_json_data(info, index, dataset):
    if info['Response']=='True':
        #['Title', 'Type', 'Director', 'Actors', 'Release', 'Genre', 'Runtime', 'Language', 
        # 'Country', 'Rating', 'IMDb', 'RottenTomatoes', 'Metacritic']
        row = []
        imdb = 'NaN'
        rotten = 'NaN'
        meta = 'NaN'
        row.append(info['Title'])
        row.append(info['Type'])
        row.append(info['Director'])
        row.append(info['Actors'])
        row.append(info['Released'])
        row.append(info['Genre'])
        if info['Type']=='movie':
            row.append(info['Runtime'])
        else:
            row.append(info['totalSeasons']+' seasons')
        row.append(info['Language'])
        row.append(info['Country'])
        row.append(info['Rated'])
        for rate in info['Ratings']:
            if rate['Source']=='Internet Movie Database':
                imdb = rate['Value']
            elif rate['Source']=='Rotten Tomatoes':
                rotten = rate['Value']
            elif rate['Source']=='Metacritic':
                meta = rate['Value']
        row.append(imdb)
        row.append(rotten)
        row.append(meta)
        dataset.loc[index] = row    

In [None]:
r = requests.get(omdb+quote(netflix.iloc[1]['Title']))

In [None]:
jsonRes = r.json()

In [None]:
jsonRes

In [None]:
for index, row in tqdm(netflix.iterrows(), total=netflix.shape[0]):
    r = requests.get(omdb+quote(netflix.iloc[index]['Title']))
    extract_json_data(r.json(), index, netflix)

In [None]:
netflix.info()

In [None]:
netflix['Type'].unique()

In [None]:
netflix[netflix['Actors'].isnull()]

In [None]:
netflix.drop(netflix[netflix.isnull().sum(axis=1)>=6].index, inplace=True)

In [None]:
netflix.replace('N/A', np.NaN, inplace=True)
netflix.replace('N/A seasons', np.NaN, inplace=True)
netflix.replace('NaN', np.NaN, inplace=True)

In [None]:
netflix['Actors'] = netflix['Actors'].apply(lambda x: np.NaN if pd.isnull(x) else list(x.split(", ")))
netflix['Genre'] = netflix['Genre'].apply(lambda x: np.NaN if pd.isnull(x) else list(x.split(", ")))
netflix['Language'] = netflix['Language'].apply(lambda x: np.NaN if pd.isnull(x) else list(x.split(", ")))
netflix['Country'] = netflix['Country'].apply(lambda x: np.NaN if pd.isnull(x) else list(x.split(", ")))

In [None]:
netflix['Actors'] = netflix['Actors'].apply(lambda x: np.NaN if isinstance(x, float) else x[0] if len(x)==1 else x)
netflix['Genre'] = netflix['Genre'].apply(lambda x: np.NaN if isinstance(x, float) else x[0] if len(x)==1 else x)
netflix['Language'] = netflix['Language'].apply(lambda x: np.NaN if isinstance(x, float) else x[0] if len(x)==1 else x)
netflix['Country'] = netflix['Country'].apply(lambda x: np.NaN if isinstance(x, float) else x[0] if len(x)==1 else x)

In [None]:
netflix['IMDb'] = netflix['IMDb'].apply(lambda x: np.NaN if pd.isnull(x) else x.replace('/10', ''))
netflix['RottenTomatoes'] = netflix['RottenTomatoes'].apply(lambda x: np.NaN if pd.isnull(x) else x.replace('%', ''))
netflix['Metacritic'] = netflix['Metacritic'].apply(lambda x: np.NaN if pd.isnull(x) else x.replace('/100', ''))

In [None]:
netflix['Runtime'] = netflix['Runtime'].apply(lambda x: np.NaN if pd.isnull(x) else x.replace(' min', ''))
netflix['Runtime'] = netflix['Runtime'].apply(lambda x: np.NaN if pd.isnull(x) else x.replace(' seasons', ''))
netflix['Runtime'] = netflix['Runtime'].apply(lambda x: np.NaN if pd.isnull(x) else x.replace(' Seasons', ''))

In [None]:
netflix['Type'] = netflix['Type'].apply(lambda x: x.replace('movie', 'Movie'))
netflix['Type'] = netflix['Type'].apply(lambda x: x.replace('series', 'TV Show'))

In [None]:
netflix.reset_index(drop=True, inplace=True)

In [None]:
netflix.head()

In [None]:
netflix.info()

In [None]:
#netflix.to_csv('../../data/netflix.csv')

# 2. Prime Video

In [None]:
prime = pd.read_csv('../../data/amazon_prime_titles.csv', sep=',', header=0)
prime.drop(columns=['show_id', 'date_added', 'description'], inplace=True)
prime.rename(columns={'type': 'Type', 'title': 'Title', 'director': 'Director', 'cast': 'Actors',
                  'country': 'Country', 'release_year': 'Release', 'rating': 'Rating',
                  'duration': 'Runtime', 'listed_in': 'Genre'}, inplace=True)
prime = prime.reindex(columns=['Type', 'Title', 'Director', 'Actors', 'Country', 'Release', 'Rating', 
                                   'Runtime', 'Genre', 'Language', 'IMDb', 'RottenTomatoes', 'Metacritic'])
prime = prime[['Title', 'Type', 'Director', 'Actors', 'Release', 'Genre', 'Runtime', 'Language', 
                   'Country', 'Rating', 'IMDb', 'RottenTomatoes', 'Metacritic']]

In [None]:
prime.head()

In [None]:
prime.info()

In [None]:
for index, row in tqdm(prime.iterrows(), total=prime.shape[0]):
    r = requests.get(omdb+quote(prime.iloc[index]['Title']))
    extract_json_data(r.json(), index, prime)

In [None]:
prime.info()

In [None]:
prime.head()

In [None]:
#prime.to_csv('./prime.csv')

In [None]:
prime['Type'].unique()

In [None]:
prime[prime['Actors'].isnull()]

In [None]:
prime[prime.isnull().sum(axis=1)>=7]

In [None]:
prime.replace('N/A', np.NaN, inplace=True)
prime.replace('N/A seasons', np.NaN, inplace=True)
prime.replace('NaN', np.NaN, inplace=True)

In [None]:
prime.drop(prime[prime.isnull().sum(axis=1)>=6].index, inplace=True)

In [None]:
prime.info()

In [None]:
prime['Actors'] = prime['Actors'].apply(lambda x: np.NaN if pd.isnull(x) else list(x.split(", ")))
prime['Genre'] = prime['Genre'].apply(lambda x: np.NaN if pd.isnull(x) else list(x.split(", ")))
prime['Language'] = prime['Language'].apply(lambda x: np.NaN if pd.isnull(x) else list(x.split(", ")))
prime['Country'] = prime['Country'].apply(lambda x: np.NaN if pd.isnull(x) else list(x.split(", ")))

In [None]:
prime['Actors'] = prime['Actors'].apply(lambda x: np.NaN if isinstance(x, float) else x[0] if len(x)==1 else x)
prime['Genre'] = prime['Genre'].apply(lambda x: np.NaN if isinstance(x, float) else x[0] if len(x)==1 else x)
prime['Language'] = prime['Language'].apply(lambda x: np.NaN if isinstance(x, float) else x[0] if len(x)==1 else x)
prime['Country'] = prime['Country'].apply(lambda x: np.NaN if isinstance(x, float) else x[0] if len(x)==1 else x)

In [None]:
prime['IMDb'] = prime['IMDb'].apply(lambda x: np.NaN if pd.isnull(x) else x.replace('/10', ''))
prime['RottenTomatoes'] = prime['RottenTomatoes'].apply(lambda x: np.NaN if pd.isnull(x) else x.replace('%', ''))
prime['Metacritic'] = prime['Metacritic'].apply(lambda x: np.NaN if pd.isnull(x) else x.replace('/100', ''))

In [None]:
prime['Runtime'] = prime['Runtime'].apply(lambda x: np.NaN if pd.isnull(x) else x.replace(' min', ''))
prime['Runtime'] = prime['Runtime'].apply(lambda x: np.NaN if pd.isnull(x) else x.replace(' seasons', ''))
prime['Runtime'] = prime['Runtime'].apply(lambda x: np.NaN if pd.isnull(x) else x.replace(' Seasons', ''))

In [None]:
prime['Type'] = prime['Type'].apply(lambda x: x.replace('movie', 'Movie'))
prime['Type'] = prime['Type'].apply(lambda x: x.replace('series', 'TV Show'))

In [None]:
prime.reset_index(drop=True, inplace=True)

In [None]:
prime.head()

In [None]:
prime.info()

In [None]:
#prime.to_csv('../../data/prime.csv')

# 3. Disney+

In [None]:
disney = pd.read_csv('../../data/disney_plus_titles.csv', sep=',', header=0)
disney.drop(columns=['show_id', 'date_added', 'description'], inplace=True)
disney.rename(columns={'type': 'Type', 'title': 'Title', 'director': 'Director', 'cast': 'Actors',
                  'country': 'Country', 'release_year': 'Release', 'rating': 'Rating',
                  'duration': 'Runtime', 'listed_in': 'Genre'}, inplace=True)
disney = disney.reindex(columns=['Type', 'Title', 'Director', 'Actors', 'Country', 'Release', 'Rating', 
                                   'Runtime', 'Genre', 'Language', 'IMDb', 'RottenTomatoes', 'Metacritic'])
disney = disney[['Title', 'Type', 'Director', 'Actors', 'Release', 'Genre', 'Runtime', 'Language', 
                   'Country', 'Rating', 'IMDb', 'RottenTomatoes', 'Metacritic']]

In [None]:
disney.head()

In [None]:
disney.info()

In [None]:
for index, row in tqdm(disney.iterrows(), total=disney.shape[0]):
    r = requests.get(omdb+quote(disney.iloc[index]['Title']))
    extract_json_data(r.json(), index, disney)

In [None]:
disney.info()

In [None]:
disney.head()

In [None]:
#disney.to_csv('./disney.csv')

In [None]:
disney['Type'].unique()

In [None]:
disney[disney['Actors'].isnull()]

In [None]:
disney[disney.isnull().sum(axis=1)>=6]

In [None]:
disney.replace('N/A', np.NaN, inplace=True)
disney.replace('N/A seasons', np.NaN, inplace=True)
disney.replace('NaN', np.NaN, inplace=True)

In [None]:
disney.drop(disney[disney.isnull().sum(axis=1)>=6].index, inplace=True)

In [None]:
disney.info()

In [None]:
disney['Actors'] = disney['Actors'].apply(lambda x: np.NaN if pd.isnull(x) else list(x.split(", ")))
disney['Genre'] = disney['Genre'].apply(lambda x: np.NaN if pd.isnull(x) else list(x.split(", ")))
disney['Language'] = disney['Language'].apply(lambda x: np.NaN if pd.isnull(x) else list(x.split(", ")))
disney['Country'] = disney['Country'].apply(lambda x: np.NaN if pd.isnull(x) else list(x.split(", ")))

In [None]:
disney['Actors'] = disney['Actors'].apply(lambda x: np.NaN if isinstance(x, float) else x[0] if len(x)==1 else x)
disney['Genre'] = disney['Genre'].apply(lambda x: np.NaN if isinstance(x, float) else x[0] if len(x)==1 else x)
disney['Language'] = disney['Language'].apply(lambda x: np.NaN if isinstance(x, float) else x[0] if len(x)==1 else x)
disney['Country'] = disney['Country'].apply(lambda x: np.NaN if isinstance(x, float) else x[0] if len(x)==1 else x)

In [None]:
disney['IMDb'] = disney['IMDb'].apply(lambda x: np.NaN if pd.isnull(x) else x.replace('/10', ''))
disney['RottenTomatoes'] = disney['RottenTomatoes'].apply(lambda x: np.NaN if pd.isnull(x) else x.replace('%', ''))
disney['Metacritic'] = disney['Metacritic'].apply(lambda x: np.NaN if pd.isnull(x) else x.replace('/100', ''))

In [None]:
disney['Runtime'] = disney['Runtime'].apply(lambda x: np.NaN if pd.isnull(x) else x.replace(' min', ''))
disney['Runtime'] = disney['Runtime'].apply(lambda x: np.NaN if pd.isnull(x) else x.replace(' seasons', ''))
disney['Runtime'] = disney['Runtime'].apply(lambda x: np.NaN if pd.isnull(x) else x.replace(' Seasons', ''))

In [None]:
disney['Type'] = disney['Type'].apply(lambda x: x.replace('movie', 'Movie'))
disney['Type'] = disney['Type'].apply(lambda x: x.replace('series', 'TV Show'))

In [None]:
disney.reset_index(drop=True, inplace=True)

In [None]:
disney.head()

In [None]:
disney.info()

In [None]:
#disney.to_csv('../../data/disney.csv')