# Gathering Aditional Movie Data by Wesley Giles

## Load modules and environment variables

In [14]:
import pandas as pd
import numpy as np
import sqlalchemy
import os
import tmdbsimple as tmdb
import json
from tqdm import  tqdm
from dotenv import load_dotenv
load_dotenv()
tmdb.API_KEY = os.environ["TMDB_API_KEY"]

## Load the data

In [15]:
basics = pd.read_csv("./data/title_basics.csv.gz")
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0094859,movie,Chief Zabu,Chief Zabu,0,2016,,74,Comedy


In [16]:
ratings = pd.read_csv("./data/title_ratings.csv.gz")
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0035423,6.4,84456
1,tt0062336,6.4,161
2,tt0069049,6.7,7323
3,tt0088751,5.2,323
4,tt0094859,7.9,83


In [17]:
akas = pd.read_csv("./data/title_akas.csv.gz")
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0035423,10,Kate et Léopold,FR,,imdbDisplay,,0
1,tt0035423,11,Kate & Leopold,ES,,imdbDisplay,,0
2,tt0035423,12,Kate e Leopold,PT,,,,0
3,tt0035423,13,Kate i Leopold,PL,,,,0
4,tt0035423,14,Кейт и Леополд,BG,bg,imdbDisplay,,0


## Now let's clean up the dataframes so we can concatenate them into a flat file

### Let's start by dropping all the akas where the region is not US

In [18]:
akas.drop(akas[akas["region"] != "US"].index, inplace=True)
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88787 entries, 27 to 702487
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   titleId          88787 non-null  object
 1   ordering         88787 non-null  int64 
 2   title            88787 non-null  object
 3   region           88787 non-null  object
 4   language         891 non-null    object
 5   types            82484 non-null  object
 6   attributes       4147 non-null   object
 7   isOriginalTitle  88787 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 6.1+ MB


### Let's also change `titleId` to `tconst` to match the other dataframes

In [19]:
akas.rename(columns = {"titleId":"tconst"}, inplace=True)
akas.head()

Unnamed: 0,tconst,ordering,title,region,language,types,attributes,isOriginalTitle
27,tt0035423,35,Kate and Leopold,US,,,alternative spelling,0
29,tt0035423,37,Kate & Leopold,US,,imdbDisplay,,0
45,tt0062336,5,The Tango of the Widower and Its Distorting Mi...,US,,imdbDisplay,,0
65,tt0069049,3,The Other Side of the Wind,US,,imdbDisplay,,0
72,tt0088751,1,Attack of the B-Movie Monster,US,,working,,0


### Now to join the dataframes

In [20]:
master = basics.merge(ratings, on = "tconst", how="outer").merge(akas,on = "tconst", how="left")
master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88787 entries, 0 to 88786
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   tconst           88787 non-null  object 
 1   titleType        88787 non-null  object 
 2   primaryTitle     88787 non-null  object 
 3   originalTitle    88787 non-null  object 
 4   isAdult          88787 non-null  int64  
 5   startYear        88787 non-null  int64  
 6   endYear          0 non-null      float64
 7   runtimeMinutes   88787 non-null  int64  
 8   genres           88787 non-null  object 
 9   averageRating    74859 non-null  float64
 10  numVotes         74859 non-null  float64
 11  ordering         88787 non-null  int64  
 12  title            88787 non-null  object 
 13  region           88787 non-null  object 
 14  language         891 non-null    object 
 15  types            82484 non-null  object 
 16  attributes       4147 non-null   object 
 17  isOriginalTi

### Now let's get the list of movies which we need data for

In [21]:
movie_list = master["tconst"].unique()
len(movie_list)

79544

In [22]:
def get_tmdb_info(movie_id):
  try:
    movie = tmdb.Movies(movie_id)
    info = {k:v for k,v in movie.info().items() if k in ["imdb_id","budget","revenue"]}
    for c in movie.releases()['countries']:
      if c['iso_3166_1' ] =='US':
        info['rating'] = c['certification']
    return info
  except:
    return {"imdb_id":movie_id, "budget":None, "revenue":None, "rating":None}

get_tmdb_info("tt1361336")

{'budget': 50000000,
 'imdb_id': 'tt1361336',
 'revenue': 132000000,
 'rating': 'PG'}

### And now to define a function which will get this data

In [23]:
def get_tmdb_data(file_name, data_list):
  try:
    os.makedirs("./data/")
  except:
    pass
  path = f"./data/{file_name}"
  try:
    with open(path,"r") as f:
      results = json.load(f)
  except:
    print("Initializing new file as empty list")
    with open(path,"w") as f:
      f.writelines(["[]"])
      results = []
  for i in tqdm(range(len(results),len(data_list))):
    try:
      new_results = get_tmdb_info(data_list[i])
      if new_results:
        with open(path,"w") as f:
          results.append(new_results)
          json.dump(results,f)
    except Exception as e:
      pass
  return results

get_tmdb_data("tmbd.json", movie_list)

0it [00:00, ?it/s]


[{'budget': 48000000,
  'imdb_id': 'tt0035423',
  'revenue': 76019048,
  'rating': 'PG-13'},
 {'budget': 0, 'imdb_id': 'tt0062336', 'revenue': 0},
 {'budget': 12000000, 'imdb_id': 'tt0069049', 'revenue': 0, 'rating': 'R'},
 {'budget': 350000, 'imdb_id': 'tt0088751', 'revenue': 0, 'rating': ''},
 {'budget': 187, 'imdb_id': 'tt0094859', 'revenue': 0, 'rating': ''},
 {'budget': 0, 'imdb_id': 'tt0096056', 'revenue': 0},
 {'budget': 0, 'imdb_id': 'tt0100275', 'revenue': 0},
 {'imdb_id': 'tt0108549', 'budget': None, 'revenue': None, 'rating': None},
 {'budget': 10000000, 'imdb_id': 'tt0113026', 'revenue': 0, 'rating': ''},
 {'budget': 0, 'imdb_id': 'tt0113092', 'revenue': 0, 'rating': ''},
 {'budget': 0, 'imdb_id': 'tt0114447', 'revenue': 0},
 {'imdb_id': 'tt0115937', 'budget': None, 'revenue': None, 'rating': None},
 {'budget': 0, 'imdb_id': 'tt0116391', 'revenue': 0},
 {'imdb_id': 'tt0116628', 'budget': None, 'revenue': None, 'rating': None},
 {'budget': 0, 'imdb_id': 'tt0116991', 'revenue

### Now to add this data into a dataframe

In [24]:
tmdb_df= pd.read_json("./data/tmbd.json")
tmdb_df.head()

Unnamed: 0,budget,imdb_id,revenue,rating
0,48000000.0,tt0035423,76019048.0,PG-13
1,0.0,tt0062336,0.0,
2,12000000.0,tt0069049,0.0,R
3,350000.0,tt0088751,0.0,
4,187.0,tt0094859,0.0,


### And merging it with master

In [25]:
master = master.merge(tmdb_df, left_on = "tconst", right_on="imdb_id" )
master.to_csv("data/tmdb_results_combined.csv.gz",compression='gzip',index=False)