## Importing the Library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import requests 
import os 
import json
from iso3166 import countries
import plotly.express as px
TMDB_API_KEY = os.getenv('TMDB_API_KEY')


##  Data Collection 
This is the first step of the project. I'll use TMDB API to collect movie data, including the name, the date, language, the genre, etc....

### genre collection

collecting the genres from the API and put them into a list for future movie data collection based on those genres. 

In [None]:
# first, I would like to discover the type of movie using genre end point in the API 
import requests
root = 'https://api.themoviedb.org/3/'
endpoint = "genre/movie/list"
parameters = {'format':'json',
              'offset':0,
              'limit':250,
              'language': 'en',
              'api_key': TMDB_API_KEY
             }

headers = {"accept": "application/json"}
response = requests.get(root+endpoint, params = parameters, headers=headers)
genre_dict_str = response.text

In [None]:
# then convert the response into a dictionary 
genre_dict = res = json.loads(genre_dict_str)

In [None]:
# finally collect genre names in the dictionary and convert them into a list 
genre_list =  [x['name'] for x in genre_dict['genres'] ]
genre_list

### movie data collection from different genres

based on the movie genre, I would like to collect the top 500 movies from each genre

In [None]:
# based on the movie genre, I would like to collect the top 500 movies from each genre
root = 'https://api.themoviedb.org/3/'
endpoint = 'discover/movie'
parameters = {
    'format':'json',
    'language': 'en-US',
    'include_video':False,
    'include_adult':False,
    'page':2,
    'with_genres':'Action',
    'api_key': TMDB_API_KEY
}

headers = {"accept": "application/json"}

response = requests.get(root+endpoint, params = parameters, headers=headers)

In [None]:
txt_resp = json.loads(response.text)

### testing to see the result of the movie details 


In [None]:
root = 'https://api.themoviedb.org/3/'
id =  670292
endpoint = f'movie/{id}'
parameters = {'format':'json',
              'api_key': TMDB_API_KEY
             }
response = requests.get(root+endpoint,params = parameters, headers=headers)


In [None]:
response

In [None]:
txt = json.loads(response.text)

In [None]:
txt

In [None]:

# this function will help us build a data frame with a specific genre
def movie_collection (genre_list):
    # predefine the dataframe that we are building
    movie_df = pd.DataFrame(columns= ['id','title','genre','release_date','language','description','runtime','poster_image','score'])
    for genre in genre_list:
        for page in range(1,500,1):
            try:
                # making api calls
                headers = {"accept": "application/json"}
                root = 'https://api.themoviedb.org/3/'
                endpoint = 'discover/movie'
                parameters = {
                    'format':'json',
                    'language': 'en-US',
                    'include_video':False,
                    'include_adult':False,
                    'page':page,
                    'with_genres': genre,
                    'sort_by':'popularity.desc',
                    'api_key': TMDB_API_KEY
                }
                response = requests.get(root+endpoint, params = parameters, headers=headers)
                movie_list_txt = json.loads(response.text)['results']
                # iteratively put rows into a dataframe. 
                for movie in movie_list_txt:
                    attribute_lst = movie_detail_extraction(movie['id'])
                    movie_df.loc[len(movie_df)] = attribute_lst
            except:
                continue
    return movie_df



def movie_detail_extraction(movie_id):
    root = 'https://api.themoviedb.org/3/'
    id =  movie_id
    endpoint = f'movie/{movie_id}'
    parameters = {'format':'json',
              'api_key': TMDB_API_KEY
             }
    response = requests.get(root+endpoint,params = parameters, headers=headers)
    movie_detail_txt = json.loads(response.text)
    genres = [x["name"] for x in movie_detail_txt['genres']]
    try:
        attribute_list =[movie_detail_txt['id'],movie_detail_txt['original_title'],genres,movie_detail_txt['release_date'],movie_detail_txt['original_language'],movie_detail_txt['overview'],movie_detail_txt['runtime'],'https://image.tmdb.org/t/p/original'+movie_detail_txt['poster_path'],"{:.1f}".format(float(movie_detail_txt['vote_average']))]
    except:
         attribute_list = [movie_detail_txt['id'],movie_detail_txt['original_title'], genres, movie_detail_txt['release_date'],movie_detail_txt['original_language'],movie_detail_txt['overview'],movie_detail_txt['runtime'],None,"{:.1f}".format(float(movie_detail_txt['vote_average']))]
    return attribute_list




In [None]:
# this function will help us build a data frame with a specific genre

def movie_collection (genre_list):
    movie_df = pd.DataFrame(columns= ['id','title','genre','release_date','language','description','runtime','poster_image','score'])
    for genre in genre_list:
        for page in range(1,500,1):
            try:
                headers = {"accept": "application/json"}
                root = 'https://api.themoviedb.org/3/'
                endpoint = 'discover/movie'
                parameters = {
                    'format':'json',
                    'language': 'en-US',
                    'include_video':False,
                    'include_adult':False,
                    'page':page,
                    'with_genres': genre,
                    'sort_by':'popularity.desc',
                    'api_key': TMDB_API_KEY
                }
                response = requests.get(root+endpoint, params = parameters, headers=headers)
                movie_list_txt = json.loads(response.text)['results']
                for movie in movie_list_txt:
                    attribute_lst = movie_detail_extraction(movie['id'])
                    movie_df.loc[len(movie_df)] = attribute_lst
                if(page%50==0):
                    print(len(movie_df))
            except:
                continue
    return movie_df


def movie_detail_extraction(movie_id):
    root = 'https://api.themoviedb.org/3/'
    id =  movie_id
    endpoint = f'movie/{movie_id}'
    parameters = {'format':'json',
              'api_key': TMDB_API_KEY
             }
    response = requests.get(root+endpoint,params = parameters, headers=headers)
    movie_detail_txt = json.loads(response.text)
    genres = [x["name"] for x in movie_detail_txt['genres']]
    try:
        attribute_list =[movie_detail_txt['id'],movie_detail_txt['original_title'],genres,movie_detail_txt['release_date'],movie_detail_txt['original_language'],movie_detail_txt['overview'],movie_detail_txt['runtime'],'https://image.tmdb.org/t/p/original'+movie_detail_txt['poster_path'],"{:.1f}".format(float(movie_detail_txt['vote_average']))]
    except:
         attribute_list = [movie_detail_txt['id'],movie_detail_txt['original_title'], genres, movie_detail_txt['release_date'],movie_detail_txt['original_language'],movie_detail_txt['overview'],movie_detail_txt['runtime'],None,"{:.1f}".format(float(movie_detail_txt['vote_average']))]
    return attribute_list




In [None]:
df = movie_collection(genre_list)

In [None]:


look_up_table_temp = df[['id','genre']]
look_up_table = look_up_table_temp.explode("genre").reset_index(drop=True)
genre = look_up_table.drop_duplicates(ignore_index = True)
genre.to_csv("genre.csv",header=True,index=False)




In [None]:
df_movie = df.drop(['genre'],axis=1)
df_movie = df_movie.drop_duplicates(ignore_index = True)

In [None]:
df_movie

In [None]:
df_movie.to_csv("movies.csv",header=True,index= False)


In [None]:
look_up_table

In [None]:
look_up_table.to_csv("genres.csv",header=True,index=False)

In [2]:
df_movie = pd.read_csv("movies.csv",lineterminator='\n')

In [None]:




# collecting the cast members for the movies.
def cast_collection(id_list):
    counter = 0
    cast_df = pd.DataFrame(columns= ['movie_id','cast_member_id','gender','known_for_department','name','original_name','profile_path','character','popularity'])
    for id in id_list:
        try:
            headers = {"accept": "application/json"}
            root = 'https://api.themoviedb.org/3/'
            endpoint = f'movie/{id}/credits'
            parameters = {
                'format':'json',
                'api_key': TMDB_API_KEY
            }
            response = requests.get(root+endpoint, params = parameters, headers=headers)
            movie_list_txt = json.loads(response.text)
            movie_id = movie_list_txt['id']
            for cast in movie_list_txt["cast"]:
                cast_attribute =[movie_id,cast['id'],cast['gender'],cast['known_for_department'],cast['name'],
                                 cast['original_name'],
                                'https://image.tmdb.org/t/p/original'+cast['profile_path'],
                                 cast['character'],cast['popularity']]
                cast_df.loc[len(cast_df)] = cast_attribute
        except:
            continue
        counter+=1
    return cast_df


# first we create a look up table of all the movie_id and cast_member_id. 
movie_cast_lookup = df_cast[["movie_id","cast_member_id",'character']]
cast_member_df = df_cast.drop(['movie_id','character'],axis=1)
# drop the duplicates in the dataset
cast_member_df =cast_member_df.drop_duplicates(ignore_index=True)



In [None]:
df_cast = cast_collection(df_movie['id'].unique())

In [None]:
df_cast

In [None]:
df_cast.to_csv("cast.csv",header=True,index=False)

In [None]:


# first we create a look up table of all the movie_id and cast_member_id. 
movie_cast_lookup = df_cast[["movie_id","cast_member_id",'character']]
cast_member_df = df_cast.drop(['movie_id','character'],axis=1)
# drop the duplicates in the dataset
cast_member_df =cast_member_df.drop_duplicates(ignore_index=True)





In [None]:
## store those two tables into csv files
cast_member_df.to_csv("cast_members.csv",index=False,header=True)
movie_cast_lookup.to_csv('cast_lookup.csv',index=False,header=True)


In [None]:


# create movie release dataframes 
def realease_date_collection(id_list):
    counter = 0
    release_date_df = pd.DataFrame(columns= ['movie_id','iso_3166_1','release_date','type'])
    # for each of the movie in the movies data frame, we make an API call to extract their release date info
    for id in id_list:
        try:
            headers = {"accept": "application/json"}
            root = 'https://api.themoviedb.org/3/'
            endpoint = f'r'
            parameters = {
                'format':'json',
                'api_key': TMDB_API_KEY
            }
            response = requests.get(root+endpoint, params = parameters, headers=headers)
            release_txt = json.loads(response.text)
            movie_id = release_txt['id']
            for country in release_txt["results"]:
                iso_3166_1 = country['iso_3166_1'] 
                release_info= country['release_dates'][0]
                date_string = release_info['release_date']
                type = release_info['type']
                date_lst = [movie_id,iso_3166_1,date_string,type]
                release_date_df.loc[len(release_date_df)] = date_lst
        except:
            continue
        counter+=1
    return release_date_df

df_release_date = realease_date_collection(df_movie['id'].unique())
#adding extra columns
release_date_day = []
release_date_time =[]
for i in range(len(df_release_date)):
    release_date_day.append(df_release_date.iloc[i,2][:9])
    release_date_time.append(df_release_date.iloc[i,2][11:-2])
df_release_date['release_date_day'] = release_date_day
df_release_date['release_date_time'] = release_date_time



In [None]:
df_release_date = pd.read_csv("movie_release_date.csv")

In [None]:
#df_release_date.to_csv("movie_release_date.csv",header=True,index=False)

In [None]:
df_release_date

In [None]:
release_date_day = []
release_date_time =[]
for i in range(len(df_release_date)):
    release_date_day.append(df_release_date.iloc[i,2][:9])
    release_date_time.append(df_release_date.iloc[i,2][11:-2])

df_release_date['release_date_day'] = release_date_day
df_release_date['release_date_time'] = release_date_time



In [None]:
df_release_date

In [None]:

# extract the iso from the countries
from iso3166 import countries
def extract_info_from_iso(lst):
    return_lst = [[],[]]
    for i in lst:
        try:
            return_lst[0].append(countries.get(i).alpha3)
            return_lst[1].append(countries.get(i).name)
        except:
            print(countries.get(i))
    return return_lst




In [None]:
df_release_update = df_release_date.loc[(df_release_date['iso_3166_1'] != 'CS')& (df_release_date['iso_3166_1'] != 'SU')&(df_release_date['iso_3166_1'] != 'XC')&(df_release_date['iso_3166_1'] != 'YU')&(df_release_date['iso_3166_1'] != 'XG')&(df_release_date['iso_3166_1'] != 'AN')]

In [None]:
df_release_update = df_release_update.dropna(subset=['iso_3166_1'],ignore_index = True)

In [None]:
lst = extract_info_from_iso(list(df_release_update['iso_3166_1']))

In [None]:
df_release_update['alpha3'] = lst[0]
df_release_update['name'] = lst[1]

In [None]:
## manipulated the data to make it look better
df_release_update = df_release_update.replace({"type":{1:'Premiere',2:"Theatrical (limited)",3:"Theatrical",
                                                      4:"Digital",5:"Physical",6:"TV"}})
df_release_update = df_release_update.drop('release_date',axis= 1)
df_release_update = df_release_update.rename(columns={"type": "release_type", "release_date_day": "release_date","release_date_time":"release_time"})


In [None]:
df_release_update = pd.read_csv("movie_release_date.csv")
country_code_lookup = pd.read_csv("country_code_lookup.csv")
df_release_update_new = pd.merge(df_release_update,country_code_lookup, left_on = 'iso_3166_1', right_on = 'iso_3166_1', how = 'inner')
df_release_update_new

In [None]:
df_release_update

In [None]:
oppen =df_release_update_new.query("movie_id == 872585 ")
print(oppen)
fig = px.choropleth(oppen, locations="alpha3", hover_name="name", color = "release_type", hover_data = ['release_date','release_time'])
fig.show()

In [None]:

unique_values = df_release_update['iso_3166_1'].unique()
return_lst = extract_info_from_iso(unique_values)
alpha3 = return_lst[0]
name = return_lst[1]
country_code_lookup = pd.DataFrame(data = {"iso_3166_1":unique_values,'alpha3':alpha3,'name':name})
country_code_lookup

In [None]:
df_release_update = df_release_update.drop(['alpha3','name'],axis =1)

In [None]:
df_release_update

In [None]:
# df_release_update.to_csv("movie_release_date.csv",header=True,index=False)
# country_code_lookup.to_csv("country_code_lookup.csv",header=True,index=False)

In [3]:
def comments_collection(id_list):
    counter = 0
    reviews_df = pd.DataFrame(columns= ['movie_id','author_name','creat_date','rating','content'])
    for id in id_list:
        try:
            headers = {"accept": "application/json"}
            root = 'https://api.themoviedb.org/3/'
            endpoint = f'movie/{id}/reviews'
            parameters = {
                'format':'json',
                'language':'en-US',
                'page':1,
                'api_key': TMDB_API_KEY
            }
            response = requests.get(root+endpoint, params = parameters, headers=headers)
            release_txt = json.loads(response.text)
            movie_id = release_txt['id']
            for reviews in release_txt["results"]:
                author = reviews['author']
                create_date = reviews['created_at'][:10]
                rating= reviews['author_details']['rating']
                comment = reviews['content']
                review_lst = [movie_id,author,create_date,rating,comment]
                reviews_df.loc[len(reviews_df)] = review_lst
        except:
            continue
        counter+=1
        if(counter%100==0):
            print(counter)
        
    return reviews_df

In [4]:
review_df = comments_collection(df_movie['id'].unique())

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900


In [None]:
review_df = review_df.dropna(subset=['rating'],ignore_index = True)

In [None]:
review_df

In [None]:
review_df.to_csv("reviews.csv",index=False,header=True)

In [None]:
df_movie

In [None]:
movie_df = df_movie.dropna(ignore_index = True)


In [None]:
movie_df["year"] = movie_df['release_date'].str[:4].astype('int')


In [None]:
movie_df.to_csv("movies.csv",header=True,index=False)