# Fetching Movies Data

Fetching the movies information, as well as their videos; then saving the data in csv files

In [4]:
import pandas as pd

# Merging all data to an object and sending to database
movies = pd.read_csv("data/final_data/movies.csv")

print(movies.columns)
movies

Index(['imdbId', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genres', 'movieId',
       'tmdbId'],
      dtype='object')
293634


Unnamed: 0,imdbId,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,movieId,tmdbId
0,5,short,Blacksmith Scene,Blacksmith Scene,0.0,1893.0,,1.0,"Comedy,Short",95541,16624.0
1,8,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,0.0,1894.0,,1.0,"Documentary,Short",88674,105158.0
2,10,short,Leaving the Factory,La sortie de l'usine Lumière à Lyon,0.0,1895.0,,1.0,"Documentary,Short",120869,774.0
3,12,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,0.0,1896.0,,1.0,"Documentary,Short",98981,160.0
4,14,short,The Waterer Watered,L'arroseur arrosé,0.0,1895.0,,1.0,"Comedy,Short",113048,82120.0
...,...,...,...,...,...,...,...,...,...,...,...
26689,4377918,movie,Crazy Beautiful You,Crazy Beautiful You,0.0,2015.0,,114.0,"Comedy,Drama,Romance",130089,327225.0
26690,4397346,movie,Bikes vs Cars,Bikes vs Cars,0.0,2015.0,,90.0,Documentary,129822,324260.0
26691,4438688,movie,Polskie gówno,Polskie gówno,0.0,2014.0,,93.0,"Comedy,Musical",128734,
26692,4475970,short,Power Rangers,Power Rangers,0.0,2015.0,,14.0,"Action,Sci-Fi,Short",130842,327029.0


In [33]:
# get key from file
with open("tmdb_key.txt") as file:
    api_key = file.read()


import requests
def get_movie_details(id_tmdb) -> None:
    url = f"https://api.themoviedb.org/3/movie/{id_tmdb}?language=en-US"
    
    headers = {
        "accept": "application/json",
        "Authorization": "Bearer " + api_key
    }
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        save_movie_details(parse_to_movie_details(response.json()), id_tmdb)
    else:
        print(f"Error in request for {id_tmdb}")
        print(response.json())
    
    
# gets only the necessary columns from request
def parse_to_movie_details(api_response):
    movie_details_info = {
        "backdrop_path": api_response.get("backdrop_path"),
        "original_language": api_response.get("original_language"),
        "overview": api_response.get("overview"),
        "poster_path": api_response.get("poster_path"),
        "video": api_response.get("video"),
        "title": api_response.get("title"),
    }
    return movie_details_info


import os
def save_movie_details(movie_details_received, id):
    file_path = 'data/final_data/movie_details.csv'

    # create file if not exists
    if not os.path.exists(file_path):
        movie_details_df = pd.DataFrame([movie_details_received])
        movie_details_df['tmdb_id'] = id
        movie_details_df.to_csv(file_path, mode='w', header=True, index=False)
    else:
        movie_details_df = pd.read_csv(file_path)
        new_row = pd.DataFrame([movie_details_received])
        new_row['tmdb_id'] = id
        movie_details_df = pd.concat([movie_details_df, new_row], ignore_index=True)
        movie_details_df.to_csv(file_path, mode='w', header=True, index=False)
        
        
def get_movie_videos(movie_id) -> bool:
    url = f"https://api.themoviedb.org/3/movie/{movie_id}/videos"
    
    headers = {
        "accept": "application/json",
        "Authorization": "Bearer " + api_key
    }
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        parsed_response = parse_to_movie_videos(response.json())
        if parsed_response is not None:
            save_movie_videos(parsed_response, movie_id)
            return True
        
    return False
        

def parse_to_movie_videos(api_response):
    if api_response.get("results") is None:
        return None

    for video in api_response["results"]:
        if video.get("site") == "YouTube":
            return video.get("key")

    return None


def save_movie_videos(movie_videos_received, movie_id):
    file_path = 'data/final_data/movie_videos.csv'

    # create file if not exists
    if not os.path.exists(file_path):
        movie_videos_df = pd.DataFrame({'video_key': [movie_videos_received], 'tmdb_id': [movie_id]})
        movie_videos_df.to_csv(file_path, mode='w', header=True, index=False)
    else:
        movie_videos_df = pd.read_csv(file_path)
        new_row = pd.DataFrame({'video_key': [movie_videos_received], 'tmdb_id': [movie_id]})
        movie_videos_df = pd.concat([movie_videos_df, new_row], ignore_index=True)
        movie_videos_df.to_csv(file_path, mode='w', header=True, index=False)
        

In [15]:
# get movie details for each movie (by imdb_id)
for movie in movies.itertuples():
    tmdb_id = getattr(movie, 'tmdbId', None)
    
    counter = 0
    # ensuring the imdb_id is valid and not empty
    if pd.notna(tmdb_id) and tmdb_id not in ["N/A", "", "nan", "NaN"]:
        # parse from float to int
        tmdb_id = int(tmdb_id)
        get_movie_details(tmdb_id)
        counter += 1
        print(f"Processed {counter} movies")
        

Details for 16624 saved successfully
Details for 105158 saved successfully
Details for 774 saved successfully
Details for 160 saved successfully
Details for 82120 saved successfully
Details for 159900 saved successfully
Details for 775 saved successfully
Details for 5698 saved successfully
Details for 127105 saved successfully
Details for 118943 saved successfully
Details for 77133 saved successfully
Details for 2929 saved successfully
Details for 90056 saved successfully
Details for 193827 saved successfully
Details for 42553 saved successfully
Details for 46758 saved successfully
Details for 205504 saved successfully
Error in request for 56508
{'success': False, 'status_code': 34, 'status_message': 'The resource you requested could not be found.'}
Details for 147622 saved successfully
Details for 28627 saved successfully
Details for 133123 saved successfully
Details for 28196 saved successfully
Details for 5153 saved successfully
Details for 100246 saved successfully
Details for 1476

In [38]:
# getting movies videos
counter = 0
for movie in movies.itertuples():
    tmdb_id = getattr(movie, 'tmdbId', None)
    
    # ensuring the imdb_id is valid and not empty
    if pd.notna(tmdb_id) and tmdb_id not in ["N/A", "", "nan", "NaN"]:
        # parse from float to int
        tmdb_id = int(tmdb_id)
        if get_movie_videos(tmdb_id) is True:
            counter += 1
            print(f"Processed {counter} movies")

Processed 1 movies
Processed 2 movies
Processed 3 movies
Processed 4 movies
Processed 5 movies
Processed 6 movies
Processed 7 movies
Processed 8 movies
Processed 9 movies
Processed 10 movies
Processed 11 movies
Processed 12 movies
Processed 13 movies
Processed 14 movies
Processed 15 movies
Processed 16 movies
Processed 17 movies
Processed 18 movies
Processed 19 movies
Processed 20 movies
Processed 21 movies
Processed 22 movies
Processed 23 movies
Processed 24 movies
Processed 25 movies
Processed 26 movies
Processed 27 movies
Processed 28 movies
Processed 29 movies
Processed 30 movies
Processed 31 movies
Processed 32 movies
Processed 33 movies
Processed 34 movies
Processed 35 movies
Processed 36 movies
Processed 37 movies
Processed 38 movies
Processed 39 movies
Processed 40 movies
Processed 41 movies
Processed 42 movies
Processed 43 movies
Processed 44 movies
Processed 45 movies
Processed 46 movies
Processed 47 movies
Processed 48 movies
Processed 49 movies
Processed 50 movies
Processed

In [18]:
# read details from file
movie_details = pd.read_csv("data/final_data/movie_details.csv")

# remove duplicates
movie_details = movie_details.drop_duplicates(subset=['tmdb_id'])

# save to file
movie_details.to_csv("data/final_data/movie_details.csv", mode='w', header=True, index=False)

movie_details

16


Unnamed: 0,backdrop_path,original_language,overview,poster_path,video,title,tmdb_id
0,/mDD99APoTgMuNJrkmAfGicooJHa.jpg,xx,Three men hammer on an anvil and pass a bottle...,/c76bs0S90EFhB5ww3i6DlYQTVk.jpg,False,Blacksmithing Scene,16624
1,/zaPZItgUO3xicz0QBqrDKNuPSbD.jpg,xx,A man (Thomas Edison's assistant) takes a pinc...,/s7fhher78hv1I5tKl7NbgnwsKha.jpg,False,Edison Kinetoscopic Record of a Sneeze,105158
2,/wNxy6Fqvjh0FsIuk7cgqNNq5PhP.jpg,fr,Working men and women leave through the main g...,/cT2sefAXgEoICJUCEM6UfxXfuDM.jpg,False,Workers Leaving the Lumière Factory,774
3,/uusNog5m2aCuL53rrKw8RaBnprb.jpg,fr,A group of people are standing along the platf...,/m5HSlaNCzwV95rAriDmT19el5h1.jpg,False,The Arrival of a Train at La Ciotat,160
4,/A0hONP6cxTthAsw6IvPMX4Aowkr.jpg,fr,"A gardener is watering his flowers, when a mis...",/rSZghvrFWTGqi4UecyG9jimzpEO.jpg,False,The Sprinkler Sprinkled,82120
...,...,...,...,...,...,...,...
26183,/m0RDGraOSHznV5q9GBLEmwz98Os.jpg,en,"A story of love, sex and teen pregnancy in San...",/11a3Gt9Q4X93hqfAsSItZ01oDwh.jpg,False,Petting Zoo,323431
26184,/b0gVgI3ptWba6xME5rmCuvFVw0.jpg,en,This material was developed and prepared over ...,/9ULqaND2rzNFL4OAxCbSrY7UWem.jpg,False,Louis C.K.: Live at The Comedy Store,321594
26185,/d8ubvqYoYKlnWyrTPmJd6A9Gv7m.jpg,tl,A bad girl and a province boy found love in th...,/sZAi6bz8I31NrVPUAkza9qtxqZE.jpg,False,Crazy Beautiful You,327225
26186,/sV5Pk6zJR4Fsu8jtVDLN7BBjJUy.jpg,en,Bikes vs Cars depicts a global crisis that we ...,/8ARMq3gIxlzFcxSDQPSVmiE32KL.jpg,False,Bikes vs Cars,324260


In [39]:
# read videos from file
movie_videos = pd.read_csv("data/final_data/movie_videos.csv")

print(movie_videos.columns)
movie_videos

Index(['video_key', 'tmdb_id'], dtype='object')


Unnamed: 0,video_key,tmdb_id
0,KXf_Stwe_k4,82120
1,JEGIyo-dKmA,775
2,In3mRDX0uqk,5698
3,LrqUenAgVBU,28627
4,32pzHWUTcPc,100246
...,...,...
19795,ixgd38EZIR0,318224
19796,HNk19TXqnq0,328346
19797,ZZ-3VAxWwnk,319999
19798,2Lcng7xgOzE,321594
