In [1]:
import pandas as pd
from datetime import datetime, timedelta 
import ast
import urllib.request as urllib
import json
from retrying import retry

# suppress all warning, for formatting issues
import warnings
warnings.filterwarnings('ignore')

This is a scrapper that will scrap the data for the number of movies that the cast have casted prior to a certain date from TMDB. The scrapper take in the cast id and a specific date, it will return the number of movies the cast have casted prior to this. Thus, some basic data merging and formatting will be done so that the scrapper will work as desired.

In [2]:
datas = pd.read_csv("https://media.githubusercontent.com/media/Eric-szh/Eric-szh.github.io/main/movies_metadata.csv", na_values=[0,"[]"])
# remove all the adult movies
datas = datas[datas["adult"] != "True"]
# drop some unnecessay columns
datas = datas.drop(['adult', 'belongs_to_collection','homepage','overview','poster_path','tagline',
                    'video','title','popularity','vote_count','vote_average','status','spoken_languages'], axis=1)

drop all movies containing NaN

In [3]:
data_step_1 = datas.dropna()

In [4]:
# 3 functions that is used to extract day, month, year out of the string 
def return_month(date_str):
    # an advanced module that can generate a date object based on the string input
    date = datetime.strptime(date_str, '%Y-%m-%d')
    return date.strftime('%m')
def return_day(date_str):
    date = datetime.strptime(date_str, '%Y-%m-%d')
    return date.strftime('%d')
def return_year(date_str):
    date = datetime.strptime(date_str, '%Y-%m-%d')
    return date.strftime('%Y')

mapping the release date column using the function defined, thus extracting the date

In [5]:
data_step_1["year"] = data_step_1["release_date"].map(return_year)
data_step_1["month"] = data_step_1["release_date"].map(return_month)
data_step_1["day"] = data_step_1["release_date"].map(return_day)


In [6]:
cast = pd.read_csv("https://media.githubusercontent.com/media/Eric-szh/Eric-szh.github.io/main/credits.csv", na_values="[]")

In [7]:
data_step_1["id"] = data_step_1["id"].astype(int)
# combine the two dataframe so it is easier to map
data_step_2 = pd.merge(data_step_1, cast[["cast","id"]], on = "id")

drop all movies containing NaN now

In [8]:
data_step_2 = data_step_2.dropna()

In [9]:
def get_id_and_year(series):
    idd = series["cast"]
    # an advanced module that transfer string indicating a python list to real python list
    real_idd = ast.literal_eval(idd)
    if(real_idd == []):
        print("shoot")
    year = series["release_date"]
    # a middle data used for scrapping
    # this will gives the id of the main actor and the year of releasing the film
    return (real_idd[0]["id"], year)

mapping id and filming year into the dataframe, prepare for scrapping

In [10]:
data_step_2["ids_and_years"] = data_step_2.apply(get_id_and_year, axis = 1)

In [11]:
# an advanced module that will retry if it falied
# will throw the exception after failed five times in a row
# for each faliue will wait for 2s before next try
# this will make sure that scrapping don't stop beacuse network flutuation
@retry(stop_max_attempt_number=5, wait_fixed=2000)
def get_url(url_get):
    html = urllib.urlopen(url_get)
    return html

# the main function that take care for the scrapping
# taking in a tuple of (id, year_of_release)
# return the number of movie that the actor have casted piror to the release of the movie
def return_num_of_act(tupleple):
    (idd, date) = tupleple
    print("doing id: ", idd)
    # again store the date as an date object
    release_date = datetime.strptime(date, '%Y-%m-%d')
    html = None
    # try to scrape the data, prepare for any exception and returns zero if encountered any
    try:
        html = get_url("https://api.themoviedb.org/3/person/"+str(idd)+"/movie_credits?api_key=9139d613c8dc188efc20bf6d20e4d463&language=en-US")
    except Exception as e:
        print("Shoot, we have", e)
    finally:
        if (html == None):
            return 0
    # as api will return a json, json.loads is used to convert the string to a usable dict
    reuslt = json.loads(html.read())
    num = 0
    for movie in reuslt["cast"]:
        # check if the actor have acted any movie
        if "release_date" in movie:
            # check if the movie have an release date
            if (movie["release_date"] != ""):
                movie_date = datetime.strptime(movie["release_date"], '%Y-%m-%d')
                # a difference in time that can be achieved by the date object
                # a positive difference means that the other movie (movie_date) is filmed before this movie (release_date)
                difference = release_date - movie_date
                # zero differences, act like a 0
                zero = timedelta()

                if (difference > zero):
                    num += 1
    print("it have: ", num)
    return num

In [None]:
# scrapping and storing the data through mapping
data_step_2["nums_of_first_actor"] = data_step_2["ids_and_years"].apply(return_num_of_act)
data_step_2[["id", "nums_of_first_actor"]].to_csv("nums_of_first_actor.csv", index=False)
# I didn't execute this cell as this tooks a long time to scrap
# if you wish to see how it works, you can execute it yourself