In [None]:
# We cliqued

In [1]:
from datetime import datetime

import pandas as pd

# Load all the actors and directors from the names.basics file
names_basics = pd.read_csv("../data/name.basics.tsv", sep="\t", usecols=['nconst', 'primaryName', 'primaryProfession'])

print("Successfully loaded names_basics with shape:", names_basics.shape)

# Load all the titles
titles_basics = pd.read_csv("../data/title.basics.tsv", sep="\t",
                            usecols=['tconst', 'primaryTitle', 'startYear', 'titleType'], dtype=str)

print("Successfully loaded titles_basics with shape:", titles_basics.shape)

# Load the alternative titles
alternatives = pd.read_csv("../data/title.akas.tsv", sep="\t", usecols=['titleId', 'region'], dtype=str)

print("Successfully loaded alternatives with shape:", alternatives.shape)


# Load the title.principals file to get the relationships between titles and names
titles_principals = pd.read_csv("../data/title.principals.tsv", sep="\t",
                                usecols=['tconst', 'nconst', 'category'], dtype=str)

print("Successfully loaded titles_principals with shape:", titles_principals.shape)

# Load the ratings file to get the ratings for each title
ratings = pd.read_csv("../data/title.ratings.tsv", sep="\t", dtype=str)

print("Successfully loaded ratings with shape:", ratings.shape)

Successfully loaded names_basics with shape: (14451516, 3)
Successfully loaded titles_basics with shape: (11693908, 4)
Successfully loaded alternatives with shape: (52769492, 2)
Successfully loaded titles_principals with shape: (92874325, 3)
Successfully loaded ratings with shape: (1574221, 3)


In [4]:
def filter_datasets() -> None:
    # Filter only actors and directors from names_basics
    global names_basics, titles_basics, alternatives, titles_principals, ratings
    names_basics = names_basics[names_basics['primaryProfession'].str.contains('actor|actress|director', na=False)]
    # Keep only the necessary columns
    names_basics = names_basics[['nconst', 'primaryName']]

    print("Filtered names_basics with shape:", names_basics.shape)

    # Filter titles for movies only
    titles_basics = titles_basics[titles_basics['titleType'] == 'movie']
    # Filter for titles that have a start year and are from 1990 onwards
    titles_basics = titles_basics[titles_basics['startYear'].str.isnumeric()]
    titles_basics = titles_basics[titles_basics['startYear'].astype(int) >= 1990]
    # Filter out titles with no primary title
    titles_basics = titles_basics[titles_basics['primaryTitle'].notna()]
    # Keep only the necessary columns
    titles_basics = titles_basics[['tconst', 'primaryTitle']]

    print("Filtered titles_basics with shape:", titles_basics.shape)

    # Filter only US movies
    alternatives = alternatives[alternatives['region'] == 'US']
    # Keep only the necessary columns and rename for consistency
    alternatives = alternatives[['titleId']].rename(columns={'titleId': 'tconst'})

    print("Filtered alternatives with shape:", alternatives.shape)

    # Filter for actors, actresses, and directors again, for consistency
    titles_principals = titles_principals[titles_principals['category'].isin(['actor', 'actress', 'director'])]
    # Keep only the necessary columns
    titles_principals = titles_principals[['tconst', 'nconst']]

    print("Filtered titles_principals with shape:", titles_principals.shape)

    # Filter out titles with a small amount of ratings
    ratings = ratings[ratings['numVotes'].str.isnumeric()]
    ratings = ratings[ratings['numVotes'].astype(int) > 1000]
    # Filter out titles with a rating below 7.0
    ratings = ratings[ratings['averageRating'].astype(float) >= 7.0]
    ratings = ratings[['tconst']]

    print("Filtered ratings with shape:", ratings.shape)

    print("All datasets filtered successfully.")


In [5]:
def create_best_collabs() -> None:
    """
    Create a DataFrame containing the best collaborations between actors and directors based on the provided datasets.
    :return: pd.DataFrame: A DataFrame with columns for title, name, and the actor or director associated with each title.
    """
    filter_datasets()
    # Merge the dataframes to get the names associated with each title and the actor or director for each title
    merged_df = (titles_principals
                 .merge(titles_basics, on="tconst")
                 .merge(ratings, on="tconst")
                 .merge(alternatives, on="tconst")
                 .merge(names_basics, on="nconst", how="inner")
                 .drop_duplicates()
                 .rename(columns={'primaryTitle': 'title', 'primaryName': 'name'}))
    print("Merged with shape:", merged_df.shape)
    print(merged_df.head())
    merged_df.to_csv("data\\collabs.csv", index=False)
