In [4]:
import pandas as pd

RAW_PATH = "../datasets/raw_data/"
DATA_PATH = "../datasets/filtered_data/"

# Define a function to process chunks of the file
def filter_movies_chunk(chunk, min_votes):
    # Filter basics to include only movies
    movies_chunk = chunk[chunk['titleType'] == 'movie']
    return movies_chunk

# Load and filter the basics dataset in chunks
chunksize = 10 ** 6  # Adjust the chunk size based on your memory capacity
filtered_chunks = []

for chunk in pd.read_csv(f'{RAW_PATH}title.basics.tsv', sep='\t', chunksize=chunksize, low_memory=False):
    filtered_chunk = filter_movies_chunk(chunk, min_votes=None)
    filtered_chunks.append(filtered_chunk)

# Concatenate filtered chunks
movies = pd.concat(filtered_chunks, ignore_index=True)

# Load the ratings dataset
ratings = pd.read_csv(f'{RAW_PATH}title.ratings.tsv', sep='\t', low_memory=False)

# Merge the datasets on 'tconst'
merged = pd.merge(movies, ratings, on='tconst')

# Filter the merged dataset to include only movies with a minimum number of votes (e.g., 1000)
min_votes = 1000
filtered_movies = merged[merged['numVotes'] >= min_votes]

# Save the filtered dataset to a new file
filtered_movies.to_csv(f'{DATA_PATH}filtered_movies.csv', index=False)


In [6]:
import pandas as pd

# Define chunk processing function
def filter_principals_chunk(chunk, movie_ids):
    # Filter principals to include only those related to our filtered movies
    filtered_chunk = chunk[chunk['tconst'].isin(movie_ids)]
    return filtered_chunk

# Load the filtered movies dataset from the previous step
filtered_movies = pd.read_csv(f'{DATA_PATH}filtered_movies.csv')


def filter(name):
    # Extract the movie IDs
    movie_ids = set(filtered_movies['tconst'])

    # Load and filter the $name dataset in chunks
    chunksize = 10 ** 6  # Adjust the chunk size based on your memory capacity
    filtered_principals_chunks = []

    for chunk in pd.read_csv(f'{RAW_PATH}title.{name}.tsv', sep='\t', chunksize=chunksize, low_memory=False):
        filtered_chunk = filter_principals_chunk(chunk, movie_ids)
        filtered_principals_chunks.append(filtered_chunk)

    # Concatenate filtered chunks
    filtered_principals = pd.concat(filtered_principals_chunks, ignore_index=True)

    # Save the filtered principals dataset to a new file
    filtered_principals.to_csv(f'{DATA_PATH}filtered_{name}.csv', index=False)


In [8]:
filter("ratings")

In [None]:
filter("crew")

In [None]:
filter("principals")

In [12]:
import pandas as pd

# Load the filtered principals dataset
filtered_principals = pd.read_csv(f'{DATA_PATH}filtered_principals.csv')

# Filter to include only rows where the category is 'actor' or 'actress'
actors_actresses = filtered_principals[filtered_principals['category'].isin(['actor', 'actress'])]

# Select only the columns 'tconst', 'ordering', 'nconst', and 'characters'
actors_actresses_filtered = actors_actresses[['tconst', 'ordering', 'nconst', 'characters']]

# Save the result to a new CSV file
actors_actresses_filtered.to_csv(f'{DATA_PATH}actors_actresses.csv', index=False)


In [14]:
import pandas as pd

# Load the filtered actors and actresses dataset
actors_actresses = pd.read_csv(f'{DATA_PATH}actors_actresses.csv')

# Extract unique nconst values for actors and actresses
actor_actress_ids = set(actors_actresses['nconst'])

# Define chunk processing function
def filter_name_basics_chunk(chunk, actor_actress_ids):
    # Filter name.basics to include only actors and actresses in our set
    filtered_chunk = chunk[chunk['nconst'].isin(actor_actress_ids)]
    return filtered_chunk

# Load and filter the name.basics dataset in chunks
chunksize = 10 ** 6  # Adjust the chunk size based on your memory capacity
filtered_name_basics_chunks = []

for chunk in pd.read_csv(f'{RAW_PATH}name.basics.tsv', sep='\t', chunksize=chunksize, low_memory=False):
    filtered_chunk = filter_name_basics_chunk(chunk, actor_actress_ids)
    filtered_name_basics_chunks.append(filtered_chunk)

# Concatenate filtered chunks
filtered_name_basics = pd.concat(filtered_name_basics_chunks, ignore_index=True)

# Save the filtered name.basics dataset to a new file
filtered_name_basics.to_csv(f'{DATA_PATH}filtered_name_basics.csv', index=False)
