In [2]:
import pandas as pd

# Load both CSV files
keywords_df = pd.read_csv("keywords.csv")
movies_df = pd.read_csv("posters.csv")

# Ensure tmdb_id is of the same type as id (usually int)
movies_df["tmdb_id"] = movies_df["tmdb_id"].astype(int)
keywords_df["id"] = keywords_df["id"].astype(int)

# Filter keywords to keep only those where id is in movies' tmdb_id
filtered_keywords = keywords_df[keywords_df["id"].isin(movies_df["tmdb_id"])]

# Save to new CSV
filtered_keywords.to_csv("keyword.csv", index=False)


In [3]:
import pandas as pd
import ast
import uuid

# Load the credits.csv file
df = pd.read_csv("credits.csv")

# Lists to hold parsed cast and crew data
cast_data = []
crew_data = []

# Iterate over each row to parse cast and crew
for _, row in df.iterrows():
    tmdb_id = row['id']
    
    # Parse stringified lists using ast.literal_eval
    cast_list = ast.literal_eval(row['cast']) if pd.notnull(row['cast']) else []
    crew_list = ast.literal_eval(row['crew']) if pd.notnull(row['crew']) else []

    for cast in cast_list:
        cast_entry = {
            'uuid': str(uuid.uuid4()),  # generate a unique id
            'cast_id': cast.get('cast_id'),
            'character': cast.get('character'),
            'credit_id': cast.get('credit_id'),
            'gender': cast.get('gender'),
            'id': cast.get('id'),
            'name': cast.get('name'),
            'order': cast.get('order'),
            'profile_path': cast.get('profile_path'),
            'tmdb_id': tmdb_id
        }
        cast_data.append(cast_entry)

    for crew in crew_list:
        crew_entry = {
            'uuid': str(uuid.uuid4()),  # generate a unique id
            'credit_id': crew.get('credit_id'),
            'department': crew.get('department'),
            'gender': crew.get('gender'),
            'id': crew.get('id'),
            'job': crew.get('job'),
            'name': crew.get('name'),
            'profile_path': crew.get('profile_path'),
            'tmdb_id': tmdb_id
        }
        crew_data.append(crew_entry)

# Convert lists to DataFrames
cast_df = pd.DataFrame(cast_data)
crew_df = pd.DataFrame(crew_data)

# Save to CSV
cast_df.to_csv("parsed_cast.csv", index=False)
crew_df.to_csv("parsed_crew.csv", index=False)


In [4]:
import pandas as pd

# Load the CSV files
cast_df = pd.read_csv("parsed_cast.csv")
posters_df = pd.read_csv("posters.csv")

# Ensure tmdb_id is of the same type (convert to str or int if needed)
cast_df['tmdb_id'] = cast_df['tmdb_id'].astype(str)
posters_df['tmdb_id'] = posters_df['tmdb_id'].astype(str)

# Filter cast_df to only include tmdb_ids present in posters_df
filtered_cast_df = cast_df[cast_df['tmdb_id'].isin(posters_df['tmdb_id'])]

# Save the result to a new CSV file
filtered_cast_df.to_csv("cast.csv", index=False)


In [5]:
import pandas as pd

# Load the parsed crew data and posters data
crew_df = pd.read_csv("parsed_crew.csv")
posters_df = pd.read_csv("posters.csv")

# Ensure tmdb_id is of the same type
crew_df['tmdb_id'] = crew_df['tmdb_id'].astype(str)
posters_df['tmdb_id'] = posters_df['tmdb_id'].astype(str)

# Filter crew data where tmdb_id is in posters
filtered_crew_df = crew_df[crew_df['tmdb_id'].isin(posters_df['tmdb_id'])]

# Save the filtered crew data to a new CSV
filtered_crew_df.to_csv("crew.csv", index=False)


In [6]:
import pandas as pd

# Load the CSV files
links_df = pd.read_csv("links.csv")
posters_df = pd.read_csv("posters.csv")

# Convert 'tmdbId' to int, handling NaN values first
links_df['tmdbId'] = links_df['tmdbId'].fillna(0).astype(int)

# Filter links_df where tmdbId exists in posters_df['tmdb_id']
filtered_links_df = links_df[links_df['tmdbId'].isin(posters_df['tmdb_id'])]

# Save the filtered DataFrame to a new CSV file
filtered_links_df.to_csv("link.csv", index=False)


In [8]:
import pandas as pd

# Load both CSV files
ratings_df = pd.read_csv("ratings.csv")
links_df = pd.read_csv("link.csv")

# Filter ratings_df to include only rows with movieId present in links_df
filtered_ratings_df = ratings_df[ratings_df['movieId'].isin(links_df['movieId'])]

# Save the filtered data
filtered_ratings_df.to_csv("rate.csv", index=False)
