In [1]:
#import dependencies
import pandas as pd

In [None]:
# Read the CSV files
movies_metadata_csv = pd.read_csv("src/movies_metadata.csv", low_memory=False)
credits_part1_csv = pd.read_csv("src/credits_part1.csv", low_memory=False)
credits_part2_csv = pd.read_csv("src/credits_part2.csv", low_memory=False)

# Combine the two credits CSVs into a single DataFrame, then drop duplicates ids
credits_csv = pd.concat([credits_part1_csv, credits_part2_csv]).drop_duplicates(subset='id')

# Convert ids to numeric, drop NaN ids & remove duplicate ids in one step
movies_metadata_csv = (
    movies_metadata_csv
    .assign(id=pd.to_numeric(movies_metadata_csv["id"], errors="coerce"))
    .dropna(subset=["id"])
    .drop_duplicates(subset='id')
)

# Merge movies metadata with credits on 'id', convert id to int, and set 'id' as the index
merged_df = (
    pd.merge(movies_metadata_csv, credits_csv, on='id', how='left')
    .astype({'id': 'int'})
    .set_index('id')
)

# Filter for English only and released on or after 1980-01-01 in one step
merged_df = (
    merged_df[
        (merged_df["original_language"] == "en") & 
        (merged_df["release_date"] >= "1980-01-01")
    ]
)

# Define function to convert genres into readable list
def get_genre_names(genre_str):
    return [genre['name'] for genre in eval(genre_str)]

# Apply the function to modify the genres column in place
# Use .loc[] to avoid the "SettingWithCopyWarning"
merged_df.loc[:, 'genres'] = merged_df['genres'].apply(get_genre_names)


# Convert the 'budget' column to numeric, coerce errors to NaN, fill NaN with 0, then convert to integer
merged_df['budget'] = pd.to_numeric(merged_df['budget'], errors='coerce').fillna(0).astype(int)

# Convert the 'release_date' column to datetime in place
merged_df['release_date'] = pd.to_datetime(merged_df['release_date'], errors='coerce')



#Display
merged_df