In [21]:
#import dependencies
import pandas as pd

In [None]:
# Read the CSV files
movies_metadata_csv = pd.read_csv("src/movies_metadata.csv", low_memory=False)
credits_part1_csv = pd.read_csv("src/credits_part1.csv", low_memory=False)
credits_part2_csv = pd.read_csv("src/credits_part2.csv", low_memory=False)
omdb_data_csv = pd.read_csv("src/omdb_data.csv", low_memory=False).loc[:, ["BoxOffice", "imdbID", "Rated", "Awards", "imdbRating", "imdbVotes"]]

# Combine the two credits CSVs into a single DataFrame and drop duplicate ids
credits_csv = pd.concat([credits_part1_csv, credits_part2_csv]).drop_duplicates(subset='id')

# Clean and prepare movies metadata: convert ids to numeric, drop NaN ids, and remove duplicates
movies_metadata_csv = (
    movies_metadata_csv
    .assign(id=pd.to_numeric(movies_metadata_csv["id"], errors="coerce"))
    .dropna(subset=["id"])
    .drop_duplicates(subset='id')
)

# Merge movies metadata with credits on 'id', convert id to int, and set 'id' as the index
merged_df = (
    pd.merge(movies_metadata_csv, credits_csv, on='id', how='left')
    .astype({'id': 'int'})
    .set_index('id')
)

# Filter for English language films and release dates on or after 1980-01-01
merged_df = merged_df[
    (merged_df["original_language"] == "en") & 
    (merged_df["release_date"] >= "1980-01-01")
]

# Convert the 'release_date' column to datetime
merged_df['release_date'] = pd.to_datetime(merged_df['release_date'], errors='coerce')

# Define function to convert genres into a readable list
def get_genre_names(genre_str):
    return [genre['name'] for genre in eval(genre_str)]

# Apply the function to modify the genres column
merged_df.loc[:, 'genres'] = merged_df['genres'].apply(get_genre_names)

# Convert the 'budget' column to numeric, fill NaN with 0, and convert to integer
merged_df['budget'] = pd.to_numeric(merged_df['budget'], errors='coerce').fillna(0).astype(int)

# Rename 'imdbID' to 'imdb_id' for merging with omdb_data
omdb_data_csv.rename(columns={"imdbID": "imdb_id"}, inplace=True)

# Drop duplicates in both omdb_data and merged_df based on 'imdb_id'
omdb_data_csv.drop_duplicates(subset='imdb_id', keep='first', inplace=True)
merged_df.drop_duplicates(subset='imdb_id', keep='first', inplace=True)

# Merge merged_df with omdb_data_csv on 'imdb_id'
merged_df = pd.merge(merged_df, omdb_data_csv, on='imdb_id', how='left')

# Display the final merged DataFrame
merged_df
