In [57]:
#import dependencies
import pandas as pd

In [58]:
# Read CSV files with low memory usage to handle large datasets
movies_metadata_csv = pd.read_csv("src/movies_metadata.csv", low_memory=False)

# Clean the movies metadata by converting types and removing duplicates
movies_metadata_csv = (
    movies_metadata_csv
    .assign(
        id=pd.to_numeric(movies_metadata_csv["id"], errors="coerce"),
        release_date=pd.to_datetime(movies_metadata_csv['release_date'], errors='coerce')
    )
    .dropna(subset=["id"])
    .drop_duplicates(subset='id')
)

omdb_data_csv = (
    pd.read_csv("src/omdb_data.csv", low_memory=False)
    .rename(columns={"imdbID": "imdb_id"})
    .drop_duplicates(subset='imdb_id', keep='first')
)

credits_csv = pd.concat([
    pd.read_csv("src/credits_part1.csv", low_memory=False),
    pd.read_csv("src/credits_part2.csv", low_memory=False)
]).drop_duplicates(subset='id')

# Merge movies metadata with credits and OMDB data, set 'imdb_id' as the index
merged_df = (
    pd.merge(movies_metadata_csv, credits_csv, on='id', how='left')
    .merge(omdb_data_csv, on='imdb_id', how='left')
    .astype({'id': 'int'})
    .set_index('imdb_id')
)

# Clean specific columns in the merged DataFrame
merged_df = (
    merged_df
    .assign(
        Year=pd.to_numeric(merged_df['Year'], errors='coerce'), 
        BoxOffice=pd.to_numeric(merged_df['BoxOffice'].str.replace('$', '').str.replace(',', ''), errors='coerce'),
        imdbVotes=pd.to_numeric(merged_df['imdbVotes'].str.replace(',', ''), errors='coerce'),
        cast=merged_df['cast'].replace("[]", None),
        crew=merged_df['crew'].replace("[]", None)
    )
    .dropna(subset=['Year','cast', 'crew', 'Language'])
)

# Calculate total awards
merged_df['Total Awards'] = [
    sum(int(num) for num in awards.split() if num.isdigit()) if isinstance(awards, str) else 0
    for awards in merged_df['Awards']
]

# add decade column(int). // = rounded down

merged_df['Decade'] = ((merged_df['Year'] // 10) * 10).astype(int)


#add season column
merged_df['Season_of_Year'] = merged_df['release_date'].dt.month.map({
    12: 'Winter', 1: 'Winter', 2: 'Winter',
    3: 'Spring', 4: 'Spring', 5: 'Spring',
    6: 'Summer', 7: 'Summer', 8: 'Summer',
    9: 'Fall', 10: 'Fall', 11: 'Fall'
})

# Add BoxOffice_millions column
merged_df['BoxOffice_millions'] = merged_df['BoxOffice'] / 1_000_000

# Filter the merged DataFrame based on specified criteria
merged_df = merged_df[
    (merged_df["original_language"] == "en") & 
    (merged_df["BoxOffice"] > 1000000) & 
    (merged_df["release_date"] >= "1980-01-01") & 
    (merged_df["Year"] < 2018) &
    (merged_df['imdbRating'].notna()) & 
    (merged_df["Type"] == 'movie') & 
    (merged_df["Rated"].isin([
        "R", "Not Rated", "PG-13", "PG", 
        "Unrated", "TV-14", "TV-MA", "G", 
        "TV-PG", "TV-G"
    ]))
]

# Drop unnecessary columns
merged_df.drop(columns=[
    'adult', 'belongs_to_collection', 'genres', 'homepage', 
    'original_title', 'popularity', 'production_companies', 
    'production_countries', 'Released', 'spoken_languages', 
    'status', 'tagline', 'video', 'Title', 'Runtime', 
    'Plot', 'poster_path', 'Ratings', 'Metascore', 
    'DVD', 'Production', 'Website', 'Response', 
    'Season', 'Episode', 'seriesID', 'totalSeasons', 
    'vote_average', 'vote_count', 'budget', 'revenue','Type','Poster','original_language',
    'overview','Director','Writer','Actors'
], inplace=True)

# Display the final merged DataFrame
# merged_df.to_csv("merged_df.csv")
merged_df.to_csv("merged_df.csv")
