In [1]:
#import dependencies
import pandas as pd
import json

In [2]:
# Read CSV files with low memory usage to handle large datasets
movies_metadata_csv = pd.read_csv("src/movies_metadata.csv", low_memory=False)
credits_part1_csv = pd.read_csv("src/credits_part1.csv", low_memory=False)
credits_part2_csv = pd.read_csv("src/credits_part2.csv", low_memory=False)
omdb_data_csv = pd.read_csv("src/omdb_data.csv", low_memory=False)

# Concatenate credits CSVs, remove duplicate entries based on 'id'
credits_csv = pd.concat([credits_part1_csv, credits_part2_csv]).drop_duplicates(subset='id')

# Rename 'imdbID' column to 'imdb_id' to match other DataFrames for merging
omdb_data_csv.rename(columns={"imdbID": "imdb_id"}, inplace=True)

# Remove duplicate rows in OMDB data based on 'imdb_id', keeping the first occurrence
omdb_data_csv.drop_duplicates(subset='imdb_id', keep='first', inplace=True)

# Clean the movies metadata:
# - Convert 'id' to numeric, setting invalid values to NaN
# - Convert 'release_date' to datetime, setting invalid values to NaT
# - Drop rows where 'id' is NaN
# - Remove duplicate rows based on 'id'
movies_metadata_csv = (
    movies_metadata_csv
    .assign(
        id=pd.to_numeric(movies_metadata_csv["id"], errors="coerce"),
        release_date=pd.to_datetime(movies_metadata_csv['release_date'], errors='coerce')
    )
    .dropna(subset=["id"])
    .drop_duplicates(subset='id')
)

# Merge movies metadata with credits DataFrame on 'id', ensuring 'id' is an integer
# Set 'id' as the index for the resulting DataFrame
merged_df = (
    pd.merge(movies_metadata_csv, credits_csv, on='id', how='left')
    .astype({'id': 'int'})
    .set_index('id')
)


# Merge the resulting merged DataFrame with OMDB data on 'imdb_id'
merged_df = pd.merge(merged_df, omdb_data_csv, on='imdb_id', how='left')

# Convert the BoxOffice column to numeric
merged_df['BoxOffice'] = pd.to_numeric(merged_df['BoxOffice'].str.replace('$', '').str.replace(',', ''), errors='coerce')

# Set "[]" in 'cast' and 'crew' to None
merged_df[['cast', 'crew']] = merged_df[['cast', 'crew']].replace("[]", None)

# Remove commas and convert to numeric imdbVotes
merged_df['imdbVotes'] = pd.to_numeric(merged_df['imdbVotes'].str.replace(',', ''), errors='coerce')

#sum of numbers in Awards

# Extract and sum the numbers directly using a simple function
def sum_awards(awards):
    if isinstance(awards, str):  # Check if the value is a string
        total = sum(int(num) for num in awards.split() if num.isdigit())
        return total
    return 0  # Return 0 if it's not a string

# Apply the function to the Awards column
merged_df['Total Awards'] = merged_df['Awards'].apply(sum_awards)



# Apply filters to the merged DataFrame:
# - Keep only English-language films
# - Box Office over 1 million
# - Only include films released on or after 1980
# - Ensure 'BoxOffice' and 'imdbRating' columns are not missing
# - Include only rows where the 'Type' is 'movie'
# - Filter films by specific ratings
merged_df = merged_df[
    (merged_df["original_language"] == "en") & 
    (merged_df["BoxOffice"] > 1000000) & 
    (merged_df["release_date"] >= "1980-01-01") & 
    (merged_df['imdbRating'].notna()) &  # Ensure imdbRating is available
    (merged_df["Type"] == 'movie') & 
    (merged_df["Rated"].isin([  # Keep only specific Ratings
        "R", "Not Rated", "PG-13", "PG", 
        "Unrated", "TV-14", "TV-MA", "G", 
        "TV-PG", "TV-G"
    ]))
]

# Drop columns that are not needed for analysis
merged_df.drop(columns=[
    'adult', 'belongs_to_collection', 'genres', 'homepage', 
    'original_title', 'popularity', 'production_companies', 
    'production_countries', 'Released', 'spoken_languages', 
    'status', 'tagline', 'video', 'Title', 'Runtime', 
    'Plot', 'poster_path', 'Ratings', 'Metascore', 
    'DVD', 'Production', 'Website', 'Response', 
    'Season', 'Episode', 'seriesID', 'totalSeasons', 
    'vote_average', 'vote_count', 'budget', 'revenue'
], inplace=True)

# Set 'imdb_id' as the index for the final DataFrame
merged_df.set_index('imdb_id', inplace=True)

# Display the final merged DataFrame .to_csv("merged_df.csv")
merged_df


Unnamed: 0_level_0,original_language,overview,release_date,runtime,title,cast,crew,Year,Rated,Genre,...,Actors,Language,Country,Awards,Poster,imdbRating,imdbVotes,Type,BoxOffice,Total Awards
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0114709,en,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,81.0,Toy Story,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",1995,G,"Animation, Adventure, Comedy",...,"Tom Hanks, Tim Allen, Don Rickles",English,United States,Nominated for 3 Oscars. 29 wins & 24 nominatio...,https://m.media-amazon.com/images/M/MV5BMDU2ZW...,8.3,1084764.0,movie,223225679.0,56
tt0113497,en,When siblings Judy and Peter discover an encha...,1995-12-15,104.0,Jumanji,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",1995,PG,"Adventure, Comedy, Family",...,"Robin Williams, Kirsten Dunst, Bonnie Hunt","English, French",United States,4 wins & 11 nominations,https://m.media-amazon.com/images/M/MV5BZTk2Zm...,7.1,382657.0,movie,100499940.0,15
tt0113228,en,A family wedding reignites the ancient feud be...,1995-12-22,101.0,Grumpier Old Men,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",1995,PG-13,"Comedy, Romance",...,"Walter Matthau, Jack Lemmon, Ann-Margret","English, Italian, German",United States,2 wins & 2 nominations,https://m.media-amazon.com/images/M/MV5BMDkwYT...,6.7,30011.0,movie,71518503.0,4
tt0114885,en,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,127.0,Waiting to Exhale,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",1995,R,"Comedy, Drama, Romance",...,"Whitney Houston, Angela Bassett, Loretta Devine",English,United States,9 wins & 10 nominations,https://m.media-amazon.com/images/M/MV5BZWU4Nz...,6.0,12462.0,movie,67052156.0,19
tt0113041,en,Just when George Banks has recovered from his ...,1995-02-10,106.0,Father of the Bride Part II,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",1995,PG,"Comedy, Family, Romance",...,"Steve Martin, Diane Keaton, Martin Short",English,United States,1 win & 2 nominations,https://m.media-amazon.com/images/M/MV5BOTMwNz...,6.1,42152.0,movie,76594107.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt5362988,en,An FBI agent teams with the town's veteran gam...,2017-08-03,111.0,Wind River,"[{'cast_id': 9, 'character': 'Cory Lambert', '...","[{'credit_id': '572815d0c3a3687a00001314', 'de...",2017,R,"Crime, Drama, Mystery",...,"Elizabeth Olsen, Jeremy Renner, Graham Greene",English,"United Kingdom, France, United States",17 wins & 27 nominations,https://m.media-amazon.com/images/M/MV5BMTUyMj...,7.7,284501.0,movie,33800859.0,44
tt5322012,en,A teenage girl discovers a box with magical po...,2017-07-07,90.0,Wish Upon,"[{'cast_id': 0, 'character': 'Claire', 'credit...","[{'credit_id': '589beb06c3a3684ce50027ab', 'de...",2017,PG-13,"Drama, Fantasy, Horror",...,"Joey King, Ryan Phillippe, Ki Hong Lee",English,"United States, Canada",2 nominations,https://m.media-amazon.com/images/M/MV5BMGY5YT...,5.1,23251.0,movie,14301505.0,2
tt3564472,en,Four girlfriends take a trip to New Orleans fo...,2017-07-21,122.0,Girls Trip,"[{'cast_id': 6, 'character': 'Ryan Pierce', 'c...","[{'credit_id': '597a1749c3a3686868015efc', 'de...",2017,R,"Adventure, Comedy, Drama",...,"Regina Hall, Queen Latifah, Jada Pinkett Smith",English,"United States, Canada, China",13 wins & 42 nominations,https://m.media-amazon.com/images/M/MV5BMjMwNT...,6.2,41022.0,movie,115171585.0,55
tt5390504,en,A police raid in Detroit in 1967 results in on...,2017-07-28,143.0,Detroit,"[{'cast_id': 1, 'character': 'Melvin Dismukes'...","[{'credit_id': '5790cb97c3a3685b2600abfb', 'de...",2017,R,"Crime, Drama, History",...,"John Boyega, Anthony Mackie, Algee Smith",English,United States,4 wins & 21 nominations,https://m.media-amazon.com/images/M/MV5BMTg4MD...,7.3,56404.0,movie,16790139.0,25
