In [5]:
import pandas as pd
import numpy as np
import warnings
import ast

warnings.filterwarnings("ignore")

In [6]:
def convert_to_list(text):
    try:
        return [item['name'] for item in ast.literal_eval(text)]
    except (ValueError, SyntaxError):
        return []

# Extract director from crew
def get_director(crew_list):
    try:
        crew = ast.literal_eval(crew_list)
        for member in crew:
            if member['job'] == 'Director':
                return member['name']
    except (ValueError, SyntaxError):
        return np.nan
    return np.nan

In [7]:
# Load the dataset
credits = pd.read_csv('../data/raw/tmdb_5000_credits.csv', low_memory=False)
movies = pd.read_csv('../data/raw/tmdb_5000_movies.csv', low_memory=False)

# Merge datasets on 'id'
movies = movies.merge(credits, left_on='id', right_on='movie_id', how='left')
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,spoken_languages,status,tagline,title_x,vote_average,vote_count,movie_id,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [8]:
movies.drop('movie_id', axis=1, inplace=True)
print("Available columns:", movies.columns.tolist())
movies = movies[['id', 'overview', 'genres', 'keywords', 'cast', 'crew', 'popularity',
                 'release_date', 'vote_average', 'vote_count', 'budget', 'revenue',
                 'runtime', 'status', 'tagline', 'title_x']]

Available columns: ['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language', 'original_title', 'overview', 'popularity', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title_x', 'vote_average', 'vote_count', 'title_y', 'cast', 'crew']


In [9]:
# Convert JSON-like strings to lists
movies['genres'] = movies['genres'].apply(convert_to_list)
movies['keywords'] = movies['keywords'].apply(convert_to_list)
movies['director'] = movies['crew'].apply(get_director)
movies.drop(columns=['crew'], inplace=True)

# Handle missing values
movies['overview'] = movies['overview'].fillna('')
movies['tagline'] = movies['tagline'].fillna('')
movies['cast'] = movies['cast'].fillna('[]').apply(convert_to_list)
movies['popularity'] = movies['popularity'].fillna(movies['popularity'].median())
movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')
movies['vote_average'] = movies['vote_average'].fillna(movies['vote_average'].median())
movies['vote_count'] = movies['vote_count'].fillna(movies['vote_count'].median())
movies['budget'] = movies['budget'].replace(0, np.nan).fillna(movies['budget'].median())
movies['revenue'] = movies['revenue'].replace(0, np.nan).fillna(movies['revenue'].median())
movies['runtime'] = movies['runtime'].fillna(movies['runtime'].median())


# Remove rows with critical missing information
movies.dropna(subset=['title_x', 'release_date'], inplace=True)
movies.rename(columns={'title_x': 'title'}, inplace=True)
movies.reset_index(drop=True, inplace=True)
movies.head()

# Display the first few rows of the cleaned dataset
movies.head()

Unnamed: 0,id,overview,genres,keywords,cast,popularity,release_date,vote_average,vote_count,budget,revenue,runtime,status,tagline,title,director
0,19995,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",150.437577,2009-12-10,7.2,11800,237000000.0,2787965000.0,162.0,Released,Enter the World of Pandora.,Avatar,James Cameron
1,285,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",139.082615,2007-05-19,6.9,4500,300000000.0,961000000.0,169.0,Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,Gore Verbinski
2,206647,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",107.376788,2015-10-26,6.3,4466,245000000.0,880674600.0,148.0,Released,A Plan No One Escapes,Spectre,Sam Mendes
3,49026,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...",112.31295,2012-07-16,7.6,9106,250000000.0,1084939000.0,165.0,Released,The Legend Ends,The Dark Knight Rises,Christopher Nolan
4,49529,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",43.926995,2012-03-07,6.1,2124,260000000.0,284139100.0,132.0,Released,"Lost in our world, found in another.",John Carter,Andrew Stanton
