In [1]:
import pandas as pd
import json
import ast  # For safely evaluating literal string representations of lists/dicts

def parse_json_column(dataframe, column_name):
    try:
        # Try parsing directly if it's already a valid JSON
        return dataframe[column_name].apply(lambda x: json.loads(x) if pd.notna(x) else [])
    except:
        # If it fails, try using ast.literal_eval which is safer than eval()
        return dataframe[column_name].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])

In [3]:
# Load the data
credits_df = pd.read_csv('tmdb_5000_credits.csv')
movies_df = pd.read_csv('tmdb_5000_movies.csv')

In [4]:
def transform_credits(credits_df):

    # Parse the JSON strings in cast and crew columns
    credits_df['cast_parsed'] = parse_json_column(credits_df, 'cast')
    credits_df['crew_parsed'] = parse_json_column(credits_df, 'crew')

    # Create a cast table with individual actors
    cast_table = []
    for index, row in credits_df.iterrows():
        movie_id = row['movie_id']
        cast_list = row['cast_parsed']
        
        for cast_member in cast_list:
            cast_table.append({
                'movie_id': movie_id,
                'cast_id': cast_member.get('cast_id'),
                'character': cast_member.get('character'),
                'credit_id': cast_member.get('credit_id'),
                'gender': cast_member.get('gender'),
                'id': cast_member.get('id'),
                'name': cast_member.get('name'),
                'order': cast_member.get('order')
            })

    cast_table_df = pd.DataFrame(cast_table)

    # Create a crew table with individual crew members
    crew_table = []
    for index, row in credits_df.iterrows():
        movie_id = row['movie_id']
        crew_list = row['crew_parsed']
        
        for crew_member in crew_list:
            crew_table.append({
                'movie_id': movie_id,
                'credit_id': crew_member.get('credit_id'),
                'department': crew_member.get('department'),
                'gender': crew_member.get('gender'),
                'id': crew_member.get('id'),
                'job': crew_member.get('job'),
                'name': crew_member.get('name')
            })

    crew_table_df = pd.DataFrame(crew_table)

    # Create a persons table (combining unique people from cast and crew)
    # Get unique persons from cast
    cast_persons = cast_table_df[['id', 'name', 'gender']].drop_duplicates()
    cast_persons.columns = ['person_id', 'name', 'gender']

    # Get unique persons from crew
    crew_persons = crew_table_df[['id', 'name', 'gender']].drop_duplicates()
    crew_persons.columns = ['person_id', 'name', 'gender']

    # Combine and remove duplicates
    persons_table = pd.concat([cast_persons, crew_persons]).drop_duplicates(subset=['person_id'])

    # Drop unnecessary columns from cast and crew tables
    cast_table_df.drop(columns=['gender', 'name'], inplace=True)
    crew_table_df.drop(columns=['gender', 'name'], inplace=True)

    # Return the transformed tables
    return cast_table_df, crew_table_df, persons_table

In [5]:
def transform_movies(movies_df):

    # Parse JSON columns 
    for column in ['genres', 'keywords', 'production_companies', 'production_countries', 'spoken_languages']:
        if column in movies_df.columns:
            movies_df[f'{column}_parsed'] = parse_json_column(movies_df, column)

    # Create the main movies table with non-JSON columns
    movies_table = movies_df[[
        'id', 'title', 'budget', 'homepage', 'original_language',
        'original_title', 'overview', 'popularity', 'release_date',
        'revenue', 'runtime', 'status', 'tagline', 'vote_average', 'vote_count'
    ]].copy()
    movies_table.rename(columns={'id': 'movie_id'}, inplace=True)

    # Create genres table
    genres_table = []
    for index, row in movies_df.iterrows():
        movie_id = row['id']
        if 'genres_parsed' in row and isinstance(row['genres_parsed'], list):
            for genre in row['genres_parsed']:
                genres_table.append({
                    'movie_id': movie_id,
                    'genre_id': genre.get('id'),
                    'genre_name': genre.get('name')
                })

    genres_table_df = pd.DataFrame(genres_table)

    # Create unique genres reference table
    unique_genres = genres_table_df[['genre_id', 'genre_name']].drop_duplicates()

    # Create keywords table
    keywords_table = []
    for index, row in movies_df.iterrows():
        movie_id = row['id']
        if 'keywords_parsed' in row and isinstance(row['keywords_parsed'], list):
            for keyword in row['keywords_parsed']:
                keywords_table.append({
                    'movie_id': movie_id,
                    'keyword_id': keyword.get('id'),
                    'keyword_name': keyword.get('name')
                })

    keywords_table_df = pd.DataFrame(keywords_table)

    # Create unique keywords reference table
    unique_keywords = keywords_table_df[['keyword_id', 'keyword_name']].drop_duplicates()

    # Create production companies table
    production_companies_table = []
    for index, row in movies_df.iterrows():
        movie_id = row['id']
        if 'production_companies_parsed' in row and isinstance(row['production_companies_parsed'], list):
            for company in row['production_companies_parsed']:
                production_companies_table.append({
                    'movie_id': movie_id,
                    'company_id': company.get('id'),
                    'company_name': company.get('name')
                })

    production_companies_df = pd.DataFrame(production_companies_table)

    # Create unique production companies reference table
    unique_companies = production_companies_df[['company_id', 'company_name']].drop_duplicates()

    # Create production countries table
    production_countries_table = []
    for index, row in movies_df.iterrows():
        movie_id = row['id'] 
        if 'production_countries_parsed' in row and isinstance(row['production_countries_parsed'], list):
            for country in row['production_countries_parsed']:
                production_countries_table.append({
                    'movie_id': movie_id,
                    'country_iso': country.get('iso_3166_1'),
                    'country_name': country.get('name')
                })

    production_countries_df = pd.DataFrame(production_countries_table)

    # Create unique production countries reference table
    unique_countries = production_countries_df[['country_iso', 'country_name']].drop_duplicates()

    # Create spoken languages table
    languages_table = []
    for index, row in movies_df.iterrows():
        movie_id = row['id']
        if 'spoken_languages_parsed' in row and isinstance(row['spoken_languages_parsed'], list):
            for language in row['spoken_languages_parsed']:
                languages_table.append({
                    'movie_id': movie_id,
                    'language_iso': language.get('iso_639_1'),
                    'language_name': language.get('name')
                })

    languages_df = pd.DataFrame(languages_table)

    # Create unique languages reference table
    unique_languages = languages_df[['language_iso', 'language_name']].drop_duplicates()

    # Drop unnecessary columns from relation tables
    genres_table_df.drop(columns=['genre_name'], inplace=True)
    keywords_table_df.drop(columns=['keyword_name'], inplace=True)
    production_companies_df.drop(columns=['company_name'], inplace=True)
    production_countries_df.drop(columns=['country_name'], inplace=True)
    languages_df.drop(columns=['language_name'], inplace=True)

    # Return the transformed tables
    return (movies_table, genres_table_df, unique_genres, keywords_table_df, unique_keywords,
            production_companies_df, unique_companies, production_countries_df, unique_countries,
            languages_df, unique_languages)

In [7]:
# Run the transformations
(movies_table, genres_table_df, unique_genres, keywords_table_df, unique_keywords,
            production_companies_df, unique_companies, production_countries_df, unique_countries,
            languages_df, unique_languages) = transform_movies(movies_df)

cast_table_df, crew_table_df, persons_table = transform_credits(credits_df)

In [None]:
# Save all tables to CSV if needed

# movies_table.to_csv('movies_details.csv', index=False)

# cast_table_df.to_csv('cast.csv', index=False)
# crew_table_df.to_csv('crew.csv', index=False)
# persons_table.to_csv('persons.csv', index=False)

# genres_table_df.to_csv('movie_genres.csv', index=False)
# unique_genres.to_csv('genres.csv', index=False)

# keywords_table_df.to_csv('movie_keywords.csv', index=False)
# unique_keywords.to_csv('keywords.csv', index=False)

# production_companies_df.to_csv('movie_production_companies.csv', index=False)
# unique_companies.to_csv('production_companies.csv', index=False)

# production_countries_df.to_csv('movie_production_countries.csv', index=False)
# unique_countries.to_csv('production_countries.csv', index=False)

# languages_df.to_csv('movie_languages.csv', index=False)
# unique_languages.to_csv('languages.csv', index=False)