# Setup

In [1]:
# import glob
# import numpy as np
# import os
import pandas as pd
from typing import Iterable

In [2]:
# # Get all CSV file paths in the folder
# csv_files = glob.glob(os.path.join("Data", '*.csv'))

# # Combine them
# combined_df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)

# # Save the result
# combined_df.to_csv('movies.csv', index=False)

# print(f"Combined {len(csv_files)} files into 'movies.csv'.")

In [2]:
categories = {
    'action',
    'adventure',
    'animation',
    'biography',
    'crime',
    'family',
    'fantasy',
    'film_noir',
    'history',
    'horror',
    'mystery',
    'romance',
    'scifi',
    'sports',
    'thriller',
    'war'
}

In [None]:
'''
action = pd.read_csv('Data/action.csv')
adventure = pd.read_csv('Data/adventure.csv')
animation = pd.read_csv('Data/animation.csv')
biography = pd.read_csv('Data/biography.csv')
crime = pd.read_csv('Data/crime.csv')
family = pd.read_csv('Data/family.csv')
fantasy = pd.read_csv('Data/fantasy.csv')
film_noir = pd.read_csv('Data/film_noir.csv')
history = pd.read_csv('Data/history.csv')
horror = pd.read_csv('Data/horror.csv')
mystery = pd.read_csv('Data/mystery.csv')
romance = pd.read_csv('Data/romance.csv')
scifi = pd.read_csv('Data/scifi.csv')
sports = pd.read_csv('Data/sports.csv')
thriller = pd.read_csv('Data/thriller.csv')
war = pd.read_csv('Data/war.csv')
'''

for category in categories:
    # Dynamically read each CSV file into a pd.DataFrame based on the category name
        # and assign it to a global variable with the same name as the category
    globals()[category] = pd.read_csv(f"Data/{category}.csv")

del category

In [5]:
# globals()['action'].head(10)

# Data Preparation

In [4]:
def clean_and_split_columns(df: pd.DataFrame, splits: Iterable, make_sets: bool) -> None:
    """
    Cleans and splits specified columns in a DataFrame.
    
    Parameters:
        df: The DataFrame to process.
        splits: List of columns to split and clean.
        make_sets: If `True`, creates a set of unique values for each column.
    
    Returns:
        None: The function modifies the DataFrame in place.
    """

    def split_multiples(df: pd.DataFrame, col: str) -> None:
        """
        Splits the values in a column by `,` and stores them in a new column, as a list.

        Parameters:
            df: The DataFrame to process.
            col: The column to split.
        
        Returns:
            None: The function modifies the DataFrame in place.
        """

        df[col + 's'] = \
            df[col].apply(
                lambda x: [val.strip() for val in x.split(',')] 
                    if pd.notnull(x) 
                    else None)
        
    def trim_name_urls(df: pd.DataFrame, col: str) -> None:
        """
        Trims name URLs to just the important part.
        This is used for director and star names.
        
        i.e. `'/name/nm0000001/'` becomes `'nm0000001'`
        
        Parameters:
            df: The DataFrame to process.
            col: The column to trim.
        
        Returns:
            None: The function modifies the DataFrame in place.
        """

        df[col] = df[col].apply(
            lambda l: [x.split('/')[-2].strip() for x in l]
                if isinstance(l, list) and len(l) > 0
                else l)

    if make_sets:
        def track_uniques(df: pd.DataFrame, col: str, unique_set: set) -> None:
            
            """
            Tracks unique values in a column and stores them in a set.

            Parameters:
                df: The DataFrame to process.
                col: The column to track unique values from.
                unique_set: A set to store unique values.
            
            Returns:
                None: The function modifies the set in place.
            """

            for items in df[col + 's'].dropna():
                # Add each item to the set
                for item in items:
                    unique_set.add(item)

    ##########

    for col in splits:
        col_s = col + 's'
        try:
            # Split the column values by ',' and store in a new column
            split_multiples(df, col)

            # Trim name URLs for director and star names if applicable
            if col_s in ['director_ids', 'star_ids']:
                trim_name_urls(df, col_s)

            if make_sets:
                # Track unique values for each column
                globals()[col_s] = set()
                track_uniques(df, col, globals()[col_s])

            df.drop(columns=[col], inplace=True) # Drop the original column to save space
        except KeyError:
            # Handle the case where the column might not exist in the DataFrame (i.e. this function has already been performed)
            print(f"Column '{col}' not found in DataFrame.")

            if make_sets:
                # Initialize an empty set to avoid errors later on
                if col_s not in globals():
                    globals()[col_s] = set()

In [5]:
def prepare(df: pd.DataFrame, drops: Iterable = None, make_sets: bool = True) -> None:
    """
    Performs all clean-up / preprocessing steps on the DataFrame.

    Parameters:
        df: The DataFrame to process.
        drops: List of columns to drop from the DataFrame.
        make_sets: If `True`, creates a set of unique values for each column.
    
    Returns:
        None: The function modifies the DataFrame in place.
    """
    
    df.dropna(inplace=True, how='all') # Drop rows where all elements are None
    try:
        df.drop(columns=drops, inplace=True) # Drop specified columns
    except:
        # Handle the case where the columns might not exist in the DataFrame
        print(f"Columns {drops} not found in DataFrame.\nHas preprocessing already been performed?")
        return

    # NOTE: splits is a list of columns to split by ','.
        # 'director' and 'star' columns are likely not useful,
        # as they are just text (meaningless to a model, likely),
        # so if they are included in drops, they will be ignored here.
    splits = {
        'genre',
        'director',
        'director_id',
        'star',
        'star_id',
    } - drops

    clean_and_split_columns(df, splits, make_sets)

In [6]:
drops = {'description', 'director', 'star'}
for category in categories:
    # Prepare each category DataFrame
    prepare(globals()[category], drops=drops, make_sets=True)

del category
del drops

# NOTE: now, each category's DataFrame has:
# - Empty rows removed,
# - useless columns dropped,
# - list-like columns split into lists and the original columns removed.

In [9]:
# NOTE: testing

# type(globals()['action']['genres'][0])
# print(globals()['genres'])