# Cleaning

In [None]:
import pandas as pd
import numpy as np

disney_df_preprocessed = pd.read_csv('../data/raw/disney_plus_titles.csv')

### Date Added

In [None]:
# Extract the mode of each 'release_year' to fill null values in 'date_added'
# Loop through unique values of 'release_year' where 'date_added' is null
for i in disney_df_preprocessed[disney_df_preprocessed['date_added'].isnull()]['release_year'].unique():        
    # Compute mode of 'date_added' for the current 'release_year'
    date = disney_df_preprocessed[disney_df_preprocessed['release_year'] == i]['date_added'].mode().values[0]
    # Fill null values in 'date_added' with the computed mode for the current 'release_year'
    disney_df_preprocessed.loc[disney_df_preprocessed['release_year'] == i,'date_added'] = disney_df_preprocessed.loc[disney_df_preprocessed['release_year']==i,'date_added'].fillna(date)

In [None]:
# Convert 'date_added' column to datetime format
disney_df_preprocessed['date_added'] = pd.to_datetime(
    disney_df_preprocessed['date_added'])

### Rating

In [None]:
disney_df_preprocessed.dropna(subset=['rating'], inplace=True)

### Country

In [None]:
# Extract the mode of 'country' by 'director'
# Loop through unique values of 'director' where 'country' is null
for i in disney_df_preprocessed[disney_df_preprocessed['country'].isnull()]['director'].unique():    
    # Check if the current 'director' value is not null in other rows of 'disney_df_preprocessed'
    if i in disney_df_preprocessed[~disney_df_preprocessed['country'].isnull()]['director'].unique():  
        # Compute mode of 'country' for the current 'director'
        country = disney_df_preprocessed[disney_df_preprocessed['director'] == i]['country'].mode().values[0]
        # Fill null values in 'country' with the computed mode for the current 'director'
        disney_df_preprocessed.loc[disney_df_preprocessed['director'] == i,'country'] = disney_df_preprocessed.loc[disney_df_preprocessed['director'] == i, 'country'].fillna(country)

In [None]:
# Extract the mode of 'country' by 'cast'
# Loop through unique values of 'cast' where 'country' is null
for i in disney_df_preprocessed[disney_df_preprocessed['country'].isnull()]['cast'].unique():
    # Check if the current 'cast' value is not null in other rows of 'disney_df_preprocessed'
    if i in disney_df_preprocessed[~disney_df_preprocessed['country'].isnull()]['cast'].unique():
        # Compute mode of 'country' for the current 'cast'
        imp = disney_df_preprocessed[disney_df_preprocessed['cast'] == i]['country'].mode().values[0]
        # Fill null values in 'country' with the computed mode for the current 'cast'
        disney_df_preprocessed.loc[disney_df_preprocessed['cast'] == i,'country'] = disney_df_preprocessed.loc[disney_df_preprocessed['cast']==i,'country'].fillna(imp)

In [None]:
disney_df_preprocessed['country'].fillna('Unknown Country', inplace=True)

### Cast & director

In [None]:
disney_df_preprocessed['cast'].replace(np.nan, 'Unknown Actors',inplace  = True)
disney_df_preprocessed['director'].replace(np.nan, 'Unknown Director',inplace  = True)
disney_df_preprocessed = disney_df_preprocessed.rename(columns={'cast' : 'actors'})

### Duration

In [None]:
disney_df_preprocessed['duration'] = disney_df_preprocessed['duration'].str.replace(' min','')

# Overall

In [None]:
disney_df_preprocessed.isna().sum()

In [None]:
disney_df_preprocessed.dtypes

In [None]:
disney_df_preprocessed.head()

# Save to Processed Data

In [None]:
disney_df_preprocessed.to_csv('../data/processed/processed_disney.csv',sep=',', header=True, index=False)