# Cleaning

In [2]:
import pandas as pd
import numpy as np

disney_df_preprocessed = pd.read_csv('../data/raw/disney_plus_titles.csv')

### Date Added

In [3]:
# Extract the mode of each 'release_year' to fill null values in 'date_added'
# Loop through unique values of 'release_year' where 'date_added' is null
for i in disney_df_preprocessed[disney_df_preprocessed['date_added'].isnull()]['release_year'].unique():        
    # Compute mode of 'date_added' for the current 'release_year'
    date = disney_df_preprocessed[disney_df_preprocessed['release_year'] == i]['date_added'].mode().values[0]
    # Fill null values in 'date_added' with the computed mode for the current 'release_year'
    disney_df_preprocessed.loc[disney_df_preprocessed['release_year'] == i,'date_added'] = disney_df_preprocessed.loc[disney_df_preprocessed['release_year']==i,'date_added'].fillna(date)

In [4]:
# Convert 'date_added' column to datetime format
disney_df_preprocessed['date_added'] = pd.to_datetime(
    disney_df_preprocessed['date_added'])

### Rating

In [5]:
disney_df_preprocessed.dropna(subset=['rating'], inplace=True)

### Country

In [6]:
# Extract the mode of 'country' by 'director'
# Loop through unique values of 'director' where 'country' is null
for i in disney_df_preprocessed[disney_df_preprocessed['country'].isnull()]['director'].unique():    
    # Check if the current 'director' value is not null in other rows of 'disney_df_preprocessed'
    if i in disney_df_preprocessed[~disney_df_preprocessed['country'].isnull()]['director'].unique():  
        # Compute mode of 'country' for the current 'director'
        country = disney_df_preprocessed[disney_df_preprocessed['director'] == i]['country'].mode().values[0]
        # Fill null values in 'country' with the computed mode for the current 'director'
        disney_df_preprocessed.loc[disney_df_preprocessed['director'] == i,'country'] = disney_df_preprocessed.loc[disney_df_preprocessed['director'] == i, 'country'].fillna(country)

In [7]:
# Extract the mode of 'country' by 'cast'
# Loop through unique values of 'cast' where 'country' is null
for i in disney_df_preprocessed[disney_df_preprocessed['country'].isnull()]['cast'].unique():
    # Check if the current 'cast' value is not null in other rows of 'disney_df_preprocessed'
    if i in disney_df_preprocessed[~disney_df_preprocessed['country'].isnull()]['cast'].unique():
        # Compute mode of 'country' for the current 'cast'
        imp = disney_df_preprocessed[disney_df_preprocessed['cast'] == i]['country'].mode().values[0]
        # Fill null values in 'country' with the computed mode for the current 'cast'
        disney_df_preprocessed.loc[disney_df_preprocessed['cast'] == i,'country'] = disney_df_preprocessed.loc[disney_df_preprocessed['cast']==i,'country'].fillna(imp)

In [8]:
disney_df_preprocessed['country'].fillna('Unknown Country', inplace=True)

### Cast & director

In [9]:
disney_df_preprocessed['cast'].replace(np.nan, 'Unknown Actors',inplace  = True)
disney_df_preprocessed['director'].replace(np.nan, 'Unknown Director',inplace  = True)
disney_df_preprocessed = disney_df_preprocessed.rename(columns={'cast' : 'actors'})

### Duration

In [10]:
disney_df_preprocessed['duration'] = disney_df_preprocessed['duration'].str.replace(' min','')

# Overall

In [11]:
disney_df_preprocessed.isna().sum()

show_id         0
type            0
title           0
director        0
actors          0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

In [12]:
disney_df_preprocessed.dtypes

show_id                 object
type                    object
title                   object
director                object
actors                  object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
description             object
dtype: object

In [13]:
disney_df_preprocessed.head()

Unnamed: 0,show_id,type,title,director,actors,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Duck the Halls: A Mickey Mouse Christmas Special,"Alonso Ramirez Ramos, Dave Wasson","Chris Diamantopoulos, Tony Anselmo, Tress MacN...",Unknown Country,2021-11-26,2016,TV-G,23,"Animation, Family",Join Mickey and the gang as they duck the halls!
1,s2,Movie,Ernest Saves Christmas,John Cherry,"Jim Varney, Noelle Parker, Douglas Seale",Unknown Country,2021-11-26,1988,PG,91,Comedy,Santa Claus passes his magic bag to a new St. ...
2,s3,Movie,Ice Age: A Mammoth Christmas,Karen Disher,"Raymond Albert Romano, John Leguizamo, Denis L...",United States,2021-11-26,2011,TV-G,23,"Animation, Comedy, Family",Sid the Sloth is on Santa's naughty list.
3,s4,Movie,The Queen Family Singalong,Hamish Hamilton,"Darren Criss, Adam Lambert, Derek Hough, Alexa...",United States,2021-11-26,2021,TV-PG,41,Musical,"This is real life, not just fantasy!"
5,s6,Movie,Becoming Cousteau,Liz Garbus,"Jacques Yves Cousteau, Vincent Cassel",United States,2021-11-24,2021,PG-13,94,"Biographical, Documentary",An inside look at the legendary life of advent...


# Save to Processed Data

In [14]:
disney_df_preprocessed.to_csv('../data/processed/processed_disney.csv',sep=',', header=True, index=False)