# Cleaning

In [1]:
import pandas as pd
import numpy as np

netflix_df_preprocessed = pd.read_csv('../data/raw/netflix_titles.csv', delimiter=';')

### Release Year

In [2]:
# Convert non-integer values in release_year to NaN
netflix_df_preprocessed['release_year'] = pd.to_numeric(
    netflix_df_preprocessed['release_year'], errors='coerce')

# Drop rows with NaN values in release_year
netflix_df_preprocessed.dropna(subset=['release_year'], inplace=True)

# Convert to int64
netflix_df_preprocessed['release_year'] = netflix_df_preprocessed['release_year'].astype(
    'int64')

### Rating

In [3]:
# Remove rows that have 'min' as value
netflix_df_preprocessed = netflix_df_preprocessed[~netflix_df_preprocessed['rating'].astype(str).str.contains('min')]

In [4]:
netflix_df_preprocessed.dropna(subset=['rating'], inplace=True)

### Date Added

In [5]:
# Extract the mode of 'date_added' by 'release_year'
# Loop through unique values of 'release_year' where 'date_added' is null
for i in netflix_df_preprocessed[netflix_df_preprocessed['date_added'].isnull()]['release_year'].unique():
    # Extract the mode of 'date_added' for the given 'release year'
    date = netflix_df_preprocessed[netflix_df_preprocessed['release_year'] == i]['date_added'].mode().values[0]
    # Fill null values in 'date_added' with the computed mode for the given 'release year'
    netflix_df_preprocessed.loc[netflix_df_preprocessed['release_year'] == i,'date_added'] = netflix_df_preprocessed.loc[netflix_df_preprocessed['release_year']==i,'date_added'].fillna(date)

In [6]:
netflix_df_preprocessed['date_added'] = pd.to_datetime(netflix_df_preprocessed['date_added'])

### Country

In [7]:
# Extract the mode of 'country' by 'director'
# Loop through unique values of 'director' where 'country' is null
for i in netflix_df_preprocessed[netflix_df_preprocessed['country'].isnull()]['director'].unique():
    # Check if the current 'director' value is not null in other rows of 'disney_df_preprocessed'
    if i in netflix_df_preprocessed[~netflix_df_preprocessed['country'].isnull()]['director'].unique():
        # Compute mode of 'country' for the current 'director'
        country = netflix_df_preprocessed[netflix_df_preprocessed['director'] == i]['country'].mode().values[0]
        # Fill null values in 'country' with the computed mode for the current 'director'
        netflix_df_preprocessed.loc[netflix_df_preprocessed['director'] == i, 'country'] = netflix_df_preprocessed.loc[netflix_df_preprocessed['director'] == i, 'country'].fillna(country)

In [8]:
# Extract the mode of 'country' by 'cast'
# Loop through unique values of 'cast' where 'country' is null
for i in netflix_df_preprocessed[netflix_df_preprocessed['country'].isnull()]['cast'].unique():
    # Check if the current 'cast' value is not null in other rows of 'disney_df_preprocessed'
    if i in netflix_df_preprocessed[~netflix_df_preprocessed['country'].isnull()]['cast'].unique():
        # Compute mode of 'country' for the current 'cast'
        imp = netflix_df_preprocessed[netflix_df_preprocessed['cast'] == i]['country'].mode().values[0]
        # Fill null values in 'country' with the computed mode for the current 'cast'
        netflix_df_preprocessed.loc[netflix_df_preprocessed['cast'] == i,'country'] = netflix_df_preprocessed.loc[netflix_df_preprocessed['cast']==i,'country'].fillna(imp)

In [9]:
netflix_df_preprocessed['country'].fillna('Unknown Country', inplace=True)

### Cast & director

In [10]:
netflix_df_preprocessed['cast'].replace(np.nan, 'Unknown Actors',inplace  = True)
netflix_df_preprocessed['director'].replace(np.nan, 'Unknown Director',inplace  = True)
netflix_df_preprocessed = netflix_df_preprocessed.rename(columns={'cast': 'actors'})

### Duration

In [11]:
netflix_df_preprocessed['duration'] = netflix_df_preprocessed['duration'].str.replace(' min','')

# Overall

In [12]:
netflix_df_preprocessed.isna().sum()

show_id         0
type            0
title           0
director        0
actors          0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

In [13]:
netflix_df_preprocessed.dtypes

show_id                 object
type                    object
title                   object
director                object
actors                  object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
description             object
dtype: object

In [14]:
netflix_df_preprocessed.head()

Unnamed: 0,show_id,type,title,director,actors,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown Actors,United States,2021-09-25,2020,PG-13,90,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,Unknown Director,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",France,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,Unknown Director,Unknown Actors,Unknown Country,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,Unknown Director,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


# Save to Processed Data

In [15]:
netflix_df_preprocessed.to_csv('../data/processed/processed_netflix.csv', header=True, sep=';', index=False)