In [6]:
import numpy as np 
import pandas as pd

In [7]:
#loading the netflix data

df = pd.read_csv(r"C:/netflix_titles.csv")

In [8]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [10]:
# renaming columns (puting lowercase and underscores instead of spaces)

df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [11]:
# Handling missing values

null_values = df.isnull().sum()

In [13]:
# Filling the NaN values in 'director' and 'cast' with "Unknown"
df['director'] = df['director'].fillna('Unknown')
df['cast'] = df['cast'].fillna('Unknown')

In [14]:
# Deleting rows with missing 'title', 'type', or 'country'
df.dropna(subset=['title', 'type', 'country'], inplace=True)

In [16]:
# Removing duplicates
df.drop_duplicates(inplace=True)


In [17]:
#  Standardizing text values
df['type'] = df['type'].str.strip().str.title()
df['country'] = df['country'].str.strip().str.title()
df['rating'] = df['rating'].astype(str).str.strip().str.upper()


In [19]:
# Converting 'date_added' to datetime and in standardized format
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
df['date_added'] = df['date_added'].dt.strftime('%d-%m-%Y')


In [20]:
# Fixing data types
df['release_year'] = df['release_year'].astype(int)

In [22]:
# Sorting by date_added in descending order
df.sort_values(by='date_added', ascending=False, inplace=True)


In [27]:
# saving the cleaned dataset
df.to_csv("netflix_titles_cleaned.csv", index=False)


In [25]:
# summary of the changes that has been made till now 
summary = {
    "null_values_before": null_values.to_dict(),
    "null_values_after": df.isnull().sum().to_dict(),
    "rows_after_cleaning": len(df),
    "columns": df.columns.tolist()
}