In [2]:
import pandas as pd

# 1. Load the Dataset

df = pd.read_csv('netflix_titles.csv')

print("--- Initial Shape ---")
print(df.shape)
print("\n--- Initial Missing Values ---")
print(df.isnull().sum())


--- Initial Shape ---
(8807, 12)

--- Initial Missing Values ---
show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


In [3]:
# 2. Handling Missing Values
# Strategy:
# - 'director', 'cast', 'country': Too valuable to drop, so we fill with "Unknown" or "Not Given".
# - 'date_added': Essential for time analysis, so we drop rows where this is missing (very few).
# - 'rating', 'duration': Fill with default values or mode.

df['director'] = df['director'].fillna('Unknown')
df['cast'] = df['cast'].fillna('Unknown')
df['country'] = df['country'].fillna('Not Given')

# Drop rows where critical data (date_added) is missing
df.dropna(subset=['date_added'], inplace=True)

# Fill remaining minor gaps
df['rating'] = df['rating'].fillna(df['rating'].mode()[0])
df['duration'] = df['duration'].fillna('0 min')

In [4]:
# 3. Handling Duplicates
# We check specifically for duplicate Show IDs, as that is the unique identifier
df.drop_duplicates(subset='show_id', inplace=True)


In [5]:
# 4. Standardization & Formatting
# Convert 'date_added' from Object (String) to Datetime format
# The format in the raw file is usually like "September 25, 2021"
# We add .strip() to remove accidental whitespace before converting
df['date_added'] = pd.to_datetime(df['date_added'].str.strip())

In [6]:
# 5. Clean Column Headers
# Ensure all headers are lowercase and use underscores instead of spaces
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [7]:
# 6. Verify Data Types
print("\n--- Post-Cleaning Data Types ---")
print(df.dtypes)


--- Post-Cleaning Data Types ---
show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
description             object
dtype: object


In [8]:
# 7. Final Check
print("\n--- Final Shape ---")
print(df.shape)
print("\n--- Final Missing Values ---")
print(df.isnull().sum())


--- Final Shape ---
(8797, 12)

--- Final Missing Values ---
show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64


In [9]:
# 8. Export Cleaned Data
df.to_csv('netflix_cleaned.csv', index=False)
print("\nSuccess! Cleaned dataset saved as 'netflix_cleaned.csv'.")


Success! Cleaned dataset saved as 'netflix_cleaned.csv'.
