In [2]:
# 1. Import Libraries
import pandas as pd
# 2. Load Dataset
df = pd.read_csv("netflix_titles.csv")
print("Initial Shape:", df.shape)
print(df.head())
print(df.info())
# 3. Handle Missing Values
# Check missing values
print("\nMissing Values Before Cleaning:\n", df.isnull().sum())
# Fill missing 'director' with "Unknown"
df['director'] = df['director'].fillna("Unknown")
# Fill missing 'cast' with "Not Specified"
df['cast'] = df['cast'].fillna("Not Specified")
# Fill missing 'country' with "Not Specified"
df['country'] = df['country'].fillna("Not Specified")
# Fill missing 'rating' with "Unrated"
df['rating'] = df['rating'].fillna("Unrated")
# Drop rows where 'date_added' is missing (very few)
df.dropna(subset=['date_added'], inplace=True)
print("\nMissing Values After Cleaning:\n", df.isnull().sum())
# 4. Remove Duplicates
before = df.shape[0]
df.drop_duplicates(inplace=True)
after = df.shape[0]
print(f"\nDuplicates Removed: {before - after}")
# 5. Standardize Text Columns
# Strip spaces, lowercase for consistency
df['type'] = df['type'].str.strip().str.title()
df['country'] = df['country'].str.strip().str.title()
df['listed_in'] = df['listed_in'].str.strip()
# 6. Fix Date Formatting
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
# Extract year and month for analysis
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month
# 7. Clean Column Names
df.columns = df.columns.str.lower().str.replace(" ", "_")
# 8. Fix Data Types
df['release_year'] = df['release_year'].astype(int)
# 9. Save Cleaned Dataset
df.to_csv("cleaned_netflix.csv", index=False)
print("\nFinal Shape:", df.shape)
print("\nCleaned Dataset Saved as 'cleaned_netflix.csv'")


Initial Shape: (8807, 12)
  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  