In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
from collections import defaultdict

In [5]:

# Load the dataset
df = pd.read_csv('netflix_titles.csv')

# 1. Strip whitespace from string columns
str_cols = df.select_dtypes(include='object').columns
df[str_cols] = df[str_cols].apply(lambda x: x.str.strip())

# 2. Convert `date_added` to datetime
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

# 3. Fill missing durations with 0
# 3. Clean and convert durations safely
df['duration_minutes'] = pd.to_numeric(df['duration_minutes'], errors='coerce').fillna(0).astype(int)
df['duration_seasons'] = pd.to_numeric(df['duration_seasons'], errors='coerce').fillna(0).astype(int)


# 4. Convert release_year to int (handle missing safely)
df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce').astype('Int64')

# 5. Create unified duration column
df['content_duration'] = df.apply(
    lambda x: x['duration_minutes'] if x['type'] == 'Movie' else x['duration_seasons'],
    axis=1
)

# 6. Create year and month added columns
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month

# 7. Drop duplicates based on show_id
df.drop_duplicates(subset='show_id', inplace=True)
df = df.dropna(subset=['show_id'])  # Remove rows with missing show_id
# 8. Save cleaned data to a new CSV
df.to_csv('netflix_titles_cleaned.csv', index=False)

print("✅ Data cleaned and saved as 'netflix_titles_cleaned.csv'")


✅ Data cleaned and saved as 'netflix_titles_cleaned.csv'
