In [None]:
# STEP 1: Upload CSV (Google Colab only)
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv('netflix_titles.csv')
print("✅ Loaded dataset with shape:", df.shape)

# Check initial missing values
print("\n🔍 Missing values before cleaning:")
print(df.isnull().sum())

# Drop duplicate rows
df.drop_duplicates(inplace=True)
print("\n✅ Duplicates removed. New shape:", df.shape)

# Fill missing values
df['director'].fillna('Unknown', inplace=True)
df['cast'].fillna('Unknown', inplace=True)
df['country'].fillna('Unknown', inplace=True)
df['rating'].fillna('Unknown', inplace=True)
df['date_added'].fillna('01-Jan-2000', inplace=True)

# Convert date_added to datetime
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

# Standardize column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Standardize text fields
df['rating'] = df['rating'].astype(str).str.strip().str.upper()
df['country'] = df['country'].astype(str).str.strip()

# Convert to correct types
df['release_year'] = df['release_year'].astype(int)

# Display cleaned data sample
print("\n📊 Sample of cleaned data:")
print(df.head())

# Save cleaned dataset
df.to_csv('cleaned_netflix_titles.csv', index=False)
print("\n✅ Cleaned dataset saved as 'cleaned_netflix_titles.csv'")