In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv("C:\\Users\\dodda\\OneDrive\\Desktop\\DA Internship\\netflix_titles.csv")

# 1. Standardize column names
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

# 2. Remove duplicates
df = df.drop_duplicates()

# 3. Trim whitespace
str_cols = df.select_dtypes(include="object").columns
for c in str_cols:
    df[c] = df[c].astype(str).str.strip()
    df[c] = df[c].replace({"": np.nan, "nan": np.nan})

# 4. Parse date_added
df["date_added_parsed"] = pd.to_datetime(df["date_added"], errors="coerce")
df["added_year"] = df["date_added_parsed"].dt.year
df["added_month"] = df["date_added_parsed"].dt.month

# 5. Fix release_year
df["release_year"] = pd.to_numeric(df["release_year"], errors="coerce").astype("Int64")

# 6. Clean rating
df["rating"] = df["rating"].astype(str).str.replace(r"\s+", "", regex=True)

# 7. Split duration
# Ensure duration is string
df["duration"] = df["duration"].astype(str)

# Extract number
df["duration_num"] = df["duration"].str.extract(r"(\d+)").astype(float)

# Extract unit (mins, seasons, etc.)
df["duration_unit"] = (
    df["duration"]
    .str.extract(r"([A-Za-z]+)")
    .iloc[:, 0]
    .str.lower()
)

# 9. Normalize category columns
df["country"] = df["country"].str.replace(r"\s*,\s*", ", ", regex=True)
df["listed_in"] = df["listed_in"].str.replace(r"\s*,\s*", ", ", regex=True)
df["type"] = df["type"].str.title()

# 10. Export cleaned dataset
df.to_csv("netflix_titles_cleaned.csv", index=False)

print("Cleaning complete! File saved as netflix_titles_cleaned.csv")


Cleaning complete! File saved as netflix_titles_cleaned.csv
