# Data Cleaning and Preprocessing


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv(r"C:\Users\bollejayanthsriteja\OneDrive\Desktop\Data Cleaning and Pre-Processing\netflix_titles.csv.csv")

# 1. Handle missing values
df['country'] = df['country'].fillna(df['country'].mode()[0])  # Fill missing countries with the mode
df['date_added'] = df['date_added'].fillna('Unknown')  # Fill missing dates with 'Unknown'
df['rating'] = df['rating'].fillna('Not Rated')  # Fill missing ratings with 'Not Rated'

# For other columns like 'director' and 'cast', we can drop them if not critical for analysis
df = df.drop(columns=['director', 'cast'])

# 2. Remove duplicate rows
df = df.drop_duplicates()

# 3. Standardize text values
df['type'] = df['type'].map({'Movie': 0, 'TV Show': 1})
df['country'] = df['country'].str.strip().str.title()  # Normalize country names

# 4. Convert date formats to be consistent
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce', format='%B %d, %Y')

# 5. Rename column headers
df.columns = [col.lower().replace(" ", "_") for col in df.columns]

# 6. Check and fix data types
# Parse 'duration' to extract numerical values
def parse_duration(duration):
    if isinstance(duration, str) and "Season" in duration:
        return int(duration.split()[0])
    elif isinstance(duration, str) and "min" in duration:
        return int(duration.split()[0])
    return np.nan

df['duration_cleaned'] = df['duration'].apply(parse_duration)
df['duration_cleaned'] = df['duration_cleaned'].fillna(df['duration_cleaned'].median())

# Convert 'release_year' to numerical if not already
df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce')

# Scale numerical columns
scaler = StandardScaler()
df[['release_year', 'duration_cleaned']] = scaler.fit_transform(df[['release_year', 'duration_cleaned']])

# Summary of changes
print("Cleaned dataset shape:", df.shape)
print("Missing values summary:\n", df.isnull().sum())
print("Sample of cleaned data:\n", df.head())

# Save the cleaned dataset
df.to_csv("cleaned_netflix_titles.csv", index=False)

print("✅ Data cleaning complete!")

Cleaned dataset shape: (8807, 11)
Missing values summary:
 show_id              0
type                 0
title                0
country              0
date_added          98
release_year         0
rating               0
duration             3
listed_in            0
description          0
duration_cleaned     0
dtype: int64
Sample of cleaned data:
   show_id  type                  title        country date_added  \
0      s1     0   Dick Johnson Is Dead  United States 2021-09-25   
1      s2     1          Blood & Water   South Africa 2021-09-24   
2      s3     1              Ganglands  United States 2021-09-24   
3      s4     1  Jailbirds New Orleans  United States 2021-09-24   
4      s5     1           Kota Factory          India 2021-09-24   

   release_year rating   duration  \
0      0.659930  PG-13     90 min   
1      0.773324  TV-MA  2 Seasons   
2      0.773324  TV-MA   1 Season   
3      0.773324  TV-MA   1 Season   
4      0.773324  TV-MA  2 Seasons   

                  