In [3]:
# Step 1: Import pandas and load the dataset
import pandas as pd

# Load the CSV file
df = pd.read_csv("netflix_titles_nov_2019.csv")

# Display the first few rows
df.head()


Unnamed: 0,show_id,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,type
0,81193313,Chocolate,,"Ha Ji-won, Yoon Kye-sang, Jang Seung-jo, Kang ...",South Korea,"November 30, 2019",2019,TV-14,1 Season,"International TV Shows, Korean TV Shows, Roman...",Brought together by meaningful meals in the pa...,TV Show
1,81197050,Guatemala: Heart of the Mayan World,"Luis Ara, Ignacio Jaunsolo",Christian Morales,,"November 30, 2019",2019,TV-G,67 min,"Documentaries, International Movies","From Sierra de las Minas to Esquipulas, explor...",Movie
2,81213894,The Zoya Factor,Abhishek Sharma,"Sonam Kapoor, Dulquer Salmaan, Sanjay Kapoor, ...",India,"November 30, 2019",2019,TV-14,135 min,"Comedies, Dramas, International Movies",A goofy copywriter unwittingly convinces the I...,Movie
3,81082007,Atlantics,Mati Diop,"Mama Sane, Amadou Mbow, Ibrahima Traore, Nicol...","France, Senegal, Belgium","November 29, 2019",2019,TV-14,106 min,"Dramas, Independent Movies, International Movies","Arranged to marry a rich man, young Ada is cru...",Movie
4,80213643,Chip and Potato,,"Abigail Oliver, Andrea Libman, Briana Buckmast...","Canada, United Kingdom",,2019,TV-Y,2 Seasons,Kids' TV,"Lovable pug Chip starts kindergarten, makes ne...",TV Show


In [4]:
# Step 2: Identify and handle missing values

# Check for missing values
print("Missing values before handling:")
print(df.isnull().sum())

# Fill missing values with 'Unknown'
df.fillna('Unknown', inplace=True)

# Confirm missing values handled
print("\nMissing values after handling:")
print(df.isnull().sum())


Missing values before handling:
show_id            0
title              0
director        1901
cast             556
country          427
date_added       642
release_year       0
rating            10
duration           0
listed_in          0
description        0
type               0
dtype: int64

Missing values after handling:
show_id         0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
type            0
dtype: int64


In [5]:
# Step 3: Remove duplicate rows

# Check for duplicates
print("Number of duplicate rows:", df.duplicated().sum())

# Drop duplicate rows
df = df.drop_duplicates()

# Confirm duplicates removed
print("Duplicate rows after removal:", df.duplicated().sum())


Number of duplicate rows: 0
Duplicate rows after removal: 0


In [6]:
# Step 4: Standardize text values (e.g., countries)

# Convert to lowercase and strip whitespace
df['country'] = df['country'].str.lower().str.strip()

# Replace inconsistent names (example)
df['country'] = df['country'].replace({
    'united states': 'usa',
    'united kingdom': 'uk'
})

# View unique countries
print("Unique countries (after standardization):")
print(df['country'].unique())


Unique countries (after standardization):
['south korea' 'unknown' 'india' 'france, senegal, belgium'
 'canada, united kingdom' 'nigeria' 'france' 'south africa'
 'united states, spain, colombia, mexico' 'usa' 'japan' 'brazil'
 'germany, canada, united states' 'canada, norway' 'poland' 'mexico'
 'china' 'united states, united arab emirates' 'united states, japan'
 'spain' 'singapore, united states' 'united states, canada'
 'united kingdom, germany, canada, united states' 'australia, france'
 'germany' 'germany, united kingdom' 'italy, united states'
 'united states, new zealand, united kingdom'
 'united kingdom, germany, united states' 'united states, germany'
 'argentina' 'taiwan' 'united kingdom, united states'
 'finland, germany, belgium' 'united kingdom, france'
 'united states, spain, chile, peru' 'colombia' 'united states, ireland'
 'india, united states' 'thailand' 'turkey'
 'argentina, united states, mexico' 'uk'
 'united states, united kingdom, canada' 'china, hong kong'
 'can

In [7]:
# Step 5: Convert date formats to a consistent type

# Convert 'date_added' to datetime format
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

# Check for conversion issues
print("Null values in 'date_added' after conversion:", df['date_added'].isnull().sum())


Null values in 'date_added' after conversion: 642


In [8]:
# Step 6: Rename column headers (lowercase and underscores)

df.columns = df.columns.str.lower().str.replace(' ', '_')

# Confirm column names
print("Cleaned column headers:")
print(df.columns)


Cleaned column headers:
Index(['show_id', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'type'],
      dtype='object')


In [9]:
# Step 7: Check and fix data types

# View data types
print("Data types before conversion:")
print(df.dtypes)

# Convert 'release_year' to integer
df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce')

# View data types again
print("Data types after conversion:")
print(df.dtypes)


Data types before conversion:
show_id                  int64
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
description             object
type                    object
dtype: object
Data types after conversion:
show_id                  int64
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
description             object
type                    object
dtype: object


In [10]:
# Final Step: Save the cleaned dataset to a new CSV

df.to_csv("cleaned_netflix_titles.csv", index=False)
print("Cleaned dataset saved as 'cleaned_netflix_titles.csv'")


Cleaned dataset saved as 'cleaned_netflix_titles.csv'
