# Import libraries

In [1]:
import pandas as pd

# load the dataset

In [2]:
df = pd.read_csv('/content/drive/MyDrive/data/netflix_titles.csv')

# Display the first few rows of the datset

In [3]:
print("Initial Dataset:")

Initial Dataset:


In [4]:
print(df.head())

  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September 24, 2021        

# Step 1 Identify and handle missing values

In [None]:
# check for missing values

In [5]:
missing_values = df.isnull().sum()

In [6]:
print("\nMissing Values in Each Column:")


Missing Values in Each Column:


In [7]:
print(missing_values)

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


# Handling missing values

In [None]:
# Fill missing values for 'director' and 'cast' with 'Unknown

In [8]:
df['director'] = df['director'].fillna('Unknown')

In [9]:
df['cast'] = df['cast'].fillna('Unknown')

In [None]:
# fill missing values for 'country' with 'Not Specified'

In [38]:
df['country'] = df['country'].fillna('Not Specified')

# Step 2 Remove Duplicate Rows

In [None]:
# check for duplicates

In [11]:
duplicates = df.duplicated().sum()

In [12]:
print(f"\nNumber of Duplicate Rows: {duplicates}")


Number of Duplicate Rows: 0


In [None]:
# Remove duplicates

In [13]:
df = df.drop_duplicates()

# Step 3 Standardize text values

In [None]:
# Standardizing 'rating' values to lowercase

In [14]:
df['rating'] = df['rating'].str.lower().str.strip()

# Step 4 Convert data formats

In [15]:
# convert 'date_added' column to datetime format

In [20]:
df['date_added'] = pd.to_datetime(df['date_added'], format='%B %d, %Y')

# Step 5 Rename Column Headers

In [21]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Step 6 check and fix data types

In [22]:
# Ensure 'release_year' is an integer

In [23]:
df['release_year'] = df['release_year'].astype(int)

# Step 7: Handle inconsistent data formats

In [24]:
# Example: Standardizing 'type' values

In [25]:
df['type'] = df['type'].str.lower().str.strip()

# Step 8: Check data quality

In [26]:
# Example: Check for valid ratings

In [27]:
valid_ratings = ['g', 'pg', 'pg-13', 'r', 'tv-ma', 'tv-pg', 'tv-14']

In [28]:
df = df[df['rating'].isin(valid_ratings)]

# Step 9: Document changes

In [29]:
summary = {
    "missing_values_handled": True,
    "duplicates_removed": True,
    "text_values_standardized": True,
    "date_format_converted": True,
    "column_headers_renamed": True,
    "data_types_fixed": True,
    "valid_ratings_checked": True
}

In [30]:
print("\nSummary of Changes:")


Summary of Changes:


In [32]:
for key, value in summary.items():
  print(f"{key}: {value}")

missing_values_handled: True
duplicates_removed: True
text_values_standardized: True
date_format_converted: True
column_headers_renamed: True
data_types_fixed: True
valid_ratings_checked: True


In [33]:
# Display the cleaned dataset

In [34]:
print("\nCleaned Dataset:")


Cleaned Dataset:


In [35]:
print(df.head())

  show_id     type                  title         director  \
0      s1    movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  tv show          Blood & Water          Unknown   
2      s3  tv show              Ganglands  Julien Leclercq   
3      s4  tv show  Jailbirds New Orleans          Unknown   
4      s5  tv show           Kota Factory          Unknown   

                                                cast        country  \
0                                            Unknown  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...  Not Specified   
3                                            Unknown  Not Specified   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

  date_added  release_year rating   duration  \
0 2021-09-25          2020  pg-13     90 min   
1 2021-09-24          2021  tv-ma  2 Seasons   
2 2021-09-24          2021  tv-ma   1 Season   
3 2021-0

In [36]:
# Save the cleaned dataset to a new CSV file

In [37]:
df.to_csv('cleaned_netflix_titles.csv', index=False)