In [1]:
# Netflix Titles - Data Cleaning and Preprocessing

# Step 1: Import required libraries
import pandas as pd
import numpy as np


In [2]:

# Step 2: Load the dataset
df = pd.read_csv('netflix_titles.csv')

In [3]:
# Step 3: Initial overview
print("Initial Dataset Info:")
print(df.info())
print("\nMissing Values per Column:")
print(df.isnull().sum())

Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB
None

Missing Values per Column:
show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration

In [4]:

# Step 4: Handle missing values
df['director'].fillna('Unknown', inplace=True)
df['cast'].fillna('Not Available', inplace=True)
df['country'].fillna('Unknown', inplace=True)
df['date_added'].fillna('Unknown', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['director'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['cast'].fillna('Not Available', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

In [5]:
# Number of duplicate rows
num_duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicates}")


Number of duplicate rows: 0


In [6]:

# Step 5: Remove duplicate rows
df.drop_duplicates(inplace=True)

In [7]:
# Step 6: Standardize text fields
df['type'] = df['type'].str.strip().str.title()
df['country'] = df['country'].str.strip().str.title()
df['rating'] = df['rating'].str.strip().str.upper()


In [8]:

# Step 7: Convert 'date_added' to datetime format (only where not "Unknown")
df['date_added'] = df['date_added'].replace('Unknown', np.nan)
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

In [9]:

# Step 8: Rename column headers to lowercase and replace spaces with underscores
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [10]:

# Step 9: Fix data types
df['release_year'] = df['release_year'].astype(int)


In [11]:
df.dropna(inplace=True)


In [12]:
# Step 10: Final cleaned data info
print("\nCleaned Dataset Info:")
print(df.info())
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())


Cleaned Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 8702 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   show_id       8702 non-null   object        
 1   type          8702 non-null   object        
 2   title         8702 non-null   object        
 3   director      8702 non-null   object        
 4   cast          8702 non-null   object        
 5   country       8702 non-null   object        
 6   date_added    8702 non-null   datetime64[ns]
 7   release_year  8702 non-null   int64         
 8   rating        8702 non-null   object        
 9   duration      8702 non-null   object        
 10  listed_in     8702 non-null   object        
 11  description   8702 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(10)
memory usage: 883.8+ KB
None

Missing Values After Cleaning:
show_id         0
type            0
title           0
director    

In [13]:

# Step 11: Save the cleaned dataset
df.to_csv('netflix_titles_cleaned.csv', index=False)

In [14]:
# Step 12: Summary of changes
summary = {
    "Rows in original dataset": df.shape[0],
    "Nulls handled": True,
    "Duplicates removed": True,
    "Text columns standardized": ["type", "country", "rating"],
    "Dates formatted": "date_added (datetime)",
    "Column names renamed": True,
    "Data types fixed": ["release_year", "date_added"],
}


In [15]:

print("\n Summary of Cleaning:")
for key, val in summary.items():
    print(f"- {key}: {val}")



 Summary of Cleaning:
- Rows in original dataset: 8702
- Nulls handled: True
- Duplicates removed: True
- Text columns standardized: ['type', 'country', 'rating']
- Dates formatted: date_added (datetime)
- Column names renamed: True
- Data types fixed: ['release_year', 'date_added']
