In [6]:
# Data manipulation
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Warnings
import warnings
warnings.filterwarnings('ignore')

In [8]:
# Load the dataset from URL
url = "https://raw.githubusercontent.com/prasertcbs/basic-dataset/master/netflix_titles.csv"
print("📥 Extracting data from URL...")
df = pd.read_csv(url)

# Preview the data
print(f"✅ Dataset loaded. Shape: {df.shape}")
df.head()

📥 Extracting data from URL...
✅ Dataset loaded. Shape: (6234, 12)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...


In [9]:
# Basic info
df.info()

# Null values
print("\n🧹 Missing Values:")
print(df.isnull().sum())

# Summary stats
df.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6234 entries, 0 to 6233
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       6234 non-null   int64 
 1   type          6234 non-null   object
 2   title         6234 non-null   object
 3   director      4265 non-null   object
 4   cast          5664 non-null   object
 5   country       5758 non-null   object
 6   date_added    6223 non-null   object
 7   release_year  6234 non-null   int64 
 8   rating        6224 non-null   object
 9   duration      6234 non-null   object
 10  listed_in     6234 non-null   object
 11  description   6234 non-null   object
dtypes: int64(2), object(10)
memory usage: 584.6+ KB

🧹 Missing Values:
show_id            0
type               0
title              0
director        1969
cast             570
country          476
date_added        11
release_year       0
rating            10
duration           0
listed_in          0
de

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
count,6234.0,6234,6234,4265,5664,5758,6223,6234.0,6224,6234,6234,6234
unique,,2,6172,3301,5469,554,1524,,14,201,461,6226
top,,Movie,The Silence,"Raúl Campos, Jan Suter",David Attenborough,United States,"January 1, 2020",,TV-MA,1 Season,Documentaries,A surly septuagenarian gets another chance at ...
freq,,4265,3,18,18,2032,122,,2027,1321,299,3
mean,76703680.0,,,,,,,2013.35932,,,,
std,10942960.0,,,,,,,8.81162,,,,
min,247747.0,,,,,,,1925.0,,,,
25%,80035800.0,,,,,,,2013.0,,,,
50%,80163370.0,,,,,,,2016.0,,,,
75%,80244890.0,,,,,,,2018.0,,,,


In [10]:
# Drop rows missing key info
df.dropna(subset=['title', 'type'], inplace=True)

# Fill missing values in categorical columns
df['country'].fillna('Unknown', inplace=True)
df['rating'].fillna('Not Rated', inplace=True)
df['director'].fillna('No Info', inplace=True)

# Convert date_added to datetime
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

In [11]:
# Extract year and month from date_added
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month

# Extract duration details (duration_int and duration_type)
df[['duration_int', 'duration_type']] = df['duration'].str.extract(r'(\d+)\s*(\w+)')
df['duration_int'] = pd.to_numeric(df['duration_int'], errors='coerce')

In [12]:
# Encode 'type' and 'rating'
label_encoders = {}
for col in ['type', 'rating']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

print("✅ Encoded categorical columns.")

✅ Encoded categorical columns.


In [13]:
# Scale 'year_added'
scaler = StandardScaler()
df['year_added_scaled'] = scaler.fit_transform(df[['year_added']].fillna(0))

print("✅ Scaled numerical features.")

✅ Scaled numerical features.


In [14]:
# Drop duplicates
df.drop_duplicates(inplace=True)

# Drop unnecessary columns (optional)
df.drop(columns=['show_id', 'description'], inplace=True)

# Preview cleaned data
df.head()

Unnamed: 0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,year_added,month_added,duration_int,duration_type,year_added_scaled
0,0,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China",2019-09-09,2019,10,90 min,"Children & Family Movies, Comedies",2019.0,9.0,90,min,0.054042
1,0,Jandino: Whatever it Takes,No Info,Jandino Asporaat,United Kingdom,2016-09-09,2016,9,94 min,Stand-Up Comedy,2016.0,9.0,94,min,0.018624
2,1,Transformers Prime,No Info,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,2018-09-08,2013,13,1 Season,Kids' TV,2018.0,9.0,1,Season,0.042236
3,1,Transformers: Robots in Disguise,No Info,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,2018-09-08,2016,12,1 Season,Kids' TV,2018.0,9.0,1,Season,0.042236
4,0,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,2017-09-08,2017,7,99 min,Comedies,2017.0,9.0,99,min,0.03043


In [15]:
# Save the processed dataset to CSV
output_file = "processed_netflix_titles.csv"
df.to_csv(output_file, index=False)

print(f"💾 Processed data saved as '{output_file}'")

💾 Processed data saved as 'processed_netflix_titles.csv'
