In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('netflix.csv')


In [3]:
print(df.head())         # View the first 5 rows
print(df.info())         # Get information about the dataset
print(df.describe())     # Summary statistics for numerical columns
print(df.isnull().sum()) # Check for missing values


  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September 24, 2021        

In [4]:
df = df.dropna() # Drop rows with missing values
df = df.dropna(axis=1) # Drop columns with missing values


In [5]:
df['director'] = df['director'].fillna('Unknown')
df['cast'] = df['cast'].fillna('Unknown')


In [6]:
df['country'] = df['country'].fillna('Not Specified')


In [7]:
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')


In [8]:
df['rating'] = df['rating'].fillna(df['rating'].mode()[0])
df['duration'] = df['duration'].fillna('Unknown')


In [9]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')


In [11]:
df['duration_minutes'] = df['duration'].str.extract(r'(\d+)').astype(float)


In [12]:
q1 = df['release_year'].quantile(0.25)
q3 = df['release_year'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

df = df[(df['release_year'] >= lower_bound) & (df['release_year'] <= upper_bound)]


In [13]:
df = df.drop_duplicates()


In [14]:
df['categories'] = df['listed_in'].str.split(', ')


In [15]:
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month


In [16]:
df.to_csv('cleaned_netflix_data.csv', index=False)
