In [68]:
import pandas as pd
import numpy as np

In [69]:
netflixdf = pd.read_csv('netflix_titles.csv')

In [70]:
print(netflixdf.head())

  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September 24, 2021        

In [71]:
print(f'Dataset size: {netflixdf.shape}')

Dataset size: (8807, 12)


In [72]:
print(netflixdf.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB
None


In [73]:
missing_vals = netflixdf.isnull().sum()
missing_pct = (missing_vals / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Values': missing_vals,
    'Percentage %': missing_pct.round(2)
})

missing_df[missing_df['Missing Values'] > 0].sort_values(by='Missing Values', ascending=False)

Unnamed: 0,Missing Values,Percentage %
director,2634,49.4
country,831,15.59
cast,825,15.47
date_added,10,0.19
rating,4,0.08
duration,3,0.06


In [74]:
netflixdf['director'] = netflixdf.apply(
    lambda x: "Various Directors" if pd.isna(x['director']) and x['type'] == 'TV Show' else
             ("Unknown Director" if pd.isna(x['director']) else x['director']),
    axis=1
)

In [75]:
netflixdf['country'] = netflixdf['country'].fillna('Unknown')

In [76]:
netflixdf['cast'] = netflixdf['cast'].fillna('Not Listed')

In [77]:
netflixdf['rating'] = netflixdf['rating'].fillna('NR')

In [78]:
netflixdf['description'] = netflixdf['description'].fillna('No description available')

In [79]:
netflixdf['date_added'] = pd.to_datetime(netflixdf['date_added'].str.strip(),
                                            format='%B %d, %Y',
                                            errors='coerce')

In [80]:
duration_pattern = r'^\d+ min$'
incorrect_ratings = netflixdf[netflixdf['rating'].str.contains(duration_pattern, na=False)]

if len(incorrect_ratings) > 0:
    print(f"Found {len(incorrect_ratings)} records with duration values in rating field")
    for idx in incorrect_ratings.index:
        if pd.isna(netflixdf.loc[idx, 'duration']):
            netflixdf.loc[idx, 'duration'] = netflixdf.loc[idx, 'rating']
        netflixdf.loc[idx, 'rating'] = 'NR'

Found 3 records with duration values in rating field


In [81]:
print(netflixdf.head())

  show_id     type                  title           director  \
0      s1    Movie   Dick Johnson Is Dead    Kirsten Johnson   
1      s2  TV Show          Blood & Water  Various Directors   
2      s3  TV Show              Ganglands    Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans  Various Directors   
4      s5  TV Show           Kota Factory  Various Directors   

                                                cast        country  \
0                                         Not Listed  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...        Unknown   
3                                         Not Listed        Unknown   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

  date_added  release_year rating   duration  \
0 2021-09-25          2020  PG-13     90 min   
1 2021-09-24          2021  TV-MA  2 Seasons   
2 2021-09-24          2021  TV-MA   1 Season

In [82]:
# data quality in percentage

# Calculate the percentage of non-missing values for each column
data_quality_pct = (1 - netflixdf.isnull().sum() / len(netflixdf)) * 100

# Display the data quality percentage for each column
print(data_quality_pct)

# Calculate the overall data quality (average percentage across all columns)
overall_data_quality = data_quality_pct.mean()
print(f"\nOverall data quality: {overall_data_quality:.2f}%")


show_id         100.000000
type            100.000000
title           100.000000
director        100.000000
cast            100.000000
country         100.000000
date_added       99.886454
release_year    100.000000
rating          100.000000
duration        100.000000
listed_in       100.000000
description     100.000000
dtype: float64

Overall data quality: 99.99%
