In [98]:
import pandas as pd
df = pd.read_csv('netflix_titles.csv')
df.head(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [99]:
df['director'] = df['director'].fillna('Unknown')
df['cast'] = df['cast'].fillna('Unknown')
df['country'] = df['country'].fillna('Unknown')

df.dropna(subset=['date_added', 'rating'], inplace=True)

In [100]:
df.dropna(subset=['duration'], inplace=True)
print("\nMissing values after handling duration:")
print(df.isnull().sum())


Missing values after handling duration:
show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64


In [101]:
text_columns = ['type', 'title', 'director', 'cast', 'country', 'rating', 'duration', 'listed_in', 'description']
for col in text_columns:
    if df[col].dtype == 'object': # Only process object type columns
        # Strip leading/trailing spaces
        df[col] = df[col].str.strip()
        # Convert to lowercase for consistency, except for 'title' which might use title case
        if col != 'title':
            df[col] = df[col].str.lower()
        else:
            df[col] = df[col].str.title()

print("Unique values in 'type' after standardization:")
print(df['type'].unique())
print("\nSample rows from 'title' after standardization:")
print(df['title'].head())

Unique values in 'type' after standardization:
['movie' 'tv show']

Sample rows from 'title' after standardization:
0     Dick Johnson Is Dead
1            Blood & Water
2                Ganglands
3    Jailbirds New Orleans
4             Kota Factory
Name: title, dtype: object


In [102]:
print("\nUnique values in 'director' after standardization:")
print(df['director'].unique()[:10]) # Displaying a sample of unique directors

print("\nSample rows from 'cast' after standardization:")
print(df['cast'].head()) # Displaying a sample of cast entries

print("\nUnique values in 'country' after standardization:")
print(df['country'].unique()[:10]) # Displaying a sample of unique countries

print("\nUnique values in 'rating' after standardization:")
print(df['rating'].unique()) # Displaying all unique ratings

print("\nUnique values in 'duration' after standardization:")
print(df['duration'].unique()[:10]) # Displaying a sample of unique durations

print("\nSample rows from 'listed_in' after standardization:")
print(df['listed_in'].head()) # Displaying a sample of listed_in entries

print("\nSample rows from 'description' after standardization:")
print(df['description'].head()) # Displaying a sample of description entries


Unique values in 'director' after standardization:
['kirsten johnson' 'unknown' 'julien leclercq' 'mike flanagan'
 'robert cullen, josé luis ucha' 'haile gerima' 'andy devonshire'
 'theodore melfi' 'kongkiat komesiri' 'christian schwochow']

Sample rows from 'cast' after standardization:
0                                              unknown
1    ama qamata, khosi ngema, gail mabalane, thaban...
2    sami bouajila, tracy gotoas, samuel jouy, nabi...
3                                              unknown
4    mayur more, jitendra kumar, ranjan raj, alam k...
Name: cast, dtype: object

Unique values in 'country' after standardization:
['united states' 'south africa' 'unknown' 'india'
 'united states, ghana, burkina faso, united kingdom, germany, ethiopia'
 'united kingdom' 'germany, czech republic' 'mexico' 'turkey' 'australia']

Unique values in 'rating' after standardization:
['pg-13' 'tv-ma' 'pg' 'tv-14' 'tv-pg' 'tv-y' 'tv-y7' 'r' 'tv-g' 'g'
 'nc-17' 'nr' 'tv-y7-fv' 'ur']

Unique val

In [103]:
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
print("Data type of 'date_added' after conversion:", df['date_added'].dtype)
print("\nFirst few rows of the DataFrame with converted 'date_added':")
display(df[['date_added']].head())

Data type of 'date_added' after conversion: datetime64[ns]

First few rows of the DataFrame with converted 'date_added':


Unnamed: 0,date_added
0,2021-09-25
1,2021-09-24
2,2021-09-24
3,2021-09-24
4,2021-09-24


In [104]:
new_column_names = {
    'show_id': 'show_id',
    'type': 'type',
    'title': 'title',
    'director': 'director',
    'cast': 'cast',
    'country': 'country',
    'date_added': 'date_added',
    'release_year': 'release_year',
    'rating': 'rating',
    'duration': 'duration',
    'listed_in': 'listed_in',
    'description': 'description'
}

df.rename(columns=new_column_names, inplace=True)

print("Updated column names:")
print(df.columns.tolist())

Updated column names:
['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description']


In [105]:
print("Current data types:")
print(df.dtypes)

# Convert 'release_year' to integer
df['release_year'] = df['release_year'].astype(int)

print("\nData types after conversion:")
print(df.dtypes)

Current data types:
show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
description             object
dtype: object

Data types after conversion:
show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
description             object
dtype: object


In [106]:
df.head(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,movie,Dick Johnson Is Dead,kirsten johnson,unknown,united states,2021-09-25,2020,pg-13,90 min,documentaries,"as her father nears the end of his life, filmm..."
1,s2,tv show,Blood & Water,unknown,"ama qamata, khosi ngema, gail mabalane, thaban...",south africa,2021-09-24,2021,tv-ma,2 seasons,"international tv shows, tv dramas, tv mysteries","after crossing paths at a party, a cape town t..."
2,s3,tv show,Ganglands,julien leclercq,"sami bouajila, tracy gotoas, samuel jouy, nabi...",unknown,2021-09-24,2021,tv-ma,1 season,"crime tv shows, international tv shows, tv act...",to protect his family from a powerful drug lor...
3,s4,tv show,Jailbirds New Orleans,unknown,unknown,unknown,2021-09-24,2021,tv-ma,1 season,"docuseries, reality tv","feuds, flirtations and toilet talk go down amo..."
4,s5,tv show,Kota Factory,unknown,"mayur more, jitendra kumar, ranjan raj, alam k...",india,2021-09-24,2021,tv-ma,2 seasons,"international tv shows, romantic tv shows, tv ...",in a city of coaching centers known to train i...
