# **Dataset Load**

In [2]:
import pandas as pd

df = pd.read_csv("/content/cleaned_netflix_data (4).csv")
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Not Available,United States,2021-09-25,2020.0,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021.0,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,2021-09-24,2021.0,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,Unknown,Not Available,Unknown,2021-09-24,2021.0,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021.0,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


# **FEATURE ENGINEERING**

**Content Age**


In [5]:
current_year = 2024
df['content_age'] = current_year - df['release_year']


**Movie**


In [6]:
df['is_movie'] = df['type'].apply(lambda x: 1 if x=='Movie' else 0)


**Recent Content**


In [7]:
df['is_recent'] = df['release_year'].apply(lambda x: 1 if x>=2018 else 0)

**Duration**

In [8]:
df['duration_minutes'] = df['duration'].str.extract('(\d+)').astype(float)

  df['duration_minutes'] = df['duration'].str.extract('(\d+)').astype(float)


**Rating Group**

In [9]:
def rating_group(r):
    if r in ['G','TV-Y']:
        return 'Kids'
    elif r in ['PG','PG-13']:
        return 'Teen'
    else:
        return 'Adult'

df['rating_group'] = df['rating'].apply(rating_group)

In [10]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,content_age,is_movie,is_recent,duration_minutes,rating_group
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Not Available,United States,2021-09-25,2020.0,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",4.0,1,1,90.0,Teen
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021.0,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",3.0,0,1,2.0,Adult
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,2021-09-24,2021.0,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,3.0,0,1,1.0,Adult
3,s4,TV Show,Jailbirds New Orleans,Unknown,Not Available,Unknown,2021-09-24,2021.0,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",3.0,0,1,1.0,Adult
4,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021.0,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,3.0,0,1,2.0,Adult


# **ENCODING**

In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['rating_group_encoded'] = le.fit_transform(df['rating_group'])

In [12]:
df[['rating_group','rating_group_encoded']].head()

Unnamed: 0,rating_group,rating_group_encoded
0,Teen,2
1,Adult,0
2,Adult,0
3,Adult,0
4,Adult,0


# **SCALING**

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df['content_age_scaled'] = scaler.fit_transform(df[['content_age']])


In [16]:
df[['content_age','content_age_scaled']].head()

Unnamed: 0,content_age,content_age_scaled
0,4.0,-0.975877
1,3.0,-1.240022
2,3.0,-1.240022
3,3.0,-1.240022
4,3.0,-1.240022


# **BEFORE vs AFTER**

In [17]:
df[['content_age','content_age_scaled']].describe()

Unnamed: 0,content_age,content_age_scaled
count,8088.0,8088.0
mean,7.694486,-2.811247e-17
std,3.786044,1.000062
min,3.0,-1.240022
25%,5.0,-0.711733
50%,7.0,-0.1834444
75%,9.0,0.3448442
max,20.0,3.250431


# **SAVE CSV**

In [18]:
df.to_csv("netflix_week4_feature_engineered.csv", index=False)