**Assignment 2**

**1. Dataset Selection**

In [24]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("/content/netflix_titles.csv")

# Show first 5 rows
print(df.head())

# Check dataset info
print(df.info())

# Check shape (rows, columns)
print(df.shape)


  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September 24, 2021        

**2.Data cleaning and preprocessing**

**2.1 Handling Missing Values**

In [25]:
# Check missing values
print(df.isnull().sum())

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


In [26]:
# Fill missing categorical values with "Unknown"
df.fillna("Unknown", inplace=True)

print(df.isnull().sum())

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64


**2.2 Fix Incorrect Data Types**

In [27]:
# Convert date_added to date format
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

print(df.dtypes)


show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
description             object
dtype: object


**2.3 Remove Duplicate Records**

In [28]:
print("Before:", df.shape)

df = df.drop_duplicates()

print("After:", df.shape)


Before: (8807, 12)
After: (8807, 12)


**2.4 Handle Outliers**

In [29]:
print(df['release_year'].describe())


count    8807.000000
mean     2014.180198
std         8.819312
min      1925.000000
25%      2013.000000
50%      2017.000000
75%      2019.000000
max      2021.000000
Name: release_year, dtype: float64


**2.5 Drop Irrelevant Columns**

In [30]:
df.drop(columns=['show_id'], inplace=True)

print(df.columns)


Index(['type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')


**3. Categorical Variable Handling**

In [31]:
from sklearn.preprocessing import LabelEncoder

**3.1 Label Encoding**

In [32]:
df = pd.read_csv("netflix_titles.csv")
df.fillna("Unknown", inplace=True)

le = LabelEncoder()

df['type_encoded'] = le.fit_transform(df['type'])

print(df[['type', 'type_encoded']].head())


      type  type_encoded
0    Movie             0
1  TV Show             1
2  TV Show             1
3  TV Show             1
4  TV Show             1


**3.2 One-Hot Encoding**

In [33]:
df_onehot = pd.get_dummies(df, columns=['rating'])

print(df_onehot.head())


  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water          Unknown   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans          Unknown   
4      s5  TV Show           Kota Factory          Unknown   

                                                cast        country  \
0                                            Unknown  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...        Unknown   
3                                            Unknown        Unknown   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year   duration  \
0  September 25, 2021          2020     90 min   
1  September 24, 2021          2021  2 Seasons   
2  September 24, 2021          2021   1 Season   


**3.3 Ordinal Encoding**

In [34]:
rating_order = {
    'G': 1,
    'PG': 2,
    'PG-13': 3,
    'R': 4,
    'NC-17': 5
}

df['rating_ordinal'] = df['rating'].map(rating_order)

print(df[['rating', 'rating_ordinal']].head())


  rating  rating_ordinal
0  PG-13             3.0
1  TV-MA             NaN
2  TV-MA             NaN
3  TV-MA             NaN
4  TV-MA             NaN


**3.4 Frequency Encoding**

In [35]:
freq = df['country'].value_counts()

df['country_freq'] = df['country'].map(freq)

print(df[['country', 'country_freq']].head())


         country  country_freq
0  United States          2818
1   South Africa            30
2        Unknown           831
3        Unknown           831
4          India           972


**3.5 Target Encoding**

In [36]:
target_mean = df.groupby('country')['type_encoded'].mean()

df['country_target'] = df['country'].map(target_mean)

print(df[['country', 'country_target']].head())


         country  country_target
0  United States        0.269695
1   South Africa        0.200000
2        Unknown        0.470517
3        Unknown        0.470517
4          India        0.081276


**4. Feature Scaling **

In [37]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, MaxAbsScaler

df = pd.read_csv("netflix_titles.csv")


**4.1 Min-Max Scaling**

In [38]:
scaler = MinMaxScaler()
df[['release_year']] = scaler.fit_transform(df[['release_year']])

print(df[['release_year']].head())


   release_year
0      0.989583
1      1.000000
2      1.000000
3      1.000000
4      1.000000


**4.2 Z-score (Standardization)**

In [39]:
scaler = StandardScaler()
df[['release_year']] = scaler.fit_transform(df[['release_year']])

print(df[['release_year']].head())


   release_year
0      0.659930
1      0.773324
2      0.773324
3      0.773324
4      0.773324


**4.3 Max Absolute Scaling**

In [40]:
scaler = MaxAbsScaler()
df[['release_year']] = scaler.fit_transform(df[['release_year']])

print(df[['release_year']].head())


   release_year
0      0.065259
1      0.076472
2      0.076472
3      0.076472
4      0.076472


**4.4 Vector Normalization**

In [41]:
normalizer = Normalizer()
df[['release_year']] = normalizer.fit_transform(df[['release_year']])

print(df[['release_year']].head())


   release_year
0           1.0
1           1.0
2           1.0
3           1.0
4           1.0


**5.  Train/Test split**

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# The DataFrame `df` was reloaded in a previous cell, erasing previous transformations.
# Re-create 'type_encoded' column for the current df.
le = LabelEncoder()
df['type_encoded'] = le.fit_transform(df['type'])

X = df[['release_year']]
y = df[['type_encoded']]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(X_train.shape, X_test.shape)

(7045, 1) (1762, 1)
