<a href="https://colab.research.google.com/github/Codechickdev/ML_Projects/blob/main/Netflix_Movies_and_Shows_(_Best_Movies_).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Netflix Movies and Shows

## ToDo
- [x] Install Packages
- [x] Import Packages
- [x] Download and Load Dataset
- [x] Data Analysis
- [x] Model Training
- [x] HyperTuning
- [x] Evaluation

## Install Pacakges

In [1]:
%%bash

pip install opendatasets xgboost --quiet

## Import Packages

In [2]:
import os
import numpy as np
import pandas as pd

import opendatasets as od

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style('darkgrid')

import plotly.express as px

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

import warnings
warnings.filterwarnings('ignore')

## Download and Load Dataset

In [3]:
URL = 'https://www.kaggle.com/datasets/thedevastator/the-ultimate-netflix-tv-shows-and-movies-dataset'

In [4]:
if os.path.exists('./the-ultimate-netflix-tv-shows-and-movies-dataset'):
    print("Dataset already Exists")
else:
    od.download(URL)

Downloading the-ultimate-netflix-tv-shows-and-movies-dataset.zip to ./the-ultimate-netflix-tv-shows-and-movies-dataset


100%|██████████| 1.81M/1.81M [00:00<00:00, 59.8MB/s]







In [5]:
DATASET_PATH = os.path.join('./the-ultimate-netflix-tv-shows-and-movies-dataset', 'Best Movies Netflix.csv')
DATASET_PATH

'./the-ultimate-netflix-tv-shows-and-movies-dataset/Best Movies Netflix.csv'

In [6]:
df = pd.read_csv(DATASET_PATH, index_col = 'index')
df.head(5)

Unnamed: 0_level_0,TITLE,RELEASE_YEAR,SCORE,NUMBER_OF_VOTES,DURATION,MAIN_GENRE,MAIN_PRODUCTION
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,David Attenborough: A Life on Our Planet,2020,9.0,31180,83,documentary,GB
1,Inception,2010,8.8,2268288,148,scifi,GB
2,Forrest Gump,1994,8.8,1994599,142,drama,US
3,Anbe Sivam,2003,8.7,20595,160,comedy,IN
4,Bo Burnham: Inside,2021,8.7,44074,87,comedy,US


## Data Analysis

In [7]:
df_copy = df.copy()

In [8]:
df.describe()

Unnamed: 0,RELEASE_YEAR,SCORE,NUMBER_OF_VOTES,DURATION
count,387.0,387.0,387.0,387.0
mean,2011.023256,7.509044,136520.6,123.395349
std,10.813874,0.441906,234211.4,28.371632
min,1954.0,6.9,10139.0,28.0
25%,2008.0,7.1,20512.5,103.5
50%,2014.0,7.4,45200.0,122.0
75%,2018.0,7.8,153485.5,139.0
max,2022.0,9.0,2268288.0,229.0


In [9]:
df.isna().sum()

TITLE              0
RELEASE_YEAR       0
SCORE              0
NUMBER_OF_VOTES    0
DURATION           0
MAIN_GENRE         0
MAIN_PRODUCTION    0
dtype: int64

In [10]:
px.imshow(df.corr(), title = 'Correlation Matrix', text_auto = True)

In [11]:
px.imshow(df.corr().sort_values(by = 'SCORE', ascending = False), title = 'Correlation Matrix by Target Value', text_auto = True)

In [12]:
px.bar(df['RELEASE_YEAR'].value_counts(), title = 'Bar Plot of Year')

In [13]:
px.scatter(x = df['RELEASE_YEAR'], y = df['SCORE'], title = 'Scatter plot of Score and Year')

In [14]:
fig = px.histogram(x = df['RELEASE_YEAR'], y = df['SCORE'], title = 'Histogram Chart of Year and Score')
fig.update_layout(bargap = 0.2)
fig.show()

In [21]:
df = df.drop('TITLE', axis = 1)

In [22]:
categoricalList = df.select_dtypes('object').columns.to_list()
numericList = df.select_dtypes(include = np.number).columns.to_list()

In [23]:
categoricalList

['MAIN_GENRE', 'MAIN_PRODUCTION']

In [28]:
numericList

['RELEASE_YEAR', 'SCORE', 'NUMBER_OF_VOTES', 'DURATION']

In [36]:
class Encoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        encoder = OneHotEncoder(sparse = False, handle_unknown = 'ignore')
        encoder.fit(df[categoricalList])
        encoded_cols = list(encoder.get_feature_names(categoricalList))
        df[encoded_cols] = encoder.transform(df[categoricalList])
        return df

In [37]:
class Scaler(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        scaler = MinMaxScaler()
        scaler.fit(df[numericList])
        df[numericList] = scaler.transform(df[numericList])
        return df

In [None]:
encoder = Encoder()
y = encoder.fit_transform(df)
scaler = Scaler()
y = scaler.fit_transform(y)
y