# In this project, you are required to do

Understanding what type content is available in different countries

If Netflix has been increasingly focusing on TV rather than movies in recent years.

Clustering similar content by matching text-based features

In [None]:
!pip install gap-stat

In [126]:
# Necessary imports
import pandas as pd
import re
import plotly.express as px
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gap_statistic import OptimalK
from sklearn.cluster import KMeans
import plotly.graph_objects as go

In [112]:
# Load the dataset
data = pd.read_csv('/content/NETFLIX MOVIES AND TV SHOWS CLUSTERING.csv')

In [113]:
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


# MANUPILATION

In [114]:
data.isnull().sum()

show_id            0
type               0
title              0
director        2389
cast             718
country          507
date_added        10
release_year       0
rating             7
duration           0
listed_in          0
description        0
dtype: int64

In [116]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7787 entries, 0 to 7786
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       7787 non-null   object
 1   type          7787 non-null   object
 2   title         7787 non-null   object
 3   director      5398 non-null   object
 4   cast          7069 non-null   object
 5   country       7280 non-null   object
 6   date_added    7777 non-null   object
 7   release_year  7787 non-null   int64 
 8   rating        7780 non-null   object
 9   duration      7787 non-null   object
 10  listed_in     7787 non-null   object
 11  description   7787 non-null   object
dtypes: int64(1), object(11)
memory usage: 730.2+ KB


In [117]:
# Handling missing values
data.loc[:, ['director', 'cast', 'country', 'date_added']] = data.loc[:, ['director', 'cast', 'country', 'date_added']].fillna('unknown')

# Understanding what type content is available in different countries

In [120]:
# Data preprocessing for visualization
working_df = data
working_df_split = working_df.assign(country=working_df['country'].str.split(', ')).explode('country')
content_type_among_country = working_df_split.groupby(['country', 'type']).size().unstack(fill_value=0).reset_index()

In [121]:
content_type_among_country.head()

type,country,Movie,TV Show
0,Afghanistan,1,0
1,Albania,1,0
2,Algeria,2,0
3,Angola,1,0
4,Argentina,64,18


In [123]:
# Visualization 1: Choropleth Map
fig = px.choropleth(content_type_among_country,
                    locations='country',
                    locationmode='country names',
                    color='Movie',
                    hover_name='country',
                    hover_data={'Movie': True, 'TV Show': True},
                    color_continuous_scale='Darkmint',
                    labels={'Movie': 'Number of Movies', 'TV Show': 'Number of TV Shows'},
                    title='Number of Movies and TV Shows by Country',
                    projection='natural earth')
fig.update_geos(showcountries=True, countrycolor="Black")
fig.update_layout(geo=dict(bgcolor='rgba(0,0,0,0)'),
                  margin=dict(l=0, r=0, t=0, b=50),
                  width=1500,
                  height=600)
fig.show()



# If Netflix has been increasingly focusing on TV rather than movies in recent years.

In [125]:
working_dfcontent_type_by_year = working_df.groupby(['release_year', 'type']).size().unstack(fill_value=0)

In [127]:
content_type_by_year.head()

type,Movie,TV Show
release_year,Unnamed: 1_level_1,Unnamed: 2_level_1
1925,0,1
1942,2,0
1943,3,0
1944,3,0
1945,3,0


In [128]:
release_year = content_type_by_year.index
fig = go.Figure()
fig.add_trace(go.Scatter(x=release_year, y=content_type_by_year['Movie'], mode='lines', name='Movies'))
fig.add_trace(go.Scatter(x=release_year, y=content_type_by_year['TV Show'], mode='lines', name='TV Shows'))

fig.update_layout(title='Count of Movies and TV Shows by Release Year', xaxis_title='Release Year', yaxis_title='Count')
fig.show()

# Clustering similar content by matching text-based features

In [129]:
# Data preprocessing for clustering
to_be_cluster = working_df[['show_id', 'description']]
nltk.download('stopwords')
nltk.download('wordnet')
ENGLISH_STOP_WORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [130]:
# Text preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', 'NUMBER', text)
    text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

In [131]:
# Applying text preprocessing
descriptions = to_be_cluster.description.apply(preprocess_text)

In [132]:
# Vectorizing text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(descriptions)

In [95]:
# Finding optimal number of clusters
optimalK = OptimalK(parallel_backend='joblib')
n_clusters = optimalK(X.toarray(), cluster_array=np.arange(1, 11))
print("Optimal number of clusters:", n_clusters)

In [134]:
# Clustering using KMeans
kmeans = KMeans(n_clusters=n_clusters, n_init=5, max_iter=500, random_state=42)
kmeans.fit(X)
to_be_cluster['cluster'] = kmeans.labels_
clustered_data = to_be_cluster.drop(columns='description')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [135]:
# Merging clustered data with original dataset
clusterdf = working_df.merge(clustered_data, on='show_id', how='left')

In [136]:
# Displaying the final clustered data
clusterdf.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,cluster
0,s1,TV Show,3%,unknown,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...,8
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...,3
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow...",8
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi...",8
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...,8
