In [0]:
from google.colab import drive # mounting google drive
drive.mount('/content/drive')

In [0]:
import pandas as pd # importing pandas
import numpy as np # importing numpy

In [0]:
df=pd.read_csv('/content/drive/MyDrive/NETFLIX MOVIES AND TV SHOWS CLUSTERING.csv') # reading csv file

In [0]:
df

In [0]:
df.info()

In [0]:
df.dropna(inplace=True) # droping null values

In [0]:
df.info()


In [0]:


df.info()

**Exploaratory Data Analysis**

In [0]:
import seaborn as sns # importing seaborn


In [0]:
df_c=pd.DataFrame(df[['title','country']]) # creating new dataframe df_c
df_c_count_of_titles=pd.DataFrame(df_c.groupby(by='country')['title'].count()).reset_index().sort_values(by='title',ascending=False).head(5) # grouping by country and counting the titles
df_c_count_of_titles.rename(columns={'title':'no_of_titles'},inplace=True) # renaming column title to no_of_titles
sns.barplot(data=df_c_count_of_titles,x='country',y='no_of_titles') # ploting barplot using seaborn






In [0]:
import matplotlib.pyplot as plt # importing matplotlib
import seaborn as sns # importing seaborn
count = sns.countplot(x="release_year", hue='type', data=df, order=df['release_year'].value_counts().index[0:15]) # ploting a count plot
count.set_title('Movie/TV shows released in top 15 year', size=15) # giving title to the graph
plt.xticks(rotation=90)# rotating xtick labels by 90 degrees
plt.show()




In [0]:
hist = sns.histplot(df['release_year']) # ploting a histogram by release year
hist.set_title('histogram by released year', size=20) # giving title to the graph

In [0]:
df_tdcr=pd.DataFrame(df[['country','rating','type','director']]) # creating a new datafram df_tdcr
def Target_Audience(x): # creating a function to replace the values in the rating column by Adult,Children, All, Infants and None
  if x in ['R','TV-MA','NR','NC-17','UR']:
    return 'Adult'
  elif x in ['TV-14','PG-13','TV-Y7','TV-Y7-FV','TV-PG','PG'] :
    return 'Children'
  elif x in ['TV-G','G']:
    return 'All'
  elif x in ['TV-Y']:
    return 'Infants'
  else:
    return 'None'
df_tdcr['rating']=df_tdcr['rating'].apply(Target_Audience) # applying the Target_Audience function on the rating column
dsj=pd.DataFrame(df_tdcr.groupby(by=['country','rating','type'])['director'].count()) # creating a new dataframe grouping by country,rating,type and counting the directors active
dsj.reset_index(inplace=True) # reseting the index
dsj.sort_values(by='director',ascending=False,inplace=True) # sorting values by desc
ax=sns.scatterplot(data=dsj.head(10),size='type',hue='rating',x='country',y='director') # ploting a scatterplot


In [0]:
df

In [0]:
df['nlp']= df['type'] +df['director'] + df['cast'] + df['country'] + df['rating'] + df['listed_in'] + df['description'] # adding a new column for natural language processing
df

In [0]:
import nltk # importing natural language processing toolkit
from nltk.corpus import stopwords # importing the stopwords
nltk.download('stopwords') # downloading the stopwords



In [0]:
stp_wrds=stopwords.words('english') # assigning the english stopwords to stp_wrds
np.array(stp_wrds) # getting array of stp_wrds

In [0]:
df['no_stp_wrds']=df['nlp'].apply(lambda x: ''.join([i.lower() for i in x if i.lower() not in stp_wrds ])) # creating a new column without stopwords
df['no_stp_wrds'] # getting insight about the new column(puntuations are present so we have to remove the puntuations in the next step)

In [0]:
df

In [0]:
df['nlp_no_punch'] = df['no_stp_wrds'].str.replace(r'[^\w\s]+', '') # removing puntuation marks from the column=no_stp_wrds
df['nlp_no_punch'] # getting a column with no puntuation marks
df['no_stp_wrds'].iloc[0]



**Stemming**

In [0]:
from nltk.stem.snowball import SnowballStemmer # importing snowballstemmer for finding the rootwords
root_words= SnowballStemmer('english') # creating a object of english snowballstemmer with the name root_words
def stemming(x): # creating a stemming function
  y=[root_words.stem(i) for i in x.split()] # getting the rootword
  return " ".join(y) # returning the output after stemming





In [0]:
df['nlp_no_punch']=df['nlp_no_punch'].apply(stemming) # applying stemming function on nlp_no_punch

In [0]:
df['nlp_no_punch'].iloc[0] # getting 1st row from 'nlp_no_punch'


**Vectorization**

In [0]:
from sklearn.feature_extraction.text import CountVectorizer # importing countvectorizer
from sklearn.feature_extraction.text import TfidfVectorizer # importing Term frequency and inverse document frequency

In [0]:
tfidf=TfidfVectorizer(stop_words='english',lowercase=False) # creating a object for Tfidvectorizer
tfidf.fit(df['nlp_no_punch']) # fitting the tfidf object on nlp_no_punch column

In [0]:
x=tfidf.vocabulary_.items() # getting the vocabulary of fitted tfidf object
len(x)# getting the length of x

In [0]:
array_=tfidf.transform(df['nlp_no_punch']).toarray() # transforming the tfidf object into an array
array_
array_.shape


In [0]:
from sklearn.decomposition import PCA # importing principal component analysis
pca=PCA() # creating a object for principle component analysis
pca.fit(array_) # fitting the array of fitted tfidf object

In [0]:
import matplotlib.pyplot as plt # importing matplotlib

plt.plot(np.cumsum(pca.explained_variance_ratio_)) # plotting cummulative sum of explained variance ratio
plt.title('PCA - Cumulative variance vs  components') # giving title to the graph
plt.xlabel(' components') # giving xlabel
plt.ylabel('Cumulative explained variance') # giving ylabel
plt.axhline(y= 0.95, color='red', linestyle=':') # creating a horizontal line at y=95%
plt.axvline(x=4000,color='purple',linestyle=':') # creatig a verticle line at x=4000
plt.show()

In [0]:
pca=PCA(n_components=4000) # choosing 4000 components
pca.fit(array_)  # fitting the array with 4000 components

In [0]:
f=pca.transform(array_)
f.shape

**Model1: Kmeans Clustering**

In [0]:
from sklearn.cluster import KMeans # imporing kmeans clustering
a=[] # creating a empty list
for i in range(1,20): # iteration for optimal k
  k_means_=KMeans(n_clusters=i,init='k-means++',random_state=10) # creating a object for kmeans
  k_means_.fit(f) # fitting the object
  a.append(k_means_.inertia_) # appending the list a
print(a)


In [0]:
plt.plot(range(1,20),a) # ploting the k and corresponding sum of squared errors
plt.xlabel('clusters')
plt.ylabel('SSE')
plt.axvline(x=8,linestyle=':',color='red') # creating a dotted line at k=8 as it was percieved as the elbow point

In [0]:

from sklearn.metrics  import silhouette_score # importing silhouette_score from the sklearn metrics
sil_avg = [] # creating a empty list

for i in range(2, 20): # iteration for clusters

  k_means_ = KMeans(n_clusters=i, init='k-means++',random_state=10) # creating a kmeans object
  k_means_.fit(f) # fitting the object over array f
  labels = k_means_.labels_ # getting labels
  score = silhouette_score(f, labels) # getting the silhouette score
  sil_avg.append(score) # appending the list sil_avg

In [0]:
plt.plot(range(2,20), sil_avg) # ploting the silhouette analysis
plt.xlabel('Number of clusters') # x label named
plt.ylabel('Silhouette score') # y label named
plt.title('Silhouette analysis For Optimal clusters') # giving title to the plot
plt.axvline(x=3,linestyle=':',color='red') # ploting a verticle line with x=3
plt.show()

**Clustering using Gaussian Mixture**

In [0]:
from sklearn.mixture import GaussianMixture # importing Gaussian Mixture


In [0]:
aic={} # creating a dictionary for aic score
bic={} # creating a dictionary for bic score
for i in range(1,20): # iteration
  gm= GaussianMixture(n_components=i,random_state=10) # creating a Gaussian Mixture object
  gm.fit(f) # fitting the Gaussian Mixture object over array f
  aic[i]=gm.aic(f) # getting the aic score
  bic[i]=gm.bic(f) # getting the bic score

In [0]:
plt.figure(figsize=(12,10))
plt.plot(list(aic.keys()),list(aic.values()),label='AIC') # ploting aic and no of clusters
plt.plot(list(bic.keys()),list(bic.values()),label='BIC') # ploting bic and no of clusters
plt.title('AIC and BIC from GMM') # giving title
plt.xlabel('Number of Clusters') # labeling x axis
plt.ylabel('AIC and BIC values') # labeling y axis
plt.axvline(x=2,linestyle=':',color='red') # creating a dotted line at x=2 as it as the lowest aic and bic score
plt.show()

**Clustering using hierarchy modeling**

In [0]:
import scipy.cluster.hierarchy as hr # importing hierarchial clustering model

In [0]:
hr_graph=hr.dendrogram(hr.linkage(f,method='ward')) # ploting a dendogram
plt.ylabel('Euclidean_distance') # labeling y axis
plt.show()

**After analysing Kmeans: Sum of Squared Error plot , silhoutte plot; Gaussian Mixture: aic and bic plot ; Hierarchial modeling: Dendogram.
I have come to the conclusion that 8 clusters would be fit for the data **

In [0]:
 k_means_=KMeans(n_clusters=8,init='k-means++',random_state=10) # again creating kmeans object with 8 clusters
 k_means_.fit(f) # fitting the array
 df['k_clusters']=k_means_.labels_ # creating a new column with the kmeans labels

In [0]:
import seaborn as sns # importing seaborn
ax=sns.countplot(data=df,x='k_clusters',hue='type') # ploting a count plot
for bar in ax.patches: # iteration for labeling the columns
  ax.annotate(format(bar.get_height(), '.0f'), (bar.get_x(), bar.get_height()))

**Creating a Recommendation system**

In [0]:
rdf=df.copy() # creating a copy of dataframe as rdf for recommendation system designing
rdf.reset_index(inplace=True) # resting the index
rdf # calling the new dataframe rdf

In [0]:
rdf

In [0]:
f # getting the array f

In [0]:
from sklearn.metrics.pairwise import cosine_similarity # importing cosine similarity
cos_sim=cosine_similarity(f) # creating a array of cosine similarity as cos_sim
cos_sim

In [0]:
def recommendations(movie):
    print(f"If you liked '{movie}', you may also like: \n")
    index = rdf[rdf['title'] == movie].index[0] # finding index
    distances = sorted(list(enumerate(cos_sim[index])), reverse=True, key=lambda x:x[1]) # enumerated list of cosine similarity with index for each entry in rdf
    for i in distances[1:11]: # iteration for top 10 recommendations
        print(rdf.iloc[i[0]].title)


In [0]:
recommendations('Golmaal: Fun Unlimited')