<a href="https://colab.research.google.com/github/S-Jyothika/AI-Final-Project/blob/main/Netflix_Content_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import nltk
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score

from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
df = pd.read_csv('/content/drive/MyDrive/AI Final Project/NetflixSimple.csv')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [4]:
text_columns = ['description', 'listed_in', 'cast', 'director']
for col in text_columns:
    df[col] = df[col].fillna("")
df[text_columns].isnull().sum()

Unnamed: 0,0
description,0
listed_in,0
cast,0
director,0


In [5]:
df['content'] = (
    df['description'] + " " +
    df['listed_in'] + " " +
    df['cast'] + " " +
    df['director']
)
df[['title', 'content']].head()

Unnamed: 0,title,content
0,3%,In a future where the elite inhabit an island ...
1,7:19,After a devastating earthquake hits Mexico Cit...
2,23:59,"When an army recruit is found dead, his fellow..."
3,9,"In a postapocalyptic world, rag-doll robots hi..."
4,21,A brilliant group of students become card-coun...


In [6]:
stop_words = set(stopwords.words('english'))
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)
df['clean_content'] = df['content'].apply(clean_text)
df[['title', 'clean_content']].head()

Unnamed: 0,title,clean_content
0,3%,future elite inhabit island paradise far crowd...
1,7:19,devastating earthquake hits mexico city trappe...
2,23:59,army recruit found dead fellow soldiers forced...
3,9,postapocalyptic world ragdoll robots hide fear...
4,21,brilliant group students become cardcounting e...


In [7]:
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['clean_content'])
tfidf_matrix.shape

(7787, 5000)

In [8]:
svd_components = min(100, tfidf_matrix.shape[1] - 1)
svd = TruncatedSVD(n_components=svd_components, random_state=42)
reduced_matrix = svd.fit_transform(tfidf_matrix)
reduced_matrix.shape

(7787, 100)

In [9]:
n_clusters = min(6, len(df))
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(reduced_matrix)
n_clusters
display(df[['title', 'cluster']])

Unnamed: 0,title,cluster
0,3%,5
1,7:19,0
2,23:59,0
3,9,4
4,21,4
...,...,...
7782,Zozo,0
7783,Zubaan,0
7784,Zulu Man in Japan,3
7785,Zumbo's Just Desserts,5


In [10]:
if n_clusters > 1:
    sil_score = silhouette_score(reduced_matrix, df['cluster'])
    print("Silhouette Score:", sil_score)
else:
    print("Silhouette Score: Not applicable for single cluster")

Silhouette Score: 0.06418374993932296


In [11]:
cosine_sim = cosine_similarity(reduced_matrix, reduced_matrix)

def recommend(title, top_n=3):
    if title not in df['title'].values:
        return "Title not found"

    idx = df[df['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]

    return [df.iloc[i[0]]['title'] for i in sim_scores]

In [12]:
print("Recommendations for '3 Idiots':")
print(recommend("3 Idiots"))

Recommendations for '3 Idiots':
['Dil Dhadakne Do', 'Thank You', 'Upstarts']


In [13]:
df[['title', 'type', 'listed_in', 'cluster']].sample(5, random_state=2)

Unnamed: 0,title,type,listed_in,cluster
4572,Oddbods: Party Monsters,Movie,Movies,4
2120,Fast & Furious Spy Racers,TV Show,Kids' TV,4
5437,Scooby-Doo on Zombie Island,Movie,Children & Family Movies,4
1558,Cutie and the Boxer,Movie,Documentaries,3
7230,Triumph of the Heart,Movie,"Dramas, Sports Movies",4
