In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
data = pd.read_csv(r"https://raw.githubusercontent.com/sindhura-nk/Datasets/refs/heads/main/netflixData.csv")
data.head()

Unnamed: 0,Show Id,Title,Description,Director,Genres,Cast,Production Country,Release Date,Rating,Duration,Imdb Score,Content Type,Date Added
0,cc1b6ed9-cf9e-4057-8303-34577fb54477,(Un)Well,This docuseries takes a deep dive into the luc...,,Reality TV,,United States,2020.0,TV-MA,1 Season,6.6/10,TV Show,
1,e2ef4e91-fb25-42ab-b485-be8e3b23dedb,#Alive,"As a grisly virus rampages a city, a lone man ...",Cho Il,"Horror Movies, International Movies, Thrillers","Yoo Ah-in, Park Shin-hye",South Korea,2020.0,TV-MA,99 min,6.2/10,Movie,"September 8, 2020"
2,b01b73b7-81f6-47a7-86d8-acb63080d525,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...","Sabina Fedeli, Anna Migotto","Documentaries, International Movies","Helen Mirren, Gengher Gatti",Italy,2019.0,TV-14,95 min,6.4/10,Movie,"July 1, 2020"
3,b6611af0-f53c-4a08-9ffa-9716dc57eb9c,#blackAF,Kenya Barris and his family navigate relations...,,TV Comedies,"Kenya Barris, Rashida Jones, Iman Benson, Genn...",United States,2020.0,TV-MA,1 Season,6.6/10,TV Show,
4,7f2d4170-bab8-4d75-adc2-197f7124c070,#cats_the_mewvie,This pawesome documentary explores how our fel...,Michael Margolis,"Documentaries, International Movies",,Canada,2020.0,TV-14,90 min,5.1/10,Movie,"February 5, 2020"


In [None]:
data['Title'].unique()[:15]

array(['(Un)Well', '#Alive', '#AnneFrank - Parallel Stories', '#blackAF',
       '#cats_the_mewvie', '#FriendButMarried', '#FriendButMarried 2',
       '#realityhigh', '#Rucker50', '#Selfie', '#Selfie 69',
       '10 Days in Sun City', '10 jours en or', '100 Days My Prince',
       '100 Humans'], dtype=object)

In [5]:
data.columns

Index(['Title', 'Description', 'Content Type', 'Genres'], dtype='object')

In [6]:
data = data[['Title','Description','Content Type','Genres']]
data.head()

Unnamed: 0,Title,Description,Content Type,Genres
0,(Un)Well,This docuseries takes a deep dive into the luc...,TV Show,Reality TV
1,#Alive,"As a grisly virus rampages a city, a lone man ...",Movie,"Horror Movies, International Movies, Thrillers"
2,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...",Movie,"Documentaries, International Movies"
3,#blackAF,Kenya Barris and his family navigate relations...,TV Show,TV Comedies
4,#cats_the_mewvie,This pawesome documentary explores how our fel...,Movie,"Documentaries, International Movies"


In [7]:

data.isna().sum()

Unnamed: 0,0
Title,0
Description,0
Content Type,0
Genres,0


In [8]:
data = data.dropna()

In [10]:
import nltk
import re
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
from nltk.corpus import stopwords
stop = set(stopwords.words("english"))
import string


In [13]:
def clean(text):
    text = text.lower()
    # remove digits
    text = re.sub(r'\d+','',text)
    # remove https
    text = re.sub(r'https?://\S+|www\.\S+', '', text, flags=re.IGNORECASE)
    # remove html
    text = re.sub(r'<[^>]+>','',text)
    # Remove bracket content (e.g., [video], [ad])
    text = re.sub(r'\[.*?\]', '', text)
    # remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
     # Clean whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # divide sentence into word tokens
    words = text.split()
    # Remove stopwords
    words = [w for w in words if w not in stop]

    return " ".join(words)

In [14]:
data['Title']

Unnamed: 0,Title
0,(Un)Well
1,#Alive
2,#AnneFrank - Parallel Stories
3,#blackAF
4,#cats_the_mewvie
...,...
5962,الف مبروك
5963,دفعة القاهرة
5964,海的儿子
5965,반드시 잡는다


In [15]:
data['Title'] = data['Title'].apply(clean)

In [16]:
data['Title']

Unnamed: 0,Title
0,unwell
1,alive
2,annefrank parallel stories
3,blackaf
4,catsthemewvie
...,...
5962,الف مبروك
5963,دفعة القاهرة
5964,海的儿子
5965,반드시 잡는다


In [17]:
# Create an instance of TfidfVectorizer
tfidf = TfidfVectorizer(stop_words="english")

# Fit and transform the vectorizer on our corpus
tfidf_pre = tfidf.fit_transform(data["Genres"]).toarray()

# Compute the cosine similarity matrix
similarity = cosine_similarity(tfidf_pre)
indices = pd.Series(data.index,
                    index=data['Title']).drop_duplicates()

In [18]:
indices

Unnamed: 0_level_0,0
Title,Unnamed: 1_level_1
unwell,0
alive,1
annefrank parallel stories,2
blackaf,3
catsthemewvie,4
...,...
الف مبروك,5962
دفعة القاهرة,5963
海的儿子,5964
반드시 잡는다,5965


In [19]:
l1=[2,3]
#enumerate(l1)
list(enumerate(l1))

[(0, 2), (1, 3)]

In [20]:
def netFlix_recommendation(title, similarity = similarity):
    index = indices[title]
    similarity_scores = list(enumerate(similarity[index]))
    # order the similarity scores from highest to smallest
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[0:10]
    # extract the index of above similarity scores
    movieindices = [i[0] for i in similarity_scores]
    # provide the index to iloc to retrieve the respective title names
    return data['Title'].iloc[movieindices]

print(netFlix_recommendation("humans"))

14                     humans
353            ancient aliens
495                    babies
929                challenger
958                   chelsea
1084                connected
1099    coronavirus explained
1298                diagnosis
1544                explained
1629             fire chasers
Name: Title, dtype: object


In [21]:
print(netFlix_recommendation("alive"))

1                     alive
178                  aaviri
360            andhaghaaram
361             andhakaaram
398                 apostle
1759     game hindi version
1760     game tamil version
1761    game telugu version
1801              ghost lab
1804          ghost stories
Name: Title, dtype: object


Item based collaborative filtering

In [22]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
# Sample user-item interaction data
collab_filtered_data = {
	'User': ['Alice', 'Alice', 'Bob', 'Bob', 'Carol', 'Carol', 'Dave', 'Dave'],
	'Item': ['Item1', 'Item2', 'Item1', 'Item3', 'Item2', 'Item3', 'Item1', 'Item2'],
	'Rating': [5, 3, 4, 2, 4, 5, 2, 5]
}

collab_f_df = pd.DataFrame(collab_filtered_data)
# Create user-item matrix
user_item_matrix = collab_f_df.pivot_table(index='User', columns='Item', values='Rating', fill_value=0)

# Compute item similarity using cosine similarity
item_similarity = cosine_similarity(user_item_matrix.T)
item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix.columns, columns=user_item_matrix.columns)

# Recommend items similar to 'Item1'
def recommend_similar_items(item, similarity_df, top_n=3):
	return similarity_df[item].sort_values(ascending=False)[1:top_n+1]

# Example recommendation
similar_items = recommend_similar_items('Item1', item_similarity_df)
print("Items similar to Item1:", similar_items)




Items similar to Item1: Item
Item2    0.527046
Item3    0.221455
Name: Item1, dtype: float64


Closer to 1 indicates the items are highly similar

Item2 is providing around 0.5 , this seems to be similar to Item1