In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
data = pd.read_csv("netflixData.csv.zip")
print(data.head())

                                Show Id                          Title  \
0  cc1b6ed9-cf9e-4057-8303-34577fb54477                       (Un)Well   
1  e2ef4e91-fb25-42ab-b485-be8e3b23dedb                         #Alive   
2  b01b73b7-81f6-47a7-86d8-acb63080d525  #AnneFrank - Parallel Stories   
3  b6611af0-f53c-4a08-9ffa-9716dc57eb9c                       #blackAF   
4  7f2d4170-bab8-4d75-adc2-197f7124c070               #cats_the_mewvie   

                                         Description  \
0  This docuseries takes a deep dive into the luc...   
1  As a grisly virus rampages a city, a lone man ...   
2  Through her diary, Anne Frank's story is retol...   
3  Kenya Barris and his family navigate relations...   
4  This pawesome documentary explores how our fel...   

                      Director  \
0                          NaN   
1                       Cho Il   
2  Sabina Fedeli, Anna Migotto   
3                          NaN   
4             Michael Margolis   

             

In [None]:
print(data.shape)

(5967, 13)


In [None]:
print(data.isnull().sum())

Show Id                  0
Title                    0
Description              0
Director              2064
Genres                   0
Cast                   530
Production Country     559
Release Date             3
Rating                   4
Duration                 3
Imdb Score             608
Content Type             0
Date Added            1335
dtype: int64


In [None]:
data = data[["Title", "Description", "Content Type", "Genres"]] # slicing of the data set

In [None]:
print(data.isnull().sum())# we wont have null in the dataset

Title           0
Description     0
Content Type    0
Genres          0
dtype: int64


In [None]:
!pip install nltk



In [None]:
import nltk # NLP Library ,help to remove garbage words
import re # helps to create rules
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer('english')
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def clean(text):
  text = str(text).lower()
  text = re.sub('\[.*?\]', '', text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('<.*?>+', '', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
  text = re.sub('\n', '', text)
  text = re.sub('\w*\d\w*', '', text)
  text = [word for word in text.split(' ') if word not in stopword]
  text=" ".join(text)
  return text



data["Title"] = data["Title"].apply(clean)

In [None]:
print(data["Title"])

0                            unwell
1                             alive
2       annefrank  parallel stories
3                           blackaf
4                     catsthemewvie
                   ...             
5962                      الف مبروك
5963                   دفعة القاهرة
5964                           海的儿子
5965                        반드시 잡는다
5966             최강전사 미니특공대  영웅의 탄생
Name: Title, Length: 5967, dtype: object


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer #IT WILL CONVERT THE DATSET INTO VECTORS AND TELL US MORE COMMON AND LESS COMMON VECTOR
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
feature = data['Genres'].tolist()
tfidf = TfidfVectorizer(stop_words= "english")#makes garbage vectors together important vectors together
tfidf_matrix = tfidf.fit_transform(feature)
similarity = cosine_similarity(tfidf_matrix)# maps all of their distances together
indices = pd.Series(data.index, index=data['Title']).drop_duplicates()

In [None]:




# Optional: Create a function to get recommendations based on cosine similarity
def get_recommendations(title, cosine_sim=similarity):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Get top 10 recommendations
    movie_indices = [i[0] for i in sim_scores]
    return data['Title'].iloc[movie_indices]



In [None]:

example_title = "alive"
if example_title in indices:
    print(get_recommendations(example_title))
else:
    print(f"Title '{example_title}' not found in the dataset.")

178                  aaviri
360            andhaghaaram
361             andhakaaram
398                 apostle
1759     game hindi version
1760     game tamil version
1761    game telugu version
1801              ghost lab
1804          ghost stories
2104             homunculus
Name: Title, dtype: object
