<a href="https://colab.research.google.com/github/SanjaraT/Similar-Shows/blob/main/478SAMP3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Loading Dataset

In [7]:
df=pd.read_csv('/content/drive/MyDrive/IMDB.csv')
df.head()

Unnamed: 0,Name,Year,Episodes,Type,Rating,Image-src,Description,Name-href
0,1. Breaking Bad,2008–2013,62 eps,TV-MA,9.5,https://m.media-amazon.com/images/M/MV5BYmQ4YW...,A chemistry teacher diagnosed with inoperable ...,https://www.imdb.com/title/tt0903747/?ref_=cht...
1,2. Planet Earth II,2016,6 eps,TV-G,9.5,https://m.media-amazon.com/images/M/MV5BMGZmYm...,David Attenborough returns with a new wildlife...,https://www.imdb.com/title/tt5491994/?ref_=cht...
2,3. Planet Earth,2006,11 eps,TV-PG,9.4,https://m.media-amazon.com/images/M/MV5BMzMyYj...,A documentary series on the wildlife found on ...,https://www.imdb.com/title/tt0795176/?ref_=cht...
3,4. Band of Brothers,2001,10 eps,TV-MA,9.4,https://m.media-amazon.com/images/M/MV5BMTI3OD...,The story of Easy Company of the U.S. Army 101...,https://www.imdb.com/title/tt0185906/?ref_=cht...
4,5. Chernobyl,2019,5 eps,TV-MA,9.4,https://m.media-amazon.com/images/M/MV5BNTdkN2...,"In April 1986, an explosion at the Chernobyl n...",https://www.imdb.com/title/tt7366338/?ref_=cht...


#Combining Name and Description

In [10]:
df['Combined'] = df['Name'] + ' ' + df['Description']
df.Combined

Unnamed: 0,Combined
0,1. Breaking Bad A chemistry teacher diagnosed ...
1,2. Planet Earth II David Attenborough returns ...
2,3. Planet Earth A documentary series on the wi...
3,4. Band of Brothers The story of Easy Company ...
4,"5. Chernobyl In April 1986, an explosion at th..."
...,...
245,246. RuPaul's Drag Race RuPaul searches for Am...
246,"247. Foyle's War As WWII rages, DCS Foyle figh..."
247,248. Southland The lives of Police Officers wo...
248,249. Kardes Payi The two brothers have a plumb...


#Preprocessing

In [11]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [12]:
nltk.download('stopwords') #common words
nltk.download('wordnet')   #similar words

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [15]:
def preprocess_text(text):
    # 1. Lowercasing
    text = text.lower()

    # 2. Removing punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text) #r'-->raw string, ^-->negation, \w-->word, \s-->space,tab,newline

    # 3. Removing numbers
    text = re.sub(r'\d+', '', text) #\d+--> one or more consecutive digit characters

    # 4. Tokenization
    words = text.split()

    # 5. Removing stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words] #filters a list of words(present in stop words)

    # 6. Lemmatization(main verb)
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # 7. Joining the words back into a string
    text = ' '.join(words)

    return text


Loops are for iterating and executing statements, while comprehensions are for building and returning new data structures

In [16]:
df['Combined'] = df['Combined'].apply(preprocess_text)

#Converting into a Vector

In [17]:
import gensim
from gensim.models import Word2Vec

# 1. Prepare the data
sentences = [text.split() for text in df['Combined']]

# 2. Train the Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# 3. Function to convert text to vector
def text_to_vector(text):
    words = text.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# 4. Apply the function to the "Combined" column
df['Combined_Vector'] = df['Combined'].apply(text_to_vector)

#Finding TOP 5 TV Shows

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

# 1. Calculate cosine similarity matrix



In [19]:
similarity_matrix = cosine_similarity(df['Combined_Vector'].to_list())

# 2. Function to get top similar shows


In [20]:
def get_similar_shows(show_index, top_n=5):
    similarities = similarity_matrix[show_index]
    sorted_indices = similarities.argsort()[::-1]
    top_indices = sorted_indices[1:top_n + 1]  # Exclude the show itself
    return df.iloc[top_indices]['Name'].to_list()

In [21]:
similar_shows = get_similar_shows(0, top_n=5)
print(similar_shows)

['159. Pose', '128. Deadwood', '242. Queer Eye', '81. Peaky Blinders', '106. Star Trek: The Next Generation']
