In [1]:
#Step 1 : Importing the necessary libraries
import nltk
print(nltk.__version__)
import gensim
print(gensim.__version__)
nltk.download("punkt")
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize

import pandas as pd
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics.pairwise import cosine_similarity



3.8.1
4.3.1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
#Step 2 : Load csv file
data=pd.read_csv('/content/embedding_ready_dataset.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,title,release_date,genres,vote_average,overview,runtime,tagline,text_description,embeddind_prep
0,0,Fast X,2023-05-17,"['Action', 'Crime', 'Thriller']",7.4,Over many missions and against impossible odds...,142.0,The end of the road begins.,The end of the road begins. Over many missions...,Action Crime Thriller. The end of the road beg...
1,1,John Wick: Chapter 4,2023-03-22,"['Action', 'Thriller', 'Crime']",7.9,"With the price on his head ever increasing, Jo...",170.0,"No way back, one way out.","No way back, one way out. With the price on hi...","Action Thriller Crime. No way back, one way ou..."
2,2,The Super Mario Bros. Movie,2023-04-05,"['Animation', 'Family', 'Adventure', 'Fantasy'...",7.8,"While working underground to fix a water main,...",92.0,,While working underground to fix a water main...,Animation Family Adventure Fantasy Comedy. Wh...
3,3,Spider-Man: Across the Spider-Verse,2023-05-31,"['Action', 'Adventure', 'Animation', 'Science ...",8.8,"After reuniting with Gwen Stacy, Brooklyn’s fu...",140.0,It's how you wear the mask that matters,It's how you wear the mask that matters After ...,Action Adventure Animation Science Fiction. It...
4,4,Hypnotic,2023-05-11,"['Mystery', 'Thriller', 'Science Fiction']",6.5,A detective becomes entangled in a mystery inv...,94.0,Control is an illusion.,Control is an illusion. A detective becomes en...,Mystery Thriller Science Fiction. Control is a...


In [3]:
# Step 3: Preprocessing
def preprocess_text(text):
    # Tokenize and preprocess the text
    tokens = simple_preprocess(text)
    return tokens

data['preprocessed_summary'] = data['embeddind_prep'].apply(preprocess_text)

In [4]:
# Step 4: Tagging and Vectorization
tagged_data = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(data['preprocessed_summary'])]

In [5]:
#Step 5 : New summary
new_summary = "an orphan boy learns how to use his magical powers"

In [6]:
new_preprocessed_summary = preprocess_text(new_summary)

In [90]:
#Step 6 : Finding the best hyperparameters and best cosine similarity

def evaluate_model(model, inferred_vector):
    document_vectors = np.array([model.dv[i] for i in range(len(tagged_data))])
    cosine_similarities = cosine_similarity(inferred_vector, document_vectors).flatten()
    return cosine_similarities


def try_different_parameters(vector_sizes, windows, min_counts, epochs):
    best_similarity = -1.0
    best_params = {}


    for vector_size in vector_sizes:
        for window in windows:
            for min_count in min_counts:
                for num_epochs in epochs:
                    # Train Doc2Vec model
                    model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, workers=4, epochs=num_epochs)
                    model.build_vocab(tagged_data)
                    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

                    # Infer vector using the same vector size
                    inferred_vector = model.infer_vector(new_preprocessed_summary)
                    inferred_vector = inferred_vector.reshape(1, -1)

                    # Evaluate the model
                    similarity_scores = evaluate_model(model, inferred_vector)
                    avg_similarity = np.mean(similarity_scores)

                    # Update best parameters if needed
                    if avg_similarity > best_similarity:
                        best_similarity = avg_similarity
                        best_params = {
                            'vector_size': vector_size,
                            'window': window,
                            'min_count': min_count,
                            'epochs': num_epochs
                        }
    return best_params, best_similarity

# Define hyperparameter ranges to try
vector_sizes = [50, 100, 150]
windows = [1, 3, 5, 7]
min_counts = [1, 2, 3]
epochs = [10, 20, 30, 40, 50, 60, 70]

# Try different hyperparameters and get the best combination
best_params, best_similarity = try_different_parameters(vector_sizes, windows, min_counts, epochs)

print("Best Hyperparameters:", best_params)
print("Best Cosine Similarity:", best_similarity)



Best Hyperparameters: {'vector_size': 50, 'window': 7, 'min_count': 2, 'epochs': 20}
Best Cosine Similarity: 0.5181477


In [7]:
# Step 7 : Train Doc2Vec model
model = Doc2Vec(vector_size=50, window=7, min_count=2, epochs=20)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

In [8]:
#Step 8 : Infering vector
inferred_vector = model.infer_vector(new_preprocessed_summary)

In [9]:
# Step 9 : Calculate Cosine Similarities and Retrieve Movie Titles and Summaries
document_vectors = np.array([model.dv[i] for i in range(len(tagged_data))])
cosine_similarities = cosine_similarity([inferred_vector], document_vectors).flatten()

In [10]:
top_similar_indices = np.argsort(cosine_similarities)[-10:][::-1]

In [11]:
# Step 10 : Display similar movie titles and summaries
print("Similar Movies and Summaries:")
for idx in top_similar_indices:
    title = data.iloc[idx]['title']
    summary = data.iloc[idx]['preprocessed_summary']
    similarity = cosine_similarities[idx]
    print(f"Title: {title}")
    print(f"Cosine Similarity: {similarity:.4f}")
    print(f"Summary: {summary}\n")



Similar Movies and Summaries:
Title: Lu Over the Wall
Cosine Similarity: 0.8740
Summary: ['animation', 'family', 'fantasy', 'adventure', 'comedy', 'music', 'in', 'small', 'fishing', 'village', 'gloomy', 'middle', 'school', 'student', 'named', 'kai', 'meets', 'mermaid', 'named', 'lu']

Title: Young Mother-in-Law
Cosine Similarity: 0.8649
Summary: ['romance', 'young', 'mother', 'in', 'law', 'falls', 'in', 'love', 'with', 'her', 'daughter', 'boyfriend']

Title: Playing It Cool
Cosine Similarity: 0.8513
Summary: ['comedy', 'romance', 'love', 'it', 'balancing', 'act', 'young', 'man', 'meets', 'and', 'instantly', 'falls', 'in', 'love', 'with', 'an', 'engaged', 'woman']

Title: Muppets Haunted Mansion
Cosine Similarity: 0.8402
Summary: ['comedy', 'family', 'tv', 'movie', 'cue', 'the', 'light', 'ning', 'gonzo', 'is', 'challenged', 'to', 'spend', 'one', 'night', 'in', 'the', 'haunted', 'mansion', 'on', 'halloween', 'night']

Title: The Green Inferno
Cosine Similarity: 0.8378
Summary: ['horror',