# Importing necessary packages

In [2]:
from timeit import default_timer as timer
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cdist as scipy_cdist
import matplotlib.pyplot as plt
import pickle

np.random.seed(0)  # for reproducibility

# Loading and Preprocessing the data

In [4]:
# Read the dataset and select 'Title' and 'PlotSummary' columns
movies = pd.read_csv('C:/Users/ASUS/Desktop/NLP-Efficient-Semantic-Similarity-Search-with-FAISS-and-GPUs-main/datasets/wiki_movie_plots_deduped_with_summaries.csv', usecols=['Title', 'PlotSummary'])

# Remove duplicate PlotSummaries
movies.drop_duplicates(subset='PlotSummary', inplace=True)

# Drop the rows that don't having a Title or Plot
movies = movies.dropna(subset=['Title', 'PlotSummary'])

# Reset index
movies.reset_index(drop=True, inplace=True)

# Print the number of movies in the dataset
print(f"Plots of {len(movies.index)} movies!")

# Set option to display maximum of 50 characters per column
pd.set_option("max_colwidth", 50)

Plots of 33870 movies!


# Encording Plot Summaries

In [5]:
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Check if a CUDA-enabled GPU is available and use it for processing

# Load the 'paraphrase-MiniLM-L6-v2' pre-trained SentenceTransformer
encoder = SentenceTransformer('paraphrase-MiniLM-L6-v2', device=torch_device)

# Encode the plot summaries and store them in a numpy array
plot_embeddings = encoder.encode(movies.PlotSummary.tolist(), device=torch_device)

# Save the encorded model in a pickle file

In [7]:
# Save the plot_embeddings numpy array to a pickle file for later use
pickle_out = open("SE10model.pickle","wb")
pickle.dump(plot_embeddings, pickle_out)
pickle_out.close()

# Finding most similar movies

In [8]:
# Define a sample input for which we will find the most similar movies
userInput = """When a teenage boy goes missing, individuals of a town who are still reeling from a mining accident, interact with each other amidst the distressing circumstances.
"""

# Encode the input and store it in a numpy array
userInput_embeddings = encoder.encode([userInput], device=torch_device)

# Measure the time taken to find the most similar movies
start = timer()

# Compute cosine similarities between the input and all the plot summaries in the dataset
similarities = 1 - scipy_cdist(userInput_embeddings, plot_embeddings, 'cosine')

# Round the similarities to 2 decimal places for readability
similarities = np.around(similarities, decimals=2)

end = timer()

i = -1
best_similar_indices = np.argsort(np.max(similarities, axis=0))[-1:-21:-1]  # indices of the top 20 highest cosine similarity
for idx in best_similar_indices:
    most_similar_title = movies.loc[idx].Title
    most_similar_title_sim = np.max(similarities, axis=0)[idx] 
    print(f'"{most_similar_title}" - {most_similar_title_sim}')


"Little Accidents" - 0.67
"Dusari Goshta" - 0.51
"Finders Keepers" - 0.51
"Late Fragment" - 0.5
"Wild Boys of the Road" - 0.5
"Dig That Uranium" - 0.49
"72 Mile - Ek Pravas" - 0.49
