# Import Modules and Libraries

In [2]:
from timeit import default_timer as timer
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cdist as scipy_cdist
import matplotlib.pyplot as plt
import pickle


np.random.seed(0)  # for reproducibility

# Importing dataset and Preprocessing 

In [4]:

movies = pd.read_csv('C:/Users/ASUS/Desktop/NLP-Efficient-Semantic-Similarity-Search-with-FAISS-and-GPUs-main/datasets/wiki_movie_plots_deduped_with_summaries.csv', usecols=['Title', 'PlotSummary'])
movies.drop_duplicates(subset='PlotSummary', inplace=True)
# Drop the rows that don't having a Title or Plot
movies = movies.dropna(subset=['Title', 'PlotSummary'])
movies.reset_index(drop=True, inplace=True)

print(f"Plots of {len(movies.index)} movies!")

Plots of 33870 movies!


# Model Training 

In [5]:
pd.set_option("max_colwidth", 50)

torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'  # use GPU if available
encoder = SentenceTransformer('paraphrase-MiniLM-L6-v2', device=torch_device)
plot_embeddings = encoder.encode(movies.PlotSummary.tolist(), device=torch_device)

# Save the trained model in a pickle file

In [7]:
pickle_out = open("SE10model.pickle","wb")
pickle.dump(plot_embeddings, pickle_out)
pickle_out.close()

# Give a hardcoded input and geting the simillar movies

In [8]:
# getting the most similar movie for Godzilla vs. Kong (2021 movie, not in the Dataset)
userInput = """When a teenage boy goes missing, individuals of a town who are still reeling from a mining accident, interact with each other amidst the distressing circumstances.
"""

userInput_embeddings = encoder.encode([userInput], device=torch_device)

start = timer()

similarities = 1 - scipy_cdist(userInput_embeddings, plot_embeddings, 'cosine')
similarities = np.around(similarities, decimals=2)

# c = np.sort(similarities)
# d = c.tolist()
# name = d[0][-1]
# print(name)
# similar_title = movies.loc[name].Title
# print(similar_title)

end = timer()
# print(f"Finished in {(end - start):.4f} seconds.")

# for i in range(21):
#     sim_idx = np.argmax(similarities[0][i])  # index of the highest cosine similarity
#     similar_title = movies.loc[sim_idx].Title
#     print(similar_title)
i = -1
best_sim_idx = np.argsort(np.max(similarities, axis=0))[i]   # index of the highest cosine similarity
most_similar_title = movies.loc[best_sim_idx].Title
# most_similar_plot = movies.loc[best_sim_idx].PlotSummary
most_similar_title_sim = np.max(similarities, axis=0)[best_sim_idx] 
print(f'"{most_similar_title}" - {most_similar_title_sim}')
# print(f'"{most_similar_plot}"')

best_sim_idx = np.argsort(np.max(similarities, axis=0))[i-1]   # index of the highest cosine similarity
most_similar_title = movies.loc[best_sim_idx].Title
most_similar_title_sim = np.max(similarities, axis=0)[best_sim_idx] 
print(f'"{most_similar_title}" - {most_similar_title_sim}')

best_sim_idx = np.argsort(np.max(similarities, axis=0))[i-2]   # index of the highest cosine similarity
most_similar_title = movies.loc[best_sim_idx].Title
most_similar_title_sim = np.max(similarities, axis=0)[best_sim_idx] 
print(f'"{most_similar_title}" - {most_similar_title_sim}')

best_sim_idx = np.argsort(np.max(similarities, axis=0))[i-3]   # index of the highest cosine similarity
most_similar_title = movies.loc[best_sim_idx].Title
most_similar_title_sim =np.max(similarities, axis=0)[best_sim_idx] 
print(f'"{most_similar_title}" - {most_similar_title_sim}')

best_sim_idx = np.argsort(np.max(similarities, axis=0))[i-4]   # index of the highest cosine similarity
most_similar_title = movies.loc[best_sim_idx].Title
most_similar_title_sim = np.max(similarities, axis=0)[best_sim_idx] 
print(f'"{most_similar_title}" - {most_similar_title_sim}')

best_sim_idx = np.argsort(np.max(similarities, axis=0))[i-5]   # index of the highest cosine similarity
most_similar_title = movies.loc[best_sim_idx].Title
most_similar_title_sim = np.max(similarities, axis=0)[best_sim_idx] 
print(f'"{most_similar_title}" - {most_similar_title_sim}')

best_sim_idx = np.argsort(np.max(similarities, axis=0))[i-6]   # index of the highest cosine similarity
most_similar_title = movies.loc[best_sim_idx].Title
most_similar_title_sim = np.max(similarities, axis=0)[best_sim_idx] 
print(f'"{most_similar_title}" - {most_similar_title_sim}')

"Little Accidents" - 0.67
"Dusari Goshta" - 0.51
"Finders Keepers" - 0.51
"Late Fragment" - 0.5
"Wild Boys of the Road" - 0.5
"Dig That Uranium" - 0.49
"72 Mile - Ek Pravas" - 0.49
