In [184]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import linear_kernel
from operator import itemgetter

import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [185]:
movies = pd.read_csv('imdb_movies.csv') # data set from https://www.kaggle.com/datasets/ashpalsingh1525/imdb-movies-dataset

In [186]:
movies.head()

Unnamed: 0,names,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
0,Creed III,03/02/2023,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,English,75000000.0,271616700.0,AU
1,Avatar: The Way of Water,12/15/2022,78.0,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Avatar: The Way of Water,Released,English,460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,04/05/2023,76.0,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",The Super Mario Bros. Movie,Released,English,100000000.0,724459000.0,AU
3,Mummies,01/05/2023,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor...",Momias,Released,"Spanish, Castilian",12300000.0,34200000.0,AU
4,Supercell,03/17/2023,61.0,Action,Good-hearted teenager William always lived in ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Supercell,Released,English,77000000.0,340942000.0,US


In [187]:
print(movies.isnull().sum())

names          0
date_x         0
score          0
genre         85
overview       0
crew          56
orig_title     0
status         0
orig_lang      0
budget_x       0
revenue        0
country        0
dtype: int64


In [188]:
movies = movies.dropna()

In [189]:
movies['combined_features'] = (
    movies['genre'] + ' ' +
    movies['overview'] + ' ' +
    movies['orig_lang'] + ' ' +
    movies['crew']
)

In [190]:
print(movies['combined_features'])

0        Drama, Action After dominating the boxing worl...
1        Science Fiction, Adventure, Action Set more th...
2        Animation, Adventure, Family, Fantasy, Comedy ...
3        Animation, Comedy, Family, Adventure, Fantasy ...
4        Action Good-hearted teenager William always li...
                               ...                        
10173    Drama In 1979 Santa Barbara, California, Dorot...
10174    Action When DEA agents are taken captive by a ...
10175    Drama, Thriller, Romance Barley Scott Blair, a...
10176    Action, Adventure, Science Fiction, Thriller, ...
10177    Animation, Family, Fantasy Princess Odette and...
Name: combined_features, Length: 10052, dtype: object


In [191]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['combined_features'])

In [192]:
print(f'shape of data: {tfidf_matrix.shape}')

shape of data: (10052, 70281)


In [193]:
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(set(synonyms))

In [204]:
def find_movies(user_input, index):
    user_input_synonyms = [get_synonyms(word) for word in user_input.split()]
    user_input += ', '.join([synonym for synonyms in user_input_synonyms for synonym in synonyms])

    user_tfidf = tfidf_vectorizer.transform([user_input])

    # Use cosine similarity between user input and each movie
    normalized_tfidf_matrix = normalize(tfidf_matrix, axis=1, norm='l2')
    normalized_user_tfidf = normalize(user_tfidf, axis=1, norm='l2')
    cosine_sim = linear_kernel(normalized_tfidf_matrix, normalized_user_tfidf) #Normalized dot product

    sim_scores = list(enumerate(cosine_sim.flatten()))  # Flatten cosine_sim array and get index and score pairs
    sim_scores = sorted(sim_scores, key=itemgetter(1), reverse=True)  # Sort in descending order based on similarity scores

    # Get indices of top 3 most similar movies
    similar_movies_indices = [index for index, _ in sim_scores[index:index+3]]
    print(f'similarity score: {sim_scores[index:index+3]}')
    # Print the 3 most similar movies
    print("\nSimilar Movies:\n")
    for index in similar_movies_indices:
        print(movies.iloc[index]['names'])
        print("Overview:", movies.iloc[index]['overview'])
        print("Genre:", movies.iloc[index]['genre'])
        print("Original Language:", movies.iloc[index]['orig_lang'])
        print("Crew:", movies.iloc[index]['crew'])
        print("\n")

In [205]:
print('\t*** Use key words, for example:  genre, crew, language, plot description')
print('\t or type exit to end the program ***')

while True:
  user_input = input('\nDescribe what movie do you want to watch: ')

  if(user_input == "exit"):
    break

  index = 0
  find_movies(user_input, index)

  while True:
    more_results = input('\nDo you want to see three more results? (y/n): ')

    if more_results == "y":
      index+=3
      find_movies(user_input,index)

    if more_results == "n":
      break


	*** Use key words, for example:  genre, crew, language, plot description
	 or type exit to end the program ***

Describe what movie do you want to watch: horror ouija
similarity score: [(7006, 0.13653203978080383), (5894, 0.1345439681353582), (3205, 0.13298849032399174)]

Similar Movies:

The Ouija Exorcism
Overview: In 1985 an exorcist locked a demonic spirit inside a Ouija board. Now, when his grandson stumbles upon the board game, the evil is unleashed and free to torment those responsible for its banishment.
Genre: Horror
Original Language:  English
Crew: Ben Morrison, Joe, Michael Palladino, Noah, Laura Kirchner, Bev, Branden Smith, Geoff, Brittney Bertier, Chloe, Lola Kelly, Ronni, J. Damian Anastasio, Dov, Tony Harutyunyan, Young Joe, Julia Rae, Young Bev


General Cemetery
Overview: Set in Iquitos, the story follows Andrea (Airam Galliani), a 15-years-ago teenage girl, who suffers the death of his father. With the help of his friends from school, encourage her to contact him u