In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

import nltk
nltk.download('punkt')
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.metrics.pairwise import cosine_similarity

from scipy.sparse import hstack


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
!wget -O ml-latest-small.zip http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

!unzip ml-latest-small.zip

--2024-08-09 18:50:53--  http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2024-08-09 18:50:54 (7.68 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [3]:
movies = pd.read_csv('ml-latest-small/movies.csv')
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
ratings = pd.read_csv('/content/ml-latest-small/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
tags = pd.read_csv('/content/ml-latest-small/tags.csv')
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [6]:


# Apply One-Hot Encoding to the 'genres' column
genres_onehot = movies['genres'].str.get_dummies()

# Preprocess 'tags' and 'titles' columns
tags_combined = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
movies = pd.merge(movies, tags_combined, on='movieId', how='left')
movies['tag'] = movies['tag'].fillna('')
movies['text'] = movies['title'] + ' ' + movies['genres'].str.split('|').apply(lambda x: ' '.join(x)) + ' ' + movies['tag']
movies['text'] = movies['text'].str.lower()
movies.head()

Unnamed: 0,movieId,title,genres,tag,text
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar pixar fun,toy story (1995) adventure animation children ...
1,2,Jumanji (1995),Adventure|Children|Fantasy,fantasy magic board game Robin Williams game,jumanji (1995) adventure children fantasy fant...
2,3,Grumpier Old Men (1995),Comedy|Romance,moldy old,grumpier old men (1995) comedy romance moldy old
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,,waiting to exhale (1995) comedy drama romance
4,5,Father of the Bride Part II (1995),Comedy,pregnancy remake,father of the bride part ii (1995) comedy preg...


In [7]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

movies['text'] = movies['text'].apply(lemmatize_text)
movies['text']

Unnamed: 0,text
0,toy story ( 1995 ) adventure animation child c...
1,jumanji ( 1995 ) adventure child fantasy fanta...
2,grumpier old men ( 1995 ) comedy romance moldy...
3,waiting to exhale ( 1995 ) comedy drama romance
4,father of the bride part ii ( 1995 ) comedy pr...
...,...
9737,black butler : book of the atlantic ( 2017 ) a...
9738,no game no life : zero ( 2017 ) animation come...
9739,flint ( 2017 ) drama
9740,bungo stray dog : dead apple ( 2018 ) action a...


In [8]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['text'])

In [9]:

# Combine the TF-IDF matrix with the genres one-hot encoded matrix
combined_features = hstack([tfidf_matrix, genres_onehot])
combined_features

<9742x9351 sparse matrix of type '<class 'numpy.float64'>'
	with 81394 stored elements in COOrdinate format>

In [10]:


# Compute cosine similarity between all movies
cosine_sim = cosine_similarity(combined_features, combined_features)


In [11]:
# Function to get top N similar movies
def get_recommendations(movie_title, cosine_sim=cosine_sim, top_n = 15):
    idx = movies.index[movies['title'] == movie_title][0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # Exclude the first movie (itself)
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

# Example: Recommend movies similar to 'The Matrix'
print(get_recommendations('Toy Story (1995)'))

2355                                   Toy Story 2 (1999)
6194                                     Wild, The (2006)
3568                                Monsters, Inc. (2001)
6486                               Shrek the Third (2007)
8219                                         Turbo (2013)
1706                                          Antz (1998)
9430                                         Moana (2016)
8927                             The Good Dinosaur (2015)
2809       Adventures of Rocky and Bullwinkle, The (2000)
6948                       Tale of Despereaux, The (2008)
3000                     Emperor's New Groove, The (2000)
7760    Asterix and the Vikings (Astérix et les Viking...
7355                                   Toy Story 3 (2010)
1757                                 Bug's Life, A (1998)
8900                                    Inside Out (2015)
Name: title, dtype: object


In [12]:
case_insensitive_movies_list = [i.lower() for i in movies.title]

In [13]:
def get_possible_movies(movie):

    temp = ''
    possible_movies = case_insensitive_movies_list.copy()
    for i in movie :
      out = []
      temp += i
      for j in possible_movies:
        if temp in j:
          out.append(j)
      if len(out) == 0:
          return possible_movies
      out.sort()
      possible_movies = out.copy()

    return possible_movies

In [14]:
def recommend():
    movie_name = input("Enter the Movie Name: ")
    movie_name_lower = movie_name.lower()

    if movie_name_lower not in case_insensitive_movies_list:
        list_movies = get_possible_movies(movie_name_lower)
        print("Did you mean one of these movies?")
        for idx, title in enumerate(list_movies):
            print(f"{idx}: {title}")
        index_movie = int(input("Enter the index number for the correct movie: "))
        num = int(input("How many movies do you want? "))
        original_title = movies['title'][case_insensitive_movies_list.index(list_movies[index_movie])]
        recommended_movies = get_recommendations(original_title, top_n=num)
    else:
        num = int(input("How many movies do you want? "))
        # Find the original title case from the movies dataframe
        original_title = movies['title'][case_insensitive_movies_list.index(movie_name_lower)]
        recommended_movies = get_recommendations(original_title, top_n=num)

    print("Here are the recommended movies:")
    print(recommended_movies)


In [15]:
recommend()

Enter the Movie Name: matrix
Did you mean one of these movies?
0: animatrix, the (2003)
1: matrix reloaded, the (2003)
2: matrix revolutions, the (2003)
3: matrix, the (1999)
Enter the index number for the correct movie: 2
How many movies do you want? 10
Here are the recommended movies:
4351     Matrix Reloaded, The (2003)
7324               Iron Man 2 (2010)
6470             Spider-Man 3 (2007)
8120    G.I. Joe: Retaliation (2013)
8178              After Earth (2013)
7545         I Am Number Four (2011)
8137                 Oblivion (2013)
6521             Transformers (2007)
8151               Iron Man 3 (2013)
7866               Battleship (2012)
Name: title, dtype: object


In [None]:
t