In [21]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
df1 = pd.read_csv(r"C:\Users\Student\Desktop\Sutharsan_N_Internship\Recommendation_System\movie\movie_recommendation_dataset\movies.csv")
print(df1.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [23]:
df1.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [24]:
import re
#cleaning dataset
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [25]:
df1['genres_list'] = df1['genres'].str.replace('|', ' ')
df1['clean_title'] = df1['title'].apply(clean_title)

movies_data = df1[['movieId', 'clean_title', 'genres_list']]
print(movies_data.head())

   movieId                       clean_title  \
0        1                    Toy Story 1995   
1        2                      Jumanji 1995   
2        3             Grumpier Old Men 1995   
3        4            Waiting to Exhale 1995   
4        5  Father of the Bride Part II 1995   

                                   genres_list  
0  Adventure Animation Children Comedy Fantasy  
1                   Adventure Children Fantasy  
2                               Comedy Romance  
3                         Comedy Drama Romance  
4                                       Comedy  


In [26]:
df2 = pd.read_csv(r"C:\Users\Student\Desktop\Sutharsan_N_Internship\Recommendation_System\movie\movie_recommendation_dataset\ratings.csv")
print(df2.head())

   userId  movieId  rating   timestamp
0       1      296     5.0  1147880044
1       1      306     3.5  1147868817
2       1      307     5.0  1147868828
3       1      665     5.0  1147878820
4       1      899     3.5  1147868510


In [27]:
df2.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [28]:
ratings_data = df2.drop(['timestamp'], axis=1)
print(ratings_data.head())

   userId  movieId  rating
0       1      296     5.0
1       1      306     3.5
2       1      307     5.0
3       1      665     5.0
4       1      899     3.5


In [29]:
combined_data = ratings_data.merge(movies_data, on='movieId')
print(combined_data.head())

   userId  movieId  rating        clean_title                  genres_list
0       1      296     5.0  Pulp Fiction 1994  Comedy Crime Drama Thriller
1       3      296     5.0  Pulp Fiction 1994  Comedy Crime Drama Thriller
2       4      296     4.0  Pulp Fiction 1994  Comedy Crime Drama Thriller
3       5      296     4.0  Pulp Fiction 1994  Comedy Crime Drama Thriller
4       7      296     4.0  Pulp Fiction 1994  Comedy Crime Drama Thriller


# 2. Definition of the Search function

In [30]:
vectorizer_title = TfidfVectorizer(ngram_range=(1,2))

tfidf_title = vectorizer_title.fit_transform(movies_data['clean_title'])

def search_by_title(title):
    title = clean_title(title)
    query_vec = vectorizer_title.transform([title])
    similarity = cosine_similarity(query_vec, tfidf_title).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies_data.iloc[indices][::-1]
    return results

movie_results = search_by_title("Toy Story")
print(movie_results)

       movieId               clean_title  \
3021      3114          Toy Story 2 1999   
14813    78499          Toy Story 3 2010   
0            1            Toy Story 1995   
59767   201588          Toy Story 4 2019   
20497   106022  Toy Story of Terror 2013   

                                            genres_list  
3021        Adventure Animation Children Comedy Fantasy  
14813  Adventure Animation Children Comedy Fantasy IMAX  
0           Adventure Animation Children Comedy Fantasy  
59767               Adventure Animation Children Comedy  
20497                         Animation Children Comedy  


In [31]:
vectorizer_genres = TfidfVectorizer(ngram_range=(1,2))

tfidf_genres = vectorizer_genres.fit_transform(movies_data['genres_list'])

def search_similar_genres(genres):
    query_vec = vectorizer_genres.transform([genres])
    similarity = cosine_similarity(query_vec, tfidf_genres).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = movies_data.iloc[indices][::-1]
    return results

gen = 'Adventure Comedy'
print(search_similar_genres(gen))

       movieId                                        clean_title  \
25659   124232                          The Fuller Brush Man 1948   
2379      2470                              Crocodile Dundee 1986   
21378   110223                         Prisoner of Zenda The 1979   
37582   152970                     Hunt for the Wilderpeople 2016   
5808      5920     Ace of Aces aka Super Ace The As des as L 1982   
57227   195905                                     Fools Day 2014   
3651      3752                              Me Myself  Irene 2000   
53513   187573                      Blondie Takes a Vacation 1939   
18675    97665  Asterix  Obelix God Save Britannia Astrix et O...   
37581   152968                                        Lusers 2015   

            genres_list  
25659  Adventure Comedy  
2379   Adventure Comedy  
21378  Adventure Comedy  
37582  Adventure Comedy  
5808   Adventure Comedy  
57227  Adventure Comedy  
3651   Adventure Comedy  
53513  Adventure Comedy  
18675 

# 3. Make the recommendation

The following function calculates scores for recommendation.

In [32]:
def scores_calculator(movie_id):
    #find the recommendations from users who like the same movie
    similar_users = combined_data[(combined_data['movieId']== movie_id) & (combined_data['rating']>=4)]['userId'].unique()
    similar_user_recs = combined_data[(combined_data['userId'].isin(similar_users)) & (combined_data['rating']>=4)]['movieId']
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    #print(similar_user_recs)
    
    #find the recommendations from all users who have watch the movies above
    all_users = combined_data[(combined_data['movieId'].isin(similar_user_recs.index)) & (combined_data['rating']>=4)]
    all_users_recs = all_users['movieId'].value_counts() / all_users['userId'].nunique()
    #print(all_users_recs)
    
    genres_of_selected_movie = combined_data[combined_data['movieId']==movie_id]['genres_list'].unique()
    genres_of_selected_movie = np.array2string(genres_of_selected_movie)
    movies_with_similar_genres = search_similar_genres(genres_of_selected_movie)
    
    indices = []
    for index in movies_with_similar_genres[(movies_with_similar_genres['movieId'].isin(similar_user_recs.index))]['movieId']:
        indices.append(index)
    
    #times a factor 1.5 to movies with similar genres and similar users
    similar_user_recs.loc[indices] = similar_user_recs.loc[indices]*1.5 

    #times a factor 0.9 to movies with similar genres and all users
    indices = []
    for index in movies_with_similar_genres[(movies_with_similar_genres['movieId'].isin(all_users_recs.index))]['movieId']:
        indices.append(index)
    all_users_recs.loc[indices] = all_users_recs.loc[indices]*0.9
    
    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentages.columns = ['similar', 'all']
    rec_percentages['score'] = rec_percentages['similar'] / rec_percentages['all']
    
    rec_percentages = rec_percentages.sort_values('score', ascending=False)
    return rec_percentages

scores_calculator(3114)

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
115875,0.002656,0.000172,15.455353
115879,0.001557,0.000111,14.043053
114240,0.000275,0.000022,12.390929
67009,0.000305,0.000031,9.912743
7269,0.000305,0.000031,9.912743
...,...,...,...
853,0.000061,0.000271,0.225290
601,0.000061,0.000283,0.215494
797,0.000061,0.000296,0.206515
1075,0.000061,0.000339,0.180232


In [34]:
def recommendation_results(user_input, title_index):
    user_input = clean_title(user_input)
    title_candidates = search_by_title(user_input)
    movie_id = title_candidates.iloc[title_index]['movieId']
    scores = scores_calculator(movie_id)
    results = scores.head(10).merge(movies_data, left_index=True, right_on='movieId')[['clean_title', 'score', 'genres_list']]
    results = results.rename(columns={'clean_title': 'title', 'genres_list': 'genres'})
    return results

# Get user input
user_input = input("Enter a movie title: ")

# Display available movie options
title_candidates = search_by_title(user_input)
for i in range(min(5, len(title_candidates))):  # Show up to 5 candidates
    print(f"{i}: {title_candidates['clean_title'].iloc[i]}")

# Ask user to choose one of the available movies
title_index = int(input("\nChoose the movie by entering the corresponding index: "))

# Validate the choice
if title_index in range(len(title_candidates)):
    print("\nWe have the following recommendations:")
    print(recommendation_results(user_input, title_index))
else:
    print("Sorry, invalid choice. Please try again.")


0: Rush Hour 3 2007
1: Rush Hour 2 2001
2: Rush Hour 1998
3: Rush 2013
4: Rush  Rush In Rio 2003

We have the following recommendations:
                                  title       score  \
52494  Katt Williams Great America 2018  289.883929   
49703              The Adventurers 2017  289.883929   
29103            Nglen til Paradis 1970  289.883929   
56990                        Reich 2001  289.883929   
45126                  Zebra Force 1976  289.883929   
45598              The Alpha Caper 1973  289.883929   
36345                 Sister Smile 2009  289.883929   
36235             Le Grand Partage 2015  289.883929   
29708        The Battle of Neretva 1969  289.883929   
30094                 Without Fail 2014  289.883929   

                             genres  
52494                        Comedy  
49703  Action Adventure Crime Drama  
29103                        Comedy  
56990                        Action  
45126                        Action  
45598            (no genres l