In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
df = pd.read_csv("/content/movies.csv")
print(df.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [None]:
df.isna().sum()

Unnamed: 0,0
movieId,0
title,0
genres,0


In [None]:
import re

def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [None]:
df['genres_list'] = df['genres'].str.replace('|', ' ')
df['clean_title'] = df['title'].apply(clean_title)

movies_data = df[['movieId', 'clean_title', 'genres_list']]
print(movies_data.head())

   movieId                       clean_title  \
0        1                    Toy Story 1995   
1        2                      Jumanji 1995   
2        3             Grumpier Old Men 1995   
3        4            Waiting to Exhale 1995   
4        5  Father of the Bride Part II 1995   

                                   genres_list  
0  Adventure Animation Children Comedy Fantasy  
1                   Adventure Children Fantasy  
2                               Comedy Romance  
3                         Comedy Drama Romance  
4                                       Comedy  


In [None]:
df1 = pd.read_csv("/content/ratings.csv")
print(df1.head())

   userId  movieId  rating   timestamp
0       1      296     5.0  1147880044
1       1      306     3.5  1147868817
2       1      307     5.0  1147868828
3       1      665     5.0  1147878820
4       1      899     3.5  1147868510


In [None]:
df1.isna().sum()

Unnamed: 0,0
userId,0
movieId,0
rating,0
timestamp,0


In [None]:
ratings_data = df1.drop(['timestamp'], axis=1)
print(ratings_data.head())

   userId  movieId  rating
0       1      296     5.0
1       1      306     3.5
2       1      307     5.0
3       1      665     5.0
4       1      899     3.5


In [None]:
combined_data = ratings_data.merge(movies_data, on='movieId')
print(combined_data.head())

   userId  movieId  rating                                 clean_title  \
0       1      296     5.0                           Pulp Fiction 1994   
1       1      306     3.5  Three Colors Red Trois couleurs Rouge 1994   
2       1      307     5.0  Three Colors Blue Trois couleurs Bleu 1993   
3       1      665     5.0                            Underground 1995   
4       1      899     3.5                     Singin in the Rain 1952   

                   genres_list  
0  Comedy Crime Drama Thriller  
1                        Drama  
2                        Drama  
3             Comedy Drama War  
4       Comedy Musical Romance  


In [None]:
vectorizer_title = TfidfVectorizer(ngram_range=(1,2))

tfidf_title = vectorizer_title.fit_transform(movies_data['clean_title'])

def search_by_title(title):
    title = clean_title(title)
    query_vec = vectorizer_title.transform([title])
    similarity = cosine_similarity(query_vec, tfidf_title).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies_data.iloc[indices][::-1]
    return results

movie_results = search_by_title("Toy Story")
print(movie_results)

       movieId               clean_title  \
3021      3114          Toy Story 2 1999   
14813    78499          Toy Story 3 2010   
0            1            Toy Story 1995   
59767   201588          Toy Story 4 2019   
20497   106022  Toy Story of Terror 2013   

                                            genres_list  
3021        Adventure Animation Children Comedy Fantasy  
14813  Adventure Animation Children Comedy Fantasy IMAX  
0           Adventure Animation Children Comedy Fantasy  
59767               Adventure Animation Children Comedy  
20497                         Animation Children Comedy  


In [None]:
vectorizer_genres = TfidfVectorizer(ngram_range=(1,2))

tfidf_genres = vectorizer_genres.fit_transform(movies_data['genres_list'])

def search_similar_genres(genres):
    query_vec = vectorizer_genres.transform([genres])
    similarity = cosine_similarity(query_vec, tfidf_genres).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = movies_data.iloc[indices][::-1]
    return results

gen = 'Adventure Comedy'
print(search_similar_genres(gen))

       movieId                                        clean_title  \
25659   124232                          The Fuller Brush Man 1948   
2379      2470                              Crocodile Dundee 1986   
21378   110223                         Prisoner of Zenda The 1979   
37582   152970                     Hunt for the Wilderpeople 2016   
5808      5920     Ace of Aces aka Super Ace The As des as L 1982   
57227   195905                                     Fools Day 2014   
3651      3752                              Me Myself  Irene 2000   
53513   187573                      Blondie Takes a Vacation 1939   
18675    97665  Asterix  Obelix God Save Britannia Astrix et O...   
37581   152968                                        Lusers 2015   

            genres_list  
25659  Adventure Comedy  
2379   Adventure Comedy  
21378  Adventure Comedy  
37582  Adventure Comedy  
5808   Adventure Comedy  
57227  Adventure Comedy  
3651   Adventure Comedy  
53513  Adventure Comedy  
18675 

In [None]:
def scores_calculator(movie_id):
    #find the recommendations from users who like the same movie
    similar_users = combined_data[(combined_data['movieId']== movie_id) & (combined_data['rating']>=4)]['userId'].unique()
    similar_user_recs = combined_data[(combined_data['userId'].isin(similar_users)) & (combined_data['rating']>=4)]['movieId']
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    #print(similar_user_recs)

    #find the recommendations from all users who have watch the movies above
    all_users = combined_data[(combined_data['movieId'].isin(similar_user_recs.index)) & (combined_data['rating']>=4)]
    all_users_recs = all_users['movieId'].value_counts() / all_users['userId'].nunique()
    #print(all_users_recs)

    genres_of_selected_movie = combined_data[combined_data['movieId']==movie_id]['genres_list'].unique()
    genres_of_selected_movie = np.array2string(genres_of_selected_movie)
    movies_with_similar_genres = search_similar_genres(genres_of_selected_movie)

    indices = []
    for index in movies_with_similar_genres[(movies_with_similar_genres['movieId'].isin(similar_user_recs.index))]['movieId']:
        indices.append(index)

    #times a factor 1.5 to movies with similar genres and similar users
    similar_user_recs.loc[indices] = similar_user_recs.loc[indices]*1.5

    #times a factor 0.9 to movies with similar genres and all users
    indices = []
    for index in movies_with_similar_genres[(movies_with_similar_genres['movieId'].isin(all_users_recs.index))]['movieId']:
        indices.append(index)
    all_users_recs.loc[indices] = all_users_recs.loc[indices]*0.9

    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentages.columns = ['similar', 'all']
    rec_percentages['score'] = rec_percentages['similar'] / rec_percentages['all']

    rec_percentages = rec_percentages.sort_values('score', ascending=False)
    return rec_percentages

scores_calculator(3114)

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
115875,0.001433,0.000085,16.938207
3114,1.000000,0.098397,10.162924
6388,0.001433,0.000141,10.162924
2821,0.001433,0.000141,10.162924
31086,0.001433,0.000141,10.162924
...,...,...,...
33138,0.000478,0.001269,0.376405
437,0.000478,0.001316,0.362962
50011,0.000478,0.001316,0.362962
1043,0.000478,0.001457,0.327836


In [None]:
def recommendation_results(user_input, title=0):
    # user_input = clean_title(user_input)
    title_candidates = search_by_title(user_input)
    movie_id = title_candidates.iloc[title]['movieId']
    scores = scores_calculator(movie_id)
    results = scores.head(10).merge(movies_data, left_index=True, right_on='movieId')[['clean_title', 'score', 'genres_list']]
    resutls = results.rename(columns={'clean_title': 'title', 'genres_list': 'genres'}, inplace=True)
    return results

user_input = "Toy Story"
print("Are you looking for (please choose a number): ")
for i in range(5):
    print(i, ": ", search_by_title(user_input)['clean_title'].iloc[i])

title = 0
if int(title) in range(5):
    print("We have following recommendations: ")
    print(recommendation_results(user_input, int(title)))
else:
    print("Sorry! please try again!")

Are you looking for (please choose a number): 
0 :  Toy Story 2 1999
1 :  Toy Story 3 2010
2 :  Toy Story 1995
3 :  Toy Story 4 2019
4 :  Toy Story of Terror 2013
We have following recommendations: 
                                                   title      score  \
22633             Toy Story Toons Hawaiian Vacation 2011  16.938207   
3021                                    Toy Story 2 1999  10.162924   
6269                                   Regeneration 1997  10.162924   
2729                                Male and Female 1919  10.162924   
9490   Battles Without Honor  Humanity Jingi naki tat...  10.162924   
6078                              Lawless Heart The 2003  10.162924   
34942                   Naomi and Elys No Kiss List 2015  10.162924   
8651             Death Rides a Horse Da uomo a uomo 1967  10.162924   
15245                         Phenix City Story The 1955  10.162924   
42393                         For the Love of Spock 2016  10.162924   

                   