### Imports

In [23]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import re
import statistics

### Loading and preparing the dataset


In [24]:
movie_df = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/BxZuF3FrO7Bdw6McwsBaBw/movies.csv')
rating_df = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/R-bYYyyf7s3IUE5rsssmMw/ratings.csv')
tag_df = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/UZKHhXSl7Ft7t9mfUFZJPQ/tags.csv')

In [25]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [26]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [27]:
tag_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


First, we'll merge the dfs into one using the movieId field

In [28]:
user_movie_df = movie_df.merge(rating_df, on = 'movieId', how = 'inner')
df = user_movie_df.merge(tag_df, on = ['movieId', 'userId'], how = 'inner')
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp_x,tag,timestamp_y
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,1122227329,pixar,1139045764
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,978575760,pixar,1137206825
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,1525286001,fun,1525286013
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,1528843890,fantasy,1528843929
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,1528843890,magic board game,1528843932


Next, we drop any unnecessary columns

In [29]:
df.drop(columns = ['timestamp_x', 'timestamp_y'], inplace = True)
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,pixar
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,magic board game


### EDA

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3476 entries, 0 to 3475
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  3476 non-null   int64  
 1   title    3476 non-null   object 
 2   genres   3476 non-null   object 
 3   userId   3476 non-null   int64  
 4   rating   3476 non-null   float64
 5   tag      3476 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 163.1+ KB


## Reccomendations

We will be implementing three types of reccomendation systems

In [31]:
df_1 = df.copy() #will be used for the first rec system

### Popularity-based recommendation

In [32]:
#calculate the number of votes from users for every movie
num_votes = df_1.groupby('movieId').size().reset_index(name='numVotes')
#merge num_votes back to the df based on the movieId
df_1 = pd.merge(df_1, num_votes, on='movieId')
df_1

Unnamed: 0,movieId,title,genres,userId,rating,tag,numVotes
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar,3
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,pixar,3
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,fun,3
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy,4
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,magic board game,4
...,...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,star wars,2
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,anime,4
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,comedy,4
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,gintama,4


In [33]:
#calculate average user rating for every movie
avg_Ratings = df_1.groupby('movieId')['rating'].mean().reset_index(name = 'avgRating')
df_1 = pd.merge(df_1, avg_Ratings, on='movieId')

In [34]:
df_1.drop_duplicates(subset = ['movieId', 'title', 'avgRating', 'numVotes'], inplace = True)
df_1

Unnamed: 0,movieId,title,genres,userId,rating,tag,numVotes,avgRating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar,3,3.833333
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy,4,3.750000
7,3,Grumpier Old Men (1995),Comedy|Romance,289,2.5,moldy,2,2.500000
9,5,Father of the Bride Part II (1995),Comedy,474,1.5,pregnancy,2,1.500000
11,7,Sabrina (1995),Comedy|Romance,474,3.0,remake,1,3.000000
...,...,...,...,...,...,...,...,...
3461,183611,Game Night (2018),Action|Comedy|Crime|Horror,62,4.0,Comedy,3,4.000000
3464,184471,Tomb Raider (2018),Action|Adventure|Fantasy,62,3.5,adventure,3,3.500000
3467,187593,Deadpool 2 (2018),Action|Comedy|Sci-Fi,62,4.0,Josh Brolin,3,4.000000
3470,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,Emilia Clarke,2,4.000000


We need to calculate a weighted score of each movie based on  its average rating (avgRating), number of votes (numV
otes), a global average rating (R), and an average number of votes (V).

We focus on giving weight on movies with more votes while still considering the overall average ratings.

In [35]:
#calculate average number of votes
V = statistics.mean(df_1['numVotes'])

#calculate average rating
R = statistics.mean(df_1['avgRating'])

def calc_weighted_score(numVotes, avgRating, V, R):
  return (numVotes*avgRating + V*R)/(numVotes + V)

df_1['score'] = calc_weighted_score(df_1['numVotes'], df_1['avgRating'], V, R)

df_1

Unnamed: 0,movieId,title,genres,userId,rating,tag,numVotes,avgRating,score
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar,3,3.833333,3.788714
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy,4,3.750000,3.743421
7,3,Grumpier Old Men (1995),Comedy|Romance,289,2.5,moldy,2,2.500000,3.168895
9,5,Father of the Bride Part II (1995),Comedy,474,1.5,pregnancy,2,1.500000,2.711680
11,7,Sabrina (1995),Comedy|Romance,474,3.0,remake,1,3.000000,3.515304
...,...,...,...,...,...,...,...,...,...
3461,183611,Game Night (2018),Action|Comedy|Crime|Horror,62,4.0,Comedy,3,4.000000,3.881749
3464,184471,Tomb Raider (2018),Action|Adventure|Fantasy,62,3.5,adventure,3,3.500000,3.602644
3467,187593,Deadpool 2 (2018),Action|Comedy|Sci-Fi,62,4.0,Josh Brolin,3,4.000000,3.881749
3470,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,Emilia Clarke,2,4.000000,3.854716


Getting the top 5 movies recommended by users:

In [36]:
top5 = df_1.sort_values(by = 'score', ascending = False).head()
print('Top 5 movies:')
top5[['title', 'genres', 'tag', 'score']]

Top 5 movies:


Unnamed: 0,title,genres,tag,score
199,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,good dialogue,4.967226
1337,Fight Club (1999),Action|Crime|Drama|Thriller,dark comedy,4.893394
604,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi,Hal,4.884498
998,"Big Lebowski, The (1998)",Comedy|Crime,Coen Brothers,4.868802
164,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller,assassin,4.852577


### Content-based recommendation

In [53]:
df_2 = df_1[['movieId', 'title', 'userId', 'avgRating', 'numVotes', 'score', 'genres', 'tag']].copy().reset_index(drop = True)
df_2['features'] = df_2['genres'].apply(lambda x: re.sub('[|]', ' ', x)) + ' ' + df_2['tag']
df_2

Unnamed: 0,movieId,title,userId,avgRating,numVotes,score,genres,tag,features
0,1,Toy Story (1995),336,3.833333,3,3.788714,Adventure|Animation|Children|Comedy|Fantasy,pixar,Adventure Animation Children Comedy Fantasy pixar
1,2,Jumanji (1995),62,3.750000,4,3.743421,Adventure|Children|Fantasy,fantasy,Adventure Children Fantasy fantasy
2,3,Grumpier Old Men (1995),289,2.500000,2,3.168895,Comedy|Romance,moldy,Comedy Romance moldy
3,5,Father of the Bride Part II (1995),474,1.500000,2,2.711680,Comedy,pregnancy,Comedy pregnancy
4,7,Sabrina (1995),474,3.000000,1,3.515304,Comedy|Romance,remake,Comedy Romance remake
...,...,...,...,...,...,...,...,...,...
1459,183611,Game Night (2018),62,4.000000,3,3.881749,Action|Comedy|Crime|Horror,Comedy,Action Comedy Crime Horror Comedy
1460,184471,Tomb Raider (2018),62,3.500000,3,3.602644,Action|Adventure|Fantasy,adventure,Action Adventure Fantasy adventure
1461,187593,Deadpool 2 (2018),62,4.000000,3,3.881749,Action|Comedy|Sci-Fi,Josh Brolin,Action Comedy Sci-Fi Josh Brolin
1462,187595,Solo: A Star Wars Story (2018),62,4.000000,2,3.854716,Action|Adventure|Children|Sci-Fi,Emilia Clarke,Action Adventure Children Sci-Fi Emilia Clarke


In [54]:
vectorizer = TfidfVectorizer(stop_words='english')

#fit and transform the 'features' column from text into numerical representations
X = vectorizer.fit_transform(df_2['features'])

In [55]:
similarity = cosine_similarity(X)

#recommendation function
def recommendation(title, df, similarity, top_n=3):

    try:
        #find index of querried movie
        id = df[df['title'] == title].index[0]
    except IndexError:
        print(f"Movie '{title}' not found in the dataset.")
        return

    #get similarity
    sim_scores = list(enumerate(similarity[id]))

    #sort the movies based on similarity scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    print(f"Movies similar to '{title}' (First movie is itself):")
    for i, (index, score) in enumerate(sim_scores[:top_n+1]):
        movie = df.iloc[index]
        print(f"{i}. {movie['title']} (Similarity Score: {score:.3f})")
        print(f"   Genres: {movie['genres']}")
        print(f"   Tag: {movie['tag']}\n")

Example of a movie recommendations query:

In [57]:
recommendation("Jumanji (1995)" , df_2, similarity, 5)

Movies similar to 'Jumanji (1995)' (First movie is itself):
0. Jumanji (1995) (Similarity Score: 1.000)
   Genres: Adventure|Children|Fantasy
   Tag: fantasy

1. Chronicles of Narnia: Prince Caspian, The (2008) (Similarity Score: 1.000)
   Genres: Adventure|Children|Fantasy
   Tag: fantasy

2. Big (1988) (Similarity Score: 0.727)
   Genres: Comedy|Drama|Fantasy|Romance
   Tag: children

3. Tomb Raider (2018) (Similarity Score: 0.670)
   Genres: Action|Adventure|Fantasy
   Tag: adventure

4. Sintel (2010) (Similarity Score: 0.660)
   Genres: Animation|Fantasy
   Tag: adventure

5. Lord of the Rings: The Return of the King, The (2003) (Similarity Score: 0.655)
   Genres: Action|Adventure|Drama|Fantasy
   Tag: Adventure



### Collaborative filtering

In [43]:
#we first create a ratings matrix for the user ratings
user_rating_matrix = rating_df.pivot(index="movieId", columns="userId", values="rating")

# fill non existing values with 0
user_rating_matrix = user_rating_matrix.fillna(0)

user_rating_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


We will be using the Nearest Neighbours Classifier based on the cosine similarity metric at this point

In [44]:
rec = NearestNeighbors(metric = 'cosine')
rec.fit(user_rating_matrix)

In [45]:
#get recommendation on similar movies based on a title
def get_recommendations(title):

    movie = df_2[df_2['title'] == title]

    if movie.empty:
        print(f"Movie '{title}' not found in dataset.")
        return None
    #get movie id of queried movie
    movie_id = int(movie['movieId'])

    #locate the movie in the user rating matrix
    try:
        user_index = user_rating_matrix.index.get_loc(movie_id)
    except KeyError:
        print(f"Movie ID {movie_id} not found in the user rating matrix.")
        return None

    #get user ratings for the movie
    user_ratings = user_rating_matrix.iloc[user_index]

    #reshape ratings
    reshaped_df = user_ratings.values.reshape(1, -1)

    #find similar movies
    distances, indices = rec.kneighbors(reshaped_df, n_neighbors=15)

    #get the movieIds of the nearest neighbors (excluding the first, which is the queried movie itself)
    nearest_idx = user_rating_matrix.iloc[indices[0]].index[1:]

    #get the movie details for the nearest neighbors
    nearest_neighbors = pd.DataFrame({'movieId': nearest_idx})
    result = pd.merge(nearest_neighbors, df_2, on='movieId', how='left')

    #return the top recommendations
    return result[['title', 'avgRating', 'genres']].head()

Example of a movie recommendations query:

In [47]:
get_recommendations('Jurassic Park (1993)')

  movie_id = int(movie['movieId'])


Unnamed: 0,title,avgRating,genres
0,Terminator 2: Judgment Day (1991),2.625,Action|Sci-Fi
1,Forrest Gump (1994),3.666667,Comedy|Drama|Romance|War
2,Braveheart (1995),4.35,Action|Drama|War
3,"Fugitive, The (1993)",5.0,Thriller
4,Speed (1994),4.0,Action|Romance|Thriller
