In [1]:
import pandas as pd
import numpy as np
from main import CLEAN_DATA
from tasks import read_json_from_file
from tabulate import tabulate
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.DataFrame(read_json_from_file(f'{CLEAN_DATA}cleaned_data.json'))
print(df['Rating'].isnull().sum())

5


Action: Calculate the mean of the ratings 

In [3]:
mean_rating = round(df['Rating'].mean(), 2)
print(mean_rating)


3.87


Action: Apply the mean to all the Nan or Null values in the rating column

In [4]:
df['Rating'].fillna(df['Rating'].mean(), inplace=True)
print(df)

      User          Movie    Rating
0    Alice      Star Wars  5.000000
1    Frank  The Godfather  4.000000
2      Bob        Titanic  5.000000
3    Carol     The Matrix  3.000000
4     Dave      Inception  2.000000
5    Emily   Pulp Fiction  4.000000
6   Quincy      Star Wars  5.000000
7      Bob      Star Wars  2.000000
8    Frank   Forrest Gump  3.000000
9    Alice     The Matrix  4.000000
10    Paul        Titanic  4.000000
11   Carol      Inception  5.000000
12    Dave        Titanic  4.000000
13   Emily  The Godfather  5.000000
14   Alice      Inception  3.000000
15    Paul  The Godfather  5.000000
16   Carol   Pulp Fiction  4.000000
17     Ivy      Star Wars  5.000000
18   Emily   Forrest Gump  3.000000
19  Quincy     The Matrix  4.000000
20     Ivy     The Matrix  4.000000
21    Paul      Inception  1.000000
22     Bob  The Godfather  4.000000
23   Frank      Star Wars  5.000000
24     Ivy      Inception  3.000000
25    Dave   Forrest Gump  5.000000
26  Quincy      Inception  3

Action: To refer to all the users with their indices instead of the names, create their corresponding numerical indices

In [5]:
user_ids = df['User'].astype('category').cat.codes
movie_ids = df['Movie'].astype('category').cat.codes
print(df)

      User          Movie    Rating
0    Alice      Star Wars  5.000000
1    Frank  The Godfather  4.000000
2      Bob        Titanic  5.000000
3    Carol     The Matrix  3.000000
4     Dave      Inception  2.000000
5    Emily   Pulp Fiction  4.000000
6   Quincy      Star Wars  5.000000
7      Bob      Star Wars  2.000000
8    Frank   Forrest Gump  3.000000
9    Alice     The Matrix  4.000000
10    Paul        Titanic  4.000000
11   Carol      Inception  5.000000
12    Dave        Titanic  4.000000
13   Emily  The Godfather  5.000000
14   Alice      Inception  3.000000
15    Paul  The Godfather  5.000000
16   Carol   Pulp Fiction  4.000000
17     Ivy      Star Wars  5.000000
18   Emily   Forrest Gump  3.000000
19  Quincy     The Matrix  4.000000
20     Ivy     The Matrix  4.000000
21    Paul      Inception  1.000000
22     Bob  The Godfather  4.000000
23   Frank      Star Wars  5.000000
24     Ivy      Inception  3.000000
25    Dave   Forrest Gump  5.000000
26  Quincy      Inception  3

Action: Create a user movie matrix and calculate cosine similarity between them, you can choose to print for each

In [6]:
user_movie_matrix = pd.pivot_table(df, values='Rating', index=user_ids, columns=movie_ids, fill_value=0)
user_similarity = cosine_similarity(user_movie_matrix)

Action: Create the movie recommender function with user's name and limit recommendations to 5 by Default. 

In [7]:
def recommend_movies_with_names(user, top_n=5):
    user_id = user_ids[df['User'] == user].values[0]
    user_ratings = user_movie_matrix.loc[user_id]
    
    similar_users = user_similarity[user_id].argsort()[::-1][1:]
    weighted_sum = np.dot(user_similarity[user_id, similar_users], user_movie_matrix.iloc[similar_users].values)
    
    predicted_ratings = weighted_sum / np.sum(np.abs(user_similarity[user_id, similar_users]))
    unrated_movies = user_ratings[user_ratings == 0].index
    recommendations = pd.Series(predicted_ratings, index=user_movie_matrix.columns)[unrated_movies].sort_values(ascending=False)
    
    movie_names = df['Movie'].unique()[recommendations.index]
    
    recommended_movies = pd.DataFrame({
        'Movie': movie_names,
        'Predicted Rating': recommendations.values
    }).head(top_n)
    
    return recommended_movies

Action: Usage, call the functions and show the output in an ASCII format using tabulate. See sample Usage below

In [9]:
top_user_recommendations_with_names = recommend_movies_with_names("Alice", top_n=3)
print(tabulate(top_user_recommendations_with_names, headers='keys', tablefmt='fancy_grid'))

╒════╤══════════════╤════════════════════╕
│    │ Movie        │   Predicted Rating │
╞════╪══════════════╪════════════════════╡
│  0 │ The Matrix   │           1.14494  │
├────┼──────────────┼────────────────────┤
│  1 │ Star Wars    │           0.945625 │
├────┼──────────────┼────────────────────┤
│  2 │ Pulp Fiction │           0.660141 │
╘════╧══════════════╧════════════════════╛
