In [1]:
import pickle 
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF

In [2]:
# Here we import all the functions needed for the recommender
from scripts.recommenderlib import *

In [3]:
import os
print(os.getcwd())

D:\hh-ds-24-07\projects\ds-capstone-recommendation


In [4]:
ratings = pd.read_csv('data/ml-latest-small/ratings_cleaned.csv')
movies = pd.read_csv('data/ml-latest-small/movies_cleaned.csv')
links = pd.read_csv('data/ml-latest-small/links_cleaned.csv')

In [None]:
# movies_links = pd.megre([])

In [5]:
ratings.head()

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating
0,0,1,0,4.0
1,1,1,2,4.0
2,2,1,5,4.0
3,3,1,43,5.0
4,4,1,46,5.0


In [6]:
ratings.movieId.unique()

array([   0,    2,    5, ..., 9342, 9389, 9390], dtype=int64)

In [7]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   9742 non-null   int64  
 1   movieId      9742 non-null   int64  
 2   title        9742 non-null   object 
 3   genres       9742 non-null   object 
 4   released_yr  9742 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 380.7+ KB


### Initialize a sparse user-item rating matrix

In [8]:
ratings.head()

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating
0,0,1,0,4.0
1,1,1,2,4.0
2,2,1,5,4.0
3,3,1,43,5.0
4,4,1,46,5.0


In [9]:
# (data, (row_ind, col_ind))
R = csr_matrix((ratings["rating"], (ratings["userId"], ratings["movieId"])))
R

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 100836 stored elements and shape (611, 9742)>

In [10]:
df_r = pd.DataFrame(R.todense())
df_r

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Create a model and set the hyperparameters

In [None]:
import os
# Create the directory if it doesn't exist
os.makedirs('./data', exist_ok=True)

In [None]:

model = NMF(n_components=250, max_iter=1000)

model.fit(R)

# Save model 
with open('./model_nmf.pkl', 'wb') as file:
    pickle.dump(model, file)

In [11]:
# Load model
with open('./model_nmf.pkl', 'rb') as file:
    model = pickle.load(file)

In [12]:
model.reconstruction_err_

409.6605777307307

## Recommender function

### Skip over the following code, still needs refinement

In [13]:
def recommender_nmf(query, model_name, df_ratings, df_movies, df_links, k):

    # Create user vector
    df_new_user = pd.DataFrame(query, columns=df_movies["movieId"], index=["new_user"])
    df_new_user_modified = df_new_user.fillna(0)

    # call in the model_load
    model = load_model(model_name)

    # Create user-feature matrix P for new user
    P_new_user_matrix = model.transform(df_new_user_modified)

    # New dataframe 
    P_new_user = pd.DataFrame(
        P_new_user_matrix,
        columns=model.get_feature_names_out(),
        index=["new_user"],
    )

    # Reconstruct user-movie matrix/dataframe for new user
    Q_matrix = model.components_
    Q = pd.DataFrame(Q_matrix)
    R_hat_new_user_matrix = np.dot(P_new_user, Q)
    R_hat_new_user = pd.DataFrame(R_hat_new_user_matrix, index=["new_user"])

    ranked = R_hat_new_user.T.sort_values("new_user", ascending=False)
    recommended = ranked[~ranked.index.isin(query)].reset_index()
    recommended.columns = ["movieid", "score"]

    # Get movie ids and corresponding titles the same order
    movie_ids = recommended.iloc[:k]["movieid"]
    # titles = [df_movies.loc[id]["title"] for id in movie_ids]
    
    recommendations = df_movies[df_movies.movieId.isin(movie_ids)]

    # getting the poster images and saving them in a directory 
    api_key = '32963fd453f575aa44262db989d926d6'
    image_paths = get_movie_posters(df_links.tmdbId.loc[recommendations.movieId], api_key)
    
    return recommendations, image_paths

In [6]:
# shabnams ratings based on movies with genre action
user_query = {
    2: 4.8, 
    1101: 5,
    1370: 4.5,
    1515: 4.9,
    1580: 5, 
    1722: 4.5,
    1831: 4.7, 
    1858: 4.2,
    62374: 4.5, 
    67923: 4.6, 
}

model_name = 'NMF'
recommendations, image_paths = recommender_nmf(user_query, model_name, ratings, movies, links, k=10)
recommendations

Unnamed: 0.1,Unnamed: 0,movieId,title,genres,released_yr
0,0,0,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",1995.0
224,224,224,Star Wars: Episode IV - A New Hope,"['Action', 'Adventure', 'Sci-Fi']",1977.0
510,510,510,"Silence of the Lambs, The","['Crime', 'Horror', 'Thriller']",1991.0
514,514,514,Pretty Woman,"['Comedy', 'Romance']",1990.0
546,546,546,Mission: Impossible,"['Action', 'Adventure', 'Mystery', 'Thriller']",1996.0
615,615,615,Independence Day (a.k.a. ID4),"['Action', 'Adventure', 'Sci-Fi', 'Thriller']",1996.0
622,622,622,"Nutty Professor, The","['Comedy', 'Fantasy', 'Romance', 'Sci-Fi']",1996.0
815,815,815,Willy Wonka & the Chocolate Factory,"['Children', 'Comedy', 'Fantasy', 'Musical']",1971.0
911,911,911,Star Wars: Episode VI - Return of the Jedi,"['Action', 'Adventure', 'Sci-Fi']",1983.0
1073,1073,1073,Jerry Maguire,"['Drama', 'Romance']",1996.0


In [None]:
movies.set_index('movieId').loc[movie_id]

### Run the codes below

In [None]:
def recommended_movies(query, model, ratings, movies, k=5):
    # Ensure movie_ids from the model match the ratings DataFrame columns
    movie_ids = ratings.columns
    
    # Create a user vector with 0s for all movies the user hasn't rated
    new_user_row = pd.Series(0, index=movie_ids)
    
    # Fill in the ratings from the query
    for movie_id, rating in query.items():
        if movie_id in movie_ids:
            new_user_row[movie_id] = rating

    # Reshape the row to create a single-row matrix for the model
    new_user_matrix = new_user_row.values.reshape(1, -1)

    # Transform the user vector using the NMF model to get the user-feature matrix P
    P_new_user_matrix = model.transform(new_user_matrix)

    # Reconstruct the user-movie matrix for the new user
    Q_matrix = model.components_
    R_hat_new_user_matrix = np.dot(P_new_user_matrix, Q_matrix)

    # Create a DataFrame for predicted scores
    predicted_scores = pd.DataFrame(R_hat_new_user_matrix, columns=movie_ids, index=["new_user"])

    # Rank movies by predicted scores
    ranked = predicted_scores.T.sort_values("new_user", ascending=False)

    # Remove movies that the user has already rated
    ranked = ranked[~ranked.index.isin(query.keys())]

    # Get top-k recommendations
    recommendations = ranked.head(k).reset_index()
    recommendations.columns = ["movie_id", "score"]

    # Merge with movie titles
    recommendations = recommendations.merge(movies, on="movie_id")

    return recommendations[["movie_id", "title", "score"]]

In [None]:
# shabnams ratings based on movies with genre action
user_query = {
    2: 4.8, 
    1101: 5,
    1370: 4.5,
    1515: 4.9,
    1580: 5,
}

recommended_movies(user_query, model, ratings, movies, k=5)

In [None]:
# shabnams ratings based on movies with genre action
user_query = {
    2: 4.8, 
    1101: 5,
    1370: 4.5,
    1515: 4.9,
    1580: 5, 
    1722: 4.5,
    1831: 4.7, 
    1858: 4.2,
    62374: 4.5, 
    67923: 4.6, 
}

movie_ids, titles = recommended_movies(user_query, model, ratings, movies, k=10)

print("Recommended movies:\n")
for i, title in enumerate(titles):
    print(f"{i+1}. {title}")
    

### Build a simple recommender 

In [None]:
def recommend_popular(query, ratings, k=10):
    """
    Function that returns a list of k unseen, most pupular movies.
    """
    # Create a list of movies by popularity
    df_popularity = (
        ratings.groupby("movie_id")
        .agg(mean=("rating", "mean"))
        .sort_values("mean", ascending=False)
        .reset_index()
        .copy()
    )

    # Filter out movie_ids the user has seen (rated)
    df_popularity = df_popularity[df_popularity["movie_id"].isin(query)]

    return df_popularity["movie_id"].head(k).to_list()

In [None]:
query = ratings[ratings["user_id"] == 4][["movie_id", "rating"]]
query = query.set_index("movie_id").to_dict()["rating"]

recommened_ids = recommend_popular(query, ratings, k=10)

recommended_movies = movies[movies["movie_id"].isin(recommened_ids)]
recommended_movies[["movie_id", "title"]]