In [47]:
import pandas as pd
import numpy as np

In [48]:
df=pd.read_csv('movielens.csv')
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre
0,5755,184,3,958280246,Nadja (1994),Drama
1,4585,519,3,964321944,Robocop 3 (1993),Sci-Fi|Thriller
2,1503,3114,4,974762175,Toy Story 2 (1999),Animation|Children's|Comedy
3,2166,648,4,974614593,Mission: Impossible (1996),Action|Adventure|Mystery
4,3201,2178,5,968626301,Frenzy (1972),Thriller


In [49]:
df.shape

(1000209, 6)

In [50]:
# ratings_per_reviewer = df.groupby('user_id').size()

# # Get the reviewer IDs that have given at least 50 ratings
# reviewer_ids = ratings_per_reviewer[ratings_per_reviewer >= 50].index

# # Filter the dataset to only include reviewers with at least 50 ratings
# df = df[df['user_id'].isin(reviewer_ids)]

In [51]:
df.shape

(1000209, 6)

In [52]:
# ratings_per_movie = df.groupby('title').size()

# # Get the movie names that have received at least 200 ratings
# movie_ids = ratings_per_movie[ratings_per_movie >= 200].index

# # Filter the dataset to only include movies with at least 200 ratings
# df = df[df['title'].isin(movie_ids)]

In [53]:
df.shape

(1000209, 6)

In [54]:
from surprise import Dataset, Reader
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import accuracy
from surprise.model_selection import train_test_split

In [55]:
# Initialize a surprise reader object
reader = Reader(line_format='user item rating', sep=',', rating_scale=(0,5), skip_lines=1)

# Load the data
#data = Dataset.load_from_df(df[['user_id','movie_id','rating']], reader=reader)
data = Dataset.load_from_df(df[['user_id','title','rating']], reader=reader)

# Build trainset object(perform this only when you are using whole dataset to train)
#trainset = data.build_full_trainset()

trainset, testset = train_test_split(data, test_size=0.2)

In [56]:
# Initialize model
svd = SVD()

# cross-validate
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b5dca07bb0>

In [57]:
predictions = svd.test(testset)

In [58]:
rmse = accuracy.rmse(predictions)

RMSE: 0.8754


In [59]:
mae = accuracy.mae(predictions)

MAE:  0.6881


In [60]:
df.sample(10)

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre
763883,1699,187,3,974712584,Party Girl (1995),Comedy
501383,4808,2289,3,963028615,"Player, The (1992)",Comedy|Drama
720598,3836,913,5,965928859,"Maltese Falcon, The (1941)",Film-Noir|Mystery
701810,4169,1077,5,967164532,Sleeper (1973),Comedy|Sci-Fi
389250,1117,372,4,1001081946,Reality Bites (1994),Comedy|Drama
174534,2700,2338,1,973304998,I Still Know What You Did Last Summer (1998),Horror|Mystery|Thriller
137640,3394,1921,4,967492495,Pi (1998),Sci-Fi|Thriller
450959,1896,954,4,975273128,Mr. Smith Goes to Washington (1939),Drama
844111,1101,78,2,1010864734,"Crossing Guard, The (1995)",Drama
940704,4310,1291,4,976292145,Indiana Jones and the Last Crusade (1989),Action|Adventure


In [81]:
svd.predict(uid=3565,iid='Forrest Gump (1994)',r_ui=3.0)

Prediction(uid=3565, iid='Forrest Gump (1994)', r_ui=3.0, est=3.5717420960365143, details={'was_impossible': False})

In [82]:
def get_recommendations(data, user_id, top_n, algo):
    
    # creating an empty list to store the recommended product ids
    recommendations = []
    
    # creating an user item interactions matrix 
    user_movie_interactions_matrix = data.pivot(index='user_id', columns='title', values='rating')
    
    # extracting those product names which the user_id has not interacted yet
    non_interacted_movies = user_movie_interactions_matrix.loc[user_id][user_movie_interactions_matrix.loc[user_id].isnull()].index.tolist()
    
    # looping through each of the product names which user_id has not interacted yet
    for item_name in non_interacted_movies:
        
        # predicting the ratings for those non interacted product ids by this user
        est = algo.predict(user_id, item_name).est
        
        # appending the predicted ratings
        #movie_name = movies[movies['movie_id']==str(item_id)]['title'].values[0]
        recommendations.append((item_name, est))

    # sorting the predicted ratings in descending order
    recommendations.sort(key=lambda x: x[1], reverse=True)
    #print(recommendations)
    return recommendations[:top_n] # returing top n highest predicted rating products for this user

In [86]:
def get_recommendationsX(data, user_id, top_n, algo):
    
    # creating an empty list to store the recommended product ids
    recommendations = []
    
    # creating an user item interactions matrix 
    user_movie_interactions_matrix = data.pivot(index='user_id', columns='title', values='rating')
    
    # extracting those product names which the user_id has not interacted yet
    #non_interacted_movies = user_movie_interactions_matrix.loc[user_id][user_movie_interactions_matrix.loc[user_id].isnull()].index.tolist()
    rated_movies = set(data[data['user_id'] == user_id]['title'])
    all_movies = set(data['title'])
    unrated_movies = list(all_movies - rated_movies)
    
    # looping through each of the product names which user_id has not interacted yet
    for item_name in unrated_movies:
        
        # predicting the ratings for those non interacted product ids by this user
        est = algo.predict(user_id, item_name).est
        
        # appending the predicted ratings
        #movie_name = movies[movies['movie_id']==str(item_id)]['title'].values[0]
        recommendations.append((item_name, est))

    # sorting the predicted ratings in descending order
    recommendations.sort(key=lambda x: x[1], reverse=True)
    #print(recommendations)
    return recommendations[:top_n] # returing top n highest predicted rating products for this user

In [90]:
get_recommendationsX(data=df, user_id=1699, top_n=10, algo=svd)

[('Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)', 5),
 ('Paths of Glory (1957)', 5),
 ('Duck Soup (1933)', 5),
 ('Wallace & Gromit: The Best of Aardman Animation (1996)', 5),
 ('Pather Panchali (1955)', 5),
 ('General, The (1927)', 4.978955031670398),
 ('Wrong Trousers, The (1993)', 4.934882004797087),
 ('Paradise Lost: The Child Murders at Robin Hood Hills (1996)',
  4.919320249791686),
 ('Sanjuro (1962)', 4.9177659141327865),
 ('Grapes of Wrath, The (1940)', 4.890421219739969)]

In [88]:
get_recommendations(data=df, user_id=2010, top_n=10, algo=svd)

[('Taxi Driver (1976)', 4.4159445396782795),
 ('Brazil (1985)', 4.390912933809539),
 ('Lawrence of Arabia (1962)', 4.341856016656698),
 ('Good, The Bad and The Ugly, The (1966)', 4.331001254599082),
 ('Requiem for a Dream (2000)', 4.23688515857849),
 ('Shining, The (1980)', 4.090353033486862),
 ('Best in Show (2000)', 4.059305259387244),
 ('Exorcist, The (1973)', 4.042957691783742),
 ('Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)', 4.0338842802161805),
 ('Chinatown (1974)', 4.01043719029493)]

In [64]:
from surprise import dump

In [65]:
dump.dump('svd_model.pkl', algo=svd)

In [66]:
# Load the trained SVD model from the file
loaded_model = dump.load('svd_model.pkl')

# Access the loaded model
svd_model = loaded_model[1]

In [67]:
svd_model.predict(uid=3565,iid='Forrest Gump (1994)',r_ui=3.0)

Prediction(uid=3565, iid='Forrest Gump (1994)', r_ui=3.0, est=3.5717420960365143, details={'was_impossible': False})

# Broda Count

In [100]:
from collections import defaultdict

# Arrays representing the predicted top 10 recommended movies for 3 users
user1 = [('Taxi Driver (1976)', 4.4159445396782795),
         ('Brazil (1985)', 4.390912933809539),
         ('Lawrence of Arabia (1962)', 4.341856016656698),
         ('Good, The Bad and The Ugly, The (1966)', 4.331001254599082),
         ('Requiem for a Dream (2000)', 4.23688515857849),
         ('Shining, The (1980)', 4.090353033486862),
         ('Best in Show (2000)', 4.059305259387244),
         ('Exorcist, The (1973)', 4.042957691783742),
         ('Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)', 4.0338842802161805),
         ('Chinatown (1974)', 4.01043719029493)]

user2 = [('Sanjuro (1962)', 4.7480884622987825),
         ("Schindler's List (1993)", 4.733654163468202),
         ('Usual Suspects, The (1995)', 4.608804294265182),
         ('General, The (1927)', 4.603545020084608),
         ('Bridge on the River Kwai, The (1957)', 4.592910991368877),
         ('12 Angry Men (1957)', 4.572788211305827),
         ('Saving Private Ryan (1998)', 4.567316249661901),
         ('Shawshank Redemption, The (1994)', 4.541116774210836),
         ('Monty Python and the Holy Grail (1974)', 4.530737968169193),
         ('Close Shave, A (1995)', 4.520896833727098)]

user3 = [('Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)', 5),
         ('Paths of Glory (1957)', 5),
         ('Duck Soup (1933)', 5),
         ('Wallace & Gromit: The Best of Aardman Animation (1996)', 5),
         ('Pather Panchali (1955)', 5),
         ('General, The (1927)', 4.978955031670398),
         ('Wrong Trousers, The (1993)', 4.934882004797087),
         ('Paradise Lost: The Child Murders at Robin Hood Hills (1996)', 4.919320249791686),
         ('Sanjuro (1962)', 4.9177659141327865),
         ('Grapes of Wrath, The (1940)', 4.890421219739969)]

# Dictionary to store the total ratings for each movie
movie_ratings = defaultdict(float)
# Dictionary to store the number of ratings for each movie
movie_counts = defaultdict(int)

# Iterate over user1's array and aggregate the ratings
for movie, rating in user1:
    movie_ratings[movie] += rating
    movie_counts[movie] += 1

# Iterate over user2's array and aggregate the ratings
for movie, rating in user2:
    movie_ratings[movie] += rating
    movie_counts[movie] += 1

# Iterate over user3's array and aggregate the ratings
for movie, rating in user3:
    movie_ratings[movie] += rating
    movie_counts[movie] += 1

# Calculate the average ratings for each movie
#print(movie_ratings)
#print(movie_counts)
movie_averages = {movie: movie_ratings[movie] / movie_counts[movie] for movie in movie_ratings}
# Sort the movies based on their average ratings in descending order
top_movies = sorted(movie_averages.items(), key=lambda x: x[1], reverse=True)

# Get the top recommended movies for the combined group of three users
top_recommendations = [movie[0] for movie in top_movies[:10]]

print(top_recommendations)


['Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)', 'Paths of Glory (1957)', 'Duck Soup (1933)', 'Wallace & Gromit: The Best of Aardman Animation (1996)', 'Pather Panchali (1955)', 'Wrong Trousers, The (1993)', 'Paradise Lost: The Child Murders at Robin Hood Hills (1996)', 'Grapes of Wrath, The (1940)', 'Sanjuro (1962)', 'General, The (1927)']


In [102]:
def grouprecommendations(uid1,uid2,uid3):
    user1=get_recommendations(data=df, user_id=uid1, top_n=50, algo=svd)
    user2=get_recommendations(data=df, user_id=uid2, top_n=50, algo=svd)
    user3=get_recommendations(data=df, user_id=uid3, top_n=50, algo=svd)
    movie_ratings = defaultdict(float)
    # Dictionary to store the number of ratings for each movie
    movie_counts = defaultdict(int)

    # Iterate over user1's array and aggregate the ratings
    for movie, rating in user1:
        movie_ratings[movie] += rating
        movie_counts[movie] += 1

    # Iterate over user2's array and aggregate the ratings
    for movie, rating in user2:
        movie_ratings[movie] += rating
        movie_counts[movie] += 1

    # Iterate over user3's array and aggregate the ratings
    for movie, rating in user3:
        movie_ratings[movie] += rating
        movie_counts[movie] += 1

    # Calculate the average ratings for each movie
    #print(movie_ratings)
    #print(movie_counts)
    movie_averages = {movie: movie_ratings[movie] / movie_counts[movie] for movie in movie_ratings}
    # Sort the movies based on their average ratings in descending order
    top_movies = sorted(movie_averages.items(), key=lambda x: x[1], reverse=True)

    # Get the top recommended movies for the combined group of three users
    top_recommendations = [movie[0] for movie in top_movies[:10]]
    return top_recommendations
    

In [103]:
grouprecommendations(5755,4585,1503)

['Fargo (1996)',
 'Shawshank Redemption, The (1994)',
 'Dersu Uzala (1974)',
 'Raising Arizona (1987)',
 'GoodFellas (1990)',
 'Deer Hunter, The (1978)',
 "I'm the One That I Want (2000)",
 'World of Apu, The (Apur Sansar) (1959)',
 'Roman Holiday (1953)',
 'Butch Cassidy and the Sundance Kid (1969)']

In [104]:
grouprecommendations(5755,1117,1503)

['Fargo (1996)',
 'Manchurian Candidate, The (1962)',
 'Raising Arizona (1987)',
 'GoodFellas (1990)',
 'Celebration, The (Festen) (1998)',
 'Love and Death (1975)',
 'Producers, The (1968)',
 'Iron Giant, The (1999)',
 'World of Apu, The (Apur Sansar) (1959)',
 'Good, The Bad and The Ugly, The (1966)']

In [105]:
grouprecommendations(2166,4585,1602)

['Pulp Fiction (1994)',
 'General, The (1927)',
 'Shawshank Redemption, The (1994)',
 'GoodFellas (1990)',
 'Maltese Falcon, The (1941)',
 'Waiting for Guffman (1996)',
 'Close Shave, A (1995)',
 'Deer Hunter, The (1978)',
 '400 Blows, The (Les Quatre cents coups) (1959)',
 'Palm Beach Story, The (1942)']