In [4]:
import pandas as pd
import numpy as np

In [5]:
df=pd.read_csv('movielens.csv')
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre
0,5755,184,3,958280246,Nadja (1994),Drama
1,4585,519,3,964321944,Robocop 3 (1993),Sci-Fi|Thriller
2,1503,3114,4,974762175,Toy Story 2 (1999),Animation|Children's|Comedy
3,2166,648,4,974614593,Mission: Impossible (1996),Action|Adventure|Mystery
4,3201,2178,5,968626301,Frenzy (1972),Thriller


In [6]:
df.shape

(1000209, 6)

In [17]:
ratings_per_reviewer = df.groupby('user_id').size()

# Get the reviewer IDs that have given at least 50 ratings
reviewer_ids = ratings_per_reviewer[ratings_per_reviewer >= 50].index

# Filter the dataset to only include reviewers with at least 50 ratings
df = df[df['user_id'].isin(reviewer_ids)]

In [18]:
df.shape

(943471, 6)

In [19]:
ratings_per_movie = df.groupby('title').size()

# Get the movie names that have received at least 200 ratings
movie_ids = ratings_per_movie[ratings_per_movie >= 200].index

# Filter the dataset to only include movies with at least 200 ratings
df = df[df['title'].isin(movie_ids)]

In [20]:
df.shape

(798174, 6)

In [21]:
from surprise import Dataset, Reader
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import accuracy
from surprise.model_selection import train_test_split

In [22]:
# Initialize a surprise reader object
reader = Reader(line_format='user item rating', sep=',', rating_scale=(0,5), skip_lines=1)

# Load the data
#data = Dataset.load_from_df(df[['user_id','movie_id','rating']], reader=reader)
data = Dataset.load_from_df(df[['user_id','title','rating']], reader=reader)

# Build trainset object(perform this only when you are using whole dataset to train)
#trainset = data.build_full_trainset()

trainset, testset = train_test_split(data, test_size=0.2)

In [23]:
# Initialize model
svd = SVD()

# cross-validate
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b5ac7bb5b0>

In [24]:
predictions = svd.test(testset)

In [25]:
rmse = accuracy.rmse(predictions)

RMSE: 0.8570


In [26]:
mae = accuracy.mae(predictions)

MAE:  0.6721


In [27]:
df.sample(10)

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre
853878,2718,356,5,973286221,Forrest Gump (1994),Comedy|Romance|War
515398,4006,1909,3,965552385,"X-Files: Fight the Future, The (1998)",Mystery|Sci-Fi|Thriller
316227,307,916,5,976486086,Roman Holiday (1953),Comedy|Romance
407433,2010,1920,2,974677856,Small Soldiers (1998),Animation|Children's|Fantasy|War
941383,1528,1994,5,974745349,Poltergeist (1982),Horror|Thriller
258695,4250,1954,3,965308398,Rocky (1976),Action|Drama
73018,2124,3763,5,974650999,F/X (1986),Action|Crime|Thriller
915870,3565,172,4,966781319,Johnny Mnemonic (1995),Action|Sci-Fi|Thriller
363964,4813,1441,4,978218369,Benny & Joon (1993),Comedy|Romance
283405,3531,1238,5,966997649,Local Hero (1983),Comedy


In [28]:
svd.predict(uid=3565,iid='Forrest Gump (1994)',r_ui=3.0)

Prediction(uid=3565, iid='Forrest Gump (1994)', r_ui=3.0, est=4.052774735722321, details={'was_impossible': False})

In [29]:
def get_recommendations(data, user_id, top_n, algo):
    
    # creating an empty list to store the recommended product ids
    recommendations = []
    
    # creating an user item interactions matrix 
    user_movie_interactions_matrix = data.pivot(index='user_id', columns='title', values='rating')
    
    # extracting those product names which the user_id has not interacted yet
    non_interacted_movies = user_movie_interactions_matrix.loc[user_id][user_movie_interactions_matrix.loc[user_id].isnull()].index.tolist()
    
    # looping through each of the product names which user_id has not interacted yet
    for item_name in non_interacted_movies:
        
        # predicting the ratings for those non interacted product ids by this user
        est = algo.predict(user_id, item_name).est
        
        # appending the predicted ratings
        #movie_name = movies[movies['movie_id']==str(item_id)]['title'].values[0]
        recommendations.append((item_name, est))

    # sorting the predicted ratings in descending order
    recommendations.sort(key=lambda x: x[1], reverse=True)
    #print(recommendations)
    return recommendations[:top_n] # returing top n highest predicted rating products for this user

In [30]:
get_recommendations(data=df, user_id=2010, top_n=10, algo=svd)

[('Citizen Kane (1941)', 4.699716508462726),
 ('Lawrence of Arabia (1962)', 4.445385472709563),
 ('Rear Window (1954)', 4.295477822069641),
 ('Chinatown (1974)', 4.248169253997325),
 ('Brazil (1985)', 4.175378270191173),
 ('Shining, The (1980)', 4.168591789489589),
 ('This Is Spinal Tap (1984)', 4.1658912897092195),
 ('Jean de Florette (1986)', 4.148078913074392),
 ('Graduate, The (1967)', 4.10094413621979),
 ('Bridge on the River Kwai, The (1957)', 4.091043725749235)]

In [31]:
from surprise import dump

In [32]:
dump.dump('svd_model.pkl', algo=svd)

In [39]:
# Load the trained SVD model from the file
loaded_model = dump.load('svd_model.pkl')

# Access the loaded model
svd_model = loaded_model[1]

In [40]:
svd_model.predict(uid=3565,iid='Forrest Gump (1994)',r_ui=3.0)

Prediction(uid=3565, iid='Forrest Gump (1994)', r_ui=3.0, est=4.052774735722321, details={'was_impossible': False})