In [None]:
import pandas as pd
!pip install scikit-surprise
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split



In [None]:
# Load movies and ratings data
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [None]:
# Load the data into Surprise
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.25)

In [None]:
algo = KNNBasic(sim_options={'name': 'pearson', 'user_based': True})
algo.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x78a4df3a3be0>

In [None]:
# Predict ratings on the testset
predictions = algo.test(testset)

# Compute and print RMSE
rmse = accuracy.rmse(predictions)
print(f"Root Mean Squared Error: {rmse}")

# Compute and print MAE
mae = accuracy.mae(predictions)
print(f"Mean Absolute Error: {mae}")

RMSE: 0.9732
Root Mean Squared Error: 0.973224539913668
MAE:  0.7502
Mean Absolute Error: 0.750174555748855


In [None]:
def predict_rating(user_id, movie_id):
    try:
        # Make prediction
        prediction = algo.predict(user_id, movie_id)
        return prediction.est
    except PredictionImpossible:
        return None

In [None]:
def get_top_n_recommendations(user_id, n=10):
    # Get a list of all movies
    all_movies = movies['movieId'].unique()
    rated_movies = ratings[ratings['userId'] == user_id]['movieId'].unique()
    unrated_movies = [movie for movie in all_movies if movie not in rated_movies]

    # Predict ratings for all unrated movies
    predictions = []
    for movie_id in unrated_movies:
        predicted_rating = predict_rating(user_id, movie_id)
        if predicted_rating is not None:
            predictions.append((movie_id, predicted_rating))

    # Sort the predictions by estimated rating in descending order
    predictions.sort(key=lambda x: x[1], reverse=True)

    # Return the top N movies with the highest predicted ratings
    top_n_movies = predictions[:n]
    top_n_movie_ids = [movie[0] for movie in top_n_movies]
    return movies[movies['movieId'].isin(top_n_movie_ids)]


In [None]:
def add_new_user_ratings(new_user_id, movie_ratings):
    """Add ratings for a new user."""
    # movie_ratings should be a list of tuples (movie_id, rating)
    new_ratings = pd.DataFrame({
        'userId': [new_user_id] * len(movie_ratings),
        'movieId': [mr[0] for mr in movie_ratings],
        'rating': [mr[1] for mr in movie_ratings]
    })
    return pd.concat([ratings, new_ratings], ignore_index=True)

In [None]:
# Example new user data
new_user_id = 5000
new_user_ratings = [(1, 4), (2, 5), (3, 3)]  # User has rated movies 1, 2, and 3

# Add new user ratings to the dataset
ratings = add_new_user_ratings(new_user_id, new_user_ratings)

In [None]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()  # Optionally use full dataset for retraining
algo.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x78a4df3a3be0>

In [None]:
import pickle

In [None]:
# Save the model to a file
with open('model.pkl', 'wb') as file:
    pickle.dump(algo, file)

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cp /content/model.pkl /content/drive/MyDrive