In [3]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3162993 sha256=1db32c75e08cd4177ed6e940b09776b43bb7522efdb68a78b0367f88f84eba6c
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [21]:
# Importing the required libraries with the csv files for movies and ratings

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
import pandas as pd
import re

movies_data = pd.read_csv("movies.csv")
ratings_data = pd.read_csv("ratings.csv")

# User_id for model testing
user_id = 128

In [22]:
# Defining a custom tokenizer such genre entries like Sci-fi and (no genres listed) could be classified as a whole

def custom_tokenizer(text):
        pattern = r'[|]+'  # Split on | only
        tokens = re.split(pattern, text)
        return tokens

vector = TfidfVectorizer(tokenizer=custom_tokenizer)
x = vector.fit_transform(movies_data["genres"])



In [23]:
# Task 1

def recommend_movies(movie_title, data, tfidf_matrix):

    idx = data[data['title'] == movie_title].index[0]

    sim_scores = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
    sim_scores_idx = sim_scores.argsort()[::-1][1:11]  # Excluding the first movie (itself)

    # Get titles of recommended movies
    recommendations = data.iloc[sim_scores_idx]['title']

    return recommendations.values

# Example usage:
movie_title = "John Wick (2014)"
recommendations = recommend_movies(movie_title, movies_data, x)
print("Recommended movies for {}: \n".format(movie_title))
for i, movie in enumerate(recommendations, 1):
    print("{}. {}".format(i, movie))

Recommended movies for John Wick (2014): 

1. Osterman Weekend, The (1983)
2. The Forgotten (1989)
3. The Last Survivors (2014)
4. The Operative (2001)
5. Paintball (2009)
6. Darc (2018)
7. Panic (2001)
8. Hunting Emma (2017)
9. Skyjacked (1972)
10. Spirit (2012)


In [24]:
# Task 2

# Content based recommendation
def content_based(user_preferences, data, tfidf_matrix, vectorizer, n=10):
    # Calculating cosine similarity between user preferences and all movies
    user_preferences_vector = vectorizer.transform(user_preferences)
    sim_scores = cosine_similarity(user_preferences_vector, tfidf_matrix)

    # Sorting out the indices of movies with highest similarity scores
    sim_scores_idx = sim_scores.argsort()[0][::-1][:n]

    # Get titles of recommended movies
    recommendations = data.iloc[sim_scores_idx]['title']

    return recommendations.values

user_preferences = movies_data[movies_data['movieId'].isin(ratings_data[ratings_data['userId'] == user_id]['movieId'])]['genres']
cbr = content_based(user_preferences, movies_data, x, vector, n=10)

print("Content-Based Recommendations for User", user_id)
for i, movie in enumerate(cbr, 1):
    print("{}. {}".format(i, movie))


Content-Based Recommendations for User 128
1. The Most Assassinated Woman in the World (2018)
2. Inferno (2016)
3. Grand Piano (2013)
4. Cat o' Nine Tails, The (Gatto a nove code, Il) (1971)
5. 23 Paces to Baker Street (1956)
6. Fate (2008)
7. 8 Remains (2018)
8. Shattered (1991)
9. Venetian Bird (1953)
10. The Devil with Seven Faces (1971)


In [25]:
# Collaborative filtering based on user-item relation

# Making the dataset readable by the surprise lib and splitting it into training and testing dat
reader = Reader(rating_scale=(0.5, 5))
dat = Dataset.load_from_df(ratings_data[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(dat, test_size=0.2, random_state=42)

# Training the model
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ba8f754ae00>

In [26]:
# Testing the model
# Accessing the predictions
collab_preds = algo.test(testset)

def get_top_n(predictions, n=10, userId=None):
    # First map the predictions to each user
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid == userId:
            if uid not in top_n:
                top_n[uid] = []
            top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve top N
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

# Showcasing the results
i = 0
top_n_collaborative = get_top_n(collab_preds, n=10, userId=user_id)
print("\nCollaborative Filtering Recommendations for User", user_id)
for movie_id, estimated_rating in top_n_collaborative[user_id]:
    i += 1
    movie_title = movies_data[movies_data['movieId'] == movie_id]['title'].iloc[0]
    print("{}. {}".format(i,movie_title, "- Estimated Rating:", round(estimated_rating, 2)))


Collaborative Filtering Recommendations for User 128
1. Whiplash (2014)
2. Dark Knight Rises, The (2012)
3. Bohemian Rhapsody (2018)
4. Doctor Strange (2016)
5. Rogue One: A Star Wars Story (2016)
6. Iron Man 2 (2010)


In [27]:
# Task 3, Accuracy metrics for the collaborative filtering based approach

from surprise import accuracy
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_collaborative_filtering(predictions):
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)
    return rmse, mae

evaluate_collaborative_filtering(collab_preds)

RMSE: 0.8308
MAE:  0.6320


(0.8307598829983691, 0.6320388716132164)

In [None]:
# Here, what I thought of doing was to find all the recommendations based on the movie John Wick
# using all the methods developed.