# Priporočilni sistem

Seminarska naloga izdelave priporočilnega sistema.
By Samo Pritržnik

## Podatki

Za seminarsko nalogo bom uporabil movielens podatke. Opis podatkov je v readme.txt.

In [28]:
# import library for reading .dat files
import numpy as np
import matplotlib as mpl
import pandas as pd
from csv import DictReader
import pickle as pkl
import random
from scipy.spatial.distance import cosine


### Branje ocen

In [29]:
class UserItemData:
    # Class for loading and filtering the MovieLens dataset
    def __init__(self, path, from_date=None, to_date=None, min_ratings=None):
        self.data = pd.read_csv(path, delimiter='\t')
        self.process_data(from_date, to_date, min_ratings)

    # Function for filtering the data
    def process_data(self, from_date, to_date, min_ratings):
        # Convert date columns to a single datetime column
        self.data['datetime'] = pd.to_datetime(self.data['date_year'].astype(str) + '-' +
                                               self.data['date_month'].astype(str).str.zfill(2) + '-' +
                                               self.data['date_day'].astype(str).str.zfill(2) + ' ' +
                                               self.data['date_hour'].astype(str).str.zfill(2) + ':' +
                                               self.data['date_minute'].astype(str).str.zfill(2) + ':' +
                                               self.data['date_second'].astype(str).str.zfill(2))

        # Filter by date range if specified
        if from_date:
            from_date = pd.to_datetime(from_date, dayfirst=True)
            self.data = self.data[self.data['datetime'] >= from_date]
        
        if to_date:
            to_date = pd.to_datetime(to_date, dayfirst=True)
            self.data = self.data[self.data['datetime'] <= to_date]
        
        # Filter by minimum ratings for each movie if specified
        if min_ratings:
            movie_counts = self.data['movieID'].value_counts()
            movies_with_min_ratings = movie_counts[movie_counts >= min_ratings].index
            self.data = self.data[self.data['movieID'].isin(movies_with_min_ratings)]

    # Function for getting the number of ratings
    def nratings(self):
        return len(self.data)

uim = UserItemData('podatki/user_ratedmovies.dat')
print(uim.nratings())

uim = UserItemData('podatki/user_ratedmovies.dat', from_date='12.1.2007', to_date='16.2.2008', min_ratings=100)
print(uim.nratings())

855618
73584


### Branje filmov

In [30]:
class MovieData:
    def __init__(self, path):
        self.data = pd.read_csv(path, delimiter='\t', encoding='latin1')

    # Function for getting the movie title
    def get_title(self, movie_id):
        return self.data[self.data['id'] == movie_id]['title'].values[0]
    
md = MovieData('podatki/movies.dat')
print(md.get_title(499))
        

Mr. Wonderful


## Prediktor

Z besedo "prediktor" bomo označevali razrede, ki za določenega uporabnika na nek način ocenijo, s kakšno vrednostjo bi ta uporabnik ocenil filme oz. produkte, ki jih ima na voljo. Ti razredi bodo imeli metodo fit(self, X), kjer je X tipa UserItemData, in metodo predict(self, user_id), kjer je user_id ID uporabnika. Metodo fit bomo uporabljali za učenje modela, predict pa za izračun priporočenih vrednosti za podanega uporabnika.

### Naključni prediktor

In [31]:
class RandomPredictor:
    # function for predicting random ratings
    def __init__(self, min_rating, max_rating):
        self.min_rating = min_rating
        self.max_rating = max_rating

    # Function for fitting the predictor
    def fit(self, user_item_data):
        self.user_item_data = user_item_data

    # Function for predicting ratings
    def predict(self, user_id):
        # Get all movie unique IDs
        movie_ids = self.user_item_data.data['movieID'].unique()
        
        # Return random rating for each movie
        return {movie_id: round(random.uniform(self.min_rating, self.max_rating)) for movie_id in movie_ids}

md = MovieData('podatki/movies.dat')
uim = UserItemData('podatki/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rp.fit(uim)
pred = rp.predict(78)
print(type(pred))
items = [1, 3, 20, 50, 100]
for item in items:
    print("Film: {}, ocena: {}".format(md.get_title(item), pred[item]))

<class 'dict'>
Film: Toy story, ocena: 3
Film: Grumpy Old Men, ocena: 5
Film: Money Train, ocena: 4
Film: The Usual Suspects, ocena: 3
Film: City Hall, ocena: 4


### Priporočanje

In [32]:
class Recommender:
    # Class for making recommendations
    def __init__(self, predictor):
        # load the predictor
        self.predictor = predictor

    # Function for fitting the recommender
    def fit(self, X):

        # Store RatingsMatrix
        self.X = X

        # Fit the predictor
        self.predictor.fit(X)

    def recommend(self, userID, n=10, rec_seen=True):
        # Get all movie IDs
        predictions = self.predictor.predict(userID)

        # Remove movies already seen by the user
        if not rec_seen:
            # Filter out movies the user has already rated
            rated_movies = set(self.X.data[self.X.data['userID'] == userID]['movieID'])

            # Remove the seen movies from predictions
            predictions = {movie_id: rating for movie_id, rating in predictions.items() if movie_id not in rated_movies}

        # Sort the predictions and return the top n
        sorted_predictions = sorted(predictions.items(), key=lambda x: x[1], reverse=True)

        # Return the top n predictions
        return sorted_predictions[:n]

md = MovieData('podatki/movies.dat')
uim = UserItemData('podatki/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=10, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Film: Batman Returns, ocena: 5
Film: Men in Black, ocena: 5
Film: The Devil's Advocate, ocena: 5
Film: Last Man Standing, ocena: 5
Film: Star Wreck: In the Pirkinning, ocena: 5
Film: The Adventures of Pluto Nash, ocena: 5
Film: Anacondas: The Hunt for the Blood Orchid, ocena: 5
Film: Cheaper by the Dozen 2, ocena: 5
Film: The Devil Wears Prada, ocena: 5
Film: The Bourne Ultimatum, ocena: 5


### Napovedovanje s povprečjem

In [33]:
class AveragePredictor:
    # Class for making average rating predictions
    def __init__(self, b):
        # b is the shrinkage parameter
        self.b = b

        # Dictionary storing the average ratings
        self.avg_ratings = {}

    # Function for fitting the predictor
    def fit(self, user_item_data):

        # Calculate the global average rating
        g_avg = user_item_data.data['rating'].mean()

        # Calculate the average rating for each movie
        for movie_id in user_item_data.data['movieID'].unique():
            # Get all ratings for the movie
            movie_data = user_item_data.data[user_item_data.data['movieID'] == movie_id]

            # Sum all ratings for the movie
            vs = movie_data['rating'].sum() 

            # Number of ratings for the movie
            n = len(movie_data)  

            # Calculate the adjusted average
            self.avg_ratings[movie_id] = (vs + self.b * g_avg) / (n + self.b)

    def predict(self, user_id):
        # Return the calculated average ratings
        return self.avg_ratings

md = MovieData('podatki/movies.dat')
uim = UserItemData('podatki/user_ratedmovies.dat')

# Using AveragePredictor with b=0
avg_pred = AveragePredictor(b=0)
rec = Recommender(avg_pred)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

# Using AveragePredictor with b=100
avg_pred_b100 = AveragePredictor(b=100)
rec_b100 = Recommender(avg_pred_b100)
rec_b100.fit(uim)
rec_items_b100 = rec_b100.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items_b100:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Film: Brother Minister: The Assassination of Malcolm X, ocena: 5.0
Film: Synthetic Pleasures, ocena: 5.0
Film: Adam & Steve, ocena: 5.0
Film: Gabbeh, ocena: 5.0
Film: Eve and the Fire Horse, ocena: 5.0
Film: The Usual Suspects, ocena: 4.225943646209935
Film: The Godfather: Part II, ocena: 4.147270016140489
Film: Cidade de Deus, ocena: 4.116537379202792
Film: The Dark Knight, ocena: 4.104137827503372
Film: 12 Angry Men, ocena: 4.1036381855925095


### Priporočanje najbolj gledanih filmov

In [34]:
class ViewsPredictor:
    # Class for making views count predictions
    def __init__(self):
        self.views_count = {}

    # Function for fitting the predictor
    def fit(self, user_item_data):
        # Count the number of views for each movie
        self.views_count = user_item_data.data['movieID'].value_counts().to_dict()

    # Function for predicting ratings
    def predict(self, user_id):
        # Return the views count for each movie
        return self.views_count

md = MovieData('podatki/movies.dat')
uim = UserItemData('podatki/user_ratedmovies.dat')

views_pred = ViewsPredictor()
rec = Recommender(views_pred)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 1576
Film: The Lord of the Rings: The Two Towers, ocena: 1528
Film: The Lord of the Rings: The Return of the King, ocena: 1457
Film: The Silence of the Lambs, ocena: 1431
Film: Shrek, ocena: 1404


## Napovedovanje ocen s podobnostjo med produkti

In [35]:
class ItemBasedPredictor:
    # Class for making item-based predictions
    def __init__(self, min_values=0, threshold=0):
        # min_values is the minimum number of common users two items must share
        self.min_values = min_values

        # threshold is the minimum similarity value for two items
        self.threshold = threshold
        
        # Initialize dictionary to store similarities for each item
        self.similarities = {}

        # Initialize the user-item data
        self.user_item_data = None 

    # Function for fitting the predictor
    def fit(self, user_item_data):
        # Store the user-item data
        self.user_item_data = user_item_data  

        # Pivot table to create a matrix of users and movie ratings
        matrix = user_item_data.data.pivot_table(index='userID', columns='movieID', values='rating')
        
        # Calculate the global mean for normalization
        global_mean = user_item_data.data['rating'].mean()

        # Initialize dictionary for each movie
        movies = matrix.columns
        for movie in movies:
            self.similarities[movie] = {}

        # Calculate similarities for each pair of movies
        for i in range(len(movies)):
            for j in range(i+1, len(movies)):
                movie1_id = movies[i]
                movie2_id = movies[j]

                # Get the ratings for the two movies
                ratings1 = matrix[movie1_id]
                ratings2 = matrix[movie2_id]

                # Find common user ratings
                common = matrix.loc[:, [movie1_id, movie2_id]].dropna()

                # Check for minimum number of common users
                if len(common) < self.min_values:
                    continue

                # Adjust ratings by subtracting the global mean
                adjusted_ratings1 = common[movie1_id] - global_mean
                adjusted_ratings2 = common[movie2_id] - global_mean

                # Compute similarity
                similarity = 1 - cosine(adjusted_ratings1, adjusted_ratings2)

                # Apply threshold
                if similarity < self.threshold:
                    similarity = 0

                self.similarities[movie1_id][movie2_id] = similarity
                self.similarities[movie2_id][movie1_id] = similarity  # Symmetric

    def similarity(self, p1, p2):
        # Return the calculated similarity
        return self.similarities.get(p1, {}).get(p2, 0)

    def predict(self, user_id):
        # Get all movies the user has rated
        user_ratings = self.user_item_data.data[self.user_item_data.data['userID'] == user_id]

        # Initialize a dictionary to store the predicted ratings
        predictions = {}

        # Iterate over all movies in the dataset
        for movie in self.similarities.keys():
            # Initialize the sum of similarities and weighted ratings
            sim_sum = 0
            weighted_ratings_sum = 0

            # Iterate over movies the user has rated
            for rated_movie, row in user_ratings.iterrows():
                rated_movie_id = row['movieID']
                rated_movie_rating = row['rating']

                # Check if similarity exists between the rated movie and the current movie
                if rated_movie_id in self.similarities[movie]:
                    # Get the similarity
                    similarity = self.similarities[movie][rated_movie_id]

                    # Update sums
                    sim_sum += similarity
                    weighted_ratings_sum += similarity * rated_movie_rating

            # Calculate the predicted rating
            if sim_sum > 0:
                # At least one similar movie found
                predicted_rating = weighted_ratings_sum / sim_sum
                # Round the rating to the nearest integer
                predictions[movie] = predicted_rating
            else:
                predictions[movie] = 0  # Default prediction when no similar movies are found

        return predictions


md = MovieData('podatki/movies.dat')
uim = UserItemData('podatki/user_ratedmovies.dat', min_ratings=1000)
rp = ItemBasedPredictor()
rec = Recommender(rp)
rec.fit(uim)

print("Podobnost med filmoma 'Men in black'(1580) in 'Ghostbusters'(2716): ", rp.similarity(1580, 2716))
print("Podobnost med filmoma 'Men in black'(1580) in 'Schindler's List'(527): ", rp.similarity(1580, 527))
print("Podobnost med filmoma 'Men in black'(1580) in 'Independence day'(780): ", rp.similarity(1580, 780))

print("Predictions for 78: ")
rec_items = rec.recommend(78, n=15, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Podobnost med filmoma 'Men in black'(1580) in 'Ghostbusters'(2716):  0.4547404102984455
Podobnost med filmoma 'Men in black'(1580) in 'Schindler's List'(527):  0
Podobnost med filmoma 'Men in black'(1580) in 'Independence day'(780):  0.5481222835158739
Predictions for 78: 
Film: The Usual Suspects, ocena: 4.228046000647015
Film: Shichinin no samurai, ocena: 4.177394960401547
Film: The Silence of the Lambs, ocena: 4.135397128915336
Film: Sin City, ocena: 4.1068039883765195
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 4.04311498579356
Film: The Incredibles, ocena: 4.0295056627728485
Film: The Lord of the Rings: The Return of the King, ocena: 3.991238364121903
Film: Batman Begins, ocena: 3.9752863828301552
Film: Good Will Hunting, ocena: 3.958027464091038
Film: The Lord of the Rings: The Two Towers, ocena: 3.946619226584744
Film: A Beautiful Mind, ocena: 3.9371194009695474
Film: Rain Man, ocena: 3.9338611915310477
Film: Die Hard, ocena: 3.9253627506306104
Film: Indiana 

### Najbolj podobni filmi

In [36]:
def print_top_similar_pairs(predictor, movie_data, top_n=20):
    # Flatten the similarity matrix into a list of tuples (movie1, movie2, similarity)
    similarity_list = []

    # Iterate over the keys of the similarity dictionary
    for movie1 in predictor.similarities:
        for movie2, similarity in predictor.similarities[movie1].items():
            if movie1 < movie2:
                # Append the tuple (movie1, movie2, similarity)
                similarity_list.append((movie1, movie2, similarity))

    # Sort the list by similarity in descending order
    sorted_similarities = sorted(similarity_list, key=lambda x: x[2], reverse=True)

    # Print the top N pairs
    for movie_pair in sorted_similarities[:top_n]:
        movie1_title = movie_data.get_title(movie_pair[0])
        movie2_title = movie_data.get_title(movie_pair[1])
        print(f"Film1: {movie1_title}, Film2: {movie2_title}, podobnost: {movie_pair[2]}")


print_top_similar_pairs(rp, md, top_n=20)

Film1: The Lord of the Rings: The Two Towers, Film2: The Lord of the Rings: The Return of the King, podobnost: 0.88443435002167
Film1: The Lord of the Rings: The Fellowship of the Ring, Film2: The Lord of the Rings: The Two Towers, podobnost: 0.8661699323818958
Film1: The Lord of the Rings: The Fellowship of the Ring, Film2: The Lord of the Rings: The Return of the King, podobnost: 0.8560584042064425
Film1: Kill Bill: Vol. 2, Film2: Kill Bill: Vol. 2, podobnost: 0.7996680596118538
Film1: Star Wars, Film2: Star Wars: Episode V - The Empire Strikes Back, podobnost: 0.7809614989355249
Film1: Star Wars: Episode V - The Empire Strikes Back, Film2: Star Wars: Episode VI - Return of the Jedi, podobnost: 0.7195558303640922
Film1: Ace Ventura: Pet Detective, Film2: The Mask, podobnost: 0.7150043669683668
Film1: Star Wars, Film2: Star Wars: Episode VI - Return of the Jedi, podobnost: 0.6899991545524874
Film1: Speed, Film2: Pretty Woman, podobnost: 0.6369612953704181
Film1: The Mask, Film2: Mrs. 

## Priporočanje glede na trenutno ogledano vsebino

In [37]:
class ItemBasedPredictor:
    # Class for making item-based predictions
    def __init__(self, min_values=0, threshold=0):
        self.min_values = min_values
        self.threshold = threshold
        self.similarities = {}
        self.user_item_data = None  # Add this line to initialize the user-item data attribute


    def fit(self, user_item_data):
        self.user_item_data = user_item_data  # Store the user-item data

        # Pivot table to create a matrix of users and movie ratings
        matrix = user_item_data.data.pivot_table(index='userID', columns='movieID', values='rating')
        
        # Calculate the global mean for normalization
        global_mean = user_item_data.data['rating'].mean()

        # Initialize dictionary for each movie
        movies = matrix.columns
        for movie in movies:
            self.similarities[movie] = {}

        # Calculate similarities for each pair of movies
        for i in range(len(movies)):
            for j in range(i+1, len(movies)):
                movie1_id = movies[i]
                movie2_id = movies[j]

                # Get the ratings for the two movies
                ratings1 = matrix[movie1_id]
                ratings2 = matrix[movie2_id]

                # Find common user ratings
                common = matrix.loc[:, [movie1_id, movie2_id]].dropna()

                # Check for minimum number of common users
                if len(common) < self.min_values:
                    continue

                # Adjust ratings by subtracting the global mean
                adjusted_ratings1 = common[movie1_id] - global_mean
                adjusted_ratings2 = common[movie2_id] - global_mean

                # Compute similarity
                similarity = 1 - cosine(adjusted_ratings1, adjusted_ratings2)

                # Apply threshold
                if similarity < self.threshold:
                    similarity = 0

                self.similarities[movie1_id][movie2_id] = similarity
                self.similarities[movie2_id][movie1_id] = similarity  # Symmetric

    def similarity(self, p1, p2):
        # Return the calculated similarity
        return self.similarities.get(p1, {}).get(p2, 0)

    def predict(self, user_id):
        user_ratings = self.user_item_data.data[self.user_item_data.data['userID'] == user_id]

        # Initialize a dictionary to store the predicted ratings
        predictions = {}

        # Iterate over all movies in the dataset
        for movie in self.similarities.keys():
            # Initialize the sum of similarities and weighted ratings
            sim_sum = 0
            weighted_ratings_sum = 0

            # Iterate over movies the user has rated
            for rated_movie, row in user_ratings.iterrows():
                rated_movie_id = row['movieID']
                rated_movie_rating = row['rating']

                # Check if similarity exists between the rated movie and the current movie
                if rated_movie_id in self.similarities[movie]:
                    similarity = self.similarities[movie][rated_movie_id]
                    sim_sum += similarity
                    weighted_ratings_sum += similarity * rated_movie_rating

            # Calculate the predicted rating
            if sim_sum > 0:
                predicted_rating = weighted_ratings_sum / sim_sum
                predictions[movie] = predicted_rating
            else:
                predictions[movie] = 0  # Default prediction when no similar movies are found

        return predictions
    
    def similarItems(self, item, n):
        # Check if the item exists in the similarity matrix
        if item not in self.similarities:
            return []

        # Retrieve all items and their similarity scores to the given item
        similar_items = self.similarities[item].items()

        # Sort the items based on similarity scores in descending order
        sorted_items = sorted(similar_items, key=lambda x: x[1], reverse=True)

        # Return the top n items
        return sorted_items[:n]


md = MovieData('podatki/movies.dat')
uim = UserItemData('podatki/user_ratedmovies.dat', min_ratings=1000)
rp = ItemBasedPredictor()
rec = Recommender(rp)
rec.fit(uim)

# Assuming the predictor has been fitted with user-item data
rec_items = rp.similarItems(4993, 10)
print('Filmi podobni "The Lord of the Rings: The Fellowship of the Ring":')
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Filmi podobni "The Lord of the Rings: The Fellowship of the Ring":
Film: The Lord of the Rings: The Two Towers, ocena: 0.8661699323818958
Film: The Lord of the Rings: The Return of the King, ocena: 0.8560584042064425
Film: Star Wars: Episode V - The Empire Strikes Back, ocena: 0.41941713043280493
Film: Star Wars, ocena: 0.4049987157833208
Film: The Matrix, ocena: 0.39792857472383913
Film: Raiders of the Lost Ark, ocena: 0.38859782772892826
Film: Star Wars: Episode VI - Return of the Jedi, ocena: 0.35507407372833644
Film: Schindler's List, ocena: 0.33830610938730676
Film: The Usual Suspects, ocena: 0.3353406259919578
Film: Indiana Jones and the Last Crusade, ocena: 0.3176956942607996


### Moja priporočila

6320 - uporabnik

In [38]:
md = MovieData('podatki/movies.dat')
uim = UserItemData('podatki/user_ratedmovies.dat', min_ratings=1000)
rp = ItemBasedPredictor()
rec = Recommender(rp)
rec.fit(uim)

# Assuming the predictor has been fitted with user-item data
rec_items = rec.recommend(6320, n=15, rec_seen=False)
print('Predictions for user 6320:')
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Predictions for user 6320:
Film: Shichinin no samurai, ocena: 5.000000000000001
Film: The Usual Suspects, ocena: 5.0
Film: Pulp Fiction, ocena: 5.0
Film: The Shawshank Redemption, ocena: 5.0
Film: Schindler's List, ocena: 5.0
Film: Blade Runner, ocena: 5.0
Film: The Silence of the Lambs, ocena: 5.0
Film: The Godfather, ocena: 5.0
Film: Reservoir Dogs, ocena: 5.0
Film: American Beauty, ocena: 5.0
Film: Fight Club, ocena: 5.0
Film: Memento, ocena: 5.0
Film: Le fabuleux destin d'Amélie Poulain, ocena: 5.0
Film: Eternal Sunshine of the Spotless Mind, ocena: 5.0
Film: Fargo, ocena: 4.983487896862781


## Priporočanje z metodo Slope one

In [39]:
class SlopeOnePredictor:
    # Class for making Slope One predictions
    def __init__(self):
        self.deviations = {}
        self.frequencies = {}
        self.user_item_data = None

    # Function for fitting the predictor
    def fit(self, user_item_data):
        self.user_item_data = user_item_data

        # Pivot the data to create a matrix of users and movie ratings
        matrix = user_item_data.data.pivot_table(index='userID', columns='movieID', values='rating')

        # Iterate over all movies and calculate deviations and frequencies
        for movie1 in matrix.columns:
            self.deviations[movie1] = {}
            self.frequencies[movie1] = {}
            for movie2 in matrix.columns:
                if movie1 != movie2:
                    # Get common users
                    common_ratings = matrix.loc[:, [movie1, movie2]].dropna()
                    if len(common_ratings) > 0:
                        # Calculate the deviations
                        self.deviations[movie1][movie2] = (common_ratings[movie1] - common_ratings[movie2]).mean()

                        # Calculate the frequencies
                        self.frequencies[movie1][movie2] = len(common_ratings)

    # Function for predicting ratings
    def predict(self, user_id):
        if self.user_item_data is None:
            raise Exception("Nisi dal podatkov.")

        # Get all movies the user has rated
        user_ratings = self.user_item_data.data[self.user_item_data.data['userID'] == user_id]

        predictions = {}
        frequencies = {}

        # Iterate over the user ratings
        for row in user_ratings.itertuples(index=False):
            # Extract movieID and rating from the row
            movie = getattr(row, 'movieID')  # Extract movieID
            rating = getattr(row, 'rating')  # Extract rating

            # Iterate over all movies
            for deviation_movie, deviation in self.deviations.get(movie, {}).items():
                # Check if the movie has already been rated by the user
                if deviation_movie not in user_ratings['movieID'].values:
                    # Update predictions and frequencies
                    if deviation_movie not in predictions:
                        predictions[deviation_movie] = 0
                        frequencies[deviation_movie] = 0

                    # Update predictions and frequencies
                    frequency = self.frequencies[movie][deviation_movie]
                    predictions[deviation_movie] += (rating + deviation) * frequency
                    frequencies[deviation_movie] += frequency

        # Calculate the mean of all predictions
        for movie in predictions:
            # Update the prediction
            predictions[movie] /= frequencies[movie]

        return predictions

uim = UserItemData('podatki/user_ratedmovies.dat', min_ratings=1000)
rp = SlopeOnePredictor()
rec = Recommender(rp)
rec.fit(uim)

print("Predictions for 78: ")
rec_items = rec.recommend(78, n=15, rec_seen=False)
for movie_id, rating in rec_items:
    print(f"Film: {md.get_title(movie_id)}, ocena: {rating}")

Predictions for 78: 
Film: Star Wars: Episode I - The Phantom Menace, ocena: 4.9833355804233515
Film: Ace Ventura: Pet Detective, ocena: 4.948704601564591
Film: The Mask, ocena: 4.851429977421409
Film: Austin Powers: The Spy Who Shagged Me, ocena: 4.718918097553563
Film: Mrs. Doubtfire, ocena: 4.71361724811941
Film: Pretty Woman, ocena: 4.710689304532709
Film: Titanic, ocena: 4.659408044652687
Film: Speed, ocena: 4.6410800075149625
Film: Mission: Impossible III, ocena: 4.616012576187981
Film: Men in Black, ocena: 4.377538716160303
Film: Batman, ocena: 4.375971640898608
Film: Spider-Man, ocena: 4.3204775022956845
Film: The Lion King, ocena: 4.266277000490918
Film: Ocean's Eleven, ocena: 4.197147854163864
Film: The Fifth Element, ocena: 4.165709969788519


# Evalvacija priporočilnega sistema

In [45]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
class Recommender:
    def __init__(self, predictor):
        self.predictor = predictor

    def fit(self, X):
        self.X = X
        self.predictor.fit(X)

    def recommend(self, userID, n=10, rec_seen=True):
        predictions = self.predictor.predict(userID)
        if not rec_seen:
            # Filter out movies the user has already rated
            rated_movies = set(self.X.data[self.X.data['userID'] == userID]['movieID'])
            predictions = {movie_id: rating for movie_id, rating in predictions.items() if movie_id not in rated_movies}

        # Sort the predictions and return the top n
        sorted_predictions = sorted(predictions.items(), key=lambda x: x[1], reverse=True)
        return sorted_predictions[:n]
    
    def evaluate(self, test_data, n, threshold=4):
        # Calculate the global average rating
        global_avg_rating = test_data.data['rating'].mean()

        # Initialize sums for metrics
        mae_sum = 0
        mse_sum = 0
        precision_sum = 0
        recall_sum = 0
        f1_sum = 0
        num_users = 0

        for user_id in test_data.data['userID'].unique():
            user_actual_ratings = test_data.data[test_data.data['userID'] == user_id]
            user_predicted_ratings = self.predictor.predict(user_id)

            # Calculate MAE and MSE
            actual = []
            predicted = []
            relevant_items = set()
            recommended_items = set()
            for _, row in user_actual_ratings.iterrows():
                movie_id = row['movieID']
                actual_rating = row['rating']
                predicted_rating = user_predicted_ratings.get(movie_id, global_avg_rating)
                actual.append(actual_rating)
                predicted.append(predicted_rating)

                if actual_rating > threshold:
                    relevant_items.add(movie_id)
                if predicted_rating > threshold:
                    recommended_items.add(movie_id)

            mae_sum += mean_absolute_error(actual, predicted)
            mse_sum += mean_squared_error(actual, predicted)

            # Calculate precision and recall
            true_positives = len(relevant_items.intersection(recommended_items))
            precision = true_positives / len(recommended_items) if recommended_items else 0
            recall = true_positives / len(relevant_items) if relevant_items else 0
            f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

            precision_sum += precision
            recall_sum += recall
            f1_sum += f1
            num_users += 1

        # Calculate averages
        mae = mae_sum / num_users
        rmse = (mse_sum / num_users) ** 0.5
        precision = precision_sum / num_users
        recall = recall_sum / num_users
        f1 = f1_sum / num_users

        return rmse, mae, precision, recall, f1

md = MovieData('podatki/movies.dat')
uim = UserItemData('podatki/user_ratedmovies.dat', min_ratings=1000, to_date='1.1.2008')
rp = RandomPredictor(3, 5)
rec = Recommender(rp)
rec.fit(uim)
uim_test = UserItemData('podatki/user_ratedmovies.dat', min_ratings=200, from_date='2.1.2008')
mse, mae, prec, rec, f1 = rec.evaluate(uim_test, 20)
print(f"Random predictor: MSE={mse}, MAE={mae}, Precision={prec}, Recall={rec}, F1={f1} ")

rp = ItemBasedPredictor()
rec = Recommender(rp)
rec.fit(uim)
uim_test = UserItemData('podatki/user_ratedmovies.dat', min_ratings=200, from_date='2.1.2008')
mse, mae, prec, rec, f1 = rec.evaluate(uim_test, 20)
print(f"Item predictor: MSE={mse}, MAE={mae}, Precision={prec}, Recall={rec}, F1={f1} ")

rp = ViewsPredictor()
rec = Recommender(rp)
rec.fit(uim)
uim_test = UserItemData('podatki/user_ratedmovies.dat', min_ratings=200, from_date='2.1.2008')
mse, mae, prec, rec, f1 = rec.evaluate(uim_test, 20)
print(f"Views predictor: MSE={mse}, MAE={mae}, Precision={prec}, Recall={rec}, F1={f1} ")

rp = SlopeOnePredictor()
rec = Recommender(rp)
rec.fit(uim)
uim_test = UserItemData('podatki/user_ratedmovies.dat', min_ratings=200, from_date='2.1.2008')
mse, mae, prec, rec, f1 = rec.evaluate(uim_test, 20)
print(f"Slope predictor: MSE={mse}, MAE={mae}, Precision={prec}, Recall={rec}, F1={f1} ")

Random predictor: MSE=0.9226895564384013, MAE=0.6956353741800367, Precision=0.06487594904114342, Recall=0.04378334209502327, F1=0.04438600193512654 
Item predictor: MSE=1.8124474577933816, MAE=1.1791350758668626, Precision=0.012486652040538965, Recall=0.010214397891076336, F1=0.009420782147865918 
Views predictor: MSE=570.8535017489829, MAE=287.95609469579125, Precision=0.09095877752750274, Recall=0.19680972102588765, F1=0.10764338115574235 
Slope predictor: MSE=0.861078525795194, MAE=0.6583039859118182, Precision=0.007711075732277145, Recall=0.005802344494924001, F1=0.005331309022001404 
