<a href="https://colab.research.google.com/github/Xeesto/UEP/blob/main/Systemy_Rekomendacyjne.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd

In [7]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [None]:
# Liczba filmów
num_movies = len(movies)
print(f"Liczba filmów w bazie: {num_movies}")

Liczba filmów w bazie: 9742


In [None]:
# Liczba ocen
num_ratings = len(ratings)
print(f"Liczba ocen w bazie: {num_ratings}")

Liczba ocen w bazie: 100836


In [None]:
# Liczba użytkowników
num_users = ratings['userId'].nunique()
print(f"Liczba użytkowników w bazie: {num_users}")


Liczba użytkowników w bazie: 610


In [None]:
# Średnia liczba ocen użytkownika
avg_ratings_per_user = ratings.groupby('userId')['rating'].count().mean()
print(f"Średnia liczba ocen użytkownika: {avg_ratings_per_user:.2f}")

Średnia liczba ocen użytkownika: 165.30


In [None]:
# Średnia liczba ocen na film
avg_ratings_per_movie = ratings.groupby('movieId')['rating'].count().mean()
print(f"Średnia liczba ocen na film: {avg_ratings_per_movie:.2f}")

Średnia liczba ocen na film: 10.37


In [None]:
# Rozkład ocen
rating_counts = ratings['rating'].value_counts().sort_index()
print("Rozkład ocen:")
print(rating_counts)

Rozkład ocen:
rating
0.5     1370
1.0     2811
1.5     1791
2.0     7551
2.5     5550
3.0    20047
3.5    13136
4.0    26818
4.5     8551
5.0    13211
Name: count, dtype: int64


In [None]:
movie_titles = dict(zip(movies['movieId'], movies['title']))

In [None]:
# Wyświetlenie słownika
# print("Słownik filmów:")
# print(movie_titles)

In [None]:
movie_id = 13                                       # ID filmu do wpisania
title = movie_titles.get(movie_id)

if title:
    print(f"Film o ID {movie_id} to: {title}")
else:
    print(f"Nie znaleziono filmu o ID: {movie_id}")

Film o ID 13 to: Balto (1995)


In [None]:
# movie_stats - agregujemy statystyki dla filmów
movie_stats = ratings.groupby('movieId')['rating'].agg(['mean', 'count']).reset_index()

# Obliczamy średnią liczbę ocen (C) i średnią ocenę (m) dla całej bazy
C = movie_stats['count'].mean()
m = movie_stats['mean'].mean()


In [None]:
# Definicja funkcji bayesowskiej
def bayesian_avg(ratings):
    return (C * m + ratings.sum()) / (C + ratings.count())

In [None]:
# Grupowanie ocen i obliczanie średniej bayesowskiej
bayesian_avg_ratings = ratings.groupby('movieId')['rating'].agg(bayesian_avg).reset_index()
bayesian_avg_ratings.columns = ['movieId', 'bayesian_avg']

movie_stats = movie_stats.merge(bayesian_avg_ratings, on='movieId')

movie_stats = movie_stats.merge(movies[['movieId', 'title']], on='movieId')

In [None]:
# Sortowanie po średniej bayesowskiej malejąco
top_movies = movie_stats.sort_values(by='bayesian_avg', ascending=False)

In [None]:
# Wyświetlenie top 3 filmów
print("Top 5 filmy według średniej bayesowskiej:")
print(top_movies[['movieId', 'title', 'bayesian_avg']].head(5))

Top 5 filmy według średniej bayesowskiej:
      movieId                                      title  bayesian_avg
277       318           Shawshank Redemption, The (1994)      4.392070
659       858                      Godfather, The (1972)      4.236457
2224     2959                          Fight Club (1999)      4.227052
224       260  Star Wars: Episode IV - A New Hope (1977)      4.192646
46         50                 Usual Suspects, The (1995)      4.190567


SURPRISE

In [1]:
!pip install numpy==1.23.5




In [2]:
!pip install scikit-surprise

Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2461560 sha256=785b40d0aa838751adf59fb63b6b7ca81a3285ffff17b044af68fc279e5984c0
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6e/7e2899163e2d85d8266daab4aa1cdabec7a6c56f83c015b5af
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [3]:
import numpy as np
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise.prediction_algorithms.knns import KNNBasic, KNNBaseline

In [8]:
min_rating = ratings.rating.min()
max_rating = ratings.rating.max()

reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [9]:
algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8730  0.8701  0.8723  0.8805  0.8767  0.8745  0.0037  
MAE (testset)     0.6708  0.6713  0.6675  0.6772  0.6728  0.6719  0.0031  
Fit time          3.73    1.52    1.56    1.54    1.50    1.97    0.88    
Test time         0.17    0.16    0.26    0.11    0.12    0.16    0.05    


{'test_rmse': array([0.87300129, 0.87014269, 0.87226513, 0.88047291, 0.87672338]),
 'test_mae': array([0.67084473, 0.67128644, 0.66754631, 0.67722128, 0.67276033]),
 'fit_time': (3.725538730621338,
  1.5180644989013672,
  1.5553226470947266,
  1.5388898849487305,
  1.4990928173065186),
 'test_time': (0.1737673282623291,
  0.15750527381896973,
  0.2621443271636963,
  0.11430835723876953,
  0.11573386192321777)}

In [10]:
trainset = data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7cdca846f590>

In [11]:
# Rekomendację dla użytkownika o ID 600
#######################################

all_movie_ids = set(movies['movieId'])

# Ocenione przez usera
rated_movie_ids = set(ratings[ratings['userId'] == 600]['movieId'])

# Nieocenione filmy przez usera
unrated_movie_ids = all_movie_ids - rated_movie_ids

# Przewidywanie oceny
predictions = [ (movie_id, algo.predict(600, movie_id).est) for movie_id in unrated_movie_ids ]

predictions.sort(key=lambda x: x[1], reverse=True)
top_10 = predictions[:10]

# Merge z filmami
recommended_movies = pd.DataFrame(top_10, columns=['movieId', 'predicted_rating'])
recommended_movies = recommended_movies.merge(movies, on='movieId')
print(recommended_movies[['movieId', 'title', 'predicted_rating']])

   movieId                                            title  predicted_rating
0     1223  Grand Day Out with Wallace and Gromit, A (1989)          4.306862
1      541                              Blade Runner (1982)          4.278327
2     1233                     Boot, Das (Boat, The) (1981)          4.246605
3     1204                        Lawrence of Arabia (1962)          4.188972
4     3435                          Double Indemnity (1944)          4.155550
5    78499                               Toy Story 3 (2010)          4.143663
6     1208                            Apocalypse Now (1979)          4.119316
7    68157                      Inglourious Basterds (2009)          4.117311
8      951                           His Girl Friday (1940)          4.077474
9     1304        Butch Cassidy and the Sundance Kid (1969)          4.076185
