<a href="https://colab.research.google.com/github/UEPP40/PUM/blob/golon/system_rekomendacyjny.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Wczytanie danych o ocenach i filmach
ratings = pd.read_csv('https://s3-us-west-2.amazonaws.com/recommender-tutorial/ratings.csv')
movies = pd.read_csv('https://s3-us-west-2.amazonaws.com/recommender-tutorial/movies.csv')

# Liczba wszystkich ocen
total_ratings = ratings.shape[0]

# Liczba filmów w bazie
total_movies = movies.shape[0]

# Liczba użytkowników
total_users = ratings['userId'].nunique()

# Średnia liczba ocen na użytkownika
avg_ratings_per_user = total_ratings / total_users

# Średnia liczba ocen na film
avg_ratings_per_movie = total_ratings / total_movies

# Rozkład ocen
ratings_distribution = ratings['rating'].value_counts().sort_index()

# Wyświetlenie wyników
print("Liczba wszystkich ocen:", total_ratings)
print("Liczba filmów w bazie:", total_movies)
print("Liczba użytkowników:", total_users)
print("Średnia liczba ocen na użytkownika:", avg_ratings_per_user)
print("Średnia liczba ocen na film:", avg_ratings_per_movie)
print("\nRozkład ocen:")
print(ratings_distribution)


Liczba wszystkich ocen: 100836
Liczba filmów w bazie: 9742
Liczba użytkowników: 610
Średnia liczba ocen na użytkownika: 165.30491803278687
Średnia liczba ocen na film: 10.350646684459043

Rozkład ocen:
rating
0.5     1370
1.0     2811
1.5     1791
2.0     7551
2.5     5550
3.0    20047
3.5    13136
4.0    26818
4.5     8551
5.0    13211
Name: count, dtype: int64


In [2]:
# Słownik tytułów filmów
movie_titles = dict(zip(movies['movieId'], movies['title']))

# Obliczenie C i m
C = ratings['rating'].mean()
m = ratings.groupby('movieId')['rating'].agg(['count', 'mean'])
m = m['count'].mean()

# Funkcja bayesian_avg
def bayesian_avg(ratings):
    bayesian_avg = (C * m + ratings.sum()) / (C + ratings.count())
    return bayesian_avg

# Obliczenie bayesian_avg_ratings
bayesian_avg_ratings = ratings.groupby('movieId')['rating'].agg(bayesian_avg).reset_index()
bayesian_avg_ratings.columns = ['movieId', 'bayesian_avg']

# Połączenie z danymi o filmach
movie_stats = movies.merge(bayesian_avg_ratings, on='movieId')

# Wyświetlenie wyników
print(movie_stats.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  bayesian_avg  
0  Adventure|Animation|Children|Comedy|Fantasy      4.024276  
1                   Adventure|Children|Fantasy      3.645857  
2                               Comedy|Romance      3.708193  
3                         Comedy|Drama|Romance      5.028823  
4                                       Comedy      3.558189  


In [3]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

# Wczytanie danych o ocenach i filmach
ratings = pd.read_csv('https://s3-us-west-2.amazonaws.com/recommender-tutorial/ratings.csv')
movies = pd.read_csv('https://s3-us-west-2.amazonaws.com/recommender-tutorial/movies.csv')

# Liczba wszystkich ocen
total_ratings = ratings.shape[0]

# Liczba filmów w bazie
total_movies = movies.shape[0]

# Liczba użytkowników
total_users = ratings['userId'].nunique()

# Średnia liczba ocen na użytkownika
avg_ratings_per_user = total_ratings / total_users

# Średnia liczba ocen na film
avg_ratings_per_movie = total_ratings / total_movies

# Rozkład ocen
ratings_distribution = ratings['rating'].value_counts().sort_index()

# Wyświetlenie wyników
print("Liczba wszystkich ocen:", total_ratings)
print("Liczba filmów w bazie:", total_movies)
print("Liczba użytkowników:", total_users)
print("Średnia liczba ocen na użytkownika:", avg_ratings_per_user)
print("Średnia liczba ocen na film:", avg_ratings_per_movie)
print("\nRozkład ocen:")
print(ratings_distribution)

# Słownik tytułów filmów
movie_titles = dict(zip(movies['movieId'], movies['title']))

# Obliczenie C i m
C = ratings['rating'].mean()
m = ratings.groupby('movieId')['rating'].agg(['count', 'mean'])
m = m['count'].mean()

# Funkcja bayesian_avg
def bayesian_avg(ratings):
    bayesian_avg = (C * m + ratings.sum()) / (C + ratings.count())
    return bayesian_avg

# Obliczenie bayesian_avg_ratings
bayesian_avg_ratings = ratings.groupby('movieId')['rating'].agg(bayesian_avg).reset_index()
bayesian_avg_ratings.columns = ['movieId', 'bayesian_avg']

# Połączenie z danymi o filmach
movie_stats = movies.merge(bayesian_avg_ratings, on='movieId')

# Wyświetlenie wyników
print(movie_stats.head())

def create_sparse(df):
    M = df['userId'].nunique()
    N = df['movieId'].nunique()
    user_mapper = dict(zip(np.unique(df["userId"]), list(range(M))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(N))))
    user_inv_mapper = dict(zip(list(range(M)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(N)), np.unique(df["movieId"])))
    user_index = [user_mapper[i] for i in df['userId']]
    item_index = [movie_mapper[i] for i in df['movieId']]
    X = csr_matrix((df["rating"], (user_index, item_index)), shape=(M,N))
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

# Utworzenie macierzy rzadkiej
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_sparse(ratings)

n_total = X.shape[0] * X.shape[1]
n_ratings = X.nnz
sparsity = n_ratings / n_total
sparsity_p = round(sparsity * 100, 2)

print(f"Matrix sparsity: {sparsity_p}%")


Liczba wszystkich ocen: 100836
Liczba filmów w bazie: 9742
Liczba użytkowników: 610
Średnia liczba ocen na użytkownika: 165.30491803278687
Średnia liczba ocen na film: 10.350646684459043

Rozkład ocen:
rating
0.5     1370
1.0     2811
1.5     1791
2.0     7551
2.5     5550
3.0    20047
3.5    13136
4.0    26818
4.5     8551
5.0    13211
Name: count, dtype: int64
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  bayesian_avg  
0  Adventure|Animation|Children|Comedy|Fantasy      4.024276  
1                   Adventure|Children|Fantasy      3.645857  
2                               Comedy|Romance      3.708193  
3                         Comedy|Drama|Romance      5.028823  
4                  

In [4]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

def find_similar_movies(movie_id, X, movie_mapper, movie_inv_mapper, k, metric='cosine'):
    X = X.T
    neighbour_ids = []
    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    if isinstance(movie_vec, (np.ndarray)):
        movie_vec = movie_vec.reshape(1, -1)
    kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric)
    kNN.fit(X)
    neighbour = kNN.kneighbors(movie_vec, return_distance=False)
    neighbour = neighbour[0][1:]  # Pomijamy pierwszy element, który jest samym filmem
    for n in neighbour:
        neighbour_ids.append(movie_inv_mapper[n])
    return neighbour_ids

# Przykładowe użycie funkcji do znalezienia 10 rekomendacji dla filmu o identyfikatorze 1 (Toy Story)
recommendations = find_similar_movies(1, X, movie_mapper, movie_inv_mapper, k=10)

# Wyświetlenie rekomendacji
for i, movie_id in enumerate(recommendations, 1):
    print(f"{i}. {movie_titles[movie_id]}")


1. Toy Story 2 (1999)
2. Jurassic Park (1993)
3. Independence Day (a.k.a. ID4) (1996)
4. Star Wars: Episode IV - A New Hope (1977)
5. Forrest Gump (1994)
6. Lion King, The (1994)
7. Star Wars: Episode VI - Return of the Jedi (1983)
8. Mission: Impossible (1996)
9. Groundhog Day (1993)
10. Back to the Future (1985)


In [9]:
# Install surprise library
!pip install surprise

# Importing relevant libraries
from surprise.model_selection import cross_validate
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import accuracy

# Assuming 'ratings' DataFrame is already loaded
# Get minimum and maximum rating from the dataset
min_rating = ratings.rating.min()
max_rating = ratings.rating.max()
reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Define the algorithm
algo = SVD()

# Perform cross-validation
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8679  0.8746  0.8677  0.8771  0.8801  0.8735  0.0050  
MAE (testset)     0.6694  0.6719  0.6690  0.6743  0.6741  0.6717  0.0023  
Fit time          1.44    1.13    1.14    1.14    1.10    1.19    0.13    
Test time         0.17    0.32    0.10    0.18    0.09    0.17    0.08    


{'test_rmse': array([0.86794448, 0.8745855 , 0.86766498, 0.87711982, 0.88009021]),
 'test_mae': array([0.66940616, 0.67189651, 0.6689616 , 0.6743451 , 0.67409027]),
 'fit_time': (1.4414112567901611,
  1.1312918663024902,
  1.139693021774292,
  1.1427602767944336,
  1.0997066497802734),
 'test_time': (0.17045092582702637,
  0.3210625648498535,
  0.09974288940429688,
  0.18238592147827148,
  0.08806276321411133)}

In [10]:

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=0.25)

algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.8753


0.875299218301957