In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

In [12]:

ratings = pd.read_csv('ratings.dat', sep='::', engine='python',
                      names=['userId', 'movieId', 'rating', 'timestamp'])

movies = pd.read_csv('movies.dat', sep='::', engine='python',
                     names=['movieId', 'title', 'genres'],
                     encoding='latin1')


print("Liczba wszystkich ocen:", len(ratings))
print("Liczba unikalnych użytkowników:", ratings['userId'].nunique())
print("Liczba unikalnych filmów ocenionych:", ratings['movieId'].nunique())
print("Liczba wszystkich filmów w movies.dat:", len(movies))
print("Liczba filmów bez ocen:", len(movies) - ratings['movieId'].nunique())
print("Przykładowe filmy:")
print(movies.sample(10))




Liczba wszystkich ocen: 1000209
Liczba unikalnych użytkowników: 6040
Liczba unikalnych filmów ocenionych: 3706
Liczba wszystkich filmów w movies.dat: 3883
Liczba filmów bez ocen: 177
Przykładowe filmy:
      movieId                                              title  \
183       185                                    Net, The (1995)   
773       783                Hunchback of Notre Dame, The (1996)   
1769     1837                          Odd Couple II, The (1998)   
1753     1819                        Storefront Hitchcock (1997)   
1425     1450  Prisoner of the Mountains (Kavkazsky Plennik) ...   
1891     1960                           Last Emperor, The (1987)   
718       727                                 War Stories (1995)   
2876     2945                               Mike's Murder (1984)   
3519     3588                 King of Marvin Gardens, The (1972)   
2412     2481                              My Name Is Joe (1998)   

                            genres  
183         

In [13]:
#  Macierz użytkownik–film
user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
user_movie_matrix_filled = user_movie_matrix.fillna(0)

#  Podobieństwo meidzy uztkownmikammi czeli na miejsci [109-300] mam miedzy nimi podobinstwo
user_similarity = cosine_similarity(user_movie_matrix_filled)
user_similarity_df = pd.DataFrame(user_similarity,
                                  index=user_movie_matrix_filled.index,
                                  columns=user_movie_matrix_filled.index)

#  Przewidywanie oceny, szukam k najbardziej pdodobnych biore ich oceny tego filmu i oblcizam srednia
def predict_rating(user_id, movie_id, k=5):
    similar_users = user_similarity_df[user_id].drop(user_id).sort_values(ascending=False)
    top_users = similar_users.head(k).index
    neighbor_ratings = user_movie_matrix.loc[top_users, movie_id].dropna()
    return neighbor_ratings.mean()

# dobiertam najlepsze k
sample_k_search = ratings.sample(1000, random_state=42)
best_rmse = float('inf')
k_best = None

for k in [10, 30, 50, 100, 200, 500]:
    predicted = []
    actual = []

    for _, row in sample_k_search.iterrows():
        pred = predict_rating(row['userId'], row['movieId'], k)
        if not np.isnan(pred):
            predicted.append(pred)
            actual.append(row['rating'])

    rmse = np.sqrt(mean_squared_error(actual, predicted))
    print(f"k = {k:<4} RMSE = {rmse:.4f}")
    if rmse < best_rmse:
        best_rmse = rmse
        k_best = k

print(f"\nNajlepsze k = {k_best}, RMSE = {best_rmse:.4f}")

#  zwracam top 5 filmow dla uzytkownika
def recommend_movies_for_user(user_id, k=k_best, top_n=5):
    similar_users = user_similarity_df[user_id].drop(user_id).sort_values(ascending=False)
    top_users = similar_users.head(k).index
    similar_ratings = user_movie_matrix.loc[top_users]
    mean_ratings = similar_ratings.mean().sort_values(ascending=False)
    seen_movies = user_movie_matrix.loc[user_id][user_movie_matrix.loc[user_id].notna()].index
    recommendations = mean_ratings.drop(index=seen_movies)
    top_movie_ids = recommendations.head(top_n).index
    return movies[movies['movieId'].isin(top_movie_ids)][['movieId', 'title']]

print(f"\nRekomendacje dla użytkownika 109 (k={k_best}):")
print(recommend_movies_for_user(109, k_best, 5))

#  RMSE
sample_eval = ratings.sample(10000, random_state=123)
predicted = []
actual = []

for _, row in sample_eval.iterrows():
    pred = predict_rating(row['userId'], row['movieId'], k_best)
    if not np.isnan(pred):
        predicted.append(pred)
        actual.append(row['rating'])

rmse = np.sqrt(mean_squared_error(actual, predicted))
print(f"\nRMSE (k={k_best}): {rmse:.4f}")


k = 10   RMSE = 1.0423
k = 30   RMSE = 0.9753
k = 50   RMSE = 0.9792
k = 100  RMSE = 0.9746
k = 200  RMSE = 0.9823
k = 500  RMSE = 0.9869

Najlepsze k = 100, RMSE = 0.9746

Rekomendacje dla użytkownika 109 (k=100):
      movieId                                 title
1371     1392                   Citizen Ruth (1996)
2591     2660  Thing From Another World, The (1951)
2612     2681                Free Enterprise (1998)
3295     3364            Asphalt Jungle, The (1950)
3791     3861              Replacements, The (2000)

RMSE (k=100): 0.9495
