# Качаем библиотеку

In [None]:
!pip install surprise



# Импортируем библиотеки

In [None]:
import pandas as pd
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from collections import defaultdict
import numpy as np

# Загружаем данные

In [None]:
!git clone https://github.com/divensambhwani/MovieLens-100K_Recommender-System.git  > /dev/null 2>&1
%cd MovieLens-100K_Recommender-System

/content/MovieLens-100K_Recommender-System/MovieLens-100K_Recommender-System


In [None]:
ratings = pd.read_csv('data/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


# Разделяем на train и test

In [None]:
df = ratings.sort_values("timestamp")
train_ratio = 0.8
train_size = int(len(df) * train_ratio)

train_df = df.iloc[:train_size]
test_df = df.iloc[train_size:]

In [None]:
reader = Reader(rating_scale=(0.5, 5))
train_data = Dataset.load_from_df(train_df[["userId", "movieId", "rating"]], reader)
trainset = train_data.build_full_trainset()

# Обучение userKNN

In [None]:
sim_options = {"name": "cosine", "user_based": True}
model = KNNBasic(sim_options=sim_options)
model.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7ebe6f4ee4d0>

In [None]:
test_ratings = test_df[["userId", "movieId", "rating"]].itertuples(index=False, name=None)
test_ratings = list(test_ratings)

predictions = model.test(test_ratings)

# Считаем NDCG@10

In [None]:
def ndcg_at_k(predictions, k=10):
    user_recommendations = defaultdict(list)
    user_relevances = defaultdict(list)

    for uid, iid, true_r, est, _ in predictions:
        user_recommendations[uid].append((iid, est))
        user_relevances[uid].append(true_r)

    ndcgs = []
    for uid in user_recommendations:
        sorted_recs = sorted(user_recommendations[uid], key=lambda x: x[1], reverse=True)[:k]
        true_rels = user_relevances[uid]

        dcg = sum(
            true_rels[idx] / np.log2(idx + 2) for idx, (iid, est) in enumerate(sorted_recs) if true_rels[idx] > 0
        )

        sorted_true_rels = sorted(true_rels, reverse=True)[:k]
        idcg = sum(rel / np.log2(idx + 2) for idx, rel in enumerate(sorted_true_rels))

        if idcg > 0:
            ndcgs.append(dcg / idcg)

    return np.mean(ndcgs)

ndcg_10 = ndcg_at_k(predictions, k=10)
print(f"NDCG@10: {ndcg_10:.4f}")

NDCG@10: 0.8465


# Создаем датасет, который идет в прод

In [None]:
def get_recommendations(model, trainset, k=10):
    all_items = trainset.all_items()
    all_items = [trainset.to_raw_iid(i) for i in all_items]

    recommendations = defaultdict(list)
    for uid in trainset.all_users():
        user_raw_id = trainset.to_raw_uid(uid)
        rated_items = set(trainset.ur[uid])
        unrated_items = [iid for iid in all_items if iid not in rated_items]

        preds = [(iid, model.predict(user_raw_id, iid).est) for iid in unrated_items]
        top_k = sorted(preds, key=lambda x: x[1], reverse=True)[:k]
        recommendations[user_raw_id] = [iid for iid, _ in top_k]

    return recommendations

recommendations = get_recommendations(model, trainset, k=10)
final_dataset = pd.DataFrame({"userid": recommendations.keys(), "recommendation": recommendations.values()})

print(final_dataset)

     userid                                     recommendation
0       429  [99, 581, 148, 496, 685, 1105, 1046, 1151, 53,...
1       107  [99, 148, 496, 467, 1151, 1696, 53, 389, 3073,...
2       191  [99, 148, 496, 626, 467, 53, 1140, 3073, 3096,...
3        99  [99, 581, 148, 496, 685, 1105, 1151, 1696, 53,...
4        54  [99, 148, 496, 626, 467, 1151, 1696, 53, 3073,...
..      ...                                                ...
517     445  [99, 148, 496, 626, 467, 1151, 53, 1140, 3073,...
518      18  [99, 148, 496, 626, 467, 1151, 53, 1140, 3073,...
519      10  [99, 148, 496, 685, 1105, 1151, 1696, 53, 389,...
520     424  [148, 496, 626, 467, 53, 1140, 3073, 3096, 347...
521     495  [99, 148, 496, 685, 1046, 467, 1151, 53, 1140,...

[522 rows x 2 columns]


In [None]:
final_dataset.to_csv("recommendations.csv")

In [None]:
from google.colab import files
files.download('recommendations.csv')