In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

movies_path = "movies.csv"
ratings_path = "ratings.csv"

## **1. Item-Based Collaborative Filtering**

In [41]:
movies_path = "movies.csv"
ratings_path = "ratings.csv"

movies = pd.read_csv(movies_path)
ratings = pd.read_csv(ratings_path)

ratings_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
ratings_sparse = csr_matrix(ratings_matrix)

item_similarity = cosine_similarity(ratings_sparse.T)
item_similarity_df = pd.DataFrame(item_similarity, index=ratings_matrix.columns, columns=ratings_matrix.columns)

def get_item_based_recommendations(user_id, n=10):
    user_ratings = ratings_matrix.loc[user_id]
    rated_items = user_ratings[user_ratings > 0].index
    scores = pd.Series(dtype=np.float64)
    for item in rated_items:
        sim_items = item_similarity_df[item]
        scores = scores.add(sim_items * user_ratings[item], fill_value=0)
    scores = scores.drop(rated_items)
    return scores.nlargest(n).index.tolist()

IB_rec = get_item_based_recommendations(1, 10)
movies[movies['movieId'].isin(IB_rec)][['title', 'movieId']]

Unnamed: 0,title,movieId
484,Blade Runner (1982),541
843,Die Hard (1988),1036
959,"Princess Bride, The (1987)",1197
962,Aliens (1986),1200
975,Alien (1979),1214
999,"Terminator, The (1984)",1240
1050,Indiana Jones and the Last Crusade (1989),1291
1232,"Fifth Element, The (1997)",1527
1674,Indiana Jones and the Temple of Doom (1984),2115
3322,Memento (2000),4226


## **2. Hybrid Recommender System**

In [None]:
movies = pd.read_csv(movies_path)
ratings = pd.read_csv(ratings_path)

rated_movie_ids = set(ratings['movieId'].unique())
movies_filtered = movies[movies['movieId'].isin(rated_movie_ids)].reset_index(drop=True)
ratings_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
ratings_matrix = ratings_matrix[movies_filtered['movieId']].fillna(0)
tfidf = TfidfVectorizer(tokenizer=lambda x: x.split('|'))
genre_matrix = tfidf.fit_transform(movies_filtered['genres'].fillna(''))
genre_similarity = cosine_similarity(genre_matrix)
item_similarity = cosine_similarity(ratings_matrix.T)

alpha = 0.5
hybrid_similarity = alpha * item_similarity + (1 - alpha) * genre_similarity
np.fill_diagonal(hybrid_similarity, 0)
movie_idx_map = {mid: idx for idx, mid in enumerate(movies_filtered['movieId'])}

def get_hybrid_rec(user_id, n=10):
    if user_id not in ratings_matrix.index:
        avg_ratings = ratings.groupby('movieId')['rating'].mean()
        avg_ratings = avg_ratings[movies_filtered['movieId']]
        return avg_ratings.nlargest(n).index.tolist()

    user_ratings = ratings_matrix.loc[user_id]
    rated_items = user_ratings[user_ratings > 0].index

    scores = np.zeros(len(movies_filtered))
    for item in rated_items:
        item_idx = movie_idx_map[item]
        scores += hybrid_similarity[item_idx] * user_ratings[item]

    for item in rated_items:
        scores[movie_idx_map[item]] = -np.inf

    top_indices = np.argsort(scores)[-n:][::-1]
    return movies_filtered.iloc[top_indices]['movieId'].tolist()

hybrid_recs = get_hybrid_rec(1, 10)
movies[movies['movieId'].isin(hybrid_recs)][['title', 'movieId']]



Unnamed: 0,title,movieId
258,Léon: The Professional (a.k.a. The Professiona...,293
843,Die Hard (1988),1036
999,"Terminator, The (1984)",1240
1232,"Fifth Element, The (1997)",1527
1259,Face/Off (1997),1573
1289,"Hunt for Red October, The (1990)",1610
1563,Lethal Weapon (1987),2000
2329,Total Recall (1990),2916
2386,RoboCop (1987),2985
5004,Kill Bill: Vol. 1 (2003),6874


## **3. Baseline RecSys(Смотрите также, Популярное)**

In [43]:
movies = pd.read_csv(movies_path)
ratings = pd.read_csv(ratings_path)

def also_watched(movie_id, n=5):
    users_who_watched = ratings[ratings['movieId'] == movie_id]['userId'].unique()
    co_movies = ratings[ratings['userId'].isin(users_who_watched)]
    co_movies_count = co_movies.groupby('movieId').size().sort_values(ascending=False)
    co_movies_count = co_movies_count[co_movies_count.index != movie_id]
    top_movies = co_movies_count.head(n).index.tolist()
    return movies[movies['movieId'].isin(top_movies)]

def popular_movies(n=10):
    movie_popularity = ratings.groupby('movieId').size().sort_values(ascending=False)
    top_movies = movie_popularity.head(n).index.tolist()
    return movies[movies['movieId'].isin(top_movies)]

also_rec = also_watched(movie_id=1, n=10)
pop_rec = popular_movies(n=10)

print("Also watched:")
print(also_rec[['title', 'movieId']])
print("\nPopular movies:")
print(pop_rec[['title', 'movieId']])


Also watched:
                                                  title  movieId
230           Star Wars: Episode IV - A New Hope (1977)      260
260                                 Pulp Fiction (1994)      296
316                                 Forrest Gump (1994)      356
426                                Jurassic Park (1993)      480
525                    Silence of the Lambs, The (1991)      593
646                Independence Day (a.k.a. ID4) (1996)      780
958   Star Wars: Episode V - The Empire Strikes Back...     1196
960   Raiders of the Lost Ark (Indiana Jones and the...     1198
971   Star Wars: Episode VI - Return of the Jedi (1983)     1210
1029                          Back to the Future (1985)     1270

Popular movies:
                                          title  movieId
98                            Braveheart (1995)      110
230   Star Wars: Episode IV - A New Hope (1977)      260
260                         Pulp Fiction (1994)      296
279            Shawshank R

## **4. GNN**

In [55]:
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import LGConv

movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

user_ids = ratings.userId.unique()
movie_ids = ratings.movieId.unique()

user_map = {u: i for i, u in enumerate(user_ids)}
movie_map = {m: i for i, m in enumerate(movie_ids)}
idx_to_movie = {i: m for i, m in enumerate(movie_ids)}

num_users = len(user_ids)
num_movies = len(movie_ids)
num_nodes = num_users + num_movies

edges = []
for _, r in ratings.iterrows():
    u = user_map[r.userId]
    m = movie_map[r.movieId] + num_users
    edges.append([u, m])
    edges.append([m, u])

edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
data = Data(edge_index=edge_index)


class LightGCN(nn.Module):
    def __init__(self, num_nodes, emb_dim=64, num_layers=3):
        super().__init__()
        self.emb = nn.Embedding(num_nodes, emb_dim)
        self.convs = nn.ModuleList([LGConv() for _ in range(num_layers)])
        self.num_layers = num_layers

    def forward(self, edge_index):
        x = self.emb.weight
        all_embeddings = [x]
        for conv in self.convs:
            x = conv(x, edge_index)
            all_embeddings.append(x)
        x = torch.stack(all_embeddings, dim=0).mean(dim=0)  # усреднение по слоям
        return x

def bpr_loss(user_emb, pos_emb, neg_emb):
    pos_scores = (user_emb * pos_emb).sum(dim=1)
    neg_scores = (user_emb * neg_emb).sum(dim=1)
    loss = -F.logsigmoid(pos_scores - neg_scores).mean()
    return loss

model = LightGCN(num_nodes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

user_pos = ratings.groupby('userId')['movieId'].apply(set).to_dict()

def sample_batch(batch_size=1024):
    users, pos_movies, neg_movies = [], [], []
    for _ in range(batch_size):
        u = random.choice(list(user_ids))
        pos_m = random.choice(list(user_pos[u]))
        while True:
            neg_m = random.choice(movie_ids)
            if neg_m not in user_pos[u]:
                break
        users.append(user_map[u])
        pos_movies.append(movie_map[pos_m] + num_users)
        neg_movies.append(movie_map[neg_m] + num_users)
    return torch.tensor(users), torch.tensor(pos_movies), torch.tensor(neg_movies)

for epoch in range(30):
    model.train()
    optimizer.zero_grad()
    embeddings = model(data.edge_index)

    users, pos_movies, neg_movies = sample_batch()
    user_emb = embeddings[users]
    pos_emb = embeddings[pos_movies]
    neg_emb = embeddings[neg_movies]

    loss = bpr_loss(user_emb, pos_emb, neg_emb)
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

model.eval()
embeddings = model(data.edge_index)

user_id = 1
user_emb = embeddings[user_map[user_id]]
movie_emb = embeddings[num_users:]

scores = torch.matmul(movie_emb, user_emb)
top_indices = torch.topk(scores, 10).indices

gnn_recs = [idx_to_movie[idx.item()] for idx in top_indices]
rec_movies = movies[movies['movieId'].isin(recommended_ids)][['movieId', 'title']]
print(rec_movies)

Epoch 1/30, Loss: 0.6888
Epoch 2/30, Loss: 0.6542
Epoch 3/30, Loss: 0.6486
Epoch 4/30, Loss: 0.6499
Epoch 5/30, Loss: 0.6406
Epoch 6/30, Loss: 0.6527
Epoch 7/30, Loss: 0.6346
Epoch 8/30, Loss: 0.6333
Epoch 9/30, Loss: 0.6331
Epoch 10/30, Loss: 0.6112
Epoch 11/30, Loss: 0.5952
Epoch 12/30, Loss: 0.6129
Epoch 13/30, Loss: 0.6041
Epoch 14/30, Loss: 0.5651
Epoch 15/30, Loss: 0.5878
Epoch 16/30, Loss: 0.5883
Epoch 17/30, Loss: 0.5718
Epoch 18/30, Loss: 0.5508
Epoch 19/30, Loss: 0.5501
Epoch 20/30, Loss: 0.5417
Epoch 21/30, Loss: 0.5419
Epoch 22/30, Loss: 0.5591
Epoch 23/30, Loss: 0.5110
Epoch 24/30, Loss: 0.4908
Epoch 25/30, Loss: 0.4842
Epoch 26/30, Loss: 0.5011
Epoch 27/30, Loss: 0.4623
Epoch 28/30, Loss: 0.4606
Epoch 29/30, Loss: 0.4484
Epoch 30/30, Loss: 0.4522
      movieId                                   title
545       616                  Aristocats, The (1970)
715       892                    Twelfth Night (1996)
815      1007         Apple Dumpling Gang, The (1975)
1374     1732