In [1]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


# ***Traditional Way***

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder



# Load ratings data (user-item-rating)
ratings = pd.read_csv('u.data', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

# Load movie information (movie_id and movie_title)
movie_info = pd.read_csv('u.item', sep='|', header=None, encoding='latin-1', usecols=[0, 1], names=['item_id', 'movie_title'])

# Step 2: Create the User-Item Rating Matrix
rating_matrix = ratings.pivot_table(index='user_id', columns='item_id', values='rating')

# Step 3: Calculate the Item-Item Similarity Matrix using Cosine Similarity
# Fill NaNs with zeros (or you can use a different strategy)
rating_matrix = rating_matrix.fillna(0)

# Compute cosine similarity between items
item_similarity = cosine_similarity(rating_matrix.T)  # Transpose to get items on rows and columns

# Step 4: Create a DataFrame for Item Similarity
item_similarity_df = pd.DataFrame(item_similarity, index=rating_matrix.columns, columns=rating_matrix.columns)

# Step 5: Recommend Items Based on User's Previous Ratings
def get_similar_items(user_id, top_k=5):
    # Get the items rated by the user
    user_ratings = rating_matrix.loc[user_id]
    rated_items = user_ratings[user_ratings > 0].index.tolist()

    # Create an empty list to store similarity scores
    similar_items = {}

    # For each item rated by the user, find the most similar items
    for item in rated_items:
        similar_scores = item_similarity_df[item]

        # Add the similarity scores to the dictionary, excluding already rated items
        for i, score in similar_scores.items():
            if i not in rated_items:
                if i in similar_items:
                    similar_items[i] += score
                else:
                    similar_items[i] = score

    # Sort the items based on similarity scores (highest first)
    recommended_items = sorted(similar_items.items(), key=lambda x: x[1], reverse=True)[:top_k]

    # Get the movie titles
    recommended_movie_ids = [item[0] for item in recommended_items]
    recommended_movie_titles = movie_info[movie_info['item_id'].isin(recommended_movie_ids)]['movie_title'].values

    return recommended_movie_titles

# Example: Recommend 5 items for user 1
recommended_movies = get_similar_items(user_id=196, top_k=5)
print("Recommended Movies for User 1:")
for idx, movie in enumerate(recommended_movies, 1):
    print(f"{idx}. {movie}")


Recommended Movies for User 1:
1. Star Wars (1977)
2. Monty Python and the Holy Grail (1974)
3. Raiders of the Lost Ark (1981)
4. Back to the Future (1985)
5. When Harry Met Sally... (1989)


# ***Data Preprocessing***

In [20]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load interaction data
df = pd.read_csv("u.data", sep='\t', header=None, names=["user_id", "item_id", "rating", "timestamp"])
df.drop("timestamp", axis=1, inplace=True)

# Load movie names
movie_info = pd.read_csv("u.item", sep='|', encoding='latin-1', header=None, usecols=[0, 1], names=["item_id", "title"])

# Label encode users/items
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
df['user'] = user_encoder.fit_transform(df['user_id'])
df['item'] = item_encoder.fit_transform(df['item_id'])

num_users = df['user'].nunique()
num_items = df['item'].nunique()
num_nodes = num_users + num_items

# Create edge index for user-item graph
edges = torch.tensor(df[['user', 'item']].values).T
edges[1] += num_users  # shift item IDs to avoid overlap with users

edge_index = torch.cat([edges, edges[[1, 0]]], dim=1)  # bidirectional

# PyG data
data = Data(edge_index=edge_index, num_nodes=num_nodes)


In [4]:
print(data)

Data(edge_index=[2, 200000], num_nodes=2625)


# ***GCN***

In [21]:
class GNNRecSys(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64):
        super().__init__()
        self.embedding_user = nn.Embedding(num_users, embedding_dim)
        self.embedding_item = nn.Embedding(num_items, embedding_dim)
        self.conv1 = GCNConv(embedding_dim, embedding_dim)
        self.conv2 = GCNConv(embedding_dim, embedding_dim)

    def forward(self, edge_index):
        x = torch.cat([self.embedding_user.weight, self.embedding_item.weight], dim=0)
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x


# ***GAT***

In [22]:
class GATRecSys(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64, heads=2):
        super().__init__()
        self.embedding_user = nn.Embedding(num_users, embedding_dim)
        self.embedding_item = nn.Embedding(num_items, embedding_dim)
        self.gat1 = GATConv(embedding_dim, embedding_dim, heads=heads, concat=True)
        self.gat2 = GATConv(embedding_dim * heads, embedding_dim, heads=1, concat=False)

    def forward(self, edge_index):
        x = torch.cat([self.embedding_user.weight, self.embedding_item.weight], dim=0)
        x = F.elu(self.gat1(x, edge_index))
        x = self.gat2(x, edge_index)
        return x


# ***Loss Function***

In [23]:
def bpr_loss(user_emb, pos_emb, neg_emb):
    pos_scores = torch.sum(user_emb * pos_emb, dim=1)
    neg_scores = torch.sum(user_emb * neg_emb, dim=1)
    return -torch.mean(F.logsigmoid(pos_scores - neg_scores))


# ***Training***

In [24]:
def train_model(model, data, df, epochs=20, lr=0.01):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        embeddings = model(data.edge_index)

        users = torch.tensor(df['user'].values)
        pos_items = torch.tensor(df['item'].values)

        # Random negative sampling
        neg_items = torch.randint(0, num_items, (len(users),))

        user_emb = embeddings[users]
        pos_emb = embeddings[pos_items + num_users]
        neg_emb = embeddings[neg_items + num_users]

        loss = bpr_loss(user_emb, pos_emb, neg_emb)
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch+1}/{epochs} Loss: {loss.item():.4f}")


In [25]:
def recommend(user_id, model, k=5):
    model.eval()

    # Check if user exists in the dataset
    if user_id not in user_encoder.classes_:
        print(f"User {user_id} not found in the dataset.")
        return pd.DataFrame()

    # Get movies the user has already watched from df
    watched_movies = df[df['user_id'] == user_id]['item_id'].values
    watched_movie_indices = item_encoder.transform(watched_movies) if len(watched_movies) > 0 else np.array([])

    with torch.no_grad():
        # Get user and item embeddings
        user_idx = user_encoder.transform([user_id])[0]
        embeddings = model(data.edge_index)
        user_emb = embeddings[user_idx]
        item_embs = embeddings[num_users:]

        # Compute scores for all items
        scores = torch.matmul(item_embs, user_emb)

        # Set scores of watched movies to negative infinity to exclude them
        if len(watched_movie_indices) > 0:
            scores[watched_movie_indices] = float('-inf')

        # Get top-k unwatched items
        top_items = torch.topk(scores, k=k).indices.numpy()

        # Convert encoded item indices to raw item IDs
        item_ids = item_encoder.inverse_transform(top_items)

        # Return movie information for recommended items
        return movie_info[movie_info['item_id'].isin(item_ids)]

In [26]:
gnn_model = GNNRecSys(num_users, num_items)
train_model(gnn_model, data, df)

print("GNN Recommendations for user 196")
print(recommend(196, gnn_model))

gat_model = GATRecSys(num_users, num_items)
train_model(gat_model, data, df)

print("GAT Recommendations for user 196")
print(recommend(196, gat_model))


Epoch 1/20 Loss: 0.6788
Epoch 2/20 Loss: 0.6031
Epoch 3/20 Loss: 0.5397
Epoch 4/20 Loss: 0.5374
Epoch 5/20 Loss: 0.4814
Epoch 6/20 Loss: 0.4753
Epoch 7/20 Loss: 0.4676
Epoch 8/20 Loss: 0.4440
Epoch 9/20 Loss: 0.4354
Epoch 10/20 Loss: 0.4316
Epoch 11/20 Loss: 0.4255
Epoch 12/20 Loss: 0.4229
Epoch 13/20 Loss: 0.4281
Epoch 14/20 Loss: 0.4187
Epoch 15/20 Loss: 0.4111
Epoch 16/20 Loss: 0.4101
Epoch 17/20 Loss: 0.4013
Epoch 18/20 Loss: 0.3927
Epoch 19/20 Loss: 0.3879
Epoch 20/20 Loss: 0.3763
GNN Recommendations for user 196
     item_id                      title
49        50           Star Wars (1977)
99       100               Fargo (1996)
180      181  Return of the Jedi (1983)
257      258             Contact (1997)
293      294           Liar Liar (1997)
Epoch 1/20 Loss: 0.6768
Epoch 2/20 Loss: 0.6601
Epoch 3/20 Loss: 0.6269
Epoch 4/20 Loss: 0.4824
Epoch 5/20 Loss: 0.4840
Epoch 6/20 Loss: 0.4628
Epoch 7/20 Loss: 0.4368
Epoch 8/20 Loss: 0.4189
Epoch 9/20 Loss: 0.4168
Epoch 10/20 Loss: 0.

# ***LightGCN***

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing as pp
import torch
import torch.nn as nn
import torch.optim as optim
from scipy.sparse import coo_matrix, diags


df = pd.read_csv('u.data', sep='\t', header=None)
df.columns = ['user_id', 'item_id', 'rating', 'timestamp']
df = df.drop(columns=['rating', 'timestamp'])


# Subsample for faster training
sampled_users = df['user_id'].drop_duplicates().sample(n=500, random_state=42)
df = df[df['user_id'].isin(sampled_users)]
df = df.sample(n=2000, random_state=42)


# TRAIN - TEST SPLIT
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_users = train_df['user_id'].unique()
train_items = train_df['item_id'].unique()
test_df = test_df[(test_df['user_id'].isin(train_users)) & (test_df['item_id'].isin(train_items))]

le_user = pp.LabelEncoder()
le_item = pp.LabelEncoder()

train_df['user_id_idx'] = le_user.fit_transform(train_df['user_id'])
train_df['item_id_idx'] = le_item.fit_transform(train_df['item_id'])
test_df['user_id_idx'] = le_user.transform(test_df['user_id'])
test_df['item_id_idx'] = le_item.transform(test_df['item_id'])

num_users = train_df['user_id_idx'].nunique()
num_items = train_df['item_id_idx'].nunique()


# ***Adjacency Matrix***

In [12]:
def build_adj_matrix(num_users, num_items, user_item_pairs):
    rows = user_item_pairs['user_id_idx']
    cols = user_item_pairs['item_id_idx'] + num_users  # shift item index

    data = np.ones(len(rows))
    adj = coo_matrix((data, (rows, cols)), shape=(num_users + num_items, num_users + num_items))
    adj = adj + adj.T

    deg = np.array(adj.sum(axis=1)).flatten()
    deg_inv_sqrt = np.power(deg, -0.5)
    deg_inv_sqrt[np.isinf(deg_inv_sqrt)] = 0.
    D_inv_sqrt = diags(deg_inv_sqrt)

    norm_adj = D_inv_sqrt @ adj @ D_inv_sqrt
    return norm_adj.tocoo()


In [13]:
class LightGCN(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64, num_layers=2):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        nn.init.xavier_uniform_(self.user_embedding.weight)
        nn.init.xavier_uniform_(self.item_embedding.weight)
        self.num_layers = num_layers

    def forward(self, norm_adj):
        all_embeddings = torch.cat([self.user_embedding.weight, self.item_embedding.weight], dim=0)
        out = all_embeddings
        embeddings_list = [out]

        for _ in range(self.num_layers):
            out = torch.sparse.mm(norm_adj, out)
            embeddings_list.append(out)

        final_embedding = torch.mean(torch.stack(embeddings_list, dim=0), dim=0)
        user_embeds = final_embedding[:self.user_embedding.num_embeddings]
        item_embeds = final_embedding[self.user_embedding.num_embeddings:]
        return user_embeds, item_embeds


In [14]:
class BPRLoss(nn.Module):
    def forward(self, user_emb, pos_emb, neg_emb):
        pos_score = torch.sum(user_emb * pos_emb, dim=1)
        neg_score = torch.sum(user_emb * neg_emb, dim=1)
        loss = -torch.mean(torch.log(torch.sigmoid(pos_score - neg_score)))
        return loss


# ***Training***

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

adj = build_adj_matrix(num_users, num_items, train_df)
indices = torch.tensor([adj.row, adj.col], dtype=torch.long)
values = torch.tensor(adj.data, dtype=torch.float32)
shape = adj.shape
norm_adj = torch.sparse.FloatTensor(indices, values, torch.Size(shape)).to(device)

model = LightGCN(num_users, num_items).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = BPRLoss()

epochs = 20
batch_size = 1024
train_df_values = train_df[['user_id_idx', 'item_id_idx']].values

for epoch in range(epochs):
    model.train()
    user_emb, item_emb = model(norm_adj)
    epoch_loss = 0
    np.random.shuffle(train_df_values)

    for i in range(0, len(train_df_values), batch_size):
        batch = train_df_values[i:i+batch_size]
        u = torch.tensor(batch[:, 0], dtype=torch.long).to(device)
        i_pos = torch.tensor(batch[:, 1], dtype=torch.long).to(device)
        j_neg = torch.randint(0, num_items, (len(batch),), dtype=torch.long).to(device)

        u_emb = user_emb[u]
        i_emb = item_emb[i_pos]
        j_emb = item_emb[j_neg]

        loss = criterion(u_emb, i_emb, j_emb)

        optimizer.zero_grad()
        loss.backward(retain_graph=True)  # Retain graph if you plan to call backward multiple times
        optimizer.step()
        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {epoch_loss:.4f}")


  indices = torch.tensor([adj.row, adj.col], dtype=torch.long)
  norm_adj = torch.sparse.FloatTensor(indices, values, torch.Size(shape)).to(device)


Epoch 1, Loss: 1.3647
Epoch 2, Loss: 1.3361
Epoch 3, Loss: 1.2952
Epoch 4, Loss: 1.2419
Epoch 5, Loss: 1.1791
Epoch 6, Loss: 1.1037
Epoch 7, Loss: 1.0163
Epoch 8, Loss: 0.9157
Epoch 9, Loss: 0.8234
Epoch 10, Loss: 0.7133
Epoch 11, Loss: 0.6182
Epoch 12, Loss: 0.5283
Epoch 13, Loss: 0.4440
Epoch 14, Loss: 0.3727
Epoch 15, Loss: 0.3025
Epoch 16, Loss: 0.2465
Epoch 17, Loss: 0.2044
Epoch 18, Loss: 0.1667
Epoch 19, Loss: 0.1398
Epoch 20, Loss: 0.1139


In [16]:
# Load the movie names from u.item (or any CSV with movie info)
movie_info_df = pd.read_csv('u.item', sep='|', header=None, encoding='latin-1')
movie_info_df.columns = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_url', 'unknown', 'action', 'adventure', 'animation', 'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'film_noir', 'horror', 'musical', 'mystery', 'romance', 'sci_fi', 'thriller', 'war', 'western']

# Keep only necessary columns
movie_info_df = movie_info_df[['movie_id', 'movie_title']]


In [18]:
def recommend_movies(user_raw_id, top_k=5):
    model.eval()

    # Check if user exists in the dataset
    if user_raw_id not in le_user.classes_:
        print(f"User {user_raw_id} not found in the dataset.")
        return []

    # Transform raw user ID to encoded index
    uid = le_user.transform([user_raw_id])[0]

    # Get movies the user has already watched (from train_df and optionally test_df)
    watched_movies = train_df[train_df['user_id'] == user_raw_id]['item_id'].values
    if test_df is not None:
        watched_movies = np.union1d(watched_movies, test_df[test_df['user_id'] == user_raw_id]['item_id'].values)

    # Get encoded indices of watched movies
    watched_movie_indices = le_item.transform(watched_movies) if len(watched_movies) > 0 else np.array([])

    with torch.no_grad():
        # Get user and item embeddings
        user_emb, item_emb = model(norm_adj)

        # Compute scores for all items
        scores = torch.matmul(item_emb, user_emb[uid])

        # Set scores of watched movies to a very low value (e.g., -infinity) to exclude them
        if len(watched_movie_indices) > 0:
            scores[watched_movie_indices] = float('-inf')

        # Get top-k items (unwatched movies)
        top_items = torch.topk(scores, top_k).indices.cpu().numpy()

        # Convert encoded item indices to raw movie IDs
        movie_ids = le_item.inverse_transform(top_items)

        # Get movie titles from movie_info_df
        movie_names = movie_info_df[movie_info_df['movie_id'].isin(movie_ids)]['movie_title'].values

        return movie_names

# Example usage
recommended_movies = recommend_movies(user_raw_id=196)
print("Recommended Movies for User 196 (excluding watched movies):")
for idx, movie in enumerate(recommended_movies, 1):
    print(f"{idx}. {movie}")

Recommended Movies for User 196 (excluding watched movies):
1. Angels and Insects (1995)
2. Muppet Treasure Island (1996)
3. Sense and Sensibility (1995)
4. Things to Do in Denver when You're Dead (1995)
5. Forbidden Christ, The (Cristo proibito, Il) (1950)


In [19]:
watched_movies = train_df[train_df['user_id'] == 196]['item_id'].values
watched_movie_titles = movie_info_df[movie_info_df['movie_id'].isin(watched_movies)]['movie_title'].values
print("Watched Movies by User 196:", watched_movie_titles)

Watched Movies by User 196: ['Boogie Nights (1997)' 'That Thing You Do! (1996)']
