<a href="https://colab.research.google.com/github/Tpmonkey-Nuttee/SpecTop/blob/main/RCNNforRec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn

torch.manual_seed(0)
np.random.seed(0)

# Hyper Paremeters Settings

In [None]:
### Dataset paramenters

# given values [x1, x2, x3, ..., x10]
# training [x1, x2, x3, ..., x7]
# validate [x8]
# test [x9, x10]
ratio_training = 0.7
ratio_validation = 0.1
ratio_test = 0.2

### Model parameters

# hidden dimension, They didn't tell us the value of `d`
h_dim = 8

w = 8 # [8, 16, 32, 64]
k = 2 # [2, 3, 4, 5, 6]

### Recall@N
N = 5 # [1, 5, 10]

Download dataset (optional)

In [None]:
!gdown 1rP413gUI_qwwsGFyc08vRQ5s0J48w8jz

Downloading...
From (original): https://drive.google.com/uc?id=1rP413gUI_qwwsGFyc08vRQ5s0J48w8jz
From (redirected): https://drive.google.com/uc?id=1rP413gUI_qwwsGFyc08vRQ5s0J48w8jz&confirm=t&uuid=957922ce-24c7-4cb9-92a3-df2b63128339
To: /content/Gowalla_totalCheckins.txt
100% 395M/395M [00:06<00:00, 57.9MB/s]


# Data Processing

In [None]:
# We are using Gowalla dataset.
# https://snap.stanford.edu/data/loc-gowalla.html
df = pd.read_csv("drive/MyDrive/dataset/Gowalla_totalCheckins.txt", sep="\t", header=None)
df.columns = ["user", "check_in_time", "latitude", "longitude", "location"]

# We don't need these.
df = df.drop(columns=["check_in_time", "latitude", "longitude"])

df

FileNotFoundError: [Errno 2] No such file or directory: 'drive/MyDrive/dataset/Gowalla_totalCheckins.txt'

In [None]:
# Remove location with less than 10 interactions.
location_to_user = df.groupby("location")["user"].apply(set).apply(list)

valid_location = location_to_user[location_to_user.apply(len) > 9].index

df2 = df[df["location"].isin(valid_location)]

df2

Unnamed: 0,user,location
0,0,22847
1,0,420315
2,0,316637
3,0,16516
4,0,5535878
...,...,...
6442832,196561,214095
6442837,196561,214095
6442849,196577,181224
6442850,196577,159231


In [None]:
# User must have more than 15 interactions
user_to_location = df2.groupby('user')['location'].apply(list)

valid_user = user_to_location[user_to_location.apply(len) > 14].index

df3 = df2[df2["user"].isin(valid_user)]

df3

Unnamed: 0,user,location
0,0,22847
1,0,420315
2,0,316637
3,0,16516
4,0,5535878
...,...,...
6442803,196561,214095
6442822,196561,214095
6442824,196561,422113
6442832,196561,214095


In [None]:
len_user = df3['user'].nunique()
len_item = df3["location"].nunique()

len_user, len_item

(31568, 57423)

In [None]:
# Map location id to normalized one
# So embedding actually works lol
sorted_locations = sorted(list(set(df3["location"].values)))

loc_norm = dict(zip(sorted_locations, range(len_item)))

In [None]:
df3["location"] = df3["location"].apply(lambda x: loc_norm[x])
df3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3["location"] = df3["location"].apply(lambda x: loc_norm[x])


Unnamed: 0,user,location
0,0,4595
1,0,42438
2,0,38496
3,0,2808
4,0,57414
...,...,...
6442803,196561,32573
6442822,196561,32573
6442824,196561,42486
6442832,196561,32573


In [None]:
dataset = df3.groupby('user')['location'].apply(list).reset_index()
print(dataset)

         user                                           location
0           0  [4595, 42438, 38496, 2808, 57414, 2484, 4213, ...
1           2  [2228, 13233, 13233, 13233, 39247, 6883, 13233...
2           4  [2219, 28704, 5750, 883, 8235, 28836, 12741, 9...
3           5  [17980, 4730, 257, 257, 7637, 7637, 56175, 566...
4           7  [42438, 42438, 4213, 42438, 13229, 2115, 11294...
...       ...                                                ...
31563  196152  [24939, 27333, 24939, 24939, 53076, 1164, 4285...
31564  196183  [26469, 36933, 37824, 6896, 38924, 26276, 4459...
31565  196353  [35447, 24519, 21516, 21516, 21516, 21516, 215...
31566  196489  [33907, 24883, 27833, 20180, 10804, 25828, 603...
31567  196561  [11920, 32573, 17618, 32573, 32573, 32573, 325...

[31568 rows x 2 columns]


# Model

In [None]:
class RCNN:
    def __init__(self):
        self.embedding = nn.Embedding(len_item, h_dim)

        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()
        # Weight input gate
        self.w_i = nn.Linear(h_dim * 2, h_dim) # 2H x H
        self.b_i = torch.randn(h_dim) # H x 1

        # Weight forget gate
        self.w_f = nn.Linear(h_dim * 2, h_dim) # 2H x H
        self.b_f = torch.randn(h_dim) # H x 1

        # Weight cell state
        self.w_c = nn.Linear(h_dim * 2, h_dim) # 2H x H
        self.b_c = torch.randn(h_dim) # H x 1

        # Weight output gate
        self.w_o = nn.Linear(h_dim * 2, h_dim) # 2H x H
        self.b_o = torch.randn(h_dim) # H x 1

    def forward(self, items: list[int]):
        # split data, only use training one
        items = items[:round(len(items) * ratio_training)]

        item_embed = self.embedding(torch.tensor(items)) # H x |U[i]|

        last_c = torch.zeros(h_dim) # H x 1
        last_h = torch.zeros(h_dim) # H x 1

        for item in item_embed:
            # Recurrent layer
            # TODO: Convolution, fully connected layers
            hx = torch.cat([last_h, item], 0) # 2H x 1
            i = self.sigmoid(self.w_i(hx) + self.b_i)
            f = self.sigmoid(self.w_f(hx) + self.b_f)
            new_c = self.tanh(self.w_c(hx) + self.b_c)
            c = torch.mul(f, last_c) + torch.mul(i, new_c)
            o = self.sigmoid(self.w_o(hx) + self.b_o)
            h = torch.mul(o, self.tanh(c))

            last_c = c
            last_h = h

            print(o)

    def predict(self, items):
        raise NotImplementedError

    def calculate_loss(self):
        raise NotImplementedError

    def backprop(self, loss):
        raise NotImplementedError

In [None]:
model = RCNN()

model.forward(dataset.iloc[0]["location"])

tensor([0.3300, 0.4602, 0.2847, 0.0943, 0.9113, 0.1257, 0.7624, 0.7853],
       grad_fn=<SigmoidBackward0>)
tensor([0.4793, 0.4754, 0.3554, 0.1600, 0.9299, 0.2904, 0.5317, 0.7048],
       grad_fn=<SigmoidBackward0>)
tensor([0.4222, 0.2768, 0.4667, 0.0746, 0.9629, 0.1354, 0.6737, 0.8688],
       grad_fn=<SigmoidBackward0>)
tensor([0.3424, 0.5973, 0.2292, 0.1124, 0.8850, 0.2800, 0.8200, 0.8150],
       grad_fn=<SigmoidBackward0>)
tensor([0.2553, 0.5438, 0.3332, 0.0997, 0.9093, 0.1101, 0.7498, 0.6956],
       grad_fn=<SigmoidBackward0>)
tensor([0.3253, 0.5079, 0.2563, 0.0856, 0.8996, 0.0882, 0.6805, 0.7705],
       grad_fn=<SigmoidBackward0>)
tensor([0.2387, 0.4869, 0.2675, 0.1013, 0.9084, 0.2470, 0.7369, 0.6689],
       grad_fn=<SigmoidBackward0>)
tensor([0.4654, 0.4774, 0.3646, 0.1557, 0.9260, 0.2984, 0.5385, 0.6968],
       grad_fn=<SigmoidBackward0>)
tensor([0.3161, 0.6306, 0.3505, 0.1034, 0.9507, 0.1728, 0.8533, 0.8472],
       grad_fn=<SigmoidBackward0>)
tensor([0.4645, 0.4856, 0.36

In [None]:
m = nn.Sigmoid()

x = torch.randn(h_dim)
y = torch.randn(h_dim)
z = x + y
print(x)
print(y)
print("=")
print(z)

tensor([-1.1997, -0.6273,  0.6078,  1.5428,  1.7732, -0.3666, -0.6488,  0.5503])
tensor([-0.0311, -1.1765, -0.1152, -1.5276,  0.2398, -0.0060, -0.4117,  0.6002])
=
tensor([-1.2308, -1.8038,  0.4926,  0.0152,  2.0129, -0.3726, -1.0605,  1.1505])


In [None]:
# from GPT
# Dont use GPT, It sucks badly -Nou
def evaluate(model, X, y, k=1):
    model.eval() # use model from touch
    X_t = torch.tensor(X, dtype=torch.long)
    y_t = torch.tensor(y, dtype=torch.long)
    with torch.no_grad():
        logits = model(X_t)
        preds = torch.argsort(logits, dim=1, descending=True) # ranking

    hit, ap = 0, 0
    for i in range(len(y_t)):
        top_k = preds[i, :k]
        true_item = y_t[i].item()
        if true_item in top_k:
            hit += 1
            rank = (preds[i] == true_item).nonzero(as_tuple=True)[0].item() + 1
            ap += 1.0 / rank
    HR = hit / len(y_t)
    MAP = ap / len(y_t)
    return HR, MAP

HR1, MAP = evaluate(model, X, y, k=1)
print(f"HR@1 = {HR1:.3f}, MAP = {MAP:.3f}")