In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [2]:
np.loadtxt('..\\data\\ml-100k\\ua.test', skiprows=0, delimiter='\t').astype('int32')

array([[        1,        20,         4, 887431883],
       [        1,        33,         4, 878542699],
       [        1,        61,         4, 878542420],
       ...,
       [      943,       570,         1, 888640125],
       [      943,       808,         4, 888639868],
       [      943,      1067,         2, 875501756]])

In [3]:
def load_data(path, delimiter='\t'):
    train = np.loadtxt(path+'ua.base', skiprows=0, delimiter=delimiter).astype('int32')
    test = np.loadtxt(path+'ua.test', skiprows=0, delimiter=delimiter).astype('int32')
    total = np.concatenate((train, test), axis=0)

    n_u = np.unique(total[:, 0]).size #num of users
    n_i = np.unique(total[:, 1]).size #num of items

    train_data = np.zeros((n_u, n_i), dtype='float32')
    test_data = np.zeros((n_u, n_i), dtype='float32')

    for i in range(train.shape[0]):
        train_data[train[i][0]-1][train[i][1]-1] = train[i][2]
    
    for i in range(test.shape[0]):
        test_data[test[i][0]-1][test[i][1]-1] = test[i][2]

    return train_data, test_data

In [4]:
train_data, test_data = load_data(path='..\\data\\ml-100k\\')

In [5]:
a = 1601
sum(train_data[:, a])/np.count_nonzero(train_data[:, a])

3.3333333333333335

In [6]:
train_data

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]], dtype=float32)

In [8]:
def average_item_rating(train_data):
    list_average_item_rating = [sum(train_data[:, i])/(np.count_nonzero(train_data[:, i])+0.1) for i in range(train_data.shape[1])]
    return list_average_item_rating

In [9]:
list_average_item_rating = average_item_rating(train_data=train_data)

In [12]:
list_average_item_rating[1581]

0.0

In [13]:
def get_item_features(item_path): 
   i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

   movies = pd.read_csv(item_path, sep='|', names=i_cols,encoding='latin-1')
   genres = movies[['unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']]
   genres = genres.to_numpy()

   return genres
    

In [14]:
item_featuers = get_item_features('..\\data\\ml-100k\\u.item')

In [15]:
item_featuers[0]

array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [16]:
import transformers

In [17]:
import torch

In [18]:
from torch.utils.data import DataLoader, Dataset

In [19]:
len(list_average_item_rating)

1682

In [20]:
class CustomDataset(Dataset):
    def __init__(self, item_features: np.ndarray, labels: list) -> None:
        self.item_features = item_features
        self.labels = labels

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx) -> None:
        item = self.item_features[idx]
        label = self.labels[idx]

        return item, label

In [21]:
training_data = CustomDataset(item_features=item_featuers, labels=list_average_item_rating)

In [22]:
training_data_loader = DataLoader(training_data, batch_size=16, shuffle=True)

In [23]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [24]:
device

device(type='cpu')

In [25]:
from transformers import BertForSequenceClassification

In [26]:
model = BertForSequenceClassification.from_pretrained('google-bert/bert-base-uncased', num_labels=1) # 1 label for regression
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model c

In [27]:
num_epoch_community = 1
num_training_steps_community = num_epoch_community*len(training_data_loader) 

In [58]:
progress_bar = tqdm(range(num_training_steps_community))

model.train()
for epoch in range(num_epoch_community):
    for batch in training_data_loader:
        outputs = model(input_ids = batch[0], labels=batch[1].float())
        print((outputs))
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/106 [00:00<?, ?it/s]

SequenceClassifierOutput(loss=tensor(0.9968, grad_fn=<MseLossBackward>), logits=tensor([[2.5861],
        [2.8939],
        [2.6799],
        [2.6132],
        [2.7920],
        [2.8478],
        [2.9307],
        [3.1704],
        [2.9968],
        [3.1480],
        [2.9593],
        [2.9529],
        [2.8504],
        [2.9844],
        [2.8103],
        [2.9708]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=tensor(0.6620, grad_fn=<MseLossBackward>), logits=tensor([[3.0802],
        [2.9560],
        [2.9090],
        [2.8321],
        [2.9836],
        [3.0318],
        [2.8905],
        [2.8499],
        [2.8347],
        [2.7607],
        [2.9566],
        [2.9444],
        [3.0065],
        [3.0055],
        [2.9544],
        [3.0242]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=tensor(0.8360, grad_fn=<MseLossBackward>), logits=tensor([[3.0123],
        [3.0097],
        [3.2121],
  

In [71]:
input_id = 199
input = torch.Tensor([item_featuers[input_id]]).long()

In [72]:
input

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]])

In [75]:
for i in range(len(list_average_item_rating)):
    input = torch.Tensor([item_featuers[i]]).long()
    model.eval()
    with torch.no_grad():
        predict_score = model(input_ids=input)

    print(predict_score.logits)

tensor([[3.0285]])
tensor([[3.0286]])
tensor([[3.0285]])
tensor([[3.0286]])
tensor([[3.0286]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0286]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0286]])
tensor([[3.0286]])
tensor([[3.0285]])
tensor([[3.0286]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0286]])
tensor([[3.0286]])
tensor([[3.0285]])
tensor([[3.0286]])
tensor([[3.0285]])
tensor([[3.0286]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0286]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0285]])
tensor([[3.0286]])
tensor([[3.0286]])
tensor([[3.0285]])
tensor([[3.0

KeyboardInterrupt: 

In [74]:
list_average_item_rating[input_id]

3.8461538461538463

In [61]:
next(iter(training_data_loader))[0]

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0]])

In [103]:
from transformers import AutoTokenizer

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [107]:
encoding = tokenizer("We are very happy to show you the Transformers library.", return_tensors='pt')
print(encoding)

{'input_ids': tensor([[  101,  2057,  2024,  2200,  3407,  2000,  2265,  2017,  1996, 19081,
          3075,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [108]:
model(**encoding)

SequenceClassifierOutput(loss=None, logits=tensor([[-0.1837]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)

In [143]:
loss = torch.nn.MSELoss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.randn((3, 5)).float()
output = loss(input, target)
print(output)

tensor(1.6690, grad_fn=<MseLossBackward>)


In [48]:
a = None
if not a: 
    print('d')

d


In [408]:
for epoch in range(num_epoch_community):
    for batch in training_data_loader:
        print("````"*12)
        print(batch)

````````````````````````````````````````````````
[tensor([[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
        [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), tensor([2.7273, 2.4799, 3.6579, 3.2059, 3.6895, 1.9048, 2.7621, 3.5855],
       dtype=torch.float64)]
````````````````````````````````````````````````
[tensor([[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0],
        [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,

In [75]:
np.count_nonzero(list_average_item_rating)

1682

In [83]:
list_average_item_rating

[3.8596938775510203,
 3.1983471074380163,
 3.0588235294117645,
 3.5454545454545454,
 3.2911392405063293,
 3.4347826086956523,
 3.8034682080924855,
 4.025773195876289,
 3.876865671641791,
 3.8292682926829267,
 3.80184331797235,
 4.386454183266932,
 3.451219512195122,
 3.9565217391304346,
 3.780392156862745,
 3.1944444444444446,
 3.0941176470588236,
 2.8,
 3.9,
 3.3442622950819674,
 2.7721518987341773,
 4.164285714285715,
 4.114457831325301,
 3.4213836477987423,
 3.4302788844621515,
 3.417910447761194,
 3.1538461538461537,
 3.9518072289156625,
 2.6576576576576576,
 3.9722222222222223,
 3.6413793103448278,
 3.7564102564102564,
 3.460674157303371,
 2.7142857142857144,
 2.1818181818181817,
 2.0,
 2.25,
 3.0,
 3.269230769230769,
 2.8653846153846154,
 3.081081081081081,
 3.804195804195804,
 3.0,
 3.3552631578947367,
 4.0675675675675675,
 3.576923076923077,
 3.588709677419355,
 4.116504854368932,
 3.2911392405063293,
 4.365656565656566,
 3.473684210526316,
 3.7093023255813953,
 2.9590163934426

In [368]:
next(iter(training_data_loader))



[tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]]),
 tensor([3.5909, 5.0000, 4.0000, 3.8611, 3.0000, 4.0000, 3.0521, 3.4474],
        dtype=torch.float64)]

In [388]:
import math

In [403]:
for i, value in enumerate(list_average_item_rating):
    if math.isnan(value):
        print(f'có nè {i}')
        break

In [401]:
np.count_nonzero(train_data[:, 1581])

0

In [402]:
list_average_item_rating[1581]

0.0