In [None]:
import numpy as np
import random
import json
import copy

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from keras.preprocessing.sequence import pad_sequences
import torch.nn.functional as F
# from d2l import torch as d2l


import glob
from tqdm.notebook import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
random.seed(1)
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic=True 

In [None]:
meta = json.load(open('/content/drive/MyDrive/dl_assigment_2/meta.json', 'r'))
tokens = meta['tokens']  # 标注有多少个单词
num_user = meta['num_user']
num_token = len(tokens)

In [None]:
tokens.append("<pad>")

In [None]:
tokens.index("<pad>")

13369

In [None]:
train_data = json.load(open('/content/drive/MyDrive/dl_assigment_2/train.json', 'r'))
valid_data = json.load(open('/content/drive/MyDrive/dl_assigment_2/valid.json', 'r'))

In [None]:
max_length = 0
for i in range(len(train_data)):
    sample = train_data[i]
    token = sample["token_id"]
    max_length = max(max_length, len(token))

print(max_length)

60


In [None]:
class tweetDataset(Dataset):
    def __init__(self, data, max_length, tokens):
        self.data = data
        self.max_length = max_length
        self.tokens = tokens

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = 0
        sample = self.data[idx]
        val_length = []
        for index in [idx]:
          sample_single = self.data[index]
          sentence = len(sample_single["token_id"])
          val_length.append(sentence)

        token_id = [sample["token_id"]]
        padded_token_id = pad_sequences(token_id, maxlen=self.max_length, padding='post',)
        sample['token_id'] = torch.Tensor(padded_token_id[0])
        sample["val_length"] = val_length[0]
     
        return sample

In [None]:
train_dataset = tweetDataset(train_data, max_length, tokens)
valid_dataset = tweetDataset(valid_data, max_length, tokens)

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True, drop_last=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=2, shuffle=True)

In [None]:
sample = next(iter(train_dataloader))

print('Sample from train dataloader: ')
print('USER ID: ', sample['user_id'])
print('TOKEN ID: ', sample['token_id'])
print('TOKEN ID shape should be BATCH by LENGTH: ', sample['token_id'].shape)
print("length:", sample['val_length'])

Sample from train dataloader: 
USER ID:  tensor([2, 3])
TOKEN ID:  tensor([[ 8039.,  6211.,  6199.,  2495., 11873.,   491.,   565.,  6833., 10048.,
          6191.,  4163.,  2795.,  1983.,  6480., 11865.,  8733., 11431.,  7320.,
          6920., 12017., 11916., 13021.,   661., 10638., 11888.,   661.,  5870.,
          8537.,  7320., 12262.,  6388., 12271.,   491.,  6918.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.],
        [ 2897., 12017., 10504.,  7815.,  6679.,  9828.,  5636.,  6388.,  7921.,
         12017.,  4981., 12017.,  6670.,  4236., 13021., 13060.,  6936., 11874.,
          6223., 11087.,  8891.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0., 

In [None]:
sample['token_id'].shape

torch.Size([2, 60])

In [None]:
class Model(nn.Module):
    def __init__(self, num_token, num_user, embed_dim, rnn_dim, num_layers):
        super(Model, self).__init__()
        self.num_token = num_token
        self.num_user = num_user
        self.embed_dim = embed_dim
        self.rnn_dim = rnn_dim
        self.num_layers = num_layers

        self.embedding = nn.Embedding(num_token, embed_dim)
        self.dropout = nn.Dropout(p=0.5)
        self.atten = nn.MultiheadAttention(rnn_dim, 1, dropout=0.1)

        # 将bidirectional设置为True以获取双向循环神经网络
        self.rnn = nn.LSTM(embed_dim, rnn_dim, num_layers=num_layers, batch_first=True, bidirectional=False)
        self.out_linear = nn.Linear(rnn_dim, num_user)

    def forward(self, token_id, val_length):
        # inputs的形状是（批量大小，时间步数）
        # 因为长短期记忆网络要求其输入的第一个维度是时间维，
        # 所以在获得词元表示之前，输入会被转置。
        # 输出形状为（时间步数，批量大小，词向量维度）
        embeddings = self.embedding(token_id.T)
        # embeddings = self.dropout(embeddings)
        self.rnn.flatten_parameters()
        # 返回上一个隐藏层在不同时间步的隐状态，
        # outputs的形状是（时间步数，批量大小，2*隐藏单元数）
        embeddings = nn.utils.rnn.pack_padded_sequence(embeddings.permute(1,0,2), val_length, batch_first=True, enforce_sorted=False)
        outputs, _ = self.rnn(embeddings)
        outputs, _= nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
        result = outputs.permute(1,0,2)
        final, _ = self.atten(result, result, result)
        # 连结初始和最终时间步的隐状态，作为全连接层的输入，
        # 其形状为（批量大小，4*隐藏单元数）
        # encoding = torch.cat((final[0], final[-1]), dim=1)
        outs = self.out_linear(final[-1])
        return outs

In [None]:
device = 'cuda'
model = Model(num_token, num_user, embed_dim=512, rnn_dim=1158, num_layers=1).to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.8e-5, weight_decay=1e-7)

In [None]:
num_param = sum(p.numel() for p in model.parameters())
print('Number of parameters: {}'.format(num_param))
print('[NOTE] Number of parameters SHOULD NOT exceed 20,000,000 (20 million).')
pred = model(sample['token_id'].long().to(device), sample['val_length'].to("cpu"))
print('Prediction shape would be BATCH X NUM_USER(OUTPUT) : ', pred.shape)

Number of parameters: 19967392
[NOTE] Number of parameters SHOULD NOT exceed 20,000,000 (20 million).
Prediction shape would be BATCH X NUM_USER(OUTPUT) :  torch.Size([2, 8])


In [None]:
criteria = nn.CrossEntropyLoss()
avg_loss = 0.0
best_valid_accu = 0.0
best_epoch = -1
best_model = None
num_epoch = 100

for epoch in tqdm(range(num_epoch)):
    # start training
    for sample in train_dataloader:
        model.train()
        optimizer.zero_grad()

        pred = model(sample['token_id'].long().to(device), sample['val_length'].to("cpu"))
        # print(pred)

        loss = criteria(pred, sample['user_id'].long().to(device))

        loss.backward()
        optimizer.step()

        avg_loss += loss.item() / len(train_dataloader)

    # start validation
    correct_cnt = 0.0
    data_cnt = 0.0
    for sample in valid_dataloader:
        model.eval()

        with torch.no_grad():
            pred = model(sample['token_id'].long().to(device), sample['val_length'].to("cpu"))

        pred_user_id = torch.argmax(pred, dim=-1)

        accu = pred_user_id.detach().cpu() == sample['user_id']

        correct_cnt += torch.sum(accu)
        data_cnt += sample['token_id'].shape[0]

    # calculate best valid accuracy, and save the best model. 
    curr_valid_accu = (correct_cnt / data_cnt).item()

    print('[EPOCH {}] VALID ACCURACY UPDATED: {}'.format(epoch, curr_valid_accu))

    best_valid_accu = max(best_valid_accu, curr_valid_accu)
    if best_valid_accu == curr_valid_accu:
        best_epoch = epoch
        best_model = copy.deepcopy(model)
        torch.save(best_model.state_dict(), 'best_baseline.pth')
        print('[EPOCH {}] BEST VALID ACCURACY UPDATED: {}'.format(epoch, best_valid_accu))

  0%|          | 0/100 [00:00<?, ?it/s]

[EPOCH 0] VALID ACCURACY UPDATED: 0.44662922620773315
[EPOCH 0] BEST VALID ACCURACY UPDATED: 0.44662922620773315
[EPOCH 1] VALID ACCURACY UPDATED: 0.5449438095092773
[EPOCH 1] BEST VALID ACCURACY UPDATED: 0.5449438095092773
[EPOCH 2] VALID ACCURACY UPDATED: 0.5870786309242249
[EPOCH 2] BEST VALID ACCURACY UPDATED: 0.5870786309242249
[EPOCH 3] VALID ACCURACY UPDATED: 0.5814606547355652
[EPOCH 4] VALID ACCURACY UPDATED: 0.5898876190185547
[EPOCH 4] BEST VALID ACCURACY UPDATED: 0.5898876190185547
[EPOCH 5] VALID ACCURACY UPDATED: 0.6067415475845337
[EPOCH 5] BEST VALID ACCURACY UPDATED: 0.6067415475845337
[EPOCH 6] VALID ACCURACY UPDATED: 0.632022500038147
[EPOCH 6] BEST VALID ACCURACY UPDATED: 0.632022500038147
[EPOCH 7] VALID ACCURACY UPDATED: 0.6151685118675232
[EPOCH 8] VALID ACCURACY UPDATED: 0.6348314881324768
[EPOCH 8] BEST VALID ACCURACY UPDATED: 0.6348314881324768


In [None]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [None]:
sample['token_id']

tensor([[ 5721.,   434.,  3841.,  7382., 13106., 11370., 11891.,  2727.,  3818.,
           491.,   667.,   240., 12017.,   726., 11853., 12089.,  9487.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.],
        [ 7815.,  7454., 12017.,  7815.,  4345.,   460.,   491.,  4693.,   687.,
         11853., 13171.,  4547., 11908., 12954.,   799.,  8324., 11853.,  1732.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,   

In [None]:
seq = torch.tensor([[1,2,0], [3,0,0], [4,5,6]])
lens = [2, 1, 3]
packed = pack_padded_sequence(seq, lens, batch_first=True, enforce_sorted=False)

In [None]:
x = sample['token_id']
x_len = [17,18]
x

tensor([[ 5721.,   434.,  3841.,  7382., 13106., 11370., 11891.,  2727.,  3818.,
           491.,   667.,   240., 12017.,   726., 11853., 12089.,  9487.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.],
        [ 7815.,  7454., 12017.,  7815.,  4345.,   460.,   491.,  4693.,   687.,
         11853., 13171.,  4547., 11908., 12954.,   799.,  8324., 11853.,  1732.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,   

In [None]:
 x = nn.utils.rnn.pack_padded_sequence(x, x_len, batch_first=True, enforce_sorted=False)
 x

PackedSequence(data=tensor([ 7815.,  5721.,  7454.,   434., 12017.,  3841.,  7815.,  7382.,  4345.,
        13106.,   460., 11370.,   491., 11891.,  4693.,  2727.,   687.,  3818.,
        11853.,   491., 13171.,   667.,  4547.,   240., 11908., 12017., 12954.,
          726.,   799., 11853.,  8324., 12089., 11853.,  9487.,  1732.]), batch_sizes=tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1]), sorted_indices=tensor([1, 0]), unsorted_indices=tensor([1, 0]))