# Load library

In [1]:
import torch
import csv
import math
import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import warnings
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe,vocab
from torch.utils.data import DataLoader
import tqdm
from nltk.tokenize.treebank import TreebankWordDetokenizer
import nltk
from nltk.corpus import stopwords
from torch.nn.utils.rnn import pad_sequence
import numpy as np

warnings.filterwarnings("ignore")

# Data Preprocessing

1. How do you choose the tokenizer for this task? Could we use the white space to tokenize the text? What about using the complicated tokenizer instead? Make some discussion.

I think the answer about the first question is yes, but the result may not be good. Since that the white space tokenizer will split the word by the white space, but the word may not be split by the white space.

But the task in this task is enough to use the white space tokenizer.

I choose `torchtext` with `basic_english` for the tokenizer. 

2. Why we need the special tokens like ⟨pad⟩, ⟨unk⟩?

Special tokens like ⟨pad⟩ and ⟨unk⟩ are used in natural language processing tasks to represent special cases that might occur in the data.
- ⟨pad⟩: it is used to pad the sequence to the same length.
- ⟨unk⟩: it is used to represent the word that is not in the vocabulary.

3. Briefly explain how your procedure is run to handle the text data.

- First, using `pandas` to read the data.
- Second, combine the `headline` and `short_description` to `text`, and filter the stop words with `nltk`.
- Thrid, using `torchtext` to build the vocabulary and tokenize the text.
- Fourth, with `GloVe` to build the embedding matrix.
- Fifth, build dataset and dataloader. And using `torch.nn.utils.rnn.pad_sequence` make sure the length of each batch is the same.

In [2]:
# download nltk stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, text, category):
        self.text = text
        self.category = category

    def __getitem__(self, idx):
        return self.category[idx], self.text[idx]

    def __len__(self):
        return len(self.text)
    
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, text):
        self.text = text

    def __getitem__(self, idx):
        return self.text[idx]

    def __len__(self):
        return len(self.text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# read train.csv with pandas
import pandas as pd
df = pd.read_csv('train_c.csv')
print(df.head())

   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  
0  House Dem Aide: We Didn’t Even See Comey’s Let...      1  
1  Ever get the feeling your life circles the rou...      0  
2  Why the Truth Might Get You Fired October 29, ...      1  
3  Videos 15 Civilians Killed In Single US Airstr...      1  
4  Print \nAn Iranian woman has been sentenced to...      1  


In [4]:
# combine headline and short_description into text, and remove idx
df['text_full'] = df['title'] + ' ' + df['text']
# remove headline and short_description stopwords
df["text_w"] = df["text_full"].apply(lambda x: ' '.join([word for word in str(x).split() if word not in (stop_words)]))
dataset = MyDataset(df['text_w'], df['label'])
# split dataset into train and valid with random_split
train_dataset, valid_dataset = torch.utils.data.random_split(dataset, [int(len(dataset)*0.8), len(dataset)-int(len(dataset)*0.8)])

In [5]:
test_df = pd.read_csv('test_c.csv')
test_df['text_full'] = test_df['title'] + ' ' + test_df['text']
test_df["text_w"] = test_df["text_full"].apply(lambda x: ' '.join([word for word in str(x).split() if word not in (stop_words)]))
test_dataset = TestDataset(test_df['text_w'])

In [6]:
text_vec = GloVe(name='6B', dim=100)

tokenizer = get_tokenizer('basic_english')

In [7]:
# text_pipeline = lambda x: text_vec.get_vecs_by_tokens(tokenizer(x), lower_case_backup=False)
myvocab=vocab(text_vec.stoi, min_freq=0, specials=['<pad>','<unk>'], special_first = True) 

myvocab.set_default_index(myvocab['<unk>'])

label_pipeline = lambda x: int(x) - 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(myvocab(tokenizer(_text)), dtype=torch.int64)
         text_list.append(processed_text)
    text_list = torch.nn.utils.rnn.pad_sequence(text_list, padding_value=0)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    return label_list.to(device), text_list.to(device)

def test_collate_batch(batch):
    text_list = []
    for (_text) in batch:
         processed_text = torch.tensor(myvocab(tokenizer(_text)), dtype=torch.int64)
         text_list.append(processed_text)
    text_list = torch.nn.utils.rnn.pad_sequence(text_list, padding_value=0)
    return text_list.to(device)

# Transformer

2. Discuss the model structure or hyperparameter setting in your design.

Model structure:
- Embedding layer: using the embedding matrix to get the embedding of each word. We called `encoder` and load the pretrained embedding matrix.
- Positional encoding: using the positional encoding to add the position information to the embedding. We called `pos_encoder`.
- Encoder: using encoder to encode the input sequence. We called `transformer_encoder`.
- Using `torch.mean` to get the mean of the output of encoder.
- Decoder: using decoder to decode the encoded sequence. After that we can get the `num_class` output. We called `decoder`.

Hyperparameter setting:
- `d_model` = 100
- `d_hid` = 400
- `nlayers` = 4
- `nhead` = 10
- `dropout` = 0.5

The hyperparameter setting is the default setting in the `torch.nn.Transformer`.

But `d_model` is 100. Since the embedding matrix is 100 dimension.

And the `n_head` needs to be a factor of `d_model`, so I set it to 10.

In [8]:
class TransformerModel(nn.Module):

    def __init__(self, num_class: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5, activatioln: str = 'relu') -> None:
        super().__init__()
        pretrain_emb=torch.cat([torch.zeros((2,100)),text_vec.vectors]) # add <pad> and <unk> to the pretrained embedding
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout,)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding.from_pretrained(pretrain_emb, freeze=False)
        self.decoder = nn.Linear(d_model, num_class)

    def forward(self, src: Tensor) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[seq_len, batch_size]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[seq_len, batch_size, ntoken]``
        """
        src = self.encoder(src)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = torch.mean(output, dim=0)
        output = self.decoder(output)
        # output = torch.argmax(output, dim=1)
        # output = output * -1
        return output
    
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [9]:
batch = 4

train_dataloader = DataLoader(train_dataset, batch_size=batch, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=test_collate_batch)

In [10]:
num_class = 2
d_model = 100  # embedding dimension
d_hid = 400  # dimension of the feedforward network model in ``nn.TransformerEncoder``
nlayers = 2  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
nhead = 10  # number of heads in ``nn.MultiheadAttention``
dropout = 0.5  # dropout probability
total_epoch = 50
act_funct = 'silu'

model = TransformerModel(num_class, d_model, nhead, d_hid, nlayers, dropout, activatioln=act_funct).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-4)
criterion = torch.nn.CrossEntropyLoss().to(device)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=total_epoch, eta_min=1e-5)

In [11]:
total_params = sum(p.numel() for p in model.parameters())
print("Total number of parameters: ", total_params)

Total number of parameters:  40243002


In [12]:
next(iter(train_dataloader))

(tensor([-1,  0,  0,  0]),
 tensor([[   27,   300,   712,   222],
         [    1,  1345,  2026,   424],
         [50352,  2814,   141,  1678],
         ...,
         [  172,     0,     0,     0],
         [ 2133,     0,     0,     0],
         [    4,     0,     0,     0]]))

In [13]:
# train_loop = tqdm.tqdm((train_dataloader), total=len(train_dataloader))
# for idx, (label, text) in enumerate(train_loop):
#   print(label)
#   o = model(text[:5])
#   print('model out:', o)
#   # pred = torch.argmax(o, dim=1)
#   # print('pred', pred)
#   print(criterion(o, label*-1))
#   break

  0%|          | 0/4160 [00:00<?, ?it/s]

tensor([-1, -1, -1, -1])
model out: tensor([[-0.1852, -0.3484],
        [-0.5409, -0.1949],
        [ 0.0767, -0.6480],
        [ 0.3924, -0.6769]], grad_fn=<AddmmBackward0>)
tensor(0.9493, grad_fn=<NllLossBackward0>)





### Lol

In [None]:
best_acc, best_loss = 0, 1e9

for i in range(total_epoch):
    train_acc, train_loss = 0, 0
    valid_acc, valid_loss = 0, 0

    model.train()
    train_loop = tqdm.tqdm((train_dataloader), total=len(train_dataloader))
    for idx, (label, text) in enumerate(train_loop):
        optimizer.zero_grad()
        output = model(text[:300])
        # print('Output', output)
        loss = criterion(output, label*(-1))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        pred = torch.argmax(F.softmax(output, dim=1) ,dim=1)
        train_acc += (pred == label).sum().item()
        train_loop.set_description(f"Epoch [{i+1}/{total_epoch}]")
        train_loop.set_postfix(train_loss=train_loss/(len(train_dataloader) * batch), train_acc=train_acc/(len(train_dataloader) * batch))

    model.eval()
    total_loss = 0.
    valid_loop = tqdm.tqdm((valid_dataloader), total=len(valid_dataloader))
    with torch.no_grad():
        for idx, (label, text) in enumerate(valid_loop):
            output = model(text[:300])
            loss = criterion(output, label)
            valid_loss += loss.item()
            pred = torch.argmax(F.softmax(output, dim=1), dim=1)
            valid_acc += (pred == label).sum().item()
            valid_loop.set_description(f"Epoch [{i+1}/{total_epoch}]")
            valid_loop.set_postfix(valid_loss=valid_loss/(len(valid_dataloader) * batch), valid_acc=valid_acc/(len(valid_dataloader) * batch))

    if valid_loss < best_loss:
        best_loss = valid_loss
        torch.save(model.state_dict(), 'best_loss_model.pth')
    scheduler.step()

Epoch [1/50]:  13%|█▎        | 534/4160 [03:55<29:46,  2.03it/s, train_acc=0.0445, train_loss=0.0172]

In [None]:
# load best model
model.load_state_dict(torch.load('best_loss_model.pth'))

pred = np.zeros(len(test_dataloader))
index = 0

test_loop = tqdm.tqdm((test_dataloader), total=len(test_dataloader))
for idx, (text) in enumerate(test_loop):
    with torch.no_grad():
        output = model(text)
        pred[idx] = torch.argmax(F.softmax(output, dim=1), dim=1)
    index += 1

with open('result.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'category'])
    for i in range(len(pred)):
        writer.writerow([i+1, int(pred[i]) + 1])