In [1]:
!pip install tokenizers

Collecting tokenizers
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 5.4 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.11.6


In [2]:
import sys
### Direct this path to the appropriate folder for importing libaries
from google.colab import drive
drive.mount('/content/drive')
sys.path.append('/content/drive/MyDrive/Newspaper Classification')


Mounted at /content/drive


In [3]:
import pandas as pd
import glob
from tqdm import tqdm
import re
import itertools
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import recall_score,f1_score,precision_score,accuracy_score
import torch
from torch import nn


#Load data

In [4]:
finale_dataframe = pd.read_csv("/content/drive/MyDrive/Newspaper Classification/Data/version_1_punctuation_13-11-2021.csv",encoding="utf8")

In [5]:
finale_dataframe=finale_dataframe[~finale_dataframe['label'].str.contains("po")]
len(finale_dataframe)

200000

In [6]:


X_train, X_test, y_train, y_test = train_test_split(
                                                    finale_dataframe.drop("label", axis=1),
                                                    finale_dataframe["label"], 
                                                    test_size=0.2, 
                                                    shuffle=True, 
                                                    random_state=42, 
                                                    stratify = finale_dataframe["label"]
                                                    )

X_valid, X_test, y_valid, y_test = train_test_split(
                                                    X_test,
                                                    y_test, 
                                                    test_size=0.5, 
                                                    shuffle=True, 
                                                    random_state=42, 
                                                    stratify = y_test
                                                    )

In [None]:
print(y_train.value_counts())
print(y_test.value_counts())
print(y_valid.value_counts())


In [None]:
lst = []
for _,x in X_train.iterrows():
    lst.extend(x["data"].split("."))
lens = [len(x.split(" ")) for x in lst]
 
print(sum(lens)/len(lens))
plt.hist(lens, bins = list(range(0, 200,5)))

In [None]:
lens = [len(x["data"].split(".")) for _,x in X_train.iterrows()]
print(sum(lens)/len(lens))
plt.hist(lens, bins = list(range(0, 200,5)))


##Load unseen test

In [7]:
unseen_test = pd.read_csv("/content/drive/MyDrive/Newspaper Classification/Data/unseen_test_punctuation_v1_3-12-2021.csv",encoding = "utf8")

In [8]:
unseen_test=unseen_test[~unseen_test['label'].str.contains("po")]
len(unseen_test)

5000

#Initializing data loader

In [9]:
import numpy as np
def filterLen(x):
    if len(x.split(" ")) > 2:
        return True
    return False
def split_conversation(X,y):
    label = ["ec","hp","ed","sp","st"]
    x_data = []
    y_data = []
    for i in range(0,len(X)):
        X_mold = X["data"].iloc[i].split(".")
        X_mold = filter(filterLen, X_mold)
        x_data.append(list(X_mold))
        y_data.append(label.index(y.iloc[i]))
    return x_data, y_data

In [10]:
import torch
import torch.nn
from torch.utils.data import Dataset, DataLoader

class DADataset(Dataset):
    
    def __init__(self, tokenizer, text, label, max_len=50):
        
        self.text = text
        self.label = label
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        
        sentences = self.text[index]
        labels = self.label[index]

        # Tokenize all of the sentences and map the tokens to thier word IDs.
        input_ids = []
        attention_masks = []

        # For every sentence...
        for sent in sentences:
            # `encode_plus` will:
            #   (1) Tokenize the sentence.
            #   (2) Prepend the `[CLS]` token to the start.
            #   (3) Append the `[SEP]` token to the end.
            #   (4) Map tokens to their IDs.
            #   (5) Pad or truncate the sentence to `max_length`
            #   (6) Create attention masks for [PAD] tokens.
            encoded_dict = tokenizer.encode(sent)
                        
            # Add the encoded sentence to the list.    
            input_ids.append(encoded_dict.ids)
            

        # Convert the lists into tensors.
        input_ids = torch.tensor(input_ids)
        if(len(input_ids)>self.max_len):
            input_ids = input_ids[:self.max_len,:]
        else:
            mold = torch.zeros(self.max_len, 50,dtype=int)
            mold[:len(input_ids),:] = input_ids
            input_ids = mold
        labels = torch.tensor(labels)
        
        return {
            "input_ids":input_ids,
            "label":labels,
        }


In [11]:

from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers
from tokenizers import ByteLevelBPETokenizer
def train_tokenizer(corpus):
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train_from_iterator(corpus, vocab_size=50000, min_frequency=10, special_tokens=["[PAD]","[UNK]"])
    tokenizer.enable_padding(direction='right', length=50, pad_id=tokenizer.get_vocab()['[PAD]'])
    tokenizer.enable_truncation(max_length=50)
    return tokenizer

In [12]:
tokenizer=train_tokenizer(X_train["data"])

In [13]:
tokenizer.save("/content/drive/MyDrive/Newspaper Classification/Model-without Politics/HAN/tokenizer.json",)

In [14]:
tokenizer = Tokenizer.from_file("/content/drive/MyDrive/Newspaper Classification/Model-without Politics/HAN/tokenizer.json")

In [15]:
moldx, moldy =split_conversation(X_train,y_train)
train_dataset = DADataset(tokenizer=tokenizer, text = moldx, label = moldy, max_len=40)
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True, drop_last=True)

In [16]:
moldx, moldy =split_conversation(X_valid,y_valid)
val_dataset = DADataset(tokenizer=tokenizer, text = moldx, label = moldy, max_len=40)
val_loader = DataLoader(dataset=val_dataset, batch_size=1, shuffle=False, drop_last=True)

In [17]:
moldx, moldy =split_conversation(X_test,y_test)
test_dataset = DADataset(tokenizer=tokenizer, text = moldx, label = moldy, max_len=40)
test_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False, drop_last=True)

In [18]:
moldx, moldy =split_conversation(unseen_test,unseen_test["label"])
unseen_dataset =DADataset(tokenizer=tokenizer, text = moldx, label = moldy, max_len=40)
unseen_loader = DataLoader(dataset=unseen_dataset, batch_size=64, shuffle=False, drop_last=True)

In [19]:
len(train_loader), len(val_loader), len(test_loader), len(unseen_loader)

(2500, 20000, 20000, 78)

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
class Word_RNN(nn.Module):
    
    def __init__(self,device,vocab,embedding_size = 300, model_name="bert-base-uncased", hidden_size=128, bidirectional=True, num_layers=1):
        super(Word_RNN, self).__init__()
        
        self.device = device
        # embedding layer 
        self.embedding = nn.Embedding(vocab, embedding_size)
        self.embedding_size = embedding_size
        self.rnn = nn.GRU(
            input_size=embedding_size, 
            hidden_size=hidden_size, 
            num_layers=num_layers, 
            bidirectional=bidirectional,
            batch_first=True
        )
        self.hidden_size = hidden_size
    def forward(self, input_ids):
        """
            x.shape = [batch_size, sentence,seq_len]
        """
        features = self.embedding(input_ids)
        hidden = torch.empty((0,input_ids.shape[1], input_ids.shape[2],self.hidden_size*2)).to(self.device)
        for i in range(0,input_ids.shape[0]):
            outputs, _ = self.rnn(features[i])
            outputs = outputs.unsqueeze(0)
            hidden = torch.cat((hidden, outputs), dim=0).to(self.device)
        return hidden

In [22]:
class WordAttention(nn.Module):

    def __init__(self, hidden_size=128*2):
        super(WordAttention, self).__init__()
        
        self.context = nn.Linear(in_features=hidden_size, out_features=1)
        self.weight = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=True)
        self.softmax = nn.Softmax(dim=-1)
        self.tanh = nn.Tanh()
        
    def forward(self, hidden_states):
       
        energy = self.tanh(self.context(self.weight(hidden_states)))
        
        attention = self.softmax(energy)
        
        m = torch.mul(attention,hidden_states)
        m = torch.sum(m,dim=-2)
        return m

In [23]:
class SentenceAttention(nn.Module):

    def __init__(self, hidden_size=64*2):
        super(SentenceAttention, self).__init__()
        
        self.context = nn.Linear(in_features=hidden_size, out_features=1)
        self.weight = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=True)
        self.softmax = nn.Softmax(dim=-1)
        self.tanh = nn.Tanh()
        
    def forward(self, hidden_states):
       
        energy = self.tanh(self.context(self.weight(hidden_states)))
        
        attention = self.softmax(energy)
        
        m = torch.mul(attention,hidden_states)
        m = torch.sum(m,dim=-2)
        return m

In [24]:
class HAN(nn.Module):
    
    def __init__(self, vocab_size,hidden_size=768, num_classes=5, device=device):
        super(HAN, self).__init__()
        self.device = device
        self.wordatt = WordAttention().to(self.device)
        self.wordgru = Word_RNN(self.device,vocab_size).to(self.device)
        self.sentenceatt = SentenceAttention().to(self.device)
        self.sentencegru = nn.GRU(
            input_size=256, 
            hidden_size=64, 
            num_layers=1, 
            bidirectional=True,
            batch_first=True
        ).to(self.device)
        self.classifier = nn.Sequential(*[
            nn.Linear(in_features=128, out_features=num_classes),
            nn.LeakyReLU(),
            nn.Softmax(dim=-1)
        ]).to(self.device)
    
    def forward(self, inputs):  

        x = self.wordgru(inputs).to(self.device)
        x = self.wordatt(x)
        outouts,_ = self.sentencegru(x)
        x = self.sentenceatt(outouts)
        # Xuất kết quả phân loại
        return self.classifier(x)

In [25]:
model = HAN(tokenizer.get_vocab_size(),device=device)

In [26]:
model.load_state_dict(torch.load("/content/drive/MyDrive/Newspaper Classification/Model-without Politics/HAN/HAN_v2.pth"))

<All keys matched successfully>

In [None]:
def evaluate(model, data_loader):
    accuracies = []
    losses = []
    model.eval()
    with torch.no_grad():
        for x in data_loader:
            # Forward pass
            targets = x["label"].to(device)
            outputs = model(x["input_ids"].to(device))
            loss = loss_function(outputs, targets)

            accuracy = (outputs.argmax(dim=-1) == targets).type(torch.float32).mean().item()
            accuracies.append(accuracy)
            losses.append(loss.item())
    
    return np.mean(losses), np.mean(accuracies)

In [None]:
# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.AdamW(params, lr=0.00005, weight_decay=0.0005)

# and a learning rate scheduler which decreases the learning rate by 2x every 10 epochs
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=5,
                                               gamma=0.5)
loss_function = nn.CrossEntropyLoss().to(device)

In [None]:
n_batch = len(train_loader)
max_acc = 0

In [None]:
_,max_acc = evaluate(model, unseen_loader)

In [None]:
max_acc

0.8852163461538461

In [None]:

train_accs = []
train_losses = []
test_accs = []
test_losses = []
for epoch in range(10):
    model.train()
    with tqdm(train_loader, unit="batch") as tepoch:
        tepoch.set_description(f"Epoch {epoch + 1}")
        for batch_idx, x in enumerate(tepoch):

            targets = x["label"].to(device)

            #forward
            logits = model(x["input_ids"].to(device))
            #print(torch.argmax(logits,dim=1))
            loss = loss_function(logits, targets)

            # Backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


            train_accs.append((logits.argmax(dim=-1) == targets).type(torch.float32).mean().item())
            train_losses.append(loss.item())

            tepoch.set_postfix(loss=train_losses[-1], acc=train_accs[-1])

            if batch_idx >= n_batch - 1:
                val_loss, val_acc =  evaluate(model, unseen_loader)
                test_losses.append(val_loss)
                test_accs.append(val_acc)
                tepoch.set_postfix(loss=np.mean(train_losses), acc=np.mean(train_accs), val_loss=val_loss, val_acc=val_acc)

    if max_acc < val_acc:
        max_acc = val_acc
        print(f"Save at epoch={epoch+1} with lr={lr_scheduler.get_last_lr()} and loss={val_loss}")
        torch.save(model.state_dict(), "/content/drive/MyDrive/Newspaper Classification/Model-without Politics/HAN/HAN_v2.pth")

Epoch 1: 100%|██████████| 2500/2500 [57:59<00:00,  1.39s/batch, acc=0.971, loss=0.934, val_acc=0.88, val_loss=1.02]
Epoch 2: 100%|██████████| 2500/2500 [57:47<00:00,  1.39s/batch, acc=0.972, loss=0.933, val_acc=0.878, val_loss=1.03]
Epoch 3: 100%|██████████| 2500/2500 [57:45<00:00,  1.39s/batch, acc=0.973, loss=0.932, val_acc=0.877, val_loss=1.03]
Epoch 4:  75%|███████▌  | 1884/2500 [43:09<14:02,  1.37s/batch, acc=0.969, loss=0.938]

In [None]:
torch.save(model.state_dict(), "/content/drive/MyDrive/Newspaper Classification/Model-without Politics/HAN/HAN_unseen_continue_continue.pth")

In [46]:
def my_max(arr):
    if len(arr)==0:
        raise TypeError("empty list")
    max = arr[0]
    for i in arr:
        if i > max:
            max=i
    return max

In [48]:
def my_second_max(arr):
    if len(arr)==0:
        raise TypeError("empty list")
    max_second= arr[0]
    max = arr[0]
    for i in arr:
        if i > max:
            max_second = max
            max=i
        elif i > max_second:
            max_second=i
    return max_second

In [50]:
my_second_max([0,2,1])

1