In [None]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel

In [None]:
df_final=pd.read_excel('Dataset_label_Augment.xlsx')
df_final['target_list']=df_final['target_list'].apply(eval)

In [None]:
train_size = 0.8
train_dataset = df_final.sample(frac=train_size, random_state=200)
valid_dataset = df_final.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)
print(f"FULL Dataset: {df_final.shape}, "
        f"TRAIN Dataset: {train_dataset.shape}, "
          f"TEST Dataset: {valid_dataset.shape}")

In [7]:
import torch
class Mydataset(Dataset):
    def __init__(self, df, max_len=512):
        self.tokenizer = BertTokenizer.from_pretrained("autodl-tmp/bert-base-uncased")
        self.text = df['clean_text']
        self.targets = df['target_list']
        self.max_len = max_len

    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        text = str(self.text[index])

        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.float)
               }


In [None]:
training_set = Mydataset(train_dataset)
validation_set = Mydataset(valid_dataset)
train_params = {'batch_size': 32, 'shuffle': True, 'num_workers': 0}
test_params = {'batch_size': 32, 'shuffle': False, 'num_workers': 0}
training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **test_params)

In [14]:
import shutil
from torch import nn
import torch
from transformers import BertModel
import torch.nn.functional as F

class MyBert(nn.Module):
    def __init__(self):
        super(MyBert,self).__init__()
        self.bert = BertModel.from_pretrained('autodl-tmp/bert-base-uncased')
        self.Dropout = torch.nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, 34)

    
    def forward(self, input_ids, attention_mask, token_type_ids):      
        output = self.bert(input_ids, attention_mask, token_type_ids)     
        cls = output.pooler_output  
        output_1 = self.Dropout(cls)
        logits= self.classifier(output_1)
        return logits
    
def focal_loss(logits, labels, alpha=1, gamma=2, reduction='mean'):
    bce_loss = F.binary_cross_entropy_with_logits(logits, labels, reduction='none')
    pt = torch.exp(-bce_loss)
    focal_loss_value = alpha * (1 - pt) ** gamma * bce_loss
    if reduction == 'mean':
        return focal_loss_value.mean()
    elif reduction == 'sum':
        return focal_loss_value.sum()
    else:
        return focal_loss_value

def loss_fn(logits, labels):
    return focal_loss(logits, labels, alpha=1, gamma=2)


def save_ckp(state, is_best, checkpoint_path, best_model_path):
    f_path = checkpoint_path
    torch.save(state, f_path)
    if is_best:
        best_path = best_model_path
        shutil.copyfile(f_path, best_path)

In [15]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report
val_targets = []
val_outputs = []

In [16]:
def train_model( n_epochs, valid_loss_min_input, training_loader, validation_loader, model,
                optimizer, checkpoint_path, best_model_path):      
    valid_loss_min = valid_loss_min_input
    lst_train_loss=[]
    lst_valid_loss=[]
    lst_train_accuracy=[]
    lst_valid_accuracy=[]


    for epoch in range(1, n_epochs + 1):
        train_loss = 0
        valid_loss = 0
        train_correct = 0
        valid_correct = 0
        model.train()
        print('Epoch {}: Training Start'.format(epoch))
        for batch_idx, data in enumerate(training_loader):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)

            outputs = model(ids, mask, token_type_ids)
            optimizer.zero_grad()
            loss = loss_fn(outputs, targets)

            if batch_idx % 100 == 0:
                print(f'Epoch: {epoch}, BATCH: {batch_idx}, Training Loss:  {loss.item()}')

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))

            preds = torch.argmax(outputs, dim=1)
            targets_labels = torch.argmax(targets, dim=1)  
            train_correct += (preds == targets_labels).sum().item()


        print('Epoch {}: Training End'.format(epoch))
        print('Epoch {}: Validation Start'.format(epoch))
     

        model.eval()

        with torch.no_grad():
            for batch_idx, data in enumerate(validation_loader, 0):
                ids = data['ids'].to(device, dtype=torch.long)
                mask = data['mask'].to(device, dtype=torch.long)
                token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
                targets = data['targets'].to(device, dtype=torch.float)
                outputs = model(ids, mask, token_type_ids)

                loss = loss_fn(outputs, targets)
                valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
                val_targets.extend(targets.cpu().detach().numpy().tolist())
                val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

            
                preds = torch.argmax(outputs, dim=1)
                targets_labels = torch.argmax(targets, dim=1)
                valid_correct += (preds == targets_labels).sum().item()


            print('Epoch {}: Validation End'.format(epoch))
            train_loss = train_loss / len(training_loader)
            valid_loss = valid_loss / len(validation_loader)
            train_accuracy = train_correct / len(training_loader.dataset)
            valid_accuracy = valid_correct / len(validation_loader.dataset)

            lst_valid_loss.append(valid_loss)            
            lst_train_loss.append(train_loss)
            lst_train_accuracy.append(train_accuracy)
            lst_valid_accuracy.append(valid_accuracy)

            
            print('Epoch: {} \t Avgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'
                  .format(epoch, train_loss, valid_loss))

           
            checkpoint = {
                'epoch': epoch + 1,
                'valid_loss_min': valid_loss,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict()
            }


            
            save_ckp(checkpoint, False, checkpoint_path, best_model_path)
            
            if valid_loss <= valid_loss_min:
                print('Validation loss decreased from {:.6f} to {:.6f}). Saving model'
                      .format(valid_loss_min, valid_loss))
                
                save_ckp(checkpoint, True, checkpoint_path, best_model_path)
                valid_loss_min = valid_loss

        print('Epoch {}  Done\n'.format(epoch))

    return lst_train_loss,lst_valid_loss,lst_train_accuracy,lst_valid_accuracy

   

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MyBert()
model.to(device)
gpu_count = torch.cuda.device_count()
if gpu_count > 1:
    model = nn.DataParallel(model)
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-05)

In [None]:
checkpoint_path = './models/current_checkpoint.pt'
best_model = './models/best_model.pt'
lst_train_loss,lst_valid_loss,lst_train_accuracy,lst_valid_accuracy = train_model(15, np.inf, training_loader, validation_loader, model,
                            optimizer, checkpoint_path, best_model)

In [None]:
val_predicts = (np.array(val_outputs) >= 0.5).astype(int)
accuracy = accuracy_score(val_targets, val_predicts)
f1_score_micro = f1_score(val_targets, val_predicts, average='micro')
f1_score_macro = f1_score(val_targets, val_predicts, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")
print(classification_report(val_targets, val_predicts))