In [1]:
import os
import re
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from sklearn.model_selection import train_test_split
from transformers import RobertaModel, BertModel, AutoModel
from transformers import RobertaTokenizer, BertTokenizer, AutoTokenizer
import math
import time
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

%matplotlib inline

In [2]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: TITAN X (Pascal)


# Data Preprocessing

In [3]:
train_data = pd.read_csv('kaggle_data/train.csv')
test_data = pd.read_csv('kaggle_data/test.csv')
sample = pd.read_csv('kaggle_data/sample_submission.csv')

num_bins = int(np.floor(1 + np.log2(len(train_data))))
print(num_bins)
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)

bins = train_data.bins.to_numpy()
target = train_data.target.to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

train_data.head()

12


Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,bins
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009,7
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805,7
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676,6
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007,5
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845,8


In [4]:
config = {
    'lr': 2e-5,
    'wd':0.01,
    'batch_size':16,
    'valid_step':10,
    'max_len':256,
    'epochs':3,
    'nfolds':5,
    'seed':42,
}


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])


In [5]:
train_data.head() #顯示前五筆數據

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,bins
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009,7
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805,7
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676,6
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007,5
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845,8


In [6]:
#tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer,max_len=256, testing=False):
        self.excerpt = df['excerpt'].to_numpy()
        self.testing = testing
        if not self.testing:self.targets = df['target'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                #return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        
        if not self.testing:
            target = torch.tensor(self.targets[idx],dtype=torch.float) 
            return torch.tensor(encode.get('input_ids')), torch.tensor(encode.get('attention_mask')), target
        else:return torch.tensor(encode.get('input_ids')), torch.tensor(encode.get('attention_mask'))
    
    def __len__(self):
        return len(self.excerpt)

In [7]:
from sklearn.model_selection import train_test_split

train_data, val_data =\
    train_test_split(train_data, test_size=0.1, random_state=2020)

len(train_data), len(val_data)

(2550, 284)

In [8]:
train_ds = CLRPDataset(train_data, tokenizer, config['max_len'])
train_dataloader = DataLoader(train_ds,
                        batch_size = config["batch_size"],
                        shuffle=True,
                        num_workers = 0,
                        pin_memory=True,
                        drop_last=False)


tmp_train = iter(train_dataloader)
ttdata = next(tmp_train)
ttdata

[tensor([[    0,   133, 26944,  ...,     1,     1,     1],
         [    0, 11475,    10,  ...,     1,     1,     1],
         [    0, 41415,    34,  ...,     1,     1,     1],
         ...,
         [    0,   100,  1017,  ...,    11,  6063,     2],
         [    0,   673,    90,  ...,     1,     1,     1],
         [    0, 36428,     5,  ...,     1,     1,     1]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([-3.1434, -0.2625,  0.1190, -0.8422, -1.3988, -3.0781, -2.0482, -0.5420,
         -1.9856,  0.9027, -0.1402, -1.3788, -2.3111, -1.5844, -1.3640, -1.1902])]

In [9]:
val_ds = CLRPDataset(val_data,tokenizer,config['max_len'])
val_dataloader = DataLoader(val_ds,
                        batch_size = config["batch_size"],
                        shuffle=False,
                        num_workers = 0,
                        pin_memory=True,
                        drop_last=False)

tmp_val = iter(val_dataloader)
tvdata = next(tmp_val)
tvdata

[tensor([[    0,   133,   372,  ...,     1,     1,     1],
         [    0, 44036, 40279,  ...,     1,     1,     1],
         [    0,   725,  6372,  ...,     1,     1,     1],
         ...,
         [    0,   243,    21,  ...,     1,     1,     1],
         [    0,  3762,  3279,  ...,     1,     1,     1],
         [    0,  4148,   358,  ...,     1,     1,     1]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([-1.0355, -0.0885,  0.4723, -0.0925, -0.8278, -0.2035, -0.5835, -1.9648,
         -2.4837, -1.5459, -0.9846, -1.9428, -0.1007, -0.0057,  0.1430, -2.0819])]

In [10]:
tmpDataset = CLRPDataset(train_data, tokenizer)
tmpIter = iter(tmpDataset)
token_ids, tmp_mask, target_score = next(tmpIter)
token_ids, tmp_mask = token_ids.unsqueeze(0), tmp_mask.unsqueeze(0)
token_ids, tmp_mask, target_score

(tensor([[    0,   133,   475, 12363,     9,  6872,   259,    58,   684,     7,
           1719,   103,  8178,   343,     6,    13,   103,  2292, 11040, 43849,
              8, 19638,    56,   416,    57,   303,   131,    53,    99,   343,
             24,   115,    28,     6, 37325,    58,    23,    10,   872,     7,
           3094,   131,   600,   103,     6,    19,  6020,   226, 26378,  6125,
             23,    49,   471,     6,  2047,    24,     7,    28,  4146,    97,
             87,     5,   248, 12336,   293,    50,    22, 27625,  7042,   293,
             60,    61,     5,  4278,     9,  1870,  1490,    13, 43901,     6,
              8, 47707,    51,   554,    15,    49,   507, 42750,     4,  5053,
          10614,     6,   959,     6,     9,     5,  3091,     9,     5, 40802,
           1947,    11,  5028,    21,    98,   444,  8315, 21779,     4, 26846,
           3435,  1085, 26606,    21,   684,    25,     7,     5, 18947,     9,
              5,  1870,  1459,    98,   

In [11]:
import torch.nn as nn

class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super(AttentionHead, self).__init__()
        self.W1 = nn.Linear(in_features, hidden_dim)
        self.W2 = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)

    def forward(self, features):
        query, key = self.W1(features), self.W2(features)
        att = torch.tanh(query + key)
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)
        return context_vector

In [12]:
class AttentionPooling(nn.Module):
    def __init__(self, num_layers, hidden_size, hiddendim_fc):
        super(AttentionPooling, self).__init__()
        self.num_hidden_layers = num_layers
        self.hidden_size = hidden_size
        self.hiddendim_fc = hiddendim_fc
        self.dropout = nn.Dropout(0.1)

        q_t = np.random.normal(loc=0.0, scale=0.1, size=(1, self.hidden_size))
        self.q = nn.Parameter(torch.from_numpy(q_t)).float().cuda()
        w_ht = np.random.normal(loc=0.0, scale=0.1, size=(self.hidden_size, self.hiddendim_fc))
        self.w_h = nn.Parameter(torch.from_numpy(w_ht)).float().cuda()

    def forward(self, all_hidden_states):
        hidden_states = torch.stack([all_hidden_states[layer_i][:, 0].squeeze()
                                     for layer_i in range(1, self.num_hidden_layers+1)], dim=-1)
        hidden_states = hidden_states.view(-1, self.num_hidden_layers, self.hidden_size)
        out = self.attention(hidden_states)
        out = self.dropout(out)
        return out

    def attention(self, h):
        v = torch.matmul(self.q, h.transpose(-2, -1)).squeeze(1)
        v = F.softmax(v, -1)
        v_temp = torch.matmul(v.unsqueeze(1), h).transpose(-2, -1)
        v = torch.matmul(self.w_h.transpose(1, 0), v_temp).squeeze(2)
        return v

In [13]:
%%time
import torch
import torch.nn as nn

# Create the BertClassfier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 64, 1

        # Instantiate BERT model
        #self.bert = BertModel.from_pretrained('bert-base-uncased')
        #self.bert = RobertaModel.from_pretrained('roberta-base')
        #self.bert = AutoModel.from_pretrained('google/electra-base-discriminator', output_hidden_states=True)
        self.bert = AutoModel.from_pretrained('roberta-base', output_hidden_states=True)
    
        self.head = AttentionHead(D_in, D_in, 1)
        
        self.pooler = AttentionPooling(12, D_in, D_in)
        
        self.linear = nn.Linear(D_in, D_out)
        
        self.dropout = nn.Dropout(0.1)
        
        # Instantiate an one-layer feed-forward classifier
        self.cls = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        
        # Feed input to BERT
        '''
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)[0]
        outputs = self.head(outputs)
        outputs = self.dropout(outputs)
        logits = self.linear(outputs)
        '''
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        all_hidden_state = torch.stack(outputs[2])        
        
        #print(all_hidden_state[-3:].shape)
        last_hidden_state_cls = outputs[0][:, 0, :]
        last_hidden_state = outputs[0]
        outputs = self.pooler(all_hidden_state)
        logits = self.linear(outputs)  
             
              

        return logits

Wall time: 0 ns


In [65]:
tmp_model = BertClassifier()
tmp_output = tmp_model(token_ids, tmp_mask)
tmp_output

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[0.4568]], grad_fn=<AddmmBackward>)

In [14]:
from transformers import AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    '''
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    '''
    scheduler = get_cosine_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [15]:
import random
import time

# Specify loss function

def loss_fn(outputs,targets):
    outputs = outputs.view(-1)
    targets = targets.view(-1)
    return torch.sqrt(nn.MSELoss()(outputs,targets))
    
def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            
            # Zero out any previously calculated gradients
            model.zero_grad()
            inputs, b_attn_mask, targets = tuple(t.to(device) for t in batch)
            
            # Perform a forward pass. This will return logits.
            logits = model(inputs, b_attn_mask)
            
            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, targets)
            batch_loss += loss.item()
            total_loss += loss.item()
            #print(loss)
            #input()
            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()
                
        

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)
        
        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
        
        #儲存模型
        torch.save(model.state_dict(), 'ckpt/E_{}.h5'.format(epoch_i+1))
    
    print("Training complete!")


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [16]:
set_seed(42)    # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=3)
train(bert_classifier, train_dataloader, val_dataloader, epochs=3, evaluation=True)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.983948   |     -      |     -     |   9.47   
   1    |   40    |   0.783552   |     -      |     -     |   8.57   
   1    |   60    |   0.747434   |     -      |     -     |   8.65   
   1    |   80    |   0.681698   |     -      |     -     |   8.68   
   1    |   100   |   0.740745   |     -      |     -     |   8.78   
   1    |   120   |   0.641834   |     -      |     -     |   9.11   
   1    |   140   |   0.630646   |     -      |     -     |   8.92   
   1    |   159   |   0.584796   |     -      |     -     |   8.22   
----------------------------------------------------------------------
   1    |    -    |   0.726826   |  0.562223  |   0.00    |   72.87  
----------------------------------------------------------------------


 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elaps

In [17]:
test_ds = CLRPDataset(test_data,tokenizer,config['max_len'],True)
test_dataloader = DataLoader(test_ds,
                        batch_size = config["batch_size"],
                        shuffle=False,
                        num_workers = 0,
                        pin_memory=True,
                        drop_last=False)

In [18]:
def get_prediction(test_dataloader, model_path, ckpt_path):        
    model = BertClassifier(freeze_bert=False)
    model.load_state_dict(torch.load(ckpt_path, map_location=device))
    model.to(device)
    model.eval()
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    predictions = list()
    for i, (batch) in tqdm(enumerate(test_dataloader)):
        inputs, b_attn_mask = tuple(t.to(device) for t in batch)
        outputs = model(inputs, b_attn_mask)
        outputs = outputs.cpu().detach().numpy().ravel().tolist()
        predictions.extend(outputs)
        
    torch.cuda.empty_cache()
    return np.array(predictions)

In [20]:
pred = get_prediction(test_dataloader, 'roberta-base', 'ckpt/E_2.h5')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
1it [00:00,  6.98it/s]


In [21]:
sample['target'] = pred
sample.to_csv('submission.csv',index=False)

In [22]:
sample

Unnamed: 0,id,target
0,c0f722661,0.082889
1,f0953f0a5,0.093809
2,0df072751,0.110757
3,04caf4e0c,0.194042
4,0e63f8bea,0.154859
5,12537fe78,0.094247
6,965e592c0,0.096656
