In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [2]:
!pwd


/home/orolol


In [3]:
df = pd.read_csv('workspace/learningLabAgency/data/train.csv')
df.head()

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


In [4]:
import torch
from torch import nn
from transformers import AutoModel

class TransformerClassifier(nn.Module):
    def __init__(self, transformer_model_name, num_labels):
        super(TransformerClassifier, self).__init__()
        self.transformer = AutoModel.from_pretrained(transformer_model_name)
        self.classifier = nn.Linear(self.transformer.config.hidden_size, 1)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs[0][:, 0, :]
        logits = self.classifier(cls_output)
        
        return logits

model_name = "bert-base-uncased" 
num_labels = 7  

In [5]:


from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_embed = BertModel.from_pretrained('bert-base-uncased')



In [6]:
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Define the device


class EssayDataset(Dataset):
    
        def __init__(self, df, tokenizer, model):
            self.df = df
            self.tokenizer = tokenizer
            self.model = model
    
        def __len__(self):
            return len(self.df)
    
        def __getitem__(self, idx):
            text = self.df['full_text'][idx]
            tokens = self.tokenizer.tokenize(text)
    
            if len(tokens) > 512:
                tokens = tokens[:512]
                
            token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            
            if len(token_ids) < 512:
                token_ids += [0] * (512 - len(token_ids))

            attention_mask = [1 if token_id != 0 else 0 for token_id in token_ids]
            input_ids = torch.tensor(token_ids).unsqueeze(0).to(device)  
            attention_mask = torch.tensor(attention_mask).unsqueeze(0).to(device)  
    
            return input_ids, self.df['score'][idx], attention_mask
        
dataset = EssayDataset(df, tokenizer, model_embed)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
model = TransformerClassifier(model_name, num_labels).to(device)  

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [7]:

def quadratic_weighted_kappa(y_true, y_pred):
    """
    Calculates the Quadratic Weighted Kappa
    y_true: array, true labels
    y_pred: array, predicted labels
    """
    o = confusion_matrix(y_true, y_pred)
    n = o.sum()
    row_sums = o.sum(axis=1)
    col_sums = o.sum(axis=0)
    
    expected = np.outer(row_sums, col_sums) / n
    weight_matrix = np.outer(np.arange(o.shape[0]), np.arange(o.shape[0]))
    weight_matrix = (weight_matrix - weight_matrix.T) ** 2 / (o.shape[0] - 1) ** 2
    
    kappa = 1 - (np.sum(weight_matrix * o) / np.sum(weight_matrix * expected))
    return kappa

def confusion_matrix(y_true, y_pred):
    """
    Generate a confusion matrix for calculating QWK
    """
    print(y_true)
    print(y_pred)
    max_rating = max(max(y_true), max(y_pred)) + 1
    matrix = np.zeros((max_rating, max_rating))
    for t, p in zip(y_true, y_pred):
        matrix[t, p] += 1
    return matrix



print("Strat training")
losses = []
mean_loss = 0
for epoch in range(10):
    running_loss = 0.0
    
    for i, data in enumerate(dataloader, 0):
        
        inputs, labels, mask = data
        inputs = inputs.to(device)
        mask = mask.to(device)
        labels = labels.to(device)
        # inputs are in Size([2, 1, 512, 768]) but we need them in Size([2, 512, 768])
        inputs = inputs.squeeze(1)
        
        optimizer.zero_grad()
        outputs = model(inputs, mask)
        outputs_rounded = outputs.detach().round().long()
        outputs_rounded = torch.clamp(outputs_rounded, min=0, max=6)
        kappa = quadratic_weighted_kappa(labels.cpu().detach().numpy(), outputs_rounded.cpu().detach().numpy())
        loss = criterion(outputs, labels.float())
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        if len(losses) > 20:
            mean_loss = np.mean(losses[-20:])
        print(f"Epoch {epoch + 1}, Batch {i + 1} loss: {loss.item()} mean : {mean_loss} kappa {kappa}")

            
print('Finished Training')

Strat training
[2 2 2 3 5 4 2 2]
[[0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]]


  kappa = 1 - (np.sum(weight_matrix * o) / np.sum(weight_matrix * expected))
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1, Batch 1 loss: 8.65086841583252 mean : 0 kappa nan
[2 2 4 4 5 4 3 2]
[[2]
 [3]
 [5]
 [4]
 [4]
 [2]
 [3]
 [0]]
Epoch 1, Batch 2 loss: 3.1422946453094482 mean : 0 kappa nan
[3 3 3 3 2 3 1 1]
[[2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]]
Epoch 1, Batch 3 loss: 0.7344210147857666 mean : 0 kappa nan
[2 3 3 3 2 3 4 4]
[[2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]]
Epoch 1, Batch 4 loss: 1.9284213781356812 mean : 0 kappa nan
[3 2 6 2 3 3 2 3]
[[0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]]
Epoch 1, Batch 5 loss: 52.22657012939453 mean : 0 kappa nan
[4 5 3 4 3 4 4 2]
[[6]
 [6]
 [6]
 [6]
 [6]
 [6]
 [6]
 [6]]


TypeError: only integer scalar arrays can be converted to a scalar index