In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [2]:
!pwd


/home/orolol/workspace


In [3]:
df = pd.read_csv('kaggle-compet/learningLabAgency/data/train.csv')
df.head()

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


In [16]:
import torch 
import torch.nn as nn


class TransformerBlock(nn.Module):

    def __init__(
        self, features, ffn_features, n_heads,
        rezero=True, **kwargs
    ):
        super().__init__(**kwargs)

        self.norm1 = nn.LayerNorm(features)
        self.atten = nn.MultiheadAttention(features, n_heads)

        self.norm2 = nn.LayerNorm(features)
        self.ffn = PositionWise(features, ffn_features)

        self.rezero = rezero

        if rezero:
            self.re_alpha = nn.Parameter(torch.zeros((1, )))
        else:
            self.re_alpha = 1

    def forward(self, x):
        # Step 1: Multi-Head Self Attention
        y1 = self.norm1(x)
        y1, _atten_weights = self.atten(y1, y1, y1)

        y = x + self.re_alpha * y1

        # Step 2: PositionWise Feed Forward Network
        y2 = self.norm2(y)
        y2 = self.ffn(y2)

        y = y + self.re_alpha * y2

        return y

    def extra_repr(self):
        return 're_alpha = %e' % (self.re_alpha, )


class PositionWise(nn.Module):

    def __init__(self, features, ffn_features, **kwargs):
        super().__init__(**kwargs)

        self.net = nn.Sequential(
            nn.Linear(features, ffn_features),
            nn.GELU(),
            nn.Linear(ffn_features, features),
        )

    def forward(self, x):
        return self.net(x)


class TransformerEncoder(nn.Module):

    def __init__(
        self, features, ffn_features, n_heads, n_blocks,
        rezero=True, **kwargs
    ):
        super().__init__(**kwargs)

        self.encoder = nn.Sequential(*[
            TransformerBlock(
                features, ffn_features, n_heads, rezero
            ) for _ in range(n_blocks)
        ])

    def forward(self, x):
        print(x.size())
        #y = x.permute((1, 0, 2))
        y = self.encoder(x)

        # result : (N, L, features)
        result = y.permute((1, 0, 2))

        return result


In [22]:
import torch
from torch import nn
from transformers import AutoModel

class TransformerClassifier(nn.Module):
    def __init__(self, transformer_model_name, num_labels):
        super(TransformerClassifier, self).__init__()
        self.transformer = AutoModel.from_pretrained(transformer_model_name)
        self.classifier = nn.Linear(self.transformer.config.hidden_size, num_labels)
        
    def forward(self, input_ids, attention_mask):
        # Pass the inputs through the transformer model
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        
        # We take only the outputs from the first token (CLS token) for classification
        cls_output = outputs[0][:, 0, :]
        
        # Pass the CLS token outputs through the classifier to get predictions
        logits = self.classifier(cls_output)
        
        return logits

# Example usage
model_name = "bert-base-uncased"  # You can replace this with any model from Hugging Face's model hub
num_labels = 6  # For binary classification, adjust according to your task

In [17]:
# DF is comprised of 3 columns: 'essay_id', 'full_text', 'score'
# 'essay_id' is a unique identifier for each essay
# 'full_text' is the text of the essay
# 'score' is the score of the essay
# 'score' is the target variable

# We will use 'full_text' as the input and 'score' as the target variable
# First let's encode the 'full_text' using BERT

# We will use the 'transformers' library by Hugging Face

from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# First let's tokenize the 'full_text'
text = df['full_text'][0]
tokens = tokenizer.tokenize(text)
print('Tokens:', tokens)
print('Length:', len(tokens))

Tokens: ['many', 'people', 'have', 'car', 'where', 'they', 'live', '.', 'the', 'thing', 'they', 'don', "'", 't', 'know', 'is', 'that', 'when', 'you', 'use', 'a', 'car', 'al', '##ot', 'of', 'thing', 'can', 'happen', 'like', 'you', 'can', 'get', 'in', 'acc', '##ide', '##t', 'or', 'the', 'smoke', 'that', 'the', 'car', 'has', 'is', 'bad', 'to', 'breath', 'on', 'if', 'someone', 'is', 'walk', 'but', 'in', 'va', '##uba', '##n', ',', 'germany', 'they', 'don', '##t', 'have', 'that', 'pro', '##ble', 'because', '70', 'percent', 'of', 'va', '##uba', '##n', "'", 's', 'families', 'do', 'not', 'own', 'cars', ',', 'and', '57', 'percent', 'sold', 'a', 'car', 'to', 'move', 'there', '.', 'street', 'park', '##ig', ',', 'driveway', '##s', 'and', 'home', 'garage', '##s', 'are', 'forbidden', 'on', 'the', 'outskirts', 'of', 'fr', '##ei', '##bu', '##rd', 'that', 'near', 'the', 'french', 'and', 'swiss', 'borders', '.', 'you', 'pro', '##bal', '##y', 'won', "'", 't', 'see', 'a', 'car', 'in', 'va', '##uba', '##n',

In [18]:
# Now we need to embed the tokens using BERT

# First let's convert the tokens to token IDs

# Truncate the tokens to fit the BERT's maximum sequence length
if len(tokens) > 512:
    tokens = tokens[:512]

token_ids = tokenizer.convert_tokens_to_ids(tokens)
print('Token IDs:', token_ids)

# Now let's convert the token IDs to embeddings

input_ids = torch.tensor(token_ids).unsqueeze(0)
print('Input IDs shape:', input_ids.shape)
outputs = model(input_ids)
last_hidden_states = outputs.last_hidden_state

print('Last hidden states shape:', last_hidden_states.shape)



Token IDs: [2116, 2111, 2031, 2482, 2073, 2027, 2444, 1012, 1996, 2518, 2027, 2123, 1005, 1056, 2113, 2003, 2008, 2043, 2017, 2224, 1037, 2482, 2632, 4140, 1997, 2518, 2064, 4148, 2066, 2017, 2064, 2131, 1999, 16222, 5178, 2102, 2030, 1996, 5610, 2008, 1996, 2482, 2038, 2003, 2919, 2000, 3052, 2006, 2065, 2619, 2003, 3328, 2021, 1999, 12436, 19761, 2078, 1010, 2762, 2027, 2123, 2102, 2031, 2008, 4013, 3468, 2138, 3963, 3867, 1997, 12436, 19761, 2078, 1005, 1055, 2945, 2079, 2025, 2219, 3765, 1010, 1998, 5401, 3867, 2853, 1037, 2482, 2000, 2693, 2045, 1012, 2395, 2380, 8004, 1010, 11202, 2015, 1998, 2188, 7381, 2015, 2024, 10386, 2006, 1996, 12730, 1997, 10424, 7416, 8569, 4103, 2008, 2379, 1996, 2413, 1998, 5364, 6645, 1012, 2017, 4013, 10264, 2100, 2180, 1005, 1056, 2156, 1037, 2482, 1999, 12436, 19761, 2078, 1005, 1055, 4534, 2138, 2027, 2024, 3294, 1000, 2482, 2489, 1000, 2021, 2065, 2070, 2008, 3268, 1999, 12436, 19761, 2078, 2008, 8617, 1037, 2482, 6095, 2003, 3039, 1010, 2021, 20

In [19]:
# Now let's create a custom PyTorch Dataset

from torch.utils.data import Dataset

class EssayDataset(Dataset):
    
        def __init__(self, df, tokenizer, model):
            self.df = df
            self.tokenizer = tokenizer
            self.model = model
    
        def __len__(self):
            return len(self.df)
    
        def __getitem__(self, idx):
            text = self.df['full_text'][idx]
            tokens = self.tokenizer.tokenize(text)
    
            if len(tokens) > 512:
                tokens = tokens[:512]
                
            token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            
            #Let's padd the token_ids
            token_ids = token_ids + [0] * (512 - len(token_ids))
            
            input_ids = torch.tensor(token_ids).unsqueeze(0)
    
            outputs = self.model(input_ids)
            last_hidden_states = outputs.last_hidden_state
    
            return last_hidden_states, self.df['score'][idx]
        
dataset = EssayDataset(df, tokenizer, model)


In [24]:

# Now let's create a custom PyTorch DataLoader

from torch.utils.data import DataLoader

dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Now let's create a simple Transformer model

class TransformerModel(nn.Module):
    
    def __init__(self):
        super(TransformerModel, self).__init__()
        
        self.encoder = TransformerEncoder(
            features=768,
            ffn_features=3072,
            n_heads=12,
            n_blocks=12
        )
        
        self.fc = nn.Linear(768, 1)
        
    def forward(self, x):
        x = self.encoder(x)
        x = x.mean(dim=1)
        x = self.fc(x)
        
        return x
    
model = TransformerClassifier(model_name, num_labels)


In [25]:

# Now let's train the model

import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print("Strat training")
for epoch in range(10):
    running_loss = 0.0
    
    for i, data in enumerate(dataloader, 0):
        print(f"Epoch {epoch + 1}, Batch {i + 1}")
        inputs, labels = data
        
        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        if i % 100 == 99:
            print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 100))
            running_loss = 0.0
            
print('Finished Training')

Strat training
Epoch 1, Batch 1


TypeError: TransformerClassifier.forward() missing 1 required positional argument: 'attention_mask'