In [1]:
! pip install torch
! pip install transformers
! pip install tqdm 
! pip install pandas



In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import tqdm
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


### Loading the data

In [3]:
def load_data(file_path):
    data = pd.read_table(file_path)
    # check if any missing values
    print(data.isnull().sum())
    key = data.keys()
    # some values were missing in sentence2 column, so did the below (sentence1 didnt split properly)
    # iterate through the rows in dataframe which have missing values
    for index, row in data[data.isnull().any(axis=1)].iterrows():
        if pd.isnull(row[key[2]]):
            if(len(row[key[1]].split('\t')) > 2 or len(row[key[1]].split('\t')) < 2):
                data.drop(index, inplace=True)
                continue
            # split the sentence1 into words into 2 parts based on \t and assign to sentence1 and sentence2
            sentence1, sentence2 = row[key[1]].split('\t')
            score = row[key[0]]
            # assign to the row
            data.at[index, key[1]] = sentence1
            data.at[index, key[2]] = sentence2
            data.at[index, key[0]] = score
    #rescale every score in data from 0-5 to 0-1
    data[key[0]] = data[key[0]]/5
    return data

In [4]:
train_data = load_data('train.csv')
valid_data = load_data('dev.csv')

score        0
sentence1    0
sentence2    5
dtype: int64
score        0
sentence1    0
sentence2    2
dtype: int64


In [5]:
valid_data.tail(10)

Unnamed: 0,score,sentence1,sentence2
1460,0.4,New UN peacekeeping chief named for Central Af...,UN takes over peacekeeping in Central African ...
1461,1.0,Oil falls in Asian trade,Oil prices down in Asian trade
1462,0.6,Israeli forces detain Palestinian MP in Hebron,Israeli forces detain 2 Palestinians in overni...
1463,0.6,Israeli police clash with Palestinian proteste...,Israel Police Clash With Palestinians in Jerus...
1464,0.0,"3 killed, 4 injured in Los Angeles shootings",Five killed in Saudi Arabia shooting
1465,0.4,Scientists prove there is water on Mars,Has Nasa discovered water on Mars?
1466,0.0,Pranab stresses need to strive for peace by na...,WTO: India regrets action of developed nations
1467,0.4,Volkswagen skids into red in wake of pollution...,"Volkswagen's ""gesture of goodwill"" to diesel o..."
1468,0.0,Obama is right: Africa deserves better leadership,Obama waiting for midterm to name attorney gen...
1469,0.0,New video shows US police officers beating men...,New York police officer critically wounded in ...


In [6]:
print(train_data.shape)
print(train_data['score'].value_counts())

(5709, 3)
score
0.0000    367
0.8000    351
0.6000    308
1.0000    265
0.7600    263
         ... 
0.0134      1
0.1454      1
0.3400      1
0.3556      1
0.8660      1
Name: count, Length: 139, dtype: int64


### Defining device variable

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


### Task 1A: using BERT to perform regression

### Creating a dataset class

In [34]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, data, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['sentence1'] + ' [SEP] ' + self.data.iloc[idx]['sentence2']
        inputs = self.tokenizer(text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors='pt')
        inputs['labels'] = torch.tensor(self.data.iloc[idx]['score'], dtype=torch.float)
        inputs = {key: inputs[key].squeeze() for key in inputs}
        return inputs

In [35]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 50

### Defining the dataset and dataloader class:

In [36]:
train_dataset = TextDataset(tokenizer, train_data, max_length)
valid_dataset = TextDataset(tokenizer, valid_data, max_length)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=8, shuffle=False)

In [37]:
# Load pre-trained model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)
sep_token = '[SEP]'

# Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.MSELoss()

### Defining the model architecture:

In [39]:
class Model(torch.nn.Module):
    def __init__(self, model):
        super(Model, self).__init__()
        self.model = model

    def forward(self, input_ids, attention_mask, labels):
        outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
        return outputs.loss, outputs.logits
    
model = Model(model)
model.to(device)


Model(
  (model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=Tru

In [40]:
def train(model, loader, optimizer, criterion, epochs=1, valid_loader=None):
    model.train()
    total_loss = 0
    for epoch in range(epochs):
        print(f'Epoch {epoch + 1}')
        for batch in tqdm.tqdm(loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            optimizer.zero_grad()
            loss, _ = model(input_ids, attention_mask, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        total_loss += loss.item()
        print(f'Epoch: {epoch}, Training loss: {total_loss / len(loader)}')
        if(valid_loader is not None):
            evaluate(model, valid_loader, criterion)
    return total_loss / len(loader)

def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm.tqdm(loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            loss, _ = model(input_ids, attention_mask, labels)
            total_loss += loss.item()
    print(f'Validation loss: {total_loss / len(loader)}')
    return total_loss / len(loader)

In [41]:
train(model, train_loader, optimizer, criterion, epochs=2, valid_loader=valid_loader)

Epoch 1


  1%|          | 4/714 [00:13<40:53,  3.46s/it]


KeyboardInterrupt: 