In [1]:
! pip install torch
! pip install transformers
! pip install tqdm 
! pip install pandas



In [49]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import tqdm
import pandas as pd

### Loading the data

In [62]:
def load_data(file_path):
    data = pd.read_table(file_path)
    # check if any missing values
    print(data.isnull().sum())
    key = data.keys()
    # some values were missing in sentence2 column, so did the below (sentence1 didnt split properly)
    # iterate through the rows in dataframe which have missing values
    for index, row in data[data.isnull().any(axis=1)].iterrows():
        if pd.isnull(row[key[2]]):
            if(len(row[key[1]].split('\t')) > 2 or len(row[key[1]].split('\t')) < 2):
                data.drop(index, inplace=True)
                continue
            # split the sentence1 into words into 2 parts based on \t and assign to sentence1 and sentence2
            sentence1, sentence2 = row[key[1]].split('\t')
            score = row[key[0]]
            # assign to the row
            data.at[index, key[1]] = sentence1
            data.at[index, key[2]] = sentence2
            data.at[index, key[0]] = score
    #rescale every score in data from 0-5 to 0-1
    data[key[0]] = data[key[0]]/5
    return data

In [63]:
train_data = load_data('train.csv')
valid_data = load_data('dev.csv')

score        0
sentence1    0
sentence2    5
dtype: int64
score        0
sentence1    0
sentence2    2
dtype: int64


In [64]:
valid_data.tail(10)

Unnamed: 0,score,sentence1,sentence2
1460,0.4,New UN peacekeeping chief named for Central Af...,UN takes over peacekeeping in Central African ...
1461,1.0,Oil falls in Asian trade,Oil prices down in Asian trade
1462,0.6,Israeli forces detain Palestinian MP in Hebron,Israeli forces detain 2 Palestinians in overni...
1463,0.6,Israeli police clash with Palestinian proteste...,Israel Police Clash With Palestinians in Jerus...
1464,0.0,"3 killed, 4 injured in Los Angeles shootings",Five killed in Saudi Arabia shooting
1465,0.4,Scientists prove there is water on Mars,Has Nasa discovered water on Mars?
1466,0.0,Pranab stresses need to strive for peace by na...,WTO: India regrets action of developed nations
1467,0.4,Volkswagen skids into red in wake of pollution...,"Volkswagen's ""gesture of goodwill"" to diesel o..."
1468,0.0,Obama is right: Africa deserves better leadership,Obama waiting for midterm to name attorney gen...
1469,0.0,New video shows US police officers beating men...,New York police officer critically wounded in ...


In [65]:
print(train_data.shape)
print(train_data['score'].value_counts())

(5709, 3)
score
0.0000    367
0.8000    351
0.6000    308
1.0000    265
0.7600    263
         ... 
0.0134      1
0.1454      1
0.3400      1
0.3556      1
0.8660      1
Name: count, Length: 139, dtype: int64


### Defining device variable

In [66]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


### Task 1A: using BERT and a linear layer to directly get the score

In [4]:
# Load pre-trained model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
sep_token = '[SEP]'

# Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input
text = "Hello, my dog is cute"
tokens = tokenizer.tokenize(text)
print(tokens)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['hello', ',', 'my', 'dog', 'is', 'cute']
