In [0]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("There are {} GPUs available.".format(torch.cuda.device_count()))
    print("We will use GPU {}".format(torch.cuda.get_device_name(0)))
else:
    print("There is no GPU available, using the CPU instead!")
    device = torch.device("cpu")

In [0]:
import pandas as pd

train_content = pd.read_csv("train.csv")
test_content = pd.read_csv("test.csv")

def read_file(contents):
    text_list = []
    label_list = []
    for i in range(len(contents)):
      record = contents.iloc[i, :]
      text_list.append(record["text"])
      label_list.append(record["label"])
    return text_list, label_list

train_text, train_label = read_file(train_content)
test_text, test_label = read_file(test_content)
print(len(train_text))
print(len(test_text))

In [0]:
# Now we gonna load the module
!pip install transformers

In [0]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [0]:
import nltk
import ssl


ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('punkt')
sent_segmenter = nltk.data.load('tokenizers/punkt/english.pickle')

In [0]:
def convert_para_to_id(contents, para_length):
    contents_ids_list = []
    attentions_list = []
    for content in contents:
        word_count = 0
        content_list = []
        attention_list = []
        sentences = sent_segmenter.tokenize(content)
        for sentence in sentences:
            while word_count < para_length:
                encoded_con = tokenizer.encode(sentence, add_special_tokens=True)
                content_list.extend(encoded_con)
                word_count += len(encoded_con)
        if len(content_list) > para_length:
            content_list = content_list[:para_length]
            attention_list.extend([1] * para_length)
        else:
            content_list.extend([0] * (para_length - len(content_list)))
            attention_list.extend([1] * len(content_list))
            attention_list.extend([0] * (para_length - len(content_list)))
        contents_ids_list.append(content_list)
        attentions_list.append(attention_list)
    return contents_ids_list, attentions_list


train_ids_list, train_attention_list = convert_para_to_id(train_text, 200)
print(len(train_ids_list))
print(len(train_attention_list))

In [0]:
# Covert label list to Tensor
import torch

labels = torch.LongTensor(train_label)
input_ids = torch.LongTensor(train_ids_list)
attention_mask = torch.LongTensor(train_attention_list)

In [0]:
# Combine the traiing inputs into a TensroDataset
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

train_data = TensorDataset(input_ids, attention_mask, labels)


train_data_loader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=100)
print(len(train_data_loader))

In [0]:
# Load the Model
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels = 2,
        output_attentions = False,
        output_hidden_states = False
)

print("The Model Loading Completed!...")

model.cuda()

In [0]:
#Define the optimizer
optimizer = AdamW(model.parameters(), lr = 2e-5, eps=1e-8)
print("Optimizer Loading Completed!...")

In [0]:
from transformers import get_linear_schedule_with_warmup

epochs = 2

total_steps = len(train_data_loader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [0]:
# Define a helper function for calculating Accuracy
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [0]:
# read the dev dataset
dev_text = test_text[:100]
dev_label = test_label[:100]


dev_ids_list, dev_attention_list = convert_para_to_id(dev_text, 200)
print(len(dev_ids_list))

dev_labels = torch.LongTensor(dev_label)
dev_input_ids = torch.LongTensor(dev_ids_list)
dev_attention_mask = torch.LongTensor(dev_attention_list)
dev_data = TensorDataset(dev_input_ids, dev_attention_mask , dev_labels)


dev_data_loader = DataLoader(dev_data, sampler=RandomSampler(dev_data), batch_size=10)
print(len(dev_data_loader))

In [0]:
# Now training
import random
import numpy as np

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)

training_stats = [] # used to store the training information


for epoch_i in range(epochs):
    
    print("")
    print("======== Epoch {:} / {:} ========".format(epoch_i + 1, epochs))
    print("Training...")
    
    total_train_loss = 0
    
    model.train()
    
    for step, batch in enumerate(train_data_loader):
        
        if (step + 1) % 10 == 0 and not step == 0:
            print("Batch {} of {}".format(step, len(train_data_loader)))
        
        batch_input_ids = batch[0].to(device)
        batch_input_mask = batch[1].to(device)
        batch_labels = batch[2].to(device)
        
        model.zero_grad()
        loss, logits = model(batch_input_ids, token_type_ids=None,
                            attention_mask=batch_input_mask,
                            labels=batch_labels)
        total_train_loss += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)
        
        optimizer.step()
        
        scheduler.step()
    
    avg_train_loss = total_train_loss / len(train_data_loader)
    
    print("")
    print(" Average Training Loss is {:2f}".format(avg_train_loss))
    
    # Now perform validation
    
    print("")
    print("Running Validation...")
    
    model.eval()
    
    total_eval_accuracy = 0
    total_eval_loss = 0
    
    for batch in dev_data_loader:
        dev_b_input_ids = batch[0].to(device)
        dev_b_input_mask = batch[1].to(device)
        dev_b_labels = batch[2].to(device)
        
        with torch.no_grad():
            loss, logits = model(dev_b_input_ids, 
                                 token_type_ids=None, 
                                attention_mask=dev_b_input_mask,
                                labels=dev_b_labels)
        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = dev_b_labels.to("cpu").numpy()
        
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    avg_val_loss = total_eval_loss / len(dev_data_loader)
    avg_val_accuracy = total_eval_accuracy / len(dev_data_loader)
    print("Validation loss :{}".format(avg_val_loss))
    print("Accuracy is {}".format(avg_val_accuracy))
    
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy
        }
    )
    
print("")
print("Training Complete!...")

In [0]:
torch.save(model, "BERT_v1.0.pkl")