In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io
from sklearn.model_selection import train_test_split
from keras.utils import pad_sequences

import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from torch.optim.lr_scheduler import ExponentialLR

from transformers import BertTokenizer, BertConfig, BertModel
from transformers import AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup

from tqdm import tqdm, trange

import warnings
warnings.filterwarnings('ignore')

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
gpu_num = torch.cuda.device_count()
gpu_num

1

In [4]:
torch.cuda.get_device_name()

'NVIDIA GeForce GTX 1660 SUPER'

In [5]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [6]:
train.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [7]:
#config
EPOCHS = 4
batch_size = 32
MAX_LEN = 256
MODEL_PATH = 'bert-base-uncased'

In [8]:
# First load the real tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH, do_lower_case=True)
# Save the loaded tokenizer locally
tokenizer.save_pretrained('.')

('.\\tokenizer_config.json',
 '.\\special_tokens_map.json',
 '.\\vocab.txt',
 '.\\added_tokens.json')

In [9]:
sep = tokenizer.sep_token
train['inputs'] = train.discourse_type + sep +train.discourse_text
new_label = {"discourse_effectiveness": {"Ineffective": 0, "Adequate": 1, "Effective": 2}}
train = train.replace(new_label)
train = train.rename(columns = {"discourse_effectiveness": "label"})

In [10]:
train.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,label,inputs
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,1,"Lead[SEP]Hi, i'm Isaac, i'm going to be writin..."
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,1,"Position[SEP]On my perspective, I think that t..."
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,1,Claim[SEP]I think that the face is a natural l...
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,1,"Evidence[SEP]If life was on Mars, we would kno..."
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,1,Counterclaim[SEP]People thought that the face ...


In [11]:
sentences = train.inputs.values
sentence_token = [tokenizer.tokenize(sen) for sen in sentences]
print(sentence_token[0])

['lead', '[SEP]', 'hi', ',', 'i', "'", 'm', 'isaac', ',', 'i', "'", 'm', 'going', 'to', 'be', 'writing', 'about', 'how', 'this', 'face', 'on', 'mars', 'is', 'a', 'natural', 'land', '##form', 'or', 'if', 'there', 'is', 'life', 'on', 'mars', 'that', 'made', 'it', '.', 'the', 'story', 'is', 'about', 'how', 'nasa', 'took', 'a', 'picture', 'of', 'mars', 'and', 'a', 'face', 'was', 'seen', 'on', 'the', 'planet', '.', 'nasa', 'doesn', "'", 't', 'know', 'if', 'the', 'land', '##form', 'was', 'created', 'by', 'life', 'on', 'mars', ',', 'or', 'if', 'it', 'is', 'just', 'a', 'natural', 'land', '##form', '.']


In [12]:
sentence_ids = [tokenizer.convert_tokens_to_ids(sen) for sen in sentence_token]
print(sentence_ids[0])

[2599, 102, 7632, 1010, 1045, 1005, 1049, 7527, 1010, 1045, 1005, 1049, 2183, 2000, 2022, 3015, 2055, 2129, 2023, 2227, 2006, 7733, 2003, 1037, 3019, 2455, 14192, 2030, 2065, 2045, 2003, 2166, 2006, 7733, 2008, 2081, 2009, 1012, 1996, 2466, 2003, 2055, 2129, 9274, 2165, 1037, 3861, 1997, 7733, 1998, 1037, 2227, 2001, 2464, 2006, 1996, 4774, 1012, 9274, 2987, 1005, 1056, 2113, 2065, 1996, 2455, 14192, 2001, 2580, 2011, 2166, 2006, 7733, 1010, 2030, 2065, 2009, 2003, 2074, 1037, 3019, 2455, 14192, 1012]


In [13]:
sentence_ids = pad_sequences(sentence_ids, maxlen=MAX_LEN, dtype='long', truncating='post', padding='post')
print(sentence_ids[0])

[ 2599   102  7632  1010  1045  1005  1049  7527  1010  1045  1005  1049
  2183  2000  2022  3015  2055  2129  2023  2227  2006  7733  2003  1037
  3019  2455 14192  2030  2065  2045  2003  2166  2006  7733  2008  2081
  2009  1012  1996  2466  2003  2055  2129  9274  2165  1037  3861  1997
  7733  1998  1037  2227  2001  2464  2006  1996  4774  1012  9274  2987
  1005  1056  2113  2065  1996  2455 14192  2001  2580  2011  2166  2006
  7733  1010  2030  2065  2009  2003  2074  1037  3019  2455 14192  1012
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

In [14]:
attention_mask = [[1 if id > 0 else 0 for id in sen] for sen in sentence_ids]
print(attention_mask[0])
#temp_ids = tokenizer(sentences, padding=True, truncation=True, max_length=MAX_LEN, return_tensors='pt')
#dict_keys(['input_ids', 'token_type_ides', 'attention_mask'])
#print(temp_ids['input_ids'][0])
#print(temp_ids['token_type_ids'][0])
#print(temp_ids['attention_mask'][0])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [15]:
labels = train.label.values
print(labels[:5])

[1 1 1 1 1]


In [16]:
X_train, X_test, y_train, y_test = train_test_split(sentence_ids, labels, test_size=0.2, random_state=666)
train_masks, test_masks, _, _ = train_test_split(attention_mask, sentence_ids,test_size=0.2, random_state=666)

In [17]:
X_train = torch.tensor(X_train).to(device)
X_test = torch.tensor(X_test).to(device)
y_train = torch.tensor(y_train).to(device)
y_test = torch.tensor(y_test).to(device)

train_masks = torch.tensor(train_masks).to(device)
test_masks = torch.tensor(test_masks).to(device)

In [18]:
train_dataset = TensorDataset(X_train, train_masks, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(X_test, test_masks, y_test)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [19]:
model = BertForSequenceClassification.from_pretrained(MODEL_PATH).to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [20]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps=0,
                                           num_training_steps=len(train_dataloader)*EPOCHS)

In [21]:
def accuracy(labels, preds):
    preds = np.argmax(preds, axis=1).flatten()
    labels = labels.flatten()
    acc = np.sum(preds == labels)
    return acc

In [26]:
train_loss = []

for i in tqdm(range(EPOCHS), desc='Epoch'):
    model.train()
    
    tr_loss = 0
    tr_examples = 0
    tr_steps = 0
    
    for i, batch_data in enumerate(train_dataloader):
        batch_data = tuple(data.to(device) for data in batch_data)
        inputs_ids, inputs_masks, inputs_labels = batch_data
        optimizer.zero_grad()
        outputs = model(inputs_ids, token_type_ids=None, attention_mask=inputs_masks, labels=inputs_labels)
        print('keys : ', outputs.keys())
        loss=outputs['loss']
        train_loss.append(loss.item())
        tr_loss += loss.item()
        tr_examples += inputs_ids.size(0)
        tr_steps += 1
        loss.backward()
        optimizer.step()
        scheduler.step()
        
    print('Training loss : {}'.format(tr_loss / tr_steps))
    
    model.eval()
    eval_acc = 0.0, 0.0
    eval_steps, eval_examples = 0.0, 0.0
    
    for batch in test_dataloader:
        batch = tuple(data.to(device) for data in batch_data)
        inputs_ids, inputs_masks, inputs_labels = batch
        with torch.no_grad():
            preds = model(inputs_ids, token_type_ides=None, attention_masks=inputs_masks)
        preds = preds['logits'].detach().to('cpu').numpy()
        labels = inputs_labels.to('cpu').numpy()
        
        eval_acc += accuracy(labels, preds)
        eval_steps += 1
    
    print('Test Accuracy : {}'.format(eval_acc / eval_steps))
    print('/n/n')

Epoch:   0%|                                                                                     | 0/4 [00:03<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 6.00 GiB total capacity; 5.26 GiB already allocated; 0 bytes free; 5.32 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [24]:
pip install --upgrade setuptools pip



ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spyder 5.1.5 requires pyqt5<5.13, which is not installed.
spyder 5.1.5 requires pyqtwebengine<5.13, which is not installed.
anaconda-project 0.11.1 requires ruamel-yaml, which is not installed.



Collecting setuptools
  Using cached setuptools-65.3.0-py3-none-any.whl (1.2 MB)
Collecting pip
  Using cached pip-22.2.2-py3-none-any.whl (2.0 MB)
Installing collected packages: setuptools, pip
  Attempting uninstall: setuptools
    Found existing installation: setuptools 63.4.1
    Uninstalling setuptools-63.4.1:
      Successfully uninstalled setuptools-63.4.1
  Attempting uninstall: pip
    Found existing installation: pip 22.1.2
    Uninstalling pip-22.1.2:
      Successfully uninstalled pip-22.1.2
Successfully installed pip-22.2.2 setuptools-65.3.0
