In [32]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences

import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from torch.optim.lr_scheduler import ExponentialLR

from transformers import BertTokenizer, BertConfig, BertModel
from transformers import AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup

from tqdm import tqdm, trange

from sklearn.metrics import log_loss

import warnings
warnings.filterwarnings('ignore')

In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [35]:
gpu_num = torch.cuda.device_count()
gpu_num

In [36]:
torch.cuda.get_device_name()

In [37]:
train = pd.read_csv('../input/feedback-prize-effectiveness/train.csv')
test = pd.read_csv('../input/feedback-prize-effectiveness/test.csv')

In [38]:
train.head()

In [39]:
#config
EPOCHS = 4
batch_size = 32
MAX_LEN = 256
MODEL_PATH = 'bert-base-uncased'

In [40]:
# First load the real tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH, do_lower_case=True)
# Save the loaded tokenizer locally
tokenizer.save_pretrained('.')

In [41]:
sep = tokenizer.sep_token
train['inputs'] = train.discourse_type + sep +train.discourse_text
new_label = {"discourse_effectiveness": {"Ineffective": 0, "Adequate": 1, "Effective": 2}}
train = train.replace(new_label)
train = train.rename(columns = {"discourse_effectiveness": "label"})

In [42]:
train.head()

In [43]:
sentence = train.inputs.values
sentence_token = [tokenizer.tokenize(sen) for sen in sentence]
print(sentence_token[0])

In [44]:
sentence_ids = [tokenizer.convert_tokens_to_ids(sen) for sen in sentence_token]
print(sentence_ids[0])

In [45]:
sentence_ids = pad_sequences(sentence_ids, maxlen=MAX_LEN, dtype='long', truncating='post', padding='post')
print(sentence_ids[0])

In [46]:
attention_mask = [[1 if id > 0 else 0 for id in sen] for sen in sentence_ids]
print(attention_mask[0])

In [47]:
labels = train.label.values
print(labels[:5])

In [48]:
X_train, X_test, y_train, y_test = train_test_split(sentence_ids, labels, test_size=0.2, random_state=666)
train_masks, test_masks, _, _ = train_test_split(attention_mask, sentence_ids,test_size=0.2, random_state=666)

In [49]:
X_train = torch.tensor(X_train).to(device)
X_test = torch.tensor(X_test).to(device)
y_train = torch.tensor(y_train).to(device)
y_test_t = torch.tensor(y_test).to(device)

train_masks = torch.tensor(train_masks).to(device)
test_masks = torch.tensor(test_masks).to(device)

In [50]:
train_dataset = TensorDataset(X_train, train_masks, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(X_test, test_masks, y_test_t)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [51]:
model = BertForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=3).to(device)

In [52]:
next(model.parameters()).is_cuda

In [53]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*EPOCHS)

In [54]:
def accuracy(labels, preds):
    preds = np.argmax(preds, axis=1).flatten()
    labels = labels.flatten()
    acc = np.sum(preds == labels)
    return acc

In [55]:
print(train_dataloader)

In [56]:
train_loss = []

for i in tqdm(range(EPOCHS), desc='Epoch'):
    model.train()
    
    tr_loss = 0
    tr_examples = 0
    tr_steps = 1
    for i, batch_data in enumerate(train_dataloader):
        batch_data = tuple(data.to(device) for data in batch_data)
        inputs_ids, inputs_masks, inputs_labels = batch_data
        inputs_ids = inputs_ids.to(device)
        inputs_masks = inputs_masks.to(device)
        inputs_labels = inputs_labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs_ids, token_type_ids=None, attention_mask=inputs_masks, labels=inputs_labels)
        #print('keys : ', outputs.keys())
        loss=outputs['loss']
        train_loss.append(loss.item())
        tr_loss += loss.item()
        tr_examples += inputs_ids.size(0)
        tr_steps += 1
        loss.backward()
        optimizer.step()
        scheduler.step()
    print('Training loss : {}'.format(tr_loss / tr_steps))
    
    model.eval()
    eval_acc = 0.0, 0.0
    eval_steps, eval_examples = 1.0, 0.0
    
    for batch in test_dataloader:
        batch = tuple(data.to(device) for data in batch_data)
        inputs_ids, inputs_masks, inputs_labels = batch
        with torch.no_grad():
            preds = model(inputs_ids, token_type_ids=None, attention_mask=inputs_masks)
        preds = preds['logits'].detach().to('cpu').numpy()
        labels = inputs_labels.to('cpu').numpy()
        
        eval_acc += accuracy(labels, preds)
        eval_steps += 1
        
    print('Test Accuracy : {}'.format(eval_acc / eval_steps))
    print('\n\n')

In [59]:
plt.figure(figsize=(12, 10))
plt.title('Training Loss')
plt.xlabel('Batch')
plt.ylabel('Loss')
plt.plot(train_loss)
plt.show()

In [60]:
test['inputs'] = test.discourse_type + sep +test.discourse_text
sents = test.inputs.values
tokens = [tokenizer.tokenize(sen) for sen in sents]
ids = [tokenizer.convert_tokens_to_ids(sen) for sen in tokens]
ids = pad_sequences(ids, maxlen=MAX_LEN, dtype='long', truncating='post', padding='post')
masks = [[1 if id > 0 else 0 for id in sen] for sen in ids]

In [61]:
ids = torch.tensor(ids).to(device)

masks = torch.tensor(masks).to(device)

In [62]:
dataset = TensorDataset(ids, masks)
dataloader =  DataLoader(dataset, batch_size, shuffle=True)

In [63]:
model.eval()

test_loss, test_acc = 0.0, 0.0
steps = 0
num = 0

for batch in dataloader:
    batch = tuple(data.to(device) for data in batch)
    inputs_ids,  inputs_masks = batch
    with torch.no_grad():
        preds = model(inputs_ids, token_type_ids=None, attention_mask=inputs_masks)
    preds = preds['logits'].detach().to('cpu').numpy()
    
    
    print(np.argmax(preds, axis=1))