In [None]:
!pip install transformers==3.3.1

Collecting transformers==3.3.1
[?25l  Downloading https://files.pythonhosted.org/packages/19/22/aff234f4a841f8999e68a7a94bdd4b60b4cebcfeca5d67d61cd08c9179de/transformers-3.3.1-py3-none-any.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 12.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 60.6MB/s 
[?25hCollecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 56.6MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64

In [None]:
!git clone https://github.com/helboukkouri/character-bert.git

Cloning into 'character-bert'...
remote: Enumerating objects: 51, done.[K
remote: Counting objects: 100% (51/51), done.[K
remote: Compressing objects: 100% (49/49), done.[K
remote: Total 51 (delta 12), reused 31 (delta 0), pack-reused 0[K
Unpacking objects: 100% (51/51), done.


In [None]:
%cd character-bert

/content/character-bert


In [None]:
!python download.py --model='medical_character_bert'

21/12/2020 12:33:05 - INFO - download.py -   Downloading medical_character_bert model (~200MB tar.xz archive)
21/12/2020 12:33:09 - INFO - download.py -   Extracting model from archive (~420MB folder)
21/12/2020 12:33:22 - INFO - download.py -   Removing archive
21/12/2020 12:33:22 - INFO - download.py -   Done.


In [None]:
import pandas as pd
import numpy as np
import time
import re
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW
from tqdm import tqdm, trange
import torch
import torch.nn as nn
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from modeling.character_bert import CharacterBertModel
from utils.character_cnn import CharacterIndexer

In [None]:
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [None]:
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=6) 
model = BertForSequenceClassification(config=config)
character_bert_model = CharacterBertModel.from_pretrained(
    './pretrained-models/medical_character_bert/')
model.bert = character_bert_model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model.cuda()
device='cuda'

In [None]:
df_train = pd.read_csv('dataset.csv')
df_train['sentiment'], uniq = pd.factorize(df_train['sentiment'])
X = df_train['comment'].tolist()
tokenized = [tokenizer.basic_tokenizer.tokenize(text) for text in X]
indexer = CharacterIndexer()  # This converts each token into a list of character indices
input_tensor = indexer.as_padded_tensor(tokenized)
X_train, X_test, y_train, y_test = train_test_split(input_tensor,df_train['sentiment'].tolist(),test_size=0.1,random_state=42)

In [None]:
batch_size = 32

y_train, y_test = torch.tensor(y_train), torch.tensor(y_test)
train_data = TensorDataset(X_train, y_train)
train_dataloader = DataLoader(train_data,batch_size=batch_size)

val_data = TensorDataset(X_test, y_test)
val_dataloader = DataLoader(val_data,batch_size=batch_size)

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = AdamW(optimizer_grouped_parameters,lr=2e-5)

In [None]:
epochs = 3
train_loss_set = []

for _ in trange(epochs, desc="Epoch"):
  start_time = time.time()
  model.train()
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  for step, batch in enumerate(train_dataloader):

    batch = tuple(t.to(device) for t in batch)
  
    b_input_ids, b_labels = batch

    optimizer.zero_grad()
  
    outputs = model(b_input_ids)[0]
    
    loss = loss_fn(outputs,b_labels)
    train_loss_set.append(loss.item())    
    
    loss.backward()

    optimizer.step()
    
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1
  end_time = time.time()

  print(epoch_time(start_time,end_time))

  print("\nTrain loss: {}".format(tr_loss/nb_tr_steps))

Epoch:  33%|███▎      | 1/3 [03:51<07:43, 231.64s/it]

(3, 51)

Train loss: 1.0491705411817969


Epoch:  67%|██████▋   | 2/3 [07:44<03:52, 232.09s/it]

(3, 53)

Train loss: 0.8557209728694544


Epoch: 100%|██████████| 3/3 [11:37<00:00, 232.43s/it]

(3, 52)

Train loss: 0.7482494985185019





In [None]:
preds = []
with torch.no_grad():
  correct = 0
  total = 0
  for i, batch in enumerate(val_dataloader):
    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_labels = batch
    
    outputs = model(b_input_ids)[0]
    # print (outputs)
    prediction = torch.argmax(outputs,dim=1)
    preds.append(prediction)
    total += b_labels.size(0)
    correct+=(prediction==b_labels).sum().item()


In [None]:
final_preds = []
for tensor in preds:
  for pred in tensor:
    final_preds.append(int(pred))

In [None]:
print(classification_report(y_test,final_preds))

              precision    recall  f1-score   support

           0       0.68      0.73      0.70       154
           1       0.75      0.86      0.80       407
           2       0.72      0.60      0.65        82
           3       0.43      0.30      0.35        44
           4       0.00      0.00      0.00        27
           5       0.00      0.00      0.00        14

    accuracy                           0.72       728
   macro avg       0.43      0.41      0.42       728
weighted avg       0.67      0.72      0.69       728



  _warn_prf(average, modifier, msg_start, len(result))
