In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
df= pd.read_csv("preprocessed_data.csv")
df.groupby('tag').describe()
labels=[]
for i in range(len(df)):
    temp=''
    if('permission'in df['tag'][i]):
        labels.append('yes')
    else:
        labels.append('no')
rest_texts, test_texts, rest_labels, test_labels = train_test_split(df['sentence'], labels, test_size=0.1, random_state=1)
train_texts, dev_texts, train_labels, dev_labels = train_test_split(rest_texts, rest_labels, test_size=0.1, random_state=1)

print("Train size:", len(train_texts))
print("Dev size:", len(dev_texts))
print("Test size:", len(test_texts))

Train size: 766
Dev size: 86
Test size: 95


In [24]:
target_names = list(set(labels))
label2idx = {label: idx for idx, label in enumerate(target_names)}
print(label2idx)

{'yes': 0, 'no': 1}


In [25]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('lr', LogisticRegression(multi_class="ovr", solver="lbfgs"))
])

parameters = {'lr__C': [0.1, 0.5, 1, 2, 5, 10, 100, 1000]}

best_classifier = GridSearchCV(pipeline, parameters, cv=5, verbose=1)
best_classifier.fit(train_texts, train_labels)
best_predictions = best_classifier.predict(test_texts)

baseline_accuracy = np.mean(best_predictions == test_labels)
print("Baseline accuracy:", baseline_accuracy)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Baseline accuracy: 0.8631578947368421


In [26]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Using the bert model

In [27]:
BERT_MODEL = "nlpaueb/bert-base-uncased-contracts"
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(BERT_MODEL, num_labels = len(label2idx))
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/bert-base-uncased-contracts and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [28]:
import logging
import numpy as np

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

MAX_SEQ_LENGTH=100

class BertInputItem(object):
    """An item with all the necessary attributes for finetuning BERT."""

    def __init__(self, text, input_ids, input_mask, segment_ids, label_id):
        self.text = text
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        

def convert_examples_to_inputs(example_texts, example_labels, label2idx, max_seq_length, tokenizer, verbose=0):
    """Loads a data file into a list of `InputBatch`s."""
    
    input_items = []
    examples = zip(example_texts, example_labels)
    for (ex_index, (text, label)) in enumerate(examples):

        # Create a list of token ids
        input_ids = tokenizer.encode(f"[CLS] {text} [SEP]")
        if len(input_ids) > max_seq_length:
            input_ids = input_ids[:max_seq_length]

        # All our tokens are in the first input segment (id 0).
        segment_ids = [0] * len(input_ids)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        label_id = label2idx[label]

        input_items.append(
            BertInputItem(text=text,
                          input_ids=input_ids,
                          input_mask=input_mask,
                          segment_ids=segment_ids,
                          label_id=label_id))

        
    return input_items

train_features = convert_examples_to_inputs(train_texts, train_labels, label2idx, MAX_SEQ_LENGTH, tokenizer, verbose=0)
dev_features = convert_examples_to_inputs(dev_texts, dev_labels, label2idx, MAX_SEQ_LENGTH, tokenizer)
test_features = convert_examples_to_inputs(test_texts, test_labels, label2idx, MAX_SEQ_LENGTH, tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (710 > 512). Running this sequence through the model will result in indexing errors


In [29]:
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

def get_data_loader(features, max_seq_length, batch_size, shuffle=True): 

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

    dataloader = DataLoader(data, shuffle=shuffle, batch_size=batch_size)
    return dataloader

BATCH_SIZE = 16

train_dataloader = get_data_loader(train_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=True)
dev_dataloader = get_data_loader(dev_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=False)
test_dataloader = get_data_loader(test_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=False)

In [30]:
def evaluate(model, dataloader):
    model.eval()
    
    eval_loss = 0
    nb_eval_steps = 0
    predicted_labels, correct_labels = [], []

    for step, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch

        with torch.no_grad():
            tmp_eval_loss = model(input_ids, attention_mask=input_mask,
                                          token_type_ids=segment_ids, labels=label_ids).loss
            logits= model(input_ids, attention_mask=input_mask,
                                          token_type_ids=segment_ids, labels=label_ids).logits
            print(tmp_eval_loss)

        outputs = np.argmax(logits.to('cpu'), axis=1)
        label_ids = label_ids.to('cpu').numpy()
        
        predicted_labels += outputs
        correct_labels += list(label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    
    correct_labels = np.array(correct_labels)
    predicted_labels = np.array(predicted_labels)
        
    return eval_loss, correct_labels , predicted_labels

In [31]:
from transformers.optimization import AdamW
from transformers import get_linear_schedule_with_warmup

GRADIENT_ACCUMULATION_STEPS = 1
NUM_TRAIN_EPOCHS = 20
LEARNING_RATE = 5e-5
WARMUP_PROPORTION = 0.1
MAX_GRAD_NORM = 5

num_train_steps = int(len(train_dataloader.dataset) / BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(WARMUP_PROPORTION * num_train_steps)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps, num_train_steps)



In [32]:
import torch
import os
from tqdm import trange
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import classification_report, precision_recall_fscore_support

OUTPUT_DIR = "/tmp/"
MODEL_FILE_NAME = "pytorch_model.bin"
PATIENCE = 2

loss_history = []
no_improvement = 0
for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Training iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch

        outputs = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids)
        loss = outputs[0]

        if GRADIENT_ACCUMULATION_STEPS > 1:
            loss = loss / GRADIENT_ACCUMULATION_STEPS

        loss.backward()
        tr_loss += loss.item()

        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)  
            
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
            
    dev_loss, _, _ = evaluate(model, dev_dataloader)
    
    print("Loss history:", loss_history)
    print("Dev loss:", dev_loss)
    
    if len(loss_history) == 0 : #or dev_loss < min(loss_history):
        no_improvement = 0
        model_to_save = model.module if hasattr(model, 'module') else model
        output_model_file = os.path.join(OUTPUT_DIR, MODEL_FILE_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
    else:
        no_improvement += 1
    
    if no_improvement >= PATIENCE: 
        print("No improvement on development set. Finish training.")
        break
        
    
    loss_history.append(dev_loss)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in enumerate(tqdm(train_dataloader, desc="Training iteration")):


Training iteration:   0%|          | 0/48 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration")):


Evaluation iteration:   0%|          | 0/6 [00:00<?, ?it/s]

tensor(0.0344, device='cuda:0')
tensor(0.3329, device='cuda:0')
tensor(0.2334, device='cuda:0')
tensor(0.0631, device='cuda:0')
tensor(0.5825, device='cuda:0')
tensor(0.0303, device='cuda:0')
Loss history: []
Dev loss: 0.21276293819149336


Epoch:   5%|█▊                                   | 1/20 [00:05<01:37,  5.12s/it]

Training iteration:   0%|          | 0/48 [00:00<?, ?it/s]

Evaluation iteration:   0%|          | 0/6 [00:00<?, ?it/s]

tensor(0.0394, device='cuda:0')
tensor(0.6122, device='cuda:0')
tensor(0.8091, device='cuda:0')
tensor(0.0245, device='cuda:0')
tensor(0.7108, device='cuda:0')


Epoch:  10%|███▋                                 | 2/20 [00:09<01:21,  4.51s/it]

tensor(0.1737, device='cuda:0')
Loss history: [0.21276293819149336]
Dev loss: 0.3949471547578772


Training iteration:   0%|          | 0/48 [00:00<?, ?it/s]

Evaluation iteration:   0%|          | 0/6 [00:00<?, ?it/s]

tensor(0.0160, device='cuda:0')
tensor(0.2717, device='cuda:0')
tensor(0.0190, device='cuda:0')
tensor(0.0139, device='cuda:0')
tensor(0.3284, device='cuda:0')


Epoch:  10%|███▋                                 | 2/20 [00:13<01:57,  6.54s/it]

tensor(0.0159, device='cuda:0')
Loss history: [0.21276293819149336, 0.3949471547578772]
Dev loss: 0.11081688556199272
No improvement on development set. Finish training.





In [33]:
model_state_dict = torch.load(os.path.join(OUTPUT_DIR, MODEL_FILE_NAME), map_location=lambda storage, loc: storage)
model = BertForSequenceClassification.from_pretrained(BERT_MODEL, state_dict=model_state_dict, num_labels = len(target_names))
model.to(device)

model.eval()

train_loss, train_correct, train_predicted = evaluate(model, train_dataloader)
dev_loss, dev_correct, dev_predicted = evaluate(model, dev_dataloader)
test_loss, test_correct, test_predicted = evaluate(model, test_dataloader)

print("Training performance:", precision_recall_fscore_support(train_correct, train_predicted, average="micro"))
print("Development performance:", precision_recall_fscore_support(dev_correct, dev_predicted, average="micro"))
print("Test performance:", precision_recall_fscore_support(test_correct, test_predicted, average="micro"))
print("Test loss: ", test_loss)

bert_accuracy = np.mean(test_predicted == test_correct)

print(classification_report(test_correct, test_predicted, target_names=target_names))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration")):


Evaluation iteration:   0%|          | 0/48 [00:00<?, ?it/s]

tensor(0.0306, device='cuda:0')
tensor(0.0955, device='cuda:0')
tensor(0.0477, device='cuda:0')
tensor(0.2375, device='cuda:0')
tensor(0.1505, device='cuda:0')
tensor(0.2611, device='cuda:0')
tensor(0.0340, device='cuda:0')
tensor(0.2295, device='cuda:0')
tensor(0.5011, device='cuda:0')
tensor(0.2887, device='cuda:0')
tensor(0.5448, device='cuda:0')
tensor(0.0269, device='cuda:0')
tensor(0.0796, device='cuda:0')
tensor(0.0330, device='cuda:0')
tensor(0.0488, device='cuda:0')
tensor(0.0822, device='cuda:0')
tensor(0.2355, device='cuda:0')
tensor(0.0322, device='cuda:0')
tensor(0.0308, device='cuda:0')
tensor(0.0295, device='cuda:0')
tensor(0.0666, device='cuda:0')
tensor(0.0354, device='cuda:0')
tensor(0.2484, device='cuda:0')
tensor(0.0778, device='cuda:0')
tensor(0.0329, device='cuda:0')
tensor(0.0370, device='cuda:0')
tensor(0.0305, device='cuda:0')
tensor(0.0304, device='cuda:0')
tensor(0.0321, device='cuda:0')
tensor(0.0658, device='cuda:0')
tensor(0.2270, device='cuda:0')
tensor(0

Evaluation iteration:   0%|          | 0/6 [00:00<?, ?it/s]

tensor(0.0344, device='cuda:0')
tensor(0.3329, device='cuda:0')
tensor(0.2334, device='cuda:0')
tensor(0.0631, device='cuda:0')
tensor(0.5825, device='cuda:0')
tensor(0.0303, device='cuda:0')


Evaluation iteration:   0%|          | 0/6 [00:00<?, ?it/s]

tensor(0.4506, device='cuda:0')
tensor(0.4022, device='cuda:0')
tensor(0.0678, device='cuda:0')
tensor(0.3830, device='cuda:0')
tensor(0.0833, device='cuda:0')
tensor(0.0380, device='cuda:0')
Training performance: (0.9621409921671018, 0.9621409921671018, 0.9621409921671018, None)
Development performance: (0.9302325581395349, 0.9302325581395349, 0.9302325581395349, None)
Test performance: (0.9157894736842105, 0.9157894736842105, 0.9157894736842105, None)
Test loss:  0.23747440924247107
              precision    recall  f1-score   support

         yes       0.84      0.94      0.89        33
          no       0.97      0.90      0.93        62

    accuracy                           0.92        95
   macro avg       0.90      0.92      0.91        95
weighted avg       0.92      0.92      0.92        95

