In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os


#os.environ['HF_HOME'] = '/data/users/ugarg/hf/hf_cache/'
os.environ['TRANSFORMERS_CACHE'] = '/data/users/ugarg/hf/hf_cache/'

In [3]:
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import torch
import numpy as np
import random
from transformers import AdamW, AutoTokenizer,  AutoModel
from torch.nn.functional import one_hot
from collections import Counter
from transformers import DataCollatorForTokenClassification
from torch.utils.data import DataLoader
from data_layer import create_splits_and_vocab 
from utils import tokenize_batch
from models import Model
from torch.optim import AdamW, Adam
from transformers import get_scheduler
from accelerate import Accelerator
import joblib
from engines import rel_trainer, postprocess_rel, rel_predict_on_batch, ner_trainer, postprocess_ner, ner_predict_on_batch, mtl_trainer
from prediction import predict

from utils import align_labels_with_tokens
def seed_everything(seed: int):
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    


In [7]:
def main(data_path, train_size, model_checkpoint, batch_size, dropout, lr, early_stopping_steps, num_train_epochs, device, seed, task):
    seed_everything(seed)
    print(f'\nYo! Model will run on GPU: {device}\n')
    dataset, ner_id2label, ner_label2id, rel_id2label, rel_label2id = create_splits_and_vocab(data_path, train_size, seed)
    
    
    
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    
    print('\nTokenizing Data...')
    ## batch size =5000 because it makes the batches of different lengths and dataloader the doesnt work
    tokenized_dataset = dataset.map(tokenize_batch, fn_kwargs = {'tokenizer':tokenizer, 
                                                        'ner_label2id':ner_label2id, 
                                                            'rel_label2id' :rel_label2id}, batched = True,remove_columns = dataset['train'].column_names, batch_size = 5000)
    
    print('Tokenizing Data Done.\n')
    
    #collate
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
    
    #data loader

    train_dataloader = DataLoader(
        tokenized_dataset["train"],
        shuffle=True,
        collate_fn=data_collator,
        batch_size=batch_size,
    )
    eval_dataloader = DataLoader(
        tokenized_dataset["val"], collate_fn=data_collator, batch_size=batch_size
    )
    print('\nDataLoaders Created.\n')
    
    print('\nCreating Model')
    model = Model(model_checkpoint, ner_id2label, ner_label2id, rel_id2label, rel_label2id, dropout = dropout, device = device)
    model.to(device)
    print('\nModel Created\n')

    optimizer = AdamW(model.parameters(), lr=lr)
    
    #accelarator
    accelerator = Accelerator()
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
    )
    
    
    #scheduler
    num_update_steps_per_epoch = len(train_dataloader)
    num_training_steps = num_train_epochs * num_update_steps_per_epoch

    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )
    
    print('Starting Training..')
    model.to(device)
    if task == 'rel':
        model, optimizer = rel_trainer(model, tokenizer, rel_id2label, train_dataloader, accelerator, optimizer, 
                                       lr_scheduler, eval_dataloader, num_training_steps, num_train_epochs, early_stopping_steps, device)

    elif task =='ner':
        model, optimizer = ner_trainer(model, tokenizer, ner_id2label, train_dataloader, accelerator, optimizer, lr_scheduler, eval_dataloader, num_training_steps, num_train_epochs, early_stopping_steps, device)
    
    
    elif task =='mtl':
        model, optimizer = mtl_trainer(model, tokenizer, train_dataloader, accelerator, optimizer, lr_scheduler, eval_dataloader, num_training_steps, num_train_epochs, early_stopping_steps, device)
    return model, tokenizer, eval_dataloader


In [16]:
model, tokenizer, eval_dataloader = main('data/', 1800,
        model_checkpoint = "bert-base-uncased",batch_size = 32, dropout = 0.1, lr = 1e-4, early_stopping_steps = 4, num_train_epochs = 30, device = 'cuda:1',seed = 42, task = 'rel')


Yo! Model will run on GPU: cuda:1

Creating Splits with training size 1800...
Creating Vocab...
ner_id2label: 
{0: 'B_subject', 1: 'I_subject', 2: 'B_person', 3: 'B_movie', 4: 'B_director', 5: 'I_director', 6: 'B_release_year', 7: 'B_country', 8: 'I_producer', 9: 'I_char', 10: 'I_movie', 11: 'I_person', 12: 'B_cast', 13: 'I_cast', 14: 'B_char', 15: 'B_location', 16: 'I_release_year', 17: 'B_language', 18: 'O', 19: 'I-movie', 20: 'I_mpaa_rating', 21: 'I_language', 22: 'I_genre', 23: 'B_producer', 24: 'I_country', 25: 'B_mpaa_rating', 26: 'B_genre'}

ner_label2id: 
{'B_subject': 0, 'I_subject': 1, 'B_person': 2, 'B_movie': 3, 'B_director': 4, 'I_director': 5, 'B_release_year': 6, 'B_country': 7, 'I_producer': 8, 'I_char': 9, 'I_movie': 10, 'I_person': 11, 'B_cast': 12, 'I_cast': 13, 'B_char': 14, 'B_location': 15, 'I_release_year': 16, 'B_language': 17, 'O': 18, 'I-movie': 19, 'I_mpaa_rating': 20, 'I_language': 21, 'I_genre': 22, 'B_producer': 23, 'I_country': 24, 'B_mpaa_rating': 25, '

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Tokenizing Data Done.


DataLoaders Created.


Creating Model


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Model Created

Starting Training..


  0%|          | 0/1710 [00:00<?, ?it/s]

[Epoch 0 / 30]

Epoch: 0 	Training Loss: 0.264357 	Validation Loss: 0.042348 	Validation Acc: 0.052980
Loss improved, saving model..
[Epoch 1 / 30]

Epoch: 1 	Training Loss: 0.145253 	Validation Loss: 0.024804 	Validation Acc: 0.682119
Loss improved, saving model..
[Epoch 2 / 30]

Epoch: 2 	Training Loss: 0.076876 	Validation Loss: 0.014254 	Validation Acc: 0.852097
Loss improved, saving model..
[Epoch 3 / 30]

Epoch: 3 	Training Loss: 0.047961 	Validation Loss: 0.011065 	Validation Acc: 0.885210
Loss improved, saving model..
[Epoch 4 / 30]

Epoch: 4 	Training Loss: 0.034824 	Validation Loss: 0.009409 	Validation Acc: 0.887417
Loss improved, saving model..
[Epoch 5 / 30]

Epoch: 5 	Training Loss: 0.026414 	Validation Loss: 0.009201 	Validation Acc: 0.894040
Loss improved, saving model..
[Epoch 6 / 30]

Epoch: 6 	Training Loss: 0.020612 	Validation Loss: 0.007957 	Validation Acc: 0.909492
Loss improved, saving model..
[Epoch 7 / 30]

Epoch: 7 	Training Loss: 0.016677 	Validation Loss: 0

In [23]:
data = joblib.load('data/val.joblib')
data['utterances'] = data.utterances.apply(lambda x: ' '.join(x))
#data['IOB Slot tags'] = data['IOB Slot tags'].apply(lambda x: ' '.join(x))
#data['Core Relations'] = data['Core Relations'].apply(lambda x: ' '.join(x))
data

# data = pd.read_excel('data/hw1_test.xlsx', engine = 'openpyxl')
# data.head()

Unnamed: 0,utterances,IOB Slot tags,Core Relations
0,bring up all diving related movies,"[O, O, O, B_subject, O, O]",[movie.subjects]
1,what are the names of movies with zombies in them,"[O, O, O, O, O, O, O, B_subject, O, O]",[movie.subjects]
2,can you give me a list of movies produced by b...,"[O, O, O, O, O, O, O, O, O, O, B_producer, I_p...",[movie.produced_by]
3,show me the dirty dancing movie,"[O, O, O, B_movie, I_movie, O]",[no_role]
4,i'd like to see foreign films,"[O, O, O, O, O, O]",[movie.country]
...,...,...,...
448,dutch movie listing,"[B_country, O, O]","[movie.country, movie.language]"
449,show all movies by paramount,"[O, O, O, O, B_producer]",[movie.production_companies]
450,films produced by twentieth century fox,"[O, O, O, B_producer, I_producer, I_producer]",[movie.production_companies]
451,list french movies,"[O, B_country, O]","[movie.country, movie.language]"


In [30]:
!python test.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
# data['ner_act'] = data['IOB Slot tags'].apply(lambda x: ' '.join(x))
# data['ner_acc'] = data.apply(lambda x: 1 if x['ner_pred'] == x['ner_act'] else 0, axis=1)
# np.mean(data.ner_acc.values)
# #data[data.ner_acc==0].head()

In [24]:
def final_prediction(data, task):
    if task == 'rel':
        data['rel_pred'] = ''
        tokenizer = AutoTokenizer.from_pretrained("checkpoints/rel/tokenizer.pt/")
        checkpoint = torch.load('checkpoints/rel/checkpoint.tar') 
        model = checkpoint['model']
        #model.to('cpu')
        rel_id2label = joblib.load('util_files/rel_id2label.joblib')
        preds = predict(data.utterances.values, model, tokenizer, rel_id2label, task = task)
        rel_preds = []

        for idx, pred in enumerate(preds):
            data.loc[idx, 'rel_pred'] = pred[1]
            #rel_preds.append((pred[0], pred[1], ' '.join(data.loc[idx, 'Core Relations'])))
        
        
        
            
        
        return data
            
    if task == 'ner':
        tokenizer = AutoTokenizer.from_pretrained("checkpoints/ner/tokenizer.pt/")
        checkpoint = torch.load('checkpoints/ner/checkpoint.tar') 
        model = checkpoint['model']
        
        ner_id2label = joblib.load('util_files/ner_id2label.joblib')
        
        preds = []
        data['ner_pred'] = ''
        data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
        for idx, example in enumerate(data.utterances.values):
            example = example.split(' ')

            d = tokenizer(example, is_split_into_words=True)
            #print(d)
            d['rel_labels'] = [-101 for i in d['input_ids']]
            d['ner_labels'] = [0 for i in d['input_ids']]
            #print(tokenizer.convert_ids_to_tokens(d['input_ids']))

            for c in d:
                d[c] = torch.Tensor(d[c]).long().unsqueeze(0)
            _, pred, _ = model(d, task = 'ner')
            pred  = pred.argmax(dim=2)[0].detach().cpu().clone().numpy()

            word_ids = d.word_ids()


            prev_word_id = None
            for ind in range(len(word_ids)):
                if word_ids[ind] is None or prev_word_id == word_ids[ind]:
                    pred[ind] = -100
                else:
                    prev_word_id = word_ids[ind]




            pred = [i for i in pred if i!=-100]

            pred = [ner_id2label[i] for i in pred]
            #print(pred)
            data.loc[idx, 'ner_pred'] = ' '.join(pred)
            
    return data

In [31]:
data = final_prediction(data, task = 'ner')

In [32]:
data

Unnamed: 0,utterances,IOB Slot tags,Core Relations,rel_pred,rel_act,rel_acc,ner_pred
0,bring up all diving related movies,"[O, O, O, B_subject, O, O]",[movie.subjects],movie.subjects,movie.subjects,1,I_genre I_genre I_genre B_language I_genre I_g...
1,what are the names of movies with zombies in them,"[O, O, O, O, O, O, O, B_subject, O, O]",[movie.subjects],movie.subjects,movie.subjects,1,I_genre I_genre I_genre I_genre I_genre I_genr...
2,can you give me a list of movies produced by b...,"[O, O, O, O, O, O, O, O, O, O, B_producer, I_p...",[movie.produced_by],movie.produced_by,movie.produced_by,1,I_genre I_genre I_genre I_genre I_genre I_genr...
3,show me the dirty dancing movie,"[O, O, O, B_movie, I_movie, O]",[no_role],no_role,no_role,1,I_genre I_genre I_genre I_producer I_mpaa_rati...
4,i'd like to see foreign films,"[O, O, O, O, O, O]",[movie.country],movie.country,movie.country,1,I_genre I_genre I_genre I_genre I_genre I_genre
...,...,...,...,...,...,...,...
448,dutch movie listing,"[B_country, O, O]","[movie.country, movie.language]",movie.country movie.language,movie.country movie.language,1,B_producer I_genre I_genre
449,show all movies by paramount,"[O, O, O, O, B_producer]",[movie.production_companies],movie.production_companies,movie.production_companies,1,I_genre I_genre I_genre I_genre B_movie
450,films produced by twentieth century fox,"[O, O, O, B_producer, I_producer, I_producer]",[movie.production_companies],movie.production_companies,movie.production_companies,1,I_genre I_genre I_genre B_movie B_release_year...
451,list french movies,"[O, B_country, O]","[movie.country, movie.language]",movie.country movie.language,movie.country movie.language,1,I_genre B_producer I_genre


In [169]:
with open('data/prediction_no_postprocess.txt', 'w') as f:
    for i in range(len(data)):
    
        l = data.loc[i].values[1:]
        f.write(l[0] + "\t" + l[1] + "\n")

In [17]:
with open('data/dev_preds.txt', 'r') as f:
    dev_preds = f.readlines()
with open('data/dev_act.txt', 'r') as f:
    dev_act = f.readlines()

In [20]:
dev_preds = [i.split('\t')[0] for i in dev_preds]
dev_act = [i.split('\t')[0] for i in dev_act]

In [25]:
for i in zip(dev_preds, dev_act):
    if len(i[0].split())!=len(i[1].split()):
        print(i)

In [None]:
data['rel_pred'] = data.rel_pred.apply(lambda x: '' if x=='no_role' else x)
# with open('data/dev_preds.txt', 'w') as f:
#     for i in range(len(data)):
    
#         l = data.loc[i].values[3:5]
#         f.write(l[0] + "\t" + l[1] + "\n")
# with open('data/dev_act.txt', 'w') as f:
#     for i in range(len(data)):
    
#         l = data.loc[i].values[1:3]
#         f.write(l[0] + "\t" + l[1] + "\n")

In [180]:
data.loc[0].values[1:3]

array(['O O O B_subject O O', 'movie.subjects'], dtype=object)

In [13]:
data

Unnamed: 0,utterances,IOB Slot tags,Core Relations,ner_pred,rel_pred
0,bring up all diving related movies,O O O B_subject O O,movie.subjects,O O O B_subject O O,movie.subjects
1,what are the names of movies with zombies in them,O O O O O O O B_subject O O,movie.subjects,O O O O O O O B_subject O O,movie.subjects
2,can you give me a list of movies produced by b...,O O O O O O O O O O B_producer I_producer,movie.produced_by,O O O O O O O O O O B_producer I_producer,movie.produced_by
3,show me the dirty dancing movie,O O O B_movie I_movie O,no_role,O O O B_movie I_movie O,no_role
4,i'd like to see foreign films,O O O O O O,movie.country,O O O O O O,movie.country
...,...,...,...,...,...
448,dutch movie listing,B_country O O,movie.country movie.language,B_country O O,movie.country movie.language
449,show all movies by paramount,O O O O B_producer,movie.production_companies,O O O O B_producer,movie.production_companies
450,films produced by twentieth century fox,O O O B_producer I_producer I_producer,movie.production_companies,O O O B_producer I_producer I_producer,movie.production_companies
451,list french movies,O B_country O,movie.country movie.language,O B_country O,movie.country movie.language


In [131]:
# data['ner_act'] = data['IOB Slot tags'].apply(lambda x: ' '.join(x))
# data['ner_acc'] = data.apply(lambda x: 1 if x['ner_pred'] == x['ner_act'] else 0, axis=1)
# np.mean(data.ner_acc.values)
# #data[data.ner_acc==0].head()

0.9227373068432672

In [150]:
# c = []
# for pred in rel_preds:
#     if pred[1] == pred[2]:
#         c.append(1)
#     else:
#         c.append(0)
    

In [151]:
# np.mean(c)

0.9094922737306843

In [27]:
data['rel_act'] = data['Core Relations'].apply(lambda x: ' '.join(x))
data['rel_acc'] = data.apply(lambda x: 1 if x['rel_pred'] == x['rel_act'] else 0, axis=1)
np.mean(data.rel_acc.values)
#data[data.ner_acc==0].head()

0.9006622516556292

In [156]:
data

Unnamed: 0,utterances,IOB Slot tags,Core Relations,rel_pred,rel_act,rel_acc
0,bring up all diving related movies,"[O, O, O, B_subject, O, O]",[movie.subjects],movie.subjects,movie.subjects,1
1,what are the names of movies with zombies in them,"[O, O, O, O, O, O, O, B_subject, O, O]",[movie.subjects],movie.subjects,movie.subjects,1
2,can you give me a list of movies produced by b...,"[O, O, O, O, O, O, O, O, O, O, B_producer, I_p...",[movie.produced_by],movie.produced_by,movie.produced_by,1
3,show me the dirty dancing movie,"[O, O, O, B_movie, I_movie, O]",[no_role],no_role,no_role,1
4,i'd like to see foreign films,"[O, O, O, O, O, O]",[movie.country],movie.country,movie.country,1
...,...,...,...,...,...,...
448,dutch movie listing,"[B_country, O, O]","[movie.country, movie.language]",movie.country movie.language,movie.country movie.language,1
449,show all movies by paramount,"[O, O, O, O, B_producer]",[movie.production_companies],movie.production_companies,movie.production_companies,1
450,films produced by twentieth century fox,"[O, O, O, B_producer, I_producer, I_producer]",[movie.production_companies],movie.production_companies,movie.production_companies,1
451,list french movies,"[O, B_country, O]","[movie.country, movie.language]",movie.country movie.language,movie.country movie.language,1


In [182]:
def opener(filename):
    with open(filename, 'r') as f:
        uncleaned_relations = [line.strip().split(' ') for line in f.readlines()]
        intents = []
        # will split into slots and intents
        for uncleaned_relation in uncleaned_relations:
            for i in range(len(uncleaned_relation)):
                if '\t' in uncleaned_relation[i]:
                    intent = []
                    split_relations = uncleaned_relation[i].split('\t')
                    uncleaned_relation[i] = split_relations[0]
                    intent.append(split_relations[1])
                    if i+1 < len(uncleaned_relation):
                        intent+=uncleaned_relation[i+1:]
                    intents.append(intent)
        relations = uncleaned_relations
        return relations, intents

In [220]:
#parser = argparse.ArgumentParser()
#parser.add_argument('--test', action='store_true')
#args = parser.parse_args()

y_true, y_true_intents = opener('data/dev_act.txt')
y_pred, y_pred_intents = opener('data/dev_preds.txt')
print(y_true[1], y_pred[1])

print("Sentence Accuracy is {:.2f}%".format(accuracy_score(y_true, y_pred) * 100))

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_subject', 'O', 'O'] ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_subject', 'O', 'O']
Sentence Accuracy is 55.37%


In [222]:
y_true[i]

TypeError: list indices must be integers or slices, not tuple

In [223]:
for i in range(len(y_true)):
    if len(y_true[i])!=len(y_pred[i]):
        print(i)

12
31
34
63
128
173
199
205
221
225
226
243
281
353
361
363
394
424


In [185]:
def accuracy_score(y_true, y_pred):
    """Accuracy classification score.
    In multilabel classification, this function computes subset accuracy:
    the set of labels predicted for a sample must *exactly* match the
    corresponding set of labels in y_true.
    Args:
        y_true : 2d array. Ground truth (correct) target values.
        y_pred : 2d array. Estimated targets as returned by a tagger.
    Returns:
        score : float.
    Example:
        >>> from seqeval.metrics import accuracy_score
        >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        >>> accuracy_score(y_true, y_pred)
        0.80
    """
    if any(isinstance(s, list) for s in y_true):
        y_true = [item for sublist in y_true for item in sublist]
        y_pred = [item for sublist in y_pred for item in sublist]

    nb_correct = sum(y_t==y_p for y_t, y_p in zip(y_true, y_pred))
    nb_true = len(y_true)

    score = nb_correct / nb_true

    return score

In [188]:
y_true = [i for j in y_true for i in j]
y_pred = [i for j in y_pred for i in j]

In [210]:
c = []
for i in zip(y_true, y_pred):
    if len(i[0]) != len(i[1]):
        print(' '.join(i[0]))

O O O B_mpaa_rating O O B_cast I_cast movie.starring.actor
O O O O O B_producer movie.produced_by
O O O B_char I_char movie.starring.character
O O O O O O B_movie I_movie actor.gender
O O O O O O O O B_movie I_movie I_movie actor.gender
O O O O O O O O O O B_movie actor.gender
O O O O O B_movie actor.gender
O O O O O B_release_year B_genre O movie.initial_release_date
O B_language O
B_genre B_mpaa_rating O movie.genre
B_language O O
O O O O O O gr.amount
O O O B_director I_director O O O O O B_movie movie.directed_by
B_country O movie.language
O O B_language O
O O O O O O gr.amount
O B_director I_director O O O B_mpaa_rating movie.directed_by
O B_mpaa_rating O O O O movie.initial_release_date


0.5560584958217271

In [206]:
data['x'] = data.apply(lambda x: 1 if len(x['ner_pred'].split()) != len(x['IOB Slot tags'].split()) else 0, axis=1)

In [207]:
data[data.x==1]

Unnamed: 0,utterances,IOB Slot tags,Core Relations,ner_pred,rel_pred,x


In [213]:
data[data['IOB Slot tags'] == 'O B_language O']

Unnamed: 0,utterances,IOB Slot tags,Core Relations,ner_pred,rel_pred,x
221,find english movies,O B_language O,movie.language,O B_language O,movie.country movie.language,0


In [15]:
data['x'] = data.apply(lambda x: 1 if len(x['ner_pred'].split()) == len(x['IOB Slot tags'].split()) else 0, axis=1)

In [16]:
data[data.x == 0]


Unnamed: 0,utterances,IOB Slot tags,Core Relations,ner_pred,rel_pred,x
