In [1]:
!pip install transformers==4.28.0 pytorch-crf --quiet

In [2]:
input_path='/kaggle/Input/'
output_path='/kaggle/working/'

In [3]:
import pandas as pd
import numpy as np
import re
import pickle
import torch

In [4]:
label_list= ['O', 'B-TSK','I-TSK','B-MTD','I-MTD','B-DST','I-DST']

In [9]:
MODELS={"name":["SciBERT","SciNCL","SPECTER","SciBERT-based","SciNCL-based","SPECTER-based"],
       "checkpoint":["allenai/scibert_scivocab_uncased","malteos/scincl","allenai/specter2_base"]}
id=5
ID=id%3

In [None]:
MODEL_PATH = input_path+"Models/"+MODELS["name"][id]

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODELS["checkpoint"][ID], add_prefix_space=True)

In [11]:
from transformers import TrainingArguments
args=torch.load(MODEL_PATH+"/training_args.bin")

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [12]:
import torch.nn as nn
from torchcrf import CRF
import torch.nn.functional as F
from transformers import PretrainedConfig

class SelfAttention(nn.Module):
    def __init__(self, input_dim):
        super(SelfAttention, self).__init__()
        self.input_dim = input_dim
        self.query = nn.Linear(input_dim, input_dim)
        self.key = nn.Linear(input_dim, input_dim)
        self.value = nn.Linear(input_dim, input_dim)
        self.softmax = nn.Softmax(dim=2)
        
    def forward(self, x):
        queries = self.query(x)
        keys = self.key(x)
        values = self.value(x)
        scores = torch.bmm(queries, keys.transpose(1, 2)) / (self.input_dim ** 0.5)
        context_vectors = torch.matmul(scores,values)
        attention = self.softmax(scores)
        weighted = torch.matmul(attention, values)
        return weighted
    
class BiLSTMAttentionCRF(nn.Module):
    def __init__(self, config, bert_model):
        super().__init__()
        self.config = config
        self.bert = bert_model
        self.bilstm = nn.LSTM(input_size=self.bert.config.hidden_size,
                              hidden_size=self.config.hidden_size,
                              num_layers=self.config.num_layers,
                              dropout=self.config.dropout_prob, 
                              bidirectional=True, batch_first=True)
        self.fc = nn.Linear(in_features=self.config.hidden_size * 2, out_features=self.config.num_labels)
        self.crf = CRF(num_tags=self.config.num_labels, batch_first=True)
        self.dropout = nn.Dropout(self.config.dropout_prob)
        self.linear = nn.Linear(self.bert.config.hidden_size, self.config.num_labels)
        self.attention = SelfAttention(self.config.hidden_size* 2)

    def forward(self, input_ids, attention_mask, labels):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits=outputs.last_hidden_state
        lstm_out, _ = self.bilstm(logits)
        lstm_out=self.attention(lstm_out)
        emissions = self.fc(lstm_out)
        log_probs = torch.log_softmax(emissions, dim=2)
        labels[labels == -100] = 0
        results = self.crf.decode(emissions)
        if labels is not None:
            loss = -self.crf(log_probs, labels, attention_mask.byte(), reduction='mean')
            return {"loss": loss, "logits": emissions}
        else:
            return {"labels": torch.tensor(results)}

In [13]:
num_labels = len(label_list)
num_layers=0
hidden_size = 0
dropout_prob = 0
class CustomConfig(PretrainedConfig):
#     def __init__(self, num_labels, num_layers, hidden_size, dropout_prob, **kwargs):
    def __init__(self, num_labels=num_labels, num_layers=num_layers, hidden_size=hidden_size, dropout_prob=dropout_prob, **kwargs):
        super().__init__(**kwargs)
        self.num_labels = num_labels
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.dropout_prob = dropout_prob

In [None]:
from transformers import AutoModelForTokenClassification
from transformers import AutoModel
if id<3:
    model=AutoModelForTokenClassification.from_pretrained(MODEL_PATH)
else:
    args.local_rank=-1
    import json
    # Read and parse the JSON data from the file
    with open(MODEL_PATH+"/config.json", "r") as file:
        config_data = json.load(file)
    config = CustomConfig(**config_data)
    bert_model=AutoModel.from_pretrained(MODELS["checkpoint"][ID])
    model = BiLSTMAttentionCRF(config=config, bert_model=bert_model)
    model.load_state_dict(torch.load(MODEL_PATH+"/pytorch_model.bin"))

In [15]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(device))
else:
    device ='cpu'

Tesla P100-PCIE-16GB


In [16]:
from transformers import DataCollatorForTokenClassification, Trainer
data_collator = DataCollatorForTokenClassification(tokenizer)
pred_trainer = Trainer(
    model,
    args,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [17]:
df_acl= pd.read_parquet(input_path+'Data/target_sentences_full.parquet')
df_acl['ID'] = [i for i in df_acl.index]
df_acl

Unnamed: 0,url,year,month,text,sentences,ID
0,https://aclanthology.org/O02-2002,2002,August,A Study on Word Similarity using Context Vecto...,[A Study on Word Similarity using Context Vect...,0
1,https://aclanthology.org/R13-1042,2013,September,"Headerless, Quoteless, but not Hopeless? Using...","[Headerless, Quoteless, but not Hopeless?, Usi...",1
2,https://aclanthology.org/W05-0819,2005,June,Aligning Words in English-Hindi Parallel Corpo...,[Aligning Words in English-Hindi Parallel Corp...,2
3,https://aclanthology.org/R13-1044,2013,September,Recognizing semantic relations within Polish n...,[Recognizing semantic relations within Polish ...,3
4,https://aclanthology.org/W05-0818,2005,June,LIHLA: Shared Task System Description\nIn this...,[LIHLA: Shared Task System Description\nIn thi...,4
...,...,...,...,...,...,...
59198,https://aclanthology.org/P99-1002,1999,June,Automatic Speech Recognition and Its Applicati...,[Automatic Speech Recognition and Its Applicat...,59198
59199,https://aclanthology.org/P00-1009,2000,October,An Improved Parser for Data-Oriented Lexical-F...,[An Improved Parser for Data-Oriented Lexical-...,59199
59200,https://aclanthology.org/P99-1056,1999,June,The grapho-phonological system of written Fren...,[The grapho-phonological system of written Fre...,59200
59201,https://aclanthology.org/P99-1051,1999,June,Acquiring Lexical Generalizations from Corpora...,[Acquiring Lexical Generalizations from Corpor...,59201


In [18]:
import random
sampled_ids = random.sample(list(df_acl.ID.unique()), 100)
with open(output_path+"Data/100_random_ids.pkl", 'wb') as file:
    pickle.dump(sampled_ids, file)

In [19]:
import pickle
with open(input_path+"Data/100_random_ids.pkl", "rb") as file:
    sampled_ids=pickle.load(file)

In [20]:
df_acl = df_acl[df_acl['ID'].isin(sampled_ids)]
df_acl

Unnamed: 0,url,year,month,text,sentences,ID
54,https://aclanthology.org/R13-1041,2013,September,A Boosting-based Algorithm for Classification ...,[A Boosting-based Algorithm for Classification...,54
178,https://aclanthology.org/W13-1204,2013,June,Event representation across genre\nThis paper ...,[Event representation across genre\nThis paper...,178
1179,https://aclanthology.org/W03-2912,2003,April,Russian Morphology: Ressources and Java Softwa...,[Russian Morphology: Ressources and Java Softw...,1179
2157,https://aclanthology.org/2020.emnlp-main.671,2020,November,Towards More Accurate Uncertainty Estimation I...,[Towards More Accurate Uncertainty Estimation ...,2157
2943,https://aclanthology.org/W19-6738,2019,August,When less is more in Neural Quality Estimation...,[When less is more in Neural Quality Estimatio...,2943
...,...,...,...,...,...,...
58024,https://aclanthology.org/S12-1107,2012,7-8 June,DirRelCond3: Detecting Textual Entailment Acro...,[DirRelCond3: Detecting Textual Entailment Acr...,58024
58301,https://aclanthology.org/W12-0108,2012,April,PRESEMT: Pattern Recognition-based Statistical...,[PRESEMT: Pattern Recognition-based Statistica...,58301
58730,https://aclanthology.org/K18-2013,2018,October,Turku Neural Parser Pipeline: An End-to-End Sy...,[Turku Neural Parser Pipeline: An End-to-End S...,58730
58827,https://aclanthology.org/2020.nlposs-1.11,2020,November,KLPT -- Kurdish Language Processing Toolkit\nD...,[KLPT -- Kurdish Language Processing Toolkit\n...,58827


In [21]:
def tokenize_and_align_labels(examples):
    label_all_tokens=False
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True,max_length=512)
    labels = []
    for i, label in enumerate(examples['dummy_ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        # print(label)
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [22]:
def data_extract(tuple_list):
    de_list = []
    for tup in tuple_list:
        if tup[1] != 'O':
            de_list.append(tup)
    return de_list

In [None]:
import math
from datasets import Dataset
window=1000
for i in range(0,math.ceil(len(df_acl)/window)):
    start=i*window
    end=min((i+1)*window,len(df_acl))
    RANGE = str(start)+"-"+str(end)
    print(RANGE)
    # change the dataset selection range as needed
    data=df_acl[start:end].explode(['sentences']).reset_index(drop=True)
    data['id'] = [i for i in data.index]
    data['tokens'] = data['sentences'].apply(lambda x: x.split())
    data['dummy_ner_tags'] = data['tokens'].apply(lambda x: [0 for tok in x])
    dataset = Dataset.from_pandas(data[['id','ID','tokens','dummy_ner_tags']])
    tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
    # Extract the predictions
    # 1000 docs take about 45min
    predictions, labels, _ = pred_trainer.predict(tokenized_dataset)
    if id<3:
        predictions = np.argmax(predictions, axis=2)
    else:
        predictions=model.crf.decode(torch.tensor(predictions).to(device))
    data['predictions'] = list(predictions)
    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    data['true_predictions'] = true_predictions
    data['check_pred'] = list(list(zip(a,b)) for a,b in zip(data['tokens'], data['true_predictions']))
    data['data_tuples'] = data['check_pred'].apply(data_extract)
    data=data[data['data_tuples'].str.len() != 0]
#     display(data)
    data[['id','ID','true_predictions','data_tuples']].to_parquet(output_path+'Results_'+MODELS["name"][id]+'_'+str(RANGE)+'.parquet')
    print("done")