In [21]:
#!pip install -U torch
#!pip install -U datasets
#!pip install -U transformers
#!pip install spacy 
#!pip install accelerate -U

In [3]:
import os, re, math, random, json, string
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
from spacy.lang.en import English
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import TrainerCallback, AdamW, get_cosine_schedule_with_warmup
from transformers import DataCollatorForTokenClassification, PreTrainedModel, RobertaTokenizerFast
from datasets import load_dataset, ClassLabel, Sequence, load_metric

from accelerate import Accelerator
accelerator=Accelerator()

In [4]:
test_dir=input("Enter location of text file for testing")

Enter location of text file for testing BLUEFLYINC_03_27_2002-EX-10.27-e-business Hosting Agreement.txt


In [5]:
def preprocess_text(text):
    text=text.replace('/n','')
    text = text.replace("\xa0", " ")
    text = text.replace("\x0c", " ")

    regex = "\ \.\ "
    subst = "."
    text = re.sub(regex, subst, text, 0)  

    regex = "_"
    subst = " "
    text = re.sub(regex, subst, text, 0)  # Get rid of underscores

    regex = "--+"
    subst = " "
    text = re.sub(regex, subst, text, 0)   # Get rid of multiple dashes

    regex = "\*+"
    subst = "*"
    text = re.sub(regex, subst, text, 0)  # Get rid of multiple stars

    regex = "\ +"
    subst = " "
    text = re.sub(regex, subst, text, 0)  # Get rid of multiple whitespace

    text = text.strip()  #Strip leading and trailing whitespace
    return text

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
def text_data(test_dir, print_text=False, clean_text=True, max_len=3000):
    text_list = []
    raw_text=open(test_dir).read()
    full_text=''
    if clean_text:
        full_text = preprocess_text(raw_text)
    short_text = full_text[:max_len]
    len_text = len(short_text)

    text_list.append([full_text, short_text, len_text])
    return text_list

In [8]:
#test_dir = TEST_FILE_PATH
data = text_data(test_dir, clean_text=True, max_len=1000)

# Create dataframe with text
columns = ['Full_Text', 'Short_Text', 'Length_Of_Short_Text']
text_df = pd.DataFrame(data=data, columns=columns) 
text_df['file_name']=test_dir

In [9]:
print(text_df)
text_df['Full_Text'][0]

                                           Full_Text  \
0  Exhibit 10.27\n\n e-business Hosting Agreement...   

                                          Short_Text  Length_Of_Short_Text  \
0  Exhibit 10.27\n\n e-business Hosting Agreement...                  1000   

                                           file_name  
0  BLUEFLYINC_03_27_2002-EX-10.27-e-business Host...  


'Exhibit 10.27\n\n e-business Hosting Agreement\n\n between\n\n Bluefly, Inc.\n\n and\n\n International Business Machines Corporation\n\n 1\n\n e-business Hosting Agreement\n\nUnder this e-business Hosting Agreement ("Agreement") between International Business Machines Corporation ("IBM") and Bluefly, Inc. ("Customer"), IBM will provide Web hosting and related services ("Services") to Customer. The Agreement includes these terms and conditions and the documents referenced herein ("Base Terms"), e-business hosting services order forms accepted by IBM ("Order Forms"), and the following attachments:\n\na. Attachment A: Facilities Services;\n\nb. Service Option Attachment for Facilities Services; and\n\nc. all other applicable attachments referenced in the Order Forms for Services options selected by Customer ("Service Option Attachments").\n\nIn the event of a conflict between the Base Terms and an attachment, the Base Terms will govern, except where an attachment or a provision contained

In [10]:
#tokenize
nlp = English()
text_df['tokens'] = text_df['Short_Text'].apply(lambda x: nlp(x))

# Split tokens into a list ready for CSV
text_df['split_tokens'] = text_df['tokens'].apply(lambda x: [tok.text for tok in x])


text_df['dummy_ner_tags'] = text_df['tokens'].apply(lambda x: [0 for tok in x])


export_columns = ['split_tokens', 'dummy_ner_tags']
export_df = text_df[export_columns]
export_df.to_json('test1.json', orient="table", index=False)
text_df = text_df.drop(['dummy_ner_tags'], axis=1)

# Re-import the serialized JSON data and create a dataset in the format needed for the transformer

datasets = load_dataset('json', data_files='test1.json', field='data')
print(datasets)
text_df

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['dummy_ner_tags', 'split_tokens'],
        num_rows: 1
    })
})


Unnamed: 0,Full_Text,Short_Text,Length_Of_Short_Text,file_name,tokens,split_tokens
0,Exhibit 10.27\n\n e-business Hosting Agreement...,Exhibit 10.27\n\n e-business Hosting Agreement...,1000,BLUEFLYINC_03_27_2002-EX-10.27-e-business Host...,"(Exhibit, 10.27, \n\n , e, -, business, Hostin...","[Exhibit, 10.27, \n\n , e, -, business, Hostin..."


In [11]:
with open('feature_class_labels.json', 'r') as f:
    label_list = json.load(f)

for n in range(len(label_list)):
    print(n, label_list[n])

0 B-AGMT_DATE
1 B-DOC_NAME
2 B-PARTY
3 I-AGMT_DATE
4 I-DOC_NAME
5 I-PARTY
6 O


In [12]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base",add_prefix_space=True)

In [13]:
def tokenize_and_align_labels(examples, label_all_tokens=False):
    tokenized_inputs = tokenizer(examples["split_tokens"],
                                 truncation=True,
                                 is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["dummy_ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [14]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [15]:
accelerator=accelerator = ()

In [16]:
loaded_model = AutoModelForTokenClassification.from_pretrained("model_saved")
loaded_model.to(device)
args = TrainingArguments(output_dir ="test_model",
                         per_device_train_batch_size=4,
                         per_device_eval_batch_size=4,
                         seed=37
                        )

data_collator = DataCollatorForTokenClassification(tokenizer)

In [17]:
pred_trainer = Trainer(
    loaded_model,
    args,
    data_collator=data_collator,
    tokenizer=tokenizer)

In [18]:
predictions, labels, _ = pred_trainer.predict(tokenized_datasets["train"])
predictions = np.argmax(predictions, axis=2)
text_df['predictions'] = list(predictions)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
text_df['true_predictions'] = true_predictions

def data_extract(tuple_list):
    de_list = []
    for tup in tuple_list:
        if tup[1] != 'O':predictions, labels, _ = pred_trainer.predict(tokenized_datasets["train"])
            de_list.append(tup)
    return de_list

text_df['check_pred'] = list(list(zip(a,b)) for a,b in zip(text_df['split_tokens'], text_df['true_predictions']))
text_df['data_tuples'] = text_df['check_pred'].apply(data_extract)

text_df.head()[['file_name', 'true_predictions']]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Unnamed: 0,file_name,true_predictions
0,BLUEFLYINC_03_27_2002-EX-10.27-e-business Host...,"[O, O, O, O, O, O, O, O, O, O, O, B-PARTY, I-P..."


In [19]:

def extract_agreement_date(tuple_list):
    for d in tuple_list:
        if d[1] == "B-AGMT_DATE":
            temp_date=d[0]
        elif d[1] == "I-AGMT_DATE":
            temp_date = temp_date + " " + d[0]
        else:
            continue
        return temp_date

text_df['agmt_date'] = text_df['data_tuples'].apply(extract_agreement_date)

def extract_agreement_name(tuple_list):
    for n in tuple_list:
        if n[1] == "B-DOC_NAME":
            temp_name=n[0]
        elif n[1] == "I-DOC_NAME":
            temp_name = temp_name + " " + n[0]
        else:
            continue
    return temp_name

text_df['agmt_name'] = text_df['data_tuples'].apply(extract_agreement_name)

def extract_agreement_parties(tuple_list):
    data_dict = {'Parties':[]}
    for i, p in enumerate(tuple_list):
        if p[1] == "B-PARTY":
            temp_party=p[0]
            if i == (len(tuple_list)-1):
                data_dict["Parties"].append(temp_party)
            elif tuple_list[i+1][1] != "I-PARTY":
                data_dict["Parties"].append(temp_party)
        elif p[1] == "I-PARTY":
            temp_party = temp_party + " " + p[0]
            if i == (len(tuple_list)-1):
                data_dict["Parties"].append(temp_party)
            elif tuple_list[i+1][1] != "I-PARTY":
                data_dict["Parties"].append(temp_party)

    return list(dict.fromkeys(data_dict['Parties']))

text_df['agmt_parties'] = text_df['data_tuples'].apply(extract_agreement_parties)

# Create a dataframe with just the information we want to keep and 
export_df = text_df[['file_name', 'agmt_name', 'agmt_date', 'agmt_parties', 'Full_Text']].copy()

# Let's have a look
export_df

Unnamed: 0,file_name,agmt_name,agmt_date,agmt_parties,Full_Text
0,BLUEFLYINC_03_27_2002-EX-10.27-e-business Host...,e - business Hosting Agreement,,"[Bluefly , Inc., International Business Machin...",Exhibit 10.27\n\n e-business Hosting Agreement...


In [20]:
!touch export_df

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
