In [70]:
import pandas as pd

train = pd.read_csv(r"C:\Users\paulw\Documents\QuantSpark\semantic_scanner\data\train_data.csv")
test = pd.read_csv(r"C:\Users\paulw\Documents\QuantSpark\semantic_scanner\data\test_data.csv")

combined = pd.concat([train, test], axis=0).reset_index(drop=True)

In [71]:
def get_excerpts_by_source(df, source):
    return df[df['source']==source]['text'].tolist()

In [72]:
import os

txt_files = os.listdir(r"C:\Users\paulw\Documents\QuantSpark\semantic_scanner\CUAD_v1\full_contract_txt")

In [73]:
txt_files = [x for x in txt_files if x.split('.')[-1]=='txt']

In [74]:
import re
from tqdm import tqdm
import numpy as np
from difflib import SequenceMatcher

def word_order_distance(w, excerpt):
    w_words = w.split()
    excerpt_words = excerpt.split()
    match = SequenceMatcher(None, w_words, excerpt_words)
    return match.ratio()

def excerpt_splitter(text, characters_to_split_on=['\n\n','\n','.']):
    excerpts = []
    for char in characters_to_split_on:
        if char in text:
            excerpts += text.split(char)
    excerpts = [x for x in excerpts if x != '' and len(x) > 10]
    excerpts = [x for x in excerpts if len(x.split()) > 10]
    return excerpts


dataset = pd.DataFrame()
for file in tqdm(txt_files):

    file_pdf = file.replace('.txt', '.pdf')
    important_excerpts = get_excerpts_by_source(combined, file_pdf)

    # replace any mention of ' (Page n)' with '' where n is any number of up to 3 digits
    pattern = re.compile(r' \(Page \d{1,3}\)')
    important_excerpts = [re.sub(pattern, '', x) for x in important_excerpts]

    # replace any mention of ' (Pages n-n)' with '' where n is any number of up to 3 digits
    pattern = re.compile(r' \(Pages \d{1,3}-\d{1,3}\)')
    important_excerpts = [re.sub(pattern, '', x) for x in important_excerpts]

    important_excerpts = [x.replace('<omitted> ','').replace('\n\n',' ').replace('\n',' ') for x in important_excerpts]

    with open(r"C:\Users\paulw\Documents\QuantSpark\semantic_scanner\CUAD_v1\full_contract_txt\\"+file, 'r') as f:
        txt = f.read()
        f.close()

    for excerpt in important_excerpts:
        w = len(excerpt)
        s = 1
        windows = []
        doc_words = txt.split(' ')
        for i in range(0, len(doc_words), s):
            k = i + 1
            subtext = doc_words[i:k]
            while len(' '.join(subtext)) < w:
                subtext = doc_words[i:k]
                k += 1
                if k > len(doc_words):
                    break
            windows.append(' '.join(subtext))

        scores = [word_order_distance(w, excerpt) for w in windows]
        score = np.argmax(scores)
        max_score = max(scores)
        if max_score >= 0.75:
            txt = txt.replace(windows[score], '')

    pattern = re.compile(r'Source: .*\n')
    txt = re.sub(pattern, '', txt)

    excerpts = excerpt_splitter(txt)

    negative_data = {'text': excerpts, 'source': [file_pdf]*len(excerpts), 'label':'not_important'}
    positive_data = {'text': important_excerpts, 'source': [file_pdf]*len(important_excerpts), 'label':'important'}

    negative_df = pd.DataFrame(negative_data)
    positive_df = pd.DataFrame(positive_data)

    together = pd.concat([negative_df, positive_df], axis=0).reset_index(drop=True)

    dataset = pd.concat([dataset, together], axis=0).reset_index(drop=True)

dataset.to_csv(r"C:\Users\paulw\Documents\QuantSpark\semantic_scanner\data\importance_dataset.csv", index=False)

100%|██████████| 510/510 [7:03:10<00:00, 49.79s/it]     


In [77]:
dataset['label'].value_counts()

label
not_important    180435
important          3440
Name: count, dtype: int64

In [78]:
from sklearn.model_selection import train_test_split

df = pd.read_csv(r"C:\Users\paulw\Documents\QuantSpark\semantic_scanner\data\importance_dataset.csv")

df['label'].value_counts(normalize=False)

label
not_important    180435
important          3440
Name: count, dtype: int64

In [79]:
not_important = df[df['label']=='not_important']
important = df[df['label']=='important']

not_important = not_important.sample(important.shape[0])

df = pd.concat([not_important, important], axis=0).reset_index(drop=True)

print(df.shape[0])

6880


In [80]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

train.to_csv(r"C:\Users\paulw\Documents\QuantSpark\semantic_scanner\data\importance_train.csv", index=False)
test.to_csv(r"C:\Users\paulw\Documents\QuantSpark\semantic_scanner\data\importance_test.csv", index=False)