In [1]:
import json
import time
import pickle
import re, string
import os
from os import path, listdir
from pathlib import Path
from os.path import isfile, join
from types import new_class
from typing import List
from lxml import etree 
from contextlib import ExitStack
import sklearn.feature_extraction.text
from nltk.tokenize import PunktSentenceTokenizer, RegexpTokenizer, TreebankWordTokenizer
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

# Among the larger bills is samples/congress/116/BILLS-116s1790enr.xml (~ 10MB)

PATH_116_USLM = 'data/samples/congress/116/uslm'
PATH_116_USLM_TRAIN = 'data/samples/congress/116/train'
PATH_116_TEXT = 'data/samples/congress/116/txt'

BILLS_SAMPLE = [f'BILLS-116hr{number}ih.xml' for number in range(100, 300)]
BIG_BILLS = ['BILLS-116s1790enr.xml', 'BILLS-116hjres31enr.xml']
BIG_BILLS_PATHS = [path.join(PATH_116_USLM, bill) for bill in (BIG_BILLS + BILLS_SAMPLE)]

SAMPLE_BILL_PATHS_TRAIN = [join(PATH_116_USLM_TRAIN, f) for f in listdir(PATH_116_USLM) if isfile(join(PATH_116_USLM_TRAIN, f))]
SAMPLE_BILL_PATHS = [join(PATH_116_USLM, f) for f in listdir(PATH_116_USLM) if isfile(join(PATH_116_USLM, f))]


def getEnum(section) -> str:
  enumpath = section.xpath('enum')  
  if len(enumpath) > 0:
    return enumpath[0].text
  return ''

def getHeader(section) -> str:
  headerpath = section.xpath('header')  
  if len(headerpath) > 0:
    return headerpath[0].text
  return ''

def text_to_vect(txt: str , ngram_size: int = 4):
    """
    Gets ngrams from text
    """
    # See https://stackoverflow.com/a/32128803/628748
    tokenizer = PunktSentenceTokenizer()
    sentences = tokenizer.tokenize(txt)
    #vect = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(ngram_size,ngram_size),
    #    tokenizer=TreebankWordTokenizer().tokenize, lowercase=True)
    vect = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(ngram_size,ngram_size),
        tokenizer=RegexpTokenizer(r"\w+").tokenize, lowercase=True)
    vect.fit(sentences)
    # ngrams = vect.get_feature_names_out()
    # print('{1}-grams: {0}'.format(ngrams, ngram_size))
    #print(vect.vocabulary_)
    return vect # list of text documents

def xml_to_sections(xml_path: str):
    """
    Parses the xml file into sections 
    """
    try:
        billTree = etree.parse(xml_path)
    except:
        raise Exception('Could not parse bill')
    sections = billTree.xpath('//section')
    if len(sections) == 0:
        return []
    return [{
            'section_number': getEnum(section) ,
            'section_header':  getHeader(section),
            'section_text': etree.tostring(section, method="text", encoding="unicode"),
            'section_xml': etree.tostring(section, method="xml", encoding="unicode")
        } if (section.xpath('header') and len(section.xpath('header')) > 0  and section.xpath('enum') and len(section.xpath('enum'))>0) else
        {
            'section_number': '',
            'section_header': '', 
            'section_text': etree.tostring(section, method="text", encoding="unicode"),
            'section_xml': etree.tostring(section, method="xml", encoding="unicode")
        } 
        for section in sections ]

def xml_to_text(xml_path: str) -> str:
    """
    Parses the xml file and returns the text of the body element, if any
    """
    try:
        billTree = etree.parse(xml_path)
    except:
        raise Exception('Could not parse bill')
    parse_text =  etree.tostring(billTree, method="text", encoding="unicode")
    
    return parse_text
    bodies = billTree.xpath('//body')
    
    if len(sections) == 0:
        return '' 
    return etree.tostring(bodies[0], method="text", encoding="unicode"),

def xml_to_vect(xml_paths: List[str], ngram_size: int = 4):
    """
    Parses the xml file and returns the text of the body element, if any
    """
    total_str = '\n'.join([xml_to_text(xml_path) for xml_path in xml_paths])
    return text_to_vect(total_str, ngram_size=ngram_size)

    # to get the vocab dict: vect.vocabulary_

def combine_vocabs(vocabs: List[CountVectorizer]):
    """
    Combines one or more vocabs into one
    """
    vocab_keys = list(set([list(v.vocabulary_.keys()) for v in vocabs]))
    vocab = {vocab_key: str(i) for i, vocab_key in enumerate(vocab_keys)}
    return vocab

def get_combined_vocabs(xml_paths: List[str] = SAMPLE_BILL_PATHS, ngram_size: int = 4):
    """
    Gets the combined vocabulary of all the xml files
    """
    return xml_to_vect(xml_paths, ngram_size=ngram_size)

def getSampleText():
    return xml_to_text(BIG_BILLS_PATHS[0])

def transform_text(text: str, vocab: dict, ngram_size: int = 4):
    """
    Transforms text into a vector using the vocab
    """
    return CountVectorizer(vocabulary=vocab).fit_transform([text])

def train_count_vectorizer(train_data: List[str], ngram_size: int = 4):
    """
    Trains a count vectorizer on the training data
    """
    vectorizer = CountVectorizer(ngram_range=(ngram_size,ngram_size), preprocessor=xml_to_text, tokenizer=RegexpTokenizer(r"\w+").tokenize, lowercase=True)
    with ExitStack() as stack:
        files = [
            stack.enter_context(open(filename))
            for filename in train_data 
        ]
        X = vectorizer.fit_transform(files)
    return vectorizer, X 

def train_hashing_vectorizer(train_data: List[str], ngram_size: int = 4):
    """
    Trains a hashing vectorizer on the training data
    """
    vectorizer = HashingVectorizer(ngram_range=(ngram_size,ngram_size), preprocessor=xml_to_text, tokenizer=RegexpTokenizer(r"\w+").tokenize, lowercase=True)
    with ExitStack() as stack:
        files = [
            stack.enter_context(open(filename))
            for filename in train_data 
        ]
        X = vectorizer.fit_transform(files)
    return vectorizer, X

def test_hashing_vectorizer(vectorizer: HashingVectorizer, test_data: List[str]):
    return vectorizer.transform(test_data)

# TODO: Add a function to parse the bill (text) into paragraphs 

# TODO: create a streaming hash vectorizer. See 
# https://scikit-learn.org/stable/auto_examples/applications/plot_out_of_core_classification.html#sphx-glr-auto-examples-applications-plot-out-of-core-classification-py


### Utility function for Text Cleaning

In [2]:
#clean text 
def text_cleaning(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

# Data Loading & Data Pre-processing

In [3]:
#get all files into one list

doc_corpus_data=[]
section_corpus_data = []

#get sections file from the section text files folder into list
paragraph_text_files = [f for f in os.listdir(PATH_116_TEXT) if f.endswith('.txt')]

for i in range(0, len(paragraph_text_files)):
    para_text_file = paragraph_text_files[i]
    
    
    delimiter = "\n"
    with open(os.path.join(PATH_116_TEXT, para_text_file), "r", encoding='UTF-8') as f:
        doc_content = f.read() 
        
        doc_corpus_data.append([Path(para_text_file).stem[:-9], text_cleaning(doc_content)])
        
        #using the string methods we split it
        paragraphs = doc_content.split(delimiter)
        
    for j in range(len(paragraphs)):   
        #for now sentence id is sentence number in document
        #append file id and clean text
        section_corpus_data.append([Path(para_text_file).stem[:-9], j,  text_cleaning(paragraphs[j])])
    

#get only whole document content from doc_corpus_data list
only_doc_data = [row[1] for row in doc_corpus_data]

#get only section content from section_corpus_data list
only_section_data = [row[2] for row in section_corpus_data]


#get length of only_doc_data list
print(len(only_doc_data))

#get length of only_section_data list
print(len(only_section_data))


239
12355


# NLP Modeling

## Model Training

In [4]:
#record training time for both vectorizer
start = time.time()


# Vectorizer to convert a collection of raw documents to a matrix 
doc_count_vectorizer = CountVectorizer(ngram_range=(4,4), tokenizer=RegexpTokenizer(r"\w+").tokenize, lowercase=True)
#Fit count vectorize instance on document level corpus
cv_doc_matrix = doc_count_vectorizer.fit_transform(only_doc_data)

# Vectorizer to convert a collection of sections to a matrix 
sec_count_vectorizer = CountVectorizer(ngram_range=(4,4), tokenizer=RegexpTokenizer(r"\w+").tokenize, lowercase=True)
#Fit count vectorize instance on section level corpus
cv_section_matrix = sec_count_vectorizer.fit_transform(only_section_data)


done = time.time()
elapsed = done - start
print("Time took in training of both vectorizer(s) ", elapsed)

Time took in training of both vectorizer(s)  12.192900657653809


## Model Saving & Loading

In [5]:
# save count vectorize instance for only_doc_data
pickle.dump(doc_count_vectorizer, open("doc_count_vectorizer.pickel", "wb"))
# load count vectorize instance for only_doc_data
doc_count_vectorizer = pickle.load(open("doc_count_vectorizer.pickel", "rb"))

#save count vectorize instance for only_section_data
pickle.dump(sec_count_vectorizer, open("sec_count_vectorizer.pickel", "wb"))
# load count vectorize instance for only_section_data
sec_count_vectorizer = pickle.load(open("sec_count_vectorizer.pickel", "rb"))



## Pick any Document A to any Document B from list for measuring similarity score


In [6]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-116hr866enr'
B_doc_name = 'BILLS-116s2107enr'

A_doc = [i[1] for i in doc_corpus_data if A_doc_name ==i[0]][0]
B_doc = [i[1] for i in doc_corpus_data if B_doc_name ==i[0]][0]

A_section_doc = [i[2] for i in section_corpus_data if A_doc_name ==i[0]]
B_section_doc = [i[2] for i in section_corpus_data if B_doc_name ==i[0]]

## Transform Document A & B into vectorized space to perform cosine similarity

In [7]:
A_doc_vectorized = doc_count_vectorizer.transform([A_doc])
B_doc_vectorized = doc_count_vectorizer.transform([B_doc])

A_section_doc_vectorized = sec_count_vectorizer.transform(A_section_doc)
B_section_doc_vectorized = sec_count_vectorizer.transform(B_section_doc)

## Measure Document Similarity Score of any Document A to any Document B

In [8]:
#record time for computing similarity 
start = time.time()

doc_sim_score =  cosine_similarity(A_doc_vectorized, B_doc_vectorized)

done = time.time()
elapsed = done - start
print("Time took in computing document to document similarity (s) ", elapsed)

Time took in computing document to document similarity (s)  0.0070037841796875


In [9]:
print(doc_sim_score)

[[0.01597025]]


## Measure Pairwise similraity between Sections of Document A & Sections of Document B

In [10]:
# record time for computing similarity 
start = time.time()

sec_doc_sim_score =  cosine_similarity(A_section_doc_vectorized, B_section_doc_vectorized)

done = time.time()
elapsed = done - start
print("Time took in computing section to section similarity of document A & document B(s) ", elapsed)

Time took in computing section to section similarity of document A & document B(s)  0.007002830505371094


## Rendering results of similarity b/w Document A & Document B and their Sections

In [11]:
#create result list
res_list = []

#create empty list
temp=[]
temp.append("ORIGINAL DOCUMENT ID: " + A_doc_name)
temp.append("MATCHED DOCUMENT ID: " + B_doc_name)
temp.append("DOCUMENT SIMILARITY SCORE: " + str(doc_sim_score[0][0]))

for i in range(0, sec_doc_sim_score.shape[0]):    
    temp.append("ORIGINAL SENTENCE ID: " + str(i+1))
    for j in range(0, sec_doc_sim_score.shape[1]):
        temp.append({"MATCHED DOCUMENT ID":  B_doc_name, "MATCHED SENTENCE ID": j+1 , "SENTENCE SIMILARITY SCORE":  sec_doc_sim_score[i][j]})
        
    res_list.append(temp)


In [12]:
#print pretty json
r = json.dumps(res_list)
parsed = json.loads(r)
print(json.dumps(parsed, indent=5))



[
     [
          "ORIGINAL DOCUMENT ID: BILLS-116hr866enr",
          "MATCHED DOCUMENT ID: BILLS-116s2107enr",
          "DOCUMENT SIMILARITY SCORE: 0.01597025441533637",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-116s2107enr",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.4803844614152615
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-116s2107enr",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-116s2107enr",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-116s2107enr",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-116s2107enr",
           

## Measure pairwise similarity between document A and all documents in the corpus

In [13]:
#record time for computing similarity 
start = time.time()

doc_to_corpus_sim_score =  cosine_similarity(A_doc_vectorized, cv_doc_matrix)

done = time.time()
elapsed = done - start
print("Time took in computing similarity of document A and all documents in the corpus  (s) ", elapsed)

Time took in computing similarity of document A and all documents in the corpus  (s)  0.20244097709655762


## Rendering results of similarity b/w document A and all documents in the corpus

In [14]:
#create result list
res_list = []

#get document names list from main doc_corpus_data list
corpus_doc_name_only = [row[0] for row in doc_corpus_data]

for i in range(0, doc_to_corpus_sim_score.shape[0]):
    #create empty list
    temp=[]
    temp.append("ORIGINAL DOCUMENT ID: " + A_doc_name)
    for j in range(0, doc_to_corpus_sim_score.shape[1]):
        temp.append({"MATCHED DOCUMENT ID": corpus_doc_name_only[j] , "DOCUMENT SIMILARITY SCORE":  doc_to_corpus_sim_score[i][j]})
        
    res_list.append(temp)


In [15]:
#print pretty json
r = json.dumps(res_list)
parsed = json.loads(r)
print(json.dumps(parsed, indent=5))



[
     [
          "ORIGINAL DOCUMENT ID: BILLS-116hr866enr",
          {
               "MATCHED DOCUMENT ID": "BILLS-116hconres105enr",
               "DOCUMENT SIMILARITY SCORE": 0.012605195009606678
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-116hconres106enr",
               "DOCUMENT SIMILARITY SCORE": 0.014745497924853898
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-116hconres16enr",
               "DOCUMENT SIMILARITY SCORE": 0.01690898948255138
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-116hconres19enr",
               "DOCUMENT SIMILARITY SCORE": 0.02614916723506242
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-116hconres21enr",
               "DOCUMENT SIMILARITY SCORE": 0.013500268742399507
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-116hconres31enr",
               "DOCUMENT SIMILARITY SCORE": 0.014253842401696592
          },
          {
        