In [1]:
# python
import json
import time
import pickle
import re, string
import os
from os import path, listdir
from pathlib import Path
from os.path import isfile, join
from types import new_class
from typing import List
from lxml import etree 
from contextlib import ExitStack
import sklearn.feature_extraction.text
from nltk.tokenize import PunktSentenceTokenizer, RegexpTokenizer, TreebankWordTokenizer
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

# Among the larger bills is samples/congress/116/BILLS-116s1790enr.xml (~ 10MB)

PATH_116_USLM = '/usr/local/share/xcential/public/data/116/uslm'
PATH_117_USLM = '/usr/local/share/xcential/public/data/117/uslm'
PATH_116_USLM_TRAIN = 'samples/congress/116/train'
PATH_116_TEXT = 'samples/congress/116/txt'

BILLS_SAMPLE = [f'BILLS-116hr{number}ih.xml' for number in range(100, 300)]
BIG_BILLS = ['BILLS-116s1790enr.xml', 'BILLS-116hjres31enr.xml']
BIG_BILLS_PATHS = [path.join(PATH_116_USLM, bill) for bill in (BIG_BILLS + BILLS_SAMPLE)]

SAMPLE_BILL_PATHS_TRAIN = [join(PATH_116_USLM_TRAIN, f) for f in listdir(PATH_116_USLM) if isfile(join(PATH_116_USLM_TRAIN, f))]
SAMPLE_BILL_PATHS = [join(PATH_117_USLM, f) for f in listdir(PATH_117_USLM) if isfile(join(PATH_117_USLM, f))]


NAMESPACES = {'uslm': 'http://xml.house.gov/schemas/uslm/1.0'}


def get_filepaths(dirpath: str, reMatch = r'.xml$') -> List[str]:
    return [join(dirpath, f) for f in listdir(dirpath) if (len(re.findall(reMatch, f)) > 0) and isfile(join(dirpath, f))]

def getEnum(section) -> str:
  enumpath = section.xpath('enum')  
  if len(enumpath) > 0:
    return enumpath[0].text
  return ''

def getHeader(section) -> str:
  headerpath = section.xpath('header')  
  if len(headerpath) > 0:
    return headerpath[0].text
  return ''

def text_to_vect(txt: str , ngram_size: int = 4):
    """
    Gets ngrams from text
    """
    # See https://stackoverflow.com/a/32128803/628748
    tokenizer = PunktSentenceTokenizer()
    sentences = tokenizer.tokenize(txt)
    #vect = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(ngram_size,ngram_size),
    #    tokenizer=TreebankWordTokenizer().tokenize, lowercase=True)
    vect = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(ngram_size,ngram_size),
        tokenizer=RegexpTokenizer(r"\w+").tokenize, lowercase=True)
    vect.fit(sentences)
    # ngrams = vect.get_feature_names_out()
    # print('{1}-grams: {0}'.format(ngrams, ngram_size))
    #print(vect.vocabulary_)
    return vect # list of text documents

def xml_to_sections(xml_path: str):
    """
    Parses the xml file into sections 
    """
    try:
        billTree = etree.parse(xml_path)
    except:
        raise Exception('Could not parse bill')
    sections = billTree.xpath('//uslm:section', namespaces=NAMESPACES)
    if len(sections) == 0:
        print('No sections found')
        return []
    return [{
            'section_number': getEnum(section) ,
            'section_header':  getHeader(section),
            'section_text': etree.tostring(section, method="text", encoding="unicode"),
            'section_xml': etree.tostring(section, method="xml", encoding="unicode")
        } if (section.xpath('header') and len(section.xpath('header')) > 0  and section.xpath('enum') and len(section.xpath('enum'))>0) else
        {
            'section_number': '',
            'section_header': '', 
            'section_text': etree.tostring(section, method="text", encoding="unicode"),
            'section_xml': etree.tostring(section, method="xml", encoding="unicode")
        } 
        for section in sections ]

def xml_to_text(xml_path: str, level: str = 'section', separator: str = '\n*****\n') -> str:
    """
    Parses the xml file and returns the text of the body element, if any
    """
    try:
        billTree = etree.parse(xml_path)
    except:
        raise Exception('Could not parse bill')
    #return etree.tostring(billTree, method="text", encoding="unicode")
    # Use 'body' for level to get the whole body element
    sections = billTree.xpath('//uslm:'+level, namespaces=NAMESPACES)
    if len(sections) == 0:
        print('No sections found')
        return '' 
    return separator.join([etree.tostring(section, method="text", encoding="unicode") for section in sections])

def xml_to_vect(xml_paths: List[str], ngram_size: int = 4):
    """
    Parses the xml file and returns the text of the body element, if any
    """
    total_str = '\n'.join([xml_to_text(xml_path) for xml_path in xml_paths])
    return text_to_vect(total_str, ngram_size=ngram_size)

    # to get the vocab dict: vect.vocabulary_

def combine_vocabs(vocabs: List[CountVectorizer]):
    """
    Combines one or more vocabs into one
    """
    vocab_keys = list(set([list(v.vocabulary_.keys()) for v in vocabs]))
    vocab = {vocab_key: str(i) for i, vocab_key in enumerate(vocab_keys)}
    return vocab

def get_combined_vocabs(xml_paths: List[str] = SAMPLE_BILL_PATHS, ngram_size: int = 4):
    """
    Gets the combined vocabulary of all the xml files
    """
    return xml_to_vect(xml_paths, ngram_size=ngram_size)

def getSampleText(level = 'body'):
    return xml_to_text(BIG_BILLS_PATHS[0])

def transform_text(text: str, vocab: dict, ngram_size: int = 4):
    """
    Transforms text into a vector using the vocab
    """
    return CountVectorizer(vocabulary=vocab).fit_transform([text])

def train_count_vectorizer(train_data: List[str], ngram_size: int = 4):
    """
    Trains a count vectorizer on the training data
    """
    vectorizer = CountVectorizer(ngram_range=(ngram_size,ngram_size), preprocessor=xml_to_text, tokenizer=RegexpTokenizer(r"\w+").tokenize, lowercase=True)
    with ExitStack() as stack:
        files = [
            stack.enter_context(open(filename))
            for filename in train_data 
        ]
        X = vectorizer.fit_transform(files)
    return vectorizer, X 

def train_hashing_vectorizer(train_data: List[str], ngram_size: int = 4):
    """
    Trains a hashing vectorizer on the training data
    """
    vectorizer = HashingVectorizer(ngram_range=(ngram_size,ngram_size), preprocessor=xml_to_text, tokenizer=RegexpTokenizer(r"\w+").tokenize, lowercase=True)
    with ExitStack() as stack:
        files = [
            stack.enter_context(open(filename))
            for filename in train_data 
        ]
        X = vectorizer.fit_transform(files)
    return vectorizer, X

def test_hashing_vectorizer(vectorizer: HashingVectorizer, test_data: List[str]):
    return vectorizer.transform(test_data)

def xml_samples_to_text(dirpath: str, level: str = 'section', separator: str = '\n*****\n'):
    """
    Converts xml files in a directory to txt files
    """
    xfiles = get_filepaths(dirpath)
    for xfile in xfiles:
        with open(xfile.replace('.xml', f'-{level}s.txt'), 'w') as f:
            f.write(xml_to_text(xfile, level=level, separator=separator))

# TODO: Add a function to parse the bill (text) into paragraphs 

# TODO: create a streaming hash vectorizer. See 
# https://scikit-learn.org/stable/auto_examples/applications/plot_out_of_core_classification.html#sphx-glr-auto-examples-applications-plot-out-of-core-classification-py

### Utility function for Text Cleaning

In [2]:
#clean text 
def text_cleaning(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

# Data Loading & Data Pre-processing

In [3]:
#xml document and section level parsing

#record training time for both vectorizer
start = time.time()


doc_corpus_data=[]
section_corpus_data = []

#get all xml files from data directory for parsing
bill_files = [f for f in os.listdir(PATH_117_USLM) if f.endswith('.xml')]

#iterate over all bill files
for i in range(0, len(bill_files)):
    
    #indexing bill document file
    bill_doc_file = bill_files[i]
    
    #parse xml into sections
    secs = xml_to_sections(os.path.join(PATH_117_USLM, bill_doc_file))
    
    #check  of sections should be 1 or more than 1
    if(len(secs)>0):  
        
        #intialize string variable for document content
        doc_content = ""
        
        #iterate over all parse sections text of bill doc file
        for s_number, section in enumerate(secs):  
            
            #text cleaning applied on each section text
            sec_text = text_cleaning(section['section_text'])
            
            #concatenate section text to doc content 
            doc_content = doc_content + sec_text + " "
            
             #for now sentence id is sentence number in document
            section_corpus_data.append([Path(bill_doc_file).stem[:], s_number, sec_text ])

        doc_corpus_data.append([Path(bill_doc_file).stem[:], doc_content])
    

#get only whole document content from doc_corpus_data list
only_doc_data = [row[1] for row in doc_corpus_data]

#get only section content from section_corpus_data list
only_section_data = [row[2] for row in section_corpus_data]


#get length of only_doc_data list
print(len(only_doc_data))

#get length of only_section_data list
print(len(only_section_data))

done = time.time()
elapsed = done - start
print('Time took in ETL with {} xml data files is {}'.format(len(only_doc_data), elapsed))           

No sections found
No sections found
9166
55007
Time took in ETL with 9166 xml data files is 56.63388204574585


# NLP Modeling

## Model Training

In [4]:
#record training time for both vectorizer
start = time.time()


# Vectorizer to convert a collection of raw documents to a matrix 
doc_hash_vectorizer = HashingVectorizer(ngram_range=(4,4), tokenizer=RegexpTokenizer(r"\w+").tokenize, lowercase=True)
#Fit hash vectorize instance on document level corpus
hv_doc_matrix = doc_hash_vectorizer.fit_transform(only_doc_data)

# Vectorizer to convert a collection of sections to a matrix 
sec_hash_vectorizer = HashingVectorizer(ngram_range=(4,4), tokenizer=RegexpTokenizer(r"\w+").tokenize, lowercase=True)
#Fit hash vectorize instance on section level corpus
hv_section_matrix = sec_hash_vectorizer.fit_transform(only_section_data)


done = time.time()
elapsed = done - start
print("Time took in training of both vectorizer(s) ", elapsed)



Time took in training of both vectorizer(s)  38.91292405128479


## Model Saving & Loading

In [5]:
# save hash vectorize instance for only_doc_data
pickle.dump(doc_hash_vectorizer, open("doc_hash_vectorizer.pickle", "wb"))
# load hash vectorize instance for only_doc_data
doc_hash_vectorizer = pickle.load(open("doc_hash_vectorizer.pickle", "rb"))

#save hash vectorize instance for only_section_data
pickle.dump(sec_hash_vectorizer, open("sec_hash_vectorizer.pickle", "wb"))
# load hash vectorize instance for only_section_data
sec_hash_vectorizer = pickle.load(open("sec_hash_vectorizer.pickle", "rb"))



## Get Document and Sections Text from XML File


In [6]:
#et document and section text from xml file for testing purpose 
def get_document_and_section_from_xml_file(file_path):
    
    #create empty t_section_data list
    t_section_data=[] 
    
    #get sections fron xml file
    t_secs = xml_to_sections(file_path)
    
    #check if length of section is more than 0  
    if(len(t_secs)>0): 
        
            #intialize string variable for document content
            t_doc_content = ""

            #iterate over all parse sections text of bill doc file
            for s_number, section in enumerate(t_secs):  

                #text cleaning applied on each section text
                sec_text = text_cleaning(section['section_text'])

                #concatenate section text to doc content 
                t_doc_content = t_doc_content + sec_text + " "

                 #for now sentence id is sentence number in document
                t_section_data.append(sec_text)

    return t_doc_content, t_section_data
    

In [7]:
#choose document A file name & document B file name 
A_doc_name = 'BILLS-117hr200ih'
B_doc_name = 'BILLS-117hr201ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

## Transform Document A & B into vectorized space to perform cosine similarity

In [8]:
def document_hash_vectorized_transformation(document, doc_hash_vectorizer):
    
    doc_vectorized = doc_hash_vectorizer.transform([document])
    return doc_vectorized

def section_doc_hash_vectorized_transformation(section_doc, sec_hash_vectorizer):
    
    section_doc_vectorized = sec_hash_vectorizer.transform(section_doc)
    return section_doc_vectorized


In [9]:
#transform document A content and document B content
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

#transform document A section content and  document B section content
A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

## Calculate Cosine Similarity

In [10]:
def cosine_pairwise_sim(a_vectorized, b_vectorized):
    
    #record time for computing similarity 
    start = time.time()

    sim_score =  cosine_similarity(a_vectorized, b_vectorized)

    done = time.time()
    elapsed = done - start
    return elapsed, sim_score

## Measure Document Similarity Score of any Document A to any Document B

In [11]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)

## Measure Pairwise similraity between Sections of Document A & Sections of Document B

In [12]:
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

In [13]:
elapsed

0.003020048141479492

## Rendering results of similarity b/w Document A & Document B and their Sections

In [14]:
def create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score):
    
    #create result list
    res_list = []

    #create empty list
    temp=[]
    temp.append("ORIGINAL DOCUMENT ID: " + A_doc_name)
    temp.append("MATCHED DOCUMENT ID: " + B_doc_name)
    temp.append("DOCUMENT SIMILARITY SCORE: " + str(doc_sim_score[0][0]))

    #iterate over sec_doc_sim_score list 
    for i, section_score_list in enumerate(sec_doc_sim_score):
        
        #add original document sentence id number
        temp.append("ORIGINAL SENTENCE ID: " + str(i+1))
           
        #sort similarity score of sections list
        section_score_list = list(enumerate(section_score_list))
        sorted_section_score_list = sorted(section_score_list, key=lambda x: x[1], reverse=True)
        
        #iterate over section level score only 
        for j, sim_score in sorted_section_score_list:
            temp.append({"MATCHED DOCUMENT ID":  B_doc_name, "MATCHED SENTENCE ID": j+1 , "SENTENCE SIMILARITY SCORE":  sim_score})

    res_list.append(temp)
        
    #return pretty json
    r = json.dumps(res_list)
    parsed = json.loads(r)
    return json.dumps(parsed, indent=5)

    

In [15]:
#put document A file name & document B file name
A_doc_name = 'BILLS-117hr200ih'
B_doc_name = 'BILLS-117hr201ih'

#get json response of newly computed similarity score of document A and document B
response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr200ih",
          "MATCHED DOCUMENT ID: BILLS-117hr201ih",
          "DOCUMENT SIMILARITY SCORE: 0.721456637430483",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr201ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.521749194749951
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr201ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr201ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.7517730496453908
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr201ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]


# Testing 

## Case # 1

In [15]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hjres27enr'
B_doc_name = 'BILLS-117hjres27ih'


A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))


In [16]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [17]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hjres27enr",
          "MATCHED DOCUMENT ID: BILLS-117hjres27ih",
          "DOCUMENT SIMILARITY SCORE: 1.000000000000003",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hjres27ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.000000000000003
          }
     ]
]


## Case # 2

In [18]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr293eh'
B_doc_name = 'BILLS-117hr293ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [19]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [20]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr293eh",
          "MATCHED DOCUMENT ID: BILLS-117hr293ih",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000002",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr293ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr293ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr293ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr293ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999998
          },
          {
               "MATCHED D

## Case # 3

In [21]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr335enr'
B_doc_name = 'BILLS-117hr335ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [22]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [23]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr335enr",
          "MATCHED DOCUMENT ID: BILLS-117hr335ih",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999997",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr335ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999997
          }
     ]
]


## Case # 4

In [24]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr335enr'
B_doc_name = 'BILLS-117hr335ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [25]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [26]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr335enr",
          "MATCHED DOCUMENT ID: BILLS-117hr335ih",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999997",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr335ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999997
          }
     ]
]


## Case # 5

In [27]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1195eh'
B_doc_name = 'BILLS-117hr1195rh'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [28]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [29]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1195eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1195rh",
          "DOCUMENT SIMILARITY SCORE: 0.9835438921537075",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 8,
               "SENTENCE SIMILARITY SCORE": 0.16805349924838137
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.03724711252952215
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.033633639699815615
          },
          {
               "MATCHED DOCUMENT ID":

## Case # 6

In [30]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1205eh'
B_doc_name = 'BILLS-117hr1205ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [31]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [32]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1205eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1205ih",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999984",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1205ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1205ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1205ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1205ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000024
          },
          {
               "MATCHED DOCUMENT I

## Case # 7

In [33]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1251eh'
B_doc_name = 'BILLS-117hr1251ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [34]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [35]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1251eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1251ih",
          "DOCUMENT SIMILARITY SCORE: 0.9447072707453308",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1251ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000027
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1251ih",
               "MATCHED SENTENCE ID": 10,
               "SENTENCE SIMILARITY SCORE": 0.09738982245370617
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1251ih",
               "MATCHED SENTENCE ID": 5,
               "SENTENCE SIMILARITY SCORE": 0.03208561247205913
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1251ih",
               "MATCHED SENTENCE ID": 6,
               "SENTENCE SIMILARITY SCORE": 0.007452007585104285
          },
          {
               "MATCH

## Case # 8

In [36]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1257eh'
B_doc_name = 'BILLS-117hr1257rfs'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [37]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [38]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1257eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1257rfs",
          "DOCUMENT SIMILARITY SCORE: 1.000000000000042",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1257rfs",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1257rfs",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1257rfs",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1257rfs",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000018
          },
          {
               

## Case # 9

In [39]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1195eh'
B_doc_name = 'BILLS-117hr1195rh'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [40]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [41]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1195eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1195rh",
          "DOCUMENT SIMILARITY SCORE: 0.9835438921537075",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 8,
               "SENTENCE SIMILARITY SCORE": 0.16805349924838137
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.03724711252952215
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.033633639699815615
          },
          {
               "MATCHED DOCUMENT ID":

## Case # 10

In [42]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1262eh'
B_doc_name = 'BILLS-117hr1262ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [43]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [44]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1262eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1262ih",
          "DOCUMENT SIMILARITY SCORE: 0.9463086319471281",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1262ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1262ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1262ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9449850221132888
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1262ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]


## Case # 11

In [45]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1280eh'
B_doc_name = 'BILLS-117hr1280ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [46]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [47]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1280eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1280ih",
          "DOCUMENT SIMILARITY SCORE: 0.9998983537453022",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1280ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000004
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1280ih",
               "MATCHED SENTENCE ID": 45,
               "SENTENCE SIMILARITY SCORE": 0.07596714068284574
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1280ih",
               "MATCHED SENTENCE ID": 7,
               "SENTENCE SIMILARITY SCORE": 0.05838949433949799
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1280ih",
               "MATCHED SENTENCE ID": 58,
               "SENTENCE SIMILARITY SCORE": 0.05793153045042203
          },
          {
               "MATCH

## Case # 12

In [48]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1314eh'
B_doc_name = 'BILLS-117hr1314ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [49]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [50]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1314eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1314ih",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999992",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1314ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000009
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1314ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1314ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000013
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1314ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]


## Case # 13

In [51]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1318enr'
B_doc_name = 'BILLS-117hr1318eh'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [52]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [53]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1318enr",
          "MATCHED DOCUMENT ID: BILLS-117hr1318eh",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999976",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1318eh",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1318eh",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1318eh",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1318eh",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT 

## Case # 14

In [54]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1324eh'
B_doc_name = 'BILLS-117hr1324ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [55]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [56]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1324eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1324ih",
          "DOCUMENT SIMILARITY SCORE: 0.998017598788533",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1324ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1324ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.011348442494136541
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1324ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0058617009654030276
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1324ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9980838041641086
          },
          {


## Case # 15

In [57]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1333eh'
B_doc_name = 'BILLS-117hr1333ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [58]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [59]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1333eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1333ih",
          "DOCUMENT SIMILARITY SCORE: 0.978476771957055",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1333ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1333ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1333ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1333ih",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr

## Case # 16

In [60]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1392eh'
B_doc_name = 'BILLS-117hr1392ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [61]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [62]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1392eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1392ih",
          "DOCUMENT SIMILARITY SCORE: 0.9306825354576475",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1392ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1392ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1392ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1392ih",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1392ih",
            

## Case # 17

In [63]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1602eh'
B_doc_name = 'BILLS-117hr1602ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [64]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [65]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1602eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1602ih",
          "DOCUMENT SIMILARITY SCORE: 0.9973812893999008",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1602ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1602ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1602ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9973515659015918
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1602ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]


## Case # 18

In [66]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1651enr'
B_doc_name = 'BILLS-117hr1651ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [67]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [68]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1651enr",
          "MATCHED DOCUMENT ID: BILLS-117hr1651ih",
          "DOCUMENT SIMILARITY SCORE: 0.7402106774910214",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1651ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1651ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.18257418583505552
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1651ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.8699075413138943
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1651ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.21535276082326638
          }

## Case # 19

In [69]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1711eh'
B_doc_name = 'BILLS-117hr1711ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [70]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [71]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1711eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1711ih",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999994",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1711ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1711ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1711ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1711ih",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT I

## Case # 20

In [72]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1833eh'
B_doc_name = 'BILLS-117hr1833ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [73]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [74]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1833eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1833ih",
          "DOCUMENT SIMILARITY SCORE: 0.7785835671892932",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1833ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000007
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1833ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.013819749820569537
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1833ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.7732879190347287
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1833ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.009166984970282082
          

## Case # 21

In [75]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2008eh'
B_doc_name = 'BILLS-117hr2008ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [76]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [77]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2008eh",
          "MATCHED DOCUMENT ID: BILLS-117hr2008ih",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000002",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2008ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000007
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2008ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2008ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2008ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]


## Case # 22

In [78]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2016rh'
B_doc_name = 'BILLS-117hr2016ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [79]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [80]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2016rh",
          "MATCHED DOCUMENT ID: BILLS-117hr2016ih",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999982",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2016ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2016ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2016ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2016ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000009
          },
          {
               "MAT

## Case # 23

In [81]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2027eh'
B_doc_name = 'BILLS-117hr2027ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [82]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [83]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2027eh",
          "MATCHED DOCUMENT ID: BILLS-117hr2027ih",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000022",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2027ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000007
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2027ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2027ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2027ih",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2027ih",
            

## Case # 24

In [84]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2062eh'
B_doc_name = 'BILLS-117hr2062rfs'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [85]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [86]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2062eh",
          "MATCHED DOCUMENT ID: BILLS-117hr2062rfs",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000029",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2062rfs",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000007
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2062rfs",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2062rfs",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2062rfs",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2062rfs",
      

## Case # 25

In [87]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1573eh'
B_doc_name = 'BILLS-117hr1573ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [88]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [89]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1573eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1573ih",
          "DOCUMENT SIMILARITY SCORE: 0.9419354339119264",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1573ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1573ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1573ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9409949123808377
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1573ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]


## Case # 26

In [90]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2332eh'
B_doc_name = 'BILLS-117hr2332ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [91]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [92]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2332eh",
          "MATCHED DOCUMENT ID: BILLS-117hr2332ih",
          "DOCUMENT SIMILARITY SCORE: 0.7540527991706186",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2332ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000007
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2332ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2332ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2332ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.7435701586071991
          },
          {
               "MAT

## Case # 27

In [93]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2467ih'
B_doc_name = 'BILLS-117hr2467rh'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [94]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [95]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2467ih",
          "MATCHED DOCUMENT ID: BILLS-117hr2467rh",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000069",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2467rh",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999998
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2467rh",
               "MATCHED SENTENCE ID": 17,
               "SENTENCE SIMILARITY SCORE": 0.1525862131963526
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2467rh",
               "MATCHED SENTENCE ID": 12,
               "SENTENCE SIMILARITY SCORE": 0.0917370968279732
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2467rh",
               "MATCHED SENTENCE ID": 9,
               "SENTENCE SIMILARITY SCORE": 0.05347371793934068
          },
          {
               "MATCHED

## Case # 28

In [96]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2485eh'
B_doc_name = 'BILLS-117hr2485ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [97]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [98]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2485eh",
          "MATCHED DOCUMENT ID: BILLS-117hr2485ih",
          "DOCUMENT SIMILARITY SCORE: 0.9988266353769495",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2485ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2485ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2485ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2485ih",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2485ih",
            

## Case # 29

In [99]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2523eh'
B_doc_name = 'BILLS-117hr2523ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [100]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [101]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2523eh",
          "MATCHED DOCUMENT ID: BILLS-117hr2523ih",
          "DOCUMENT SIMILARITY SCORE: 0.9824861031052958",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2523ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000009
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2523ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2523ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2523ih",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2523ih",
            

## Case # 30

In [102]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr3007ih'
B_doc_name = 'BILLS-117hr3007rh'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [103]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [104]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr3007ih",
          "MATCHED DOCUMENT ID: BILLS-117hr3007rh",
          "DOCUMENT SIMILARITY SCORE: 0.9657708021584301",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr3007rh",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000007
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr3007rh",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr3007rh",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9652103098677537
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr3007rh",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]


## Case # 31

In [105]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117s658rfh'
B_doc_name = 'BILLS-117s658rs'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [106]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [107]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117s658rfh",
          "MATCHED DOCUMENT ID: BILLS-117s658rs",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999986",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117s658rs",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117s658rs",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117s658rs",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117s658rs",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-

## Case # 32

In [108]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117sjres13is'
B_doc_name = 'BILLS-117sjres13es'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [109]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [110]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117sjres13is",
          "MATCHED DOCUMENT ID: BILLS-117sjres13es",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999996",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117sjres13es",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999996
          }
     ]
]


## Case # 33

In [111]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117sres107is'
B_doc_name = 'BILLS-117sres107rs'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [112]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [113]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117sres107is",
          "MATCHED DOCUMENT ID: BILLS-117sres107rs",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999996",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117sres107rs",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999996
          }
     ]
]


## Case # 34

In [114]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117sres120is'
B_doc_name = 'BILLS-117sres120ats'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [115]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [116]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117sres120is",
          "MATCHED DOCUMENT ID: BILLS-117sres120ats",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999984",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117sres120ats",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999984
          }
     ]
]


## Case # 35

In [117]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117sres81ats'
B_doc_name = 'BILLS-117sres81is'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [118]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [119]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117sres81ats",
          "MATCHED DOCUMENT ID: BILLS-117sres81is",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000007",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117sres81is",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000007
          }
     ]
]


## Case # 36

In [120]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117sres37is'
B_doc_name = 'BILLS-117sres37rs'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [121]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [122]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117sres37is",
          "MATCHED DOCUMENT ID: BILLS-117sres37rs",
          "DOCUMENT SIMILARITY SCORE: 0.9874880466425388",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117sres37rs",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.000000000000006
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117sres37rs",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9579080043578716
          }
     ]
]


## Case # 37

In [123]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117sres22is'
B_doc_name = 'BILLS-117sres22rs'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [124]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [125]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117sres22is",
          "MATCHED DOCUMENT ID: BILLS-117sres22rs",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000089",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117sres22rs",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000089
          }
     ]
]


## Case # 38

In [126]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117s1910es'
B_doc_name = 'BILLS-117s1910is'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [127]:
A_doc_vectorized = document_hash_vectorized_transformation(A_doc, doc_hash_vectorizer)
B_doc_vectorized = document_hash_vectorized_transformation(B_doc, doc_hash_vectorizer)

A_section_doc_vectorized = section_doc_hash_vectorized_transformation(A_section_doc, sec_hash_vectorizer)
B_section_doc_vectorized = section_doc_hash_vectorized_transformation(B_section_doc, sec_hash_vectorizer)

In [128]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117s1910es",
          "MATCHED DOCUMENT ID: BILLS-117s1910is",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999996",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117s1910is",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117s1910is",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117s1910is",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999998
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117s1910is",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]
