In [1]:
# python
import json
import time
import pickle
import re, string
import os
from os import path, listdir
from pathlib import Path
from os.path import isfile, join
from types import new_class
from typing import List
from lxml import etree 
from contextlib import ExitStack
import sklearn.feature_extraction.text
from nltk.tokenize import PunktSentenceTokenizer, RegexpTokenizer, TreebankWordTokenizer
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

# Among the larger bills is samples/congress/116/BILLS-116s1790enr.xml (~ 10MB)

PATH_116_USLM = '/usr/local/share/xcential/public/data/116/uslm'
PATH_117_USLM = '/usr/local/share/xcential/public/data/117/uslm'
PATH_116_USLM_TRAIN = 'samples/congress/116/train'
PATH_116_TEXT = 'samples/congress/116/txt'

BILLS_SAMPLE = [f'BILLS-116hr{number}ih.xml' for number in range(100, 300)]
BIG_BILLS = ['BILLS-116s1790enr.xml', 'BILLS-116hjres31enr.xml']
BIG_BILLS_PATHS = [path.join(PATH_116_USLM, bill) for bill in (BIG_BILLS + BILLS_SAMPLE)]

SAMPLE_BILL_PATHS_TRAIN = [join(PATH_116_USLM_TRAIN, f) for f in listdir(PATH_116_USLM) if isfile(join(PATH_116_USLM_TRAIN, f))]
SAMPLE_BILL_PATHS = [join(PATH_117_USLM, f) for f in listdir(PATH_117_USLM) if isfile(join(PATH_117_USLM, f))]


NAMESPACES = {'uslm': 'http://xml.house.gov/schemas/uslm/1.0'}


def get_filepaths(dirpath: str, reMatch = r'.xml$') -> List[str]:
    return [join(dirpath, f) for f in listdir(dirpath) if (len(re.findall(reMatch, f)) > 0) and isfile(join(dirpath, f))]

def getEnum(section) -> str:
  enumpath = section.xpath('enum')  
  if len(enumpath) > 0:
    return enumpath[0].text
  return ''

def getHeader(section) -> str:
  headerpath = section.xpath('header')  
  if len(headerpath) > 0:
    return headerpath[0].text
  return ''

def text_to_vect(txt: str , ngram_size: int = 4):
    """
    Gets ngrams from text
    """
    # See https://stackoverflow.com/a/32128803/628748
    tokenizer = PunktSentenceTokenizer()
    sentences = tokenizer.tokenize(txt)
    #vect = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(ngram_size,ngram_size),
    #    tokenizer=TreebankWordTokenizer().tokenize, lowercase=True)
    vect = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(ngram_size,ngram_size),
        tokenizer=RegexpTokenizer(r"\w+").tokenize, lowercase=True)
    vect.fit(sentences)
    # ngrams = vect.get_feature_names_out()
    # print('{1}-grams: {0}'.format(ngrams, ngram_size))
    #print(vect.vocabulary_)
    return vect # list of text documents

def xml_to_sections(xml_path: str):
    """
    Parses the xml file into sections 
    """
    try:
        billTree = etree.parse(xml_path)
    except:
        raise Exception('Could not parse bill')
    sections = billTree.xpath('//uslm:section', namespaces=NAMESPACES)
    if len(sections) == 0:
        print('No sections found')
        return []
    return [{
            'section_number': getEnum(section) ,
            'section_header':  getHeader(section),
            'section_text': etree.tostring(section, method="text", encoding="unicode"),
            'section_xml': etree.tostring(section, method="xml", encoding="unicode")
        } if (section.xpath('header') and len(section.xpath('header')) > 0  and section.xpath('enum') and len(section.xpath('enum'))>0) else
        {
            'section_number': '',
            'section_header': '', 
            'section_text': etree.tostring(section, method="text", encoding="unicode"),
            'section_xml': etree.tostring(section, method="xml", encoding="unicode")
        } 
        for section in sections ]

def xml_to_text(xml_path: str, level: str = 'section', separator: str = '\n*****\n') -> str:
    """
    Parses the xml file and returns the text of the body element, if any
    """
    try:
        billTree = etree.parse(xml_path)
    except:
        raise Exception('Could not parse bill')
    #return etree.tostring(billTree, method="text", encoding="unicode")
    # Use 'body' for level to get the whole body element
    sections = billTree.xpath('//uslm:'+level, namespaces=NAMESPACES)
    if len(sections) == 0:
        print('No sections found')
        return '' 
    return separator.join([etree.tostring(section, method="text", encoding="unicode") for section in sections])

def xml_to_vect(xml_paths: List[str], ngram_size: int = 4):
    """
    Parses the xml file and returns the text of the body element, if any
    """
    total_str = '\n'.join([xml_to_text(xml_path) for xml_path in xml_paths])
    return text_to_vect(total_str, ngram_size=ngram_size)

    # to get the vocab dict: vect.vocabulary_

def combine_vocabs(vocabs: List[CountVectorizer]):
    """
    Combines one or more vocabs into one
    """
    vocab_keys = list(set([list(v.vocabulary_.keys()) for v in vocabs]))
    vocab = {vocab_key: str(i) for i, vocab_key in enumerate(vocab_keys)}
    return vocab

def get_combined_vocabs(xml_paths: List[str] = SAMPLE_BILL_PATHS, ngram_size: int = 4):
    """
    Gets the combined vocabulary of all the xml files
    """
    return xml_to_vect(xml_paths, ngram_size=ngram_size)

def getSampleText(level = 'body'):
    return xml_to_text(BIG_BILLS_PATHS[0])

def transform_text(text: str, vocab: dict, ngram_size: int = 4):
    """
    Transforms text into a vector using the vocab
    """
    return CountVectorizer(vocabulary=vocab).fit_transform([text])

def train_count_vectorizer(train_data: List[str], ngram_size: int = 4):
    """
    Trains a count vectorizer on the training data
    """
    vectorizer = CountVectorizer(ngram_range=(ngram_size,ngram_size), preprocessor=xml_to_text, tokenizer=RegexpTokenizer(r"\w+").tokenize, lowercase=True)
    with ExitStack() as stack:
        files = [
            stack.enter_context(open(filename))
            for filename in train_data 
        ]
        X = vectorizer.fit_transform(files)
    return vectorizer, X 

def train_hashing_vectorizer(train_data: List[str], ngram_size: int = 4):
    """
    Trains a hashing vectorizer on the training data
    """
    vectorizer = HashingVectorizer(ngram_range=(ngram_size,ngram_size), preprocessor=xml_to_text, tokenizer=RegexpTokenizer(r"\w+").tokenize, lowercase=True)
    with ExitStack() as stack:
        files = [
            stack.enter_context(open(filename))
            for filename in train_data 
        ]
        X = vectorizer.fit_transform(files)
    return vectorizer, X

def test_hashing_vectorizer(vectorizer: HashingVectorizer, test_data: List[str]):
    return vectorizer.transform(test_data)

def xml_samples_to_text(dirpath: str, level: str = 'section', separator: str = '\n*****\n'):
    """
    Converts xml files in a directory to txt files
    """
    xfiles = get_filepaths(dirpath)
    for xfile in xfiles:
        with open(xfile.replace('.xml', f'-{level}s.txt'), 'w') as f:
            f.write(xml_to_text(xfile, level=level, separator=separator))

# TODO: Add a function to parse the bill (text) into paragraphs 

# TODO: create a streaming hash vectorizer. See 
# https://scikit-learn.org/stable/auto_examples/applications/plot_out_of_core_classification.html#sphx-glr-auto-examples-applications-plot-out-of-core-classification-py

### Utility function for Text Cleaning

In [2]:
#clean text 
def text_cleaning(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

# Data Loading & Data Pre-processing

In [3]:
#xml document and section level parsing

#record training time for both vectorizer
start = time.time()


doc_corpus_data=[]
section_corpus_data = []

#get all xml files from data directory for parsing
bill_files = [f for f in os.listdir(PATH_117_USLM) if f.endswith('.xml')]

#iterate over all bill files
for i in range(0, len(bill_files)):
    
    #indexing bill document file
    bill_doc_file = bill_files[i]
    
    #parse xml into sections
    secs = xml_to_sections(os.path.join(PATH_117_USLM, bill_doc_file))
    
    #check  of sections should be 1 or more than 1
    if(len(secs)>0):  
        
        #intialize string variable for document content
        doc_content = ""
        
        #iterate over all parse sections text of bill doc file
        for s_number, section in enumerate(secs):  
            
            #text cleaning applied on each section text
            sec_text = text_cleaning(section['section_text'])
            
            #concatenate section text to doc content 
            doc_content = doc_content + sec_text + " "
            
             #for now sentence id is sentence number in document
            section_corpus_data.append([Path(bill_doc_file).stem[:], s_number, sec_text ])

        doc_corpus_data.append([Path(bill_doc_file).stem[:], doc_content])
    

#get only whole document content from doc_corpus_data list
only_doc_data = [row[1] for row in doc_corpus_data]

#get only section content from section_corpus_data list
only_section_data = [row[2] for row in section_corpus_data]


#get length of only_doc_data list
print(len(only_doc_data))

#get length of only_section_data list
print(len(only_section_data))

done = time.time()
elapsed = done - start
print('Time took in ETL with {} xml data files is {}'.format(len(only_doc_data), elapsed))           

No sections found
No sections found
9166
55007
Time took in ETL with 9166 xml data files is 59.5397789478302


# NLP Modeling

## Model Training

In [4]:
#record training time for both vectorizer
start = time.time()


# Vectorizer to convert a collection of raw documents to a matrix 
doc_count_vectorizer = CountVectorizer(ngram_range=(4,4), tokenizer=RegexpTokenizer(r"\w+").tokenize, lowercase=True)
#Fit count vectorize instance on document level corpus
cv_doc_matrix = doc_count_vectorizer.fit_transform(only_doc_data)

# Vectorizer to convert a collection of sections to a matrix 
sec_count_vectorizer = CountVectorizer(ngram_range=(4,4), tokenizer=RegexpTokenizer(r"\w+").tokenize, lowercase=True)
#Fit count vectorize instance on section level corpus
cv_section_matrix = sec_count_vectorizer.fit_transform(only_section_data)


done = time.time()
elapsed = done - start
print("Time took in training of both vectorizer(s) ", elapsed)

Time took in training of both vectorizer(s)  106.84038186073303


## Model Saving & Loading

In [5]:
# save count vectorize instance for only_doc_data
pickle.dump(doc_count_vectorizer, open("doc_count_vectorizer.pickle", "wb"))
# load count vectorize instance for only_doc_data
doc_count_vectorizer = pickle.load(open("doc_count_vectorizer.pickle", "rb"))

#save count vectorize instance for only_section_data
pickle.dump(sec_count_vectorizer, open("sec_count_vectorizer.pickle", "wb"))
# load count vectorize instance for only_section_data
sec_count_vectorizer = pickle.load(open("sec_count_vectorizer.pickle", "rb"))



## Get Document and Sections Text from XML File


In [6]:
# Get document and section text from xml file for testing purpose 
def get_document_and_section_from_xml_file(file_path):
    
    #create empty t_section_data list
    t_section_data=[] 
    
    #get sections fron xml file
    t_secs = xml_to_sections(file_path)
    
    #check if length of section is more than 0  
    if(len(t_secs)>0): 
        
            #intialize string variable for document content
            t_doc_content = ""

            #iterate over all parse sections text of bill doc file
            for s_number, section in enumerate(t_secs):  

                #text cleaning applied on each section text
                sec_text = text_cleaning(section['section_text'])

                #concatenate section text to doc content 
                t_doc_content = t_doc_content + sec_text + " "

                 #for now sentence id is sentence number in document
                t_section_data.append(sec_text)

    return t_doc_content, t_section_data
    

## Measure Pairwise similarity between Sections of Document A & Sections of Document B

In [25]:
#choose document A file name & document B file name 
A_doc_name = 'BILLS-117hr3684pcs'
B_doc_name = 'BILLS-117hr3684eh'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [26]:
def document_count_vectorized_transformation(document, doc_count_vectorizer):
    
    doc_vectorized = doc_count_vectorizer.transform([document])
    return doc_vectorized

def section_doc_count_vectorized_transformation(section_doc, sec_count_vectorizer):
    
    section_doc_vectorized = sec_count_vectorizer.transform(section_doc)
    return section_doc_vectorized


## Transform Document A & B into vectorized space to perform cosine similarity

In [31]:
#transform document A content and document B content
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

#transform document A section content and  document B section content
A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)
print('Done vectorizing')

Done vectorizing


## Calculate Cosine Similarity

In [32]:
def cosine_pairwise_sim(a_vectorized, b_vectorized):
    
    #record time for computing similarity 
    start = time.time()

    sim_score =  cosine_similarity(a_vectorized, b_vectorized)

    done = time.time()
    elapsed = done - start
    return elapsed, sim_score

## Measure Document Similarity Score of any Document A to any Document B

In [33]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
print(elapsed)

0.03469109535217285


In [34]:
elapsed

0.03469109535217285

In [39]:
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

In [40]:
elapsed, sec_doc_sim_score

(0.11475586891174316,
 array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 1.        , 0.        , ..., 0.        , 0.00208401,
         0.        ],
        [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 1.        , 0.03659155,
         0.        ],
        [0.        , 0.00208401, 0.        , ..., 0.03659155, 1.        ,
         0.18456235],
        [0.        , 0.        , 0.        , ..., 0.        , 0.18456235,
         1.        ]]))

## Rendering results of similarity b/w Document A & Document B and their Sections

In [37]:
def create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score):
    
    #create result list
    res_list = []

    #create empty list
    temp=[]
    temp.append("ORIGINAL DOCUMENT ID: " + A_doc_name)
    temp.append("MATCHED DOCUMENT ID: " + B_doc_name)
    temp.append("DOCUMENT SIMILARITY SCORE: " + str(doc_sim_score[0][0]))

    #iterate over sec_doc_sim_score list 
    for i, section_score_list in enumerate(sec_doc_sim_score):
        
        #add original document sentence id number
        temp.append("ORIGINAL SENTENCE ID: " + str(i+1))
           
        #sort similarity score of sections list
        section_score_list = list(enumerate(section_score_list))
        sorted_section_score_list = sorted(section_score_list, key=lambda x: x[1], reverse=True)
        
        #iterate over section level score only 
        for j, sim_score in sorted_section_score_list:
            temp.append({"MATCHED DOCUMENT ID":  B_doc_name, "MATCHED SENTENCE ID": j+1 , "SENTENCE SIMILARITY SCORE":  sim_score})

    res_list.append(temp)
        
    #return pretty json
    r = json.dumps(res_list)
    parsed = json.loads(r)
    return json.dumps(parsed, indent=5)

    

In [38]:
#put document A file name & document B file name
#A_doc_name = 'BILLS-117hconres11eh'
#B_doc_name = 'BILLS-117hconres11ih'

#get json response of newly computed similarity score of document A and document B
response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(A_doc_name + " vs. " + B_doc_name)
print(response)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=2000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# Testing 

## Case # 1

In [20]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-116hr2ih'
B_doc_name = 'BILLS-116hr6379ih' # Two very large bills


A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_116_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_116_USLM, B_doc_name+".xml"))


In [21]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [22]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=2000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Case # 2

In [23]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr293eh'
B_doc_name = 'BILLS-117hr293ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [24]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [25]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr293eh",
          "MATCHED DOCUMENT ID: BILLS-117hr293ih",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999982",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr293ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr293ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr293ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr293ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999992
          },
          {
               "MATCHED D

## Case # 3

In [26]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr335enr'
B_doc_name = 'BILLS-117hr335ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [27]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [28]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr335enr",
          "MATCHED DOCUMENT ID: BILLS-117hr335ih",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000027",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr335ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000027
          }
     ]
]


## Case # 4

In [29]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr335enr'
B_doc_name = 'BILLS-117hr335ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [30]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [31]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr335enr",
          "MATCHED DOCUMENT ID: BILLS-117hr335ih",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000027",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr335ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000027
          }
     ]
]


## Case # 5

In [32]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1195eh'
B_doc_name = 'BILLS-117hr1195rh'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [33]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [34]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1195eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1195rh",
          "DOCUMENT SIMILARITY SCORE: 0.9835255892533872",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 8,
               "SENTENCE SIMILARITY SCORE": 0.16805349924838123
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.03724711252952238
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.03363363969981562
          },
          {
               "MATCHED DOCUMENT ID": 

NOTE: This appears to take about 17 elapsed seconds to run

## Case # 6

In [35]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1205eh'
B_doc_name = 'BILLS-117hr1205ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [36]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [37]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1205eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1205ih",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999956",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1205ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1205ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1205ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1205ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000038
          },
          {
               "MATCHED DOCUMENT I

## Case # 7

In [34]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1251eh'
B_doc_name = 'BILLS-117hr1251ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [35]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [36]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1251eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1251ih",
          "DOCUMENT SIMILARITY SCORE: 0.9446525515396096",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1251ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999983
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1251ih",
               "MATCHED SENTENCE ID": 10,
               "SENTENCE SIMILARITY SCORE": 0.09738982245370613
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1251ih",
               "MATCHED SENTENCE ID": 5,
               "SENTENCE SIMILARITY SCORE": 0.032085612472059315
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1251ih",
               "MATCHED SENTENCE ID": 6,
               "SENTENCE SIMILARITY SCORE": 0.0074520075851043
          },
          {
               "MATCHE

## Case # 8

In [37]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1257eh'
B_doc_name = 'BILLS-117hr1257rfs'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [38]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [39]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1257eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1257rfs",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999865",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1257rfs",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000004
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1257rfs",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1257rfs",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1257rfs",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000058
          },
          {
              

## Case # 9

In [40]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1195eh'
B_doc_name = 'BILLS-117hr1195rh'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [41]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [42]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1195eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1195rh",
          "DOCUMENT SIMILARITY SCORE: 0.9835255892533872",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 8,
               "SENTENCE SIMILARITY SCORE": 0.16805349924838123
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.03724711252952238
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.03363363969981562
          },
          {
               "MATCHED DOCUMENT ID": 

## Case # 10

In [43]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1262eh'
B_doc_name = 'BILLS-117hr1262ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [44]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [45]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1262eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1262ih",
          "DOCUMENT SIMILARITY SCORE: 0.9463086319471228",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1262ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1262ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1262ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9449850221132841
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1262ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]


## Case # 11

In [46]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1280eh'
B_doc_name = 'BILLS-117hr1280ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [47]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [48]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1280eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1280ih",
          "DOCUMENT SIMILARITY SCORE: 0.9998981656302678",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1280ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000064
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1280ih",
               "MATCHED SENTENCE ID": 45,
               "SENTENCE SIMILARITY SCORE": 0.07575757575757577
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1280ih",
               "MATCHED SENTENCE ID": 7,
               "SENTENCE SIMILARITY SCORE": 0.05822841956548624
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1280ih",
               "MATCHED SENTENCE ID": 58,
               "SENTENCE SIMILARITY SCORE": 0.05777171902747662
          },
          {
               "MATCH

## Case # 12

In [49]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1314eh'
B_doc_name = 'BILLS-117hr1314ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [50]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [51]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1314eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1314ih",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999971",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1314ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999993
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1314ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1314ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999937
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1314ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]


## Case # 13

In [52]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1318enr'
B_doc_name = 'BILLS-117hr1318eh'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [53]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [54]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1318enr",
          "MATCHED DOCUMENT ID: BILLS-117hr1318eh",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000122",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1318eh",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1318eh",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1318eh",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1318eh",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT 

## Case # 14

In [55]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1324eh'
B_doc_name = 'BILLS-117hr1324ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [56]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [57]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1324eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1324ih",
          "DOCUMENT SIMILARITY SCORE: 0.998017598788528",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1324ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1324ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.011348442494136593
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1324ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0058617009654030015
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1324ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9980838041641102
          },
          {


## Case # 15

In [58]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1333eh'
B_doc_name = 'BILLS-117hr1333ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [59]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [60]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1333eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1333ih",
          "DOCUMENT SIMILARITY SCORE: 0.9784388878937144",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1333ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1333ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1333ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1333ih",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117h

## Case # 16

In [61]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1392eh'
B_doc_name = 'BILLS-117hr1392ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [62]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [63]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1392eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1392ih",
          "DOCUMENT SIMILARITY SCORE: 0.9306825354576423",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1392ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1392ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1392ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1392ih",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1392ih",
            

## Case # 17

In [64]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1602eh'
B_doc_name = 'BILLS-117hr1602ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [65]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [66]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1602eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1602ih",
          "DOCUMENT SIMILARITY SCORE: 0.9973812893998977",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1602ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1602ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1602ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9973515659015977
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1602ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]


## Case # 18

In [67]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1651enr'
B_doc_name = 'BILLS-117hr1651ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [68]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [69]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1651enr",
          "MATCHED DOCUMENT ID: BILLS-117hr1651ih",
          "DOCUMENT SIMILARITY SCORE: 0.7402106774910209",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1651ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1651ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.1825741858350554
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1651ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.8699075413138926
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1651ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.21535276082326624
          },

## Case # 19

In [70]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1711eh'
B_doc_name = 'BILLS-117hr1711ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [71]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [72]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1711eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1711ih",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999952",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1711ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1711ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1711ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1711ih",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT I

## Case # 20

In [73]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1833eh'
B_doc_name = 'BILLS-117hr1833ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [74]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [75]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1833eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1833ih",
          "DOCUMENT SIMILARITY SCORE: 0.778583567189285",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1833ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999999
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1833ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.013819749820569553
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1833ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.7732879190347327
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1833ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.009166984970282115
          }

## Case # 21

In [76]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2008eh'
B_doc_name = 'BILLS-117hr2008ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [77]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [78]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2008eh",
          "MATCHED DOCUMENT ID: BILLS-117hr2008ih",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000002",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2008ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999999
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2008ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2008ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000007
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2008ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]


## Case # 22

In [79]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2016rh'
B_doc_name = 'BILLS-117hr2016ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [80]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [81]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2016rh",
          "MATCHED DOCUMENT ID: BILLS-117hr2016ih",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999938",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2016ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2016ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2016ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2016ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 1.000000000000001
          },
          {
               "MATC

## Case # 23

In [82]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2027eh'
B_doc_name = 'BILLS-117hr2027ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [83]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [84]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2027eh",
          "MATCHED DOCUMENT ID: BILLS-117hr2027ih",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000229",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2027ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999999
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2027ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2027ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2027ih",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2027ih",
            

## Case # 24

In [85]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2062eh'
B_doc_name = 'BILLS-117hr2062rfs'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [86]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [87]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2062eh",
          "MATCHED DOCUMENT ID: BILLS-117hr2062rfs",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000038",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2062rfs",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999998
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2062rfs",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2062rfs",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2062rfs",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2062rfs",
      

## Case # 25

In [88]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1573eh'
B_doc_name = 'BILLS-117hr1573ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [89]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [90]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1573eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1573ih",
          "DOCUMENT SIMILARITY SCORE: 0.9419354339119238",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1573ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1573ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1573ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9409949123808353
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1573ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]


## Case # 26

In [91]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2332eh'
B_doc_name = 'BILLS-117hr2332ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [92]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [93]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2332eh",
          "MATCHED DOCUMENT ID: BILLS-117hr2332ih",
          "DOCUMENT SIMILARITY SCORE: 0.7540527991706187",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2332ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999999
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2332ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2332ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2332ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.7435701586072027
          },
          {
               "MAT

## Case # 27

In [94]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2467ih'
B_doc_name = 'BILLS-117hr2467rh'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [95]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [96]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2467ih",
          "MATCHED DOCUMENT ID: BILLS-117hr2467rh",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000273",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2467rh",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999991
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2467rh",
               "MATCHED SENTENCE ID": 17,
               "SENTENCE SIMILARITY SCORE": 0.15258621319635257
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2467rh",
               "MATCHED SENTENCE ID": 12,
               "SENTENCE SIMILARITY SCORE": 0.09173709682797307
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2467rh",
               "MATCHED SENTENCE ID": 9,
               "SENTENCE SIMILARITY SCORE": 0.053473717939340706
          },
          {
               "MATC

## Case # 28

In [97]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2485eh'
B_doc_name = 'BILLS-117hr2485ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [98]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [99]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2485eh",
          "MATCHED DOCUMENT ID: BILLS-117hr2485ih",
          "DOCUMENT SIMILARITY SCORE: 0.9988293824992519",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2485ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2485ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2485ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2485ih",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2485ih",
            

## Case # 29

In [100]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2523eh'
B_doc_name = 'BILLS-117hr2523ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [101]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [102]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2523eh",
          "MATCHED DOCUMENT ID: BILLS-117hr2523ih",
          "DOCUMENT SIMILARITY SCORE: 0.9824661839161795",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2523ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999993
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2523ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2523ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2523ih",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2523ih",
            

## Case # 30

In [103]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr3007ih'
B_doc_name = 'BILLS-117hr3007rh'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [104]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [105]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr3007ih",
          "MATCHED DOCUMENT ID: BILLS-117hr3007rh",
          "DOCUMENT SIMILARITY SCORE: 0.9657708021584326",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr3007rh",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999998
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr3007rh",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr3007rh",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9652103098677866
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr3007rh",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]


## Case # 31

In [106]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117s658rfh'
B_doc_name = 'BILLS-117s658rs'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [107]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [108]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117s658rfh",
          "MATCHED DOCUMENT ID: BILLS-117s658rs",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000058",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117s658rs",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117s658rs",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117s658rs",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117s658rs",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-

## Case # 32

In [109]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117sjres13is'
B_doc_name = 'BILLS-117sjres13es'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [110]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [111]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117sjres13is",
          "MATCHED DOCUMENT ID: BILLS-117sjres13es",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999996",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117sjres13es",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999996
          }
     ]
]


## Case # 33

In [112]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117sres107is'
B_doc_name = 'BILLS-117sres107rs'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [113]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [114]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117sres107is",
          "MATCHED DOCUMENT ID: BILLS-117sres107rs",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999992",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117sres107rs",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999992
          }
     ]
]


## Case # 34

In [115]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117sres120is'
B_doc_name = 'BILLS-117sres120ats'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [116]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [117]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117sres120is",
          "MATCHED DOCUMENT ID: BILLS-117sres120ats",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999963",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117sres120ats",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999963
          }
     ]
]


## Case # 35

In [118]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117sres81ats'
B_doc_name = 'BILLS-117sres81is'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [119]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [120]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117sres81ats",
          "MATCHED DOCUMENT ID: BILLS-117sres81is",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000007",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117sres81is",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000007
          }
     ]
]


## Case # 36

In [121]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117sres37is'
B_doc_name = 'BILLS-117sres37rs'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [122]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [123]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117sres37is",
          "MATCHED DOCUMENT ID: BILLS-117sres37rs",
          "DOCUMENT SIMILARITY SCORE: 0.9874880466425378",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117sres37rs",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999974
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117sres37rs",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9579080043578713
          }
     ]
]


## Case # 37

In [124]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117sres22is'
B_doc_name = 'BILLS-117sres22rs'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [125]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [126]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117sres22is",
          "MATCHED DOCUMENT ID: BILLS-117sres22rs",
          "DOCUMENT SIMILARITY SCORE: 1.000000000000002",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117sres22rs",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.000000000000002
          }
     ]
]


## Case # 38

In [127]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117s1910es'
B_doc_name = 'BILLS-117s1910is'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [128]:
A_doc_vectorized = document_count_vectorized_transformation(A_doc, doc_count_vectorizer)
B_doc_vectorized = document_count_vectorized_transformation(B_doc, doc_count_vectorizer)

A_section_doc_vectorized = section_doc_count_vectorized_transformation(A_section_doc, sec_count_vectorizer)
B_section_doc_vectorized = section_doc_count_vectorized_transformation(B_section_doc, sec_count_vectorizer)

In [129]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117s1910es",
          "MATCHED DOCUMENT ID: BILLS-117s1910is",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999976",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117s1910is",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117s1910is",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117s1910is",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999963
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117s1910is",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]
