In [1]:
# python
import json
import time
import pickle
import re, string
import os
from os import path, listdir
from pathlib import Path
from os.path import isfile, join
from types import new_class
from typing import List
from lxml import etree 
from contextlib import ExitStack
import sklearn.feature_extraction.text
from nltk.tokenize import PunktSentenceTokenizer, RegexpTokenizer, TreebankWordTokenizer
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

# Among the larger bills is samples/congress/116/BILLS-116s1790enr.xml (~ 10MB)

PATH_116_USLM = '/usr/local/share/xcential/public/data/116/uslm'
PATH_117_USLM = '/usr/local/share/xcential/public/data/117/uslm'
PATH_116_USLM_TRAIN = 'samples/congress/116/train'
PATH_116_TEXT = 'samples/congress/116/txt'

BILLS_SAMPLE = [f'BILLS-116hr{number}ih.xml' for number in range(100, 300)]
BIG_BILLS = ['BILLS-116s1790enr.xml', 'BILLS-116hjres31enr.xml']
BIG_BILLS_PATHS = [path.join(PATH_116_USLM, bill) for bill in (BIG_BILLS + BILLS_SAMPLE)]

SAMPLE_BILL_PATHS_TRAIN = [join(PATH_116_USLM_TRAIN, f) for f in listdir(PATH_116_USLM) if isfile(join(PATH_116_USLM_TRAIN, f))]
SAMPLE_BILL_PATHS = [join(PATH_117_USLM, f) for f in listdir(PATH_117_USLM) if isfile(join(PATH_117_USLM, f))]


NAMESPACES = {'uslm': 'http://xml.house.gov/schemas/uslm/1.0'}


def get_filepaths(dirpath: str, reMatch = r'.xml$') -> List[str]:
    return [join(dirpath, f) for f in listdir(dirpath) if (len(re.findall(reMatch, f)) > 0) and isfile(join(dirpath, f))]

def getEnum(section) -> str:
  enumpath = section.xpath('enum')  
  if len(enumpath) > 0:
    return enumpath[0].text
  return ''

def getHeader(section) -> str:
  headerpath = section.xpath('header')  
  if len(headerpath) > 0:
    return headerpath[0].text
  return ''

def text_to_vect(txt: str , ngram_size: int = 4):
    """
    Gets ngrams from text
    """
    # See https://stackoverflow.com/a/32128803/628748
    tokenizer = PunktSentenceTokenizer()
    sentences = tokenizer.tokenize(txt)
    #vect = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(ngram_size,ngram_size),
    #    tokenizer=TreebankWordTokenizer().tokenize, lowercase=True)
    vect = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(ngram_size,ngram_size),
        tokenizer=RegexpTokenizer(r"\w+").tokenize, lowercase=True)
    vect.fit(sentences)
    # ngrams = vect.get_feature_names_out()
    # print('{1}-grams: {0}'.format(ngrams, ngram_size))
    #print(vect.vocabulary_)
    return vect # list of text documents

def xml_to_sections(xml_path: str):
    """
    Parses the xml file into sections 
    """
    try:
        billTree = etree.parse(xml_path)
    except:
        raise Exception('Could not parse bill')
    sections = billTree.xpath('//uslm:section', namespaces=NAMESPACES)
    if len(sections) == 0:
        print('No sections found')
        return []
    return [{
            'section_number': getEnum(section) ,
            'section_header':  getHeader(section),
            'section_text': etree.tostring(section, method="text", encoding="unicode"),
            'section_xml': etree.tostring(section, method="xml", encoding="unicode")
        } if (section.xpath('header') and len(section.xpath('header')) > 0  and section.xpath('enum') and len(section.xpath('enum'))>0) else
        {
            'section_number': '',
            'section_header': '', 
            'section_text': etree.tostring(section, method="text", encoding="unicode"),
            'section_xml': etree.tostring(section, method="xml", encoding="unicode")
        } 
        for section in sections ]

def xml_to_text(xml_path: str, level: str = 'section', separator: str = '\n*****\n') -> str:
    """
    Parses the xml file and returns the text of the body element, if any
    """
    try:
        billTree = etree.parse(xml_path)
    except:
        raise Exception('Could not parse bill')
    #return etree.tostring(billTree, method="text", encoding="unicode")
    # Use 'body' for level to get the whole body element
    sections = billTree.xpath('//uslm:'+level, namespaces=NAMESPACES)
    if len(sections) == 0:
        print('No sections found')
        return '' 
    return separator.join([etree.tostring(section, method="text", encoding="unicode") for section in sections])

def xml_to_vect(xml_paths: List[str], ngram_size: int = 4):
    """
    Parses the xml file and returns the text of the body element, if any
    """
    total_str = '\n'.join([xml_to_text(xml_path) for xml_path in xml_paths])
    return text_to_vect(total_str, ngram_size=ngram_size)

    # to get the vocab dict: vect.vocabulary_

def combine_vocabs(vocabs: List[CountVectorizer]):
    """
    Combines one or more vocabs into one
    """
    vocab_keys = list(set([list(v.vocabulary_.keys()) for v in vocabs]))
    vocab = {vocab_key: str(i) for i, vocab_key in enumerate(vocab_keys)}
    return vocab

def get_combined_vocabs(xml_paths: List[str] = SAMPLE_BILL_PATHS, ngram_size: int = 4):
    """
    Gets the combined vocabulary of all the xml files
    """
    return xml_to_vect(xml_paths, ngram_size=ngram_size)

def getSampleText(level = 'body'):
    return xml_to_text(BIG_BILLS_PATHS[0])

def transform_text(text: str, vocab: dict, ngram_size: int = 4):
    """
    Transforms text into a vector using the vocab
    """
    return CountVectorizer(vocabulary=vocab).fit_transform([text])

def train_count_vectorizer(train_data: List[str], ngram_size: int = 4):
    """
    Trains a count vectorizer on the training data
    """
    vectorizer = CountVectorizer(ngram_range=(ngram_size,ngram_size), preprocessor=xml_to_text, tokenizer=RegexpTokenizer(r"\w+").tokenize, lowercase=True)
    with ExitStack() as stack:
        files = [
            stack.enter_context(open(filename))
            for filename in train_data 
        ]
        X = vectorizer.fit_transform(files)
    return vectorizer, X 

def train_hashing_vectorizer(train_data: List[str], ngram_size: int = 4):
    """
    Trains a hashing vectorizer on the training data
    """
    vectorizer = HashingVectorizer(ngram_range=(ngram_size,ngram_size), preprocessor=xml_to_text, tokenizer=RegexpTokenizer(r"\w+").tokenize, lowercase=True)
    with ExitStack() as stack:
        files = [
            stack.enter_context(open(filename))
            for filename in train_data 
        ]
        X = vectorizer.fit_transform(files)
    return vectorizer, X

def test_hashing_vectorizer(vectorizer: HashingVectorizer, test_data: List[str]):
    return vectorizer.transform(test_data)

def xml_samples_to_text(dirpath: str, level: str = 'section', separator: str = '\n*****\n'):
    """
    Converts xml files in a directory to txt files
    """
    xfiles = get_filepaths(dirpath)
    for xfile in xfiles:
        with open(xfile.replace('.xml', f'-{level}s.txt'), 'w') as f:
            f.write(xml_to_text(xfile, level=level, separator=separator))

# TODO: Add a function to parse the bill (text) into paragraphs 

# TODO: create a streaming hash vectorizer. See 
# https://scikit-learn.org/stable/auto_examples/applications/plot_out_of_core_classification.html#sphx-glr-auto-examples-applications-plot-out-of-core-classification-py

### Utility function for Text Cleaning

In [2]:
#clean text 
def text_cleaning(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

# Data Loading & Data Pre-processing

In [3]:
#xml document and section level parsing

#record training time for both vectorizer
start = time.time()


doc_corpus_data=[]
section_corpus_data = []

#get all xml files from data directory for parsing
bill_files = [f for f in os.listdir(PATH_117_USLM) if f.endswith('.xml')]

#iterate over all bill files
for i in range(0, len(bill_files)):
    
    #indexing bill document file
    bill_doc_file = bill_files[i]
    
    #parse xml into sections
    secs = xml_to_sections(os.path.join(PATH_117_USLM, bill_doc_file))
    
    #check  of sections should be 1 or more than 1
    if(len(secs)>0):  
        
        #intialize string variable for document content
        doc_content = ""
        
        #iterate over all parse sections text of bill doc file
        for s_number, section in enumerate(secs):  
            
            #text cleaning applied on each section text
            sec_text = text_cleaning(section['section_text'])
            
            #concatenate section text to doc content 
            doc_content = doc_content + sec_text + " "
            
             #for now sentence id is sentence number in document
            section_corpus_data.append([Path(bill_doc_file).stem[:], s_number, sec_text ])

        doc_corpus_data.append([Path(bill_doc_file).stem[:], doc_content])
    

#get only whole document content from doc_corpus_data list
only_doc_data = [row[1] for row in doc_corpus_data]

#get only section content from section_corpus_data list
only_section_data = [row[2] for row in section_corpus_data]


#get length of only_doc_data list
print(len(only_doc_data))

#get length of only_section_data list
print(len(only_section_data))

done = time.time()
elapsed = done - start
print('Time took in ETL with {} xml data files is {}'.format(len(only_doc_data), elapsed))           

No sections found
No sections found
12805
81642
Time took in ETL with 12805 xml data files is 105.39807391166687


# NLP Modeling

## Model Training

In [4]:
#record training time for both vectorizer
start = time.time()


# Vectorizer to convert a collection of raw documents to a matrix 
doc_tfidf_vectorizer = TfidfVectorizer(ngram_range=(4,4), tokenizer=RegexpTokenizer(r"\w+").tokenize, lowercase=True)
#Fit tfidf vectorize instance on document level corpus
tv_doc_matrix = doc_tfidf_vectorizer.fit_transform(only_doc_data)

# Vectorizer to convert a collection of sections to a matrix 
sec_tfidf_vectorizer = TfidfVectorizer(ngram_range=(4,4), tokenizer=RegexpTokenizer(r"\w+").tokenize, lowercase=True)
#Fit tfidf vectorize instance on section level corpus
tv_section_matrix = sec_tfidf_vectorizer.fit_transform(only_section_data)


done = time.time()
elapsed = done - start
print("Time took in training of both vectorizer(s) ", elapsed)

Time took in training of both vectorizer(s)  185.00698709487915


## Model Saving & Loading

In [5]:
# save tfidf vectorize instance for only_doc_data
pickle.dump(doc_tfidf_vectorizer, open("doc_tfidf_vectorizer.pickle", "wb"))
# load tfidf vectorize instance for only_doc_data
doc_tfidf_vectorizer = pickle.load(open("doc_tfidf_vectorizer.pickle", "rb"))

#save tfidf vectorize instance for only_section_data
pickle.dump(sec_tfidf_vectorizer, open("sec_tfidf_vectorizer.pickle", "wb"))
# load tfidf vectorize instance for only_section_data
sec_tfidf_vectorizer = pickle.load(open("sec_tfidf_vectorizer.pickle", "rb"))



## Get Document and Sections Text from XML File


In [6]:
#et document and section text from xml file for testing purpose 
def get_document_and_section_from_xml_file(file_path):
    
    #create empty t_section_data list
    t_section_data=[] 
    
    #get sections fron xml file
    t_secs = xml_to_sections(file_path)
    
    #check if length of section is more than 0  
    if(len(t_secs)>0): 
        
            #intialize string variable for document content
            t_doc_content = ""

            #iterate over all parse sections text of bill doc file
            for s_number, section in enumerate(t_secs):  

                #text cleaning applied on each section text
                sec_text = text_cleaning(section['section_text'])

                #concatenate section text to doc content 
                t_doc_content = t_doc_content + sec_text + " "

                 #for now sentence id is sentence number in document
                t_section_data.append(sec_text)

    return t_doc_content, t_section_data
    

## Calculate Cosine Similarity

In [13]:
#choose document A file name & document B file name 
A_doc_name = 'BILLS-117hr2547ih'
B_doc_name = 'BILLS-117hr2547rfs'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

print("A section doc[6]:")
print(A_section_doc[6])

print("A number of sections:")
print(len(A_section_doc))

A section doc[6]:
sec  enhanced protection against debt collector harassment of servicemembersa communication in connection with debt collection—section  of the fair debt collection practices act  usc  is amended by adding at the end the following“e communications concerning servicemember debts—“ definition—in this subsection the term ‘covered member’ means—“a a covered member or a dependent as defined in section  of title  united states code and“bi an individual who was separated discharged or released from duty described in such section  but only during the  period beginning on the date of separation discharge or release or“ii a person with respect to an individual described in clause i described in subparagraph a d e or i of section  of title  united states code“ prohibitions—a debt collector may not in connection with the collection of any debt of a covered member—“a threaten to have the covered member reduced in rank“b threaten to have the covered member’s security clearance revok

## Transform Document A & B into vectorized space to perform cosine similarity

In [11]:
def document_tfidf_vectorized_transformation(document, doc_tfidf_vectorizer):
    
    doc_vectorized = doc_tfidf_vectorizer.transform([document])
    return doc_vectorized

def section_doc_tfidf_vectorized_transformation(section_doc, sec_tfidf_vectorizer):
    
    section_doc_vectorized = sec_tfidf_vectorizer.transform(section_doc)
    return section_doc_vectorized


In [16]:
#record training time for both vectorizer
start = time.time()

#transform document A content and document B content
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

#transform document A section content and  document B section content
A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

#show form of vectorized doc
print("A_section_doc_vectorized (a sparse matrix; see https://stackoverflow.com/q/15115765/628748 and https://stackoverflow.com/a/49738773/628748)")
print(A_section_doc_vectorized)

print("Length of A_section_doc_vectorized")
print(len(A_section_doc_vectorized.todense()))

done = time.time()
elapsed = done - start
print("Time took in vectorizing both documents ", elapsed)

A_section_doc_vectorized (a sparse matrix; see https://stackoverflow.com/q/15115765/628748 and https://stackoverflow.com/a/49738773/628748)
  (0, 5565248)	0.08121005883378808
  (0, 5564171)	0.08121005883378808
  (0, 5559976)	0.08121005883378808
  (0, 5538452)	0.08121005883378808
  (0, 5391527)	0.08121005883378808
  (0, 5373082)	0.08121005883378808
  (0, 5348028)	0.07287241830948726
  (0, 5257695)	0.08121005883378808
  (0, 5216068)	0.08121005883378808
  (0, 5216067)	0.08121005883378808
  (0, 5216048)	0.08121005883378808
  (0, 5216047)	0.08121005883378808
  (0, 5216015)	0.08121005883378808
  (0, 5215991)	0.08121005883378808
  (0, 5215927)	0.08121005883378808
  (0, 5215910)	0.08121005883378808
  (0, 5214041)	0.04433951528035008
  (0, 5213692)	0.05549940085775523
  (0, 5213688)	0.0466655543936124
  (0, 5159123)	0.03769340254983612
  (0, 5158961)	0.04473872310657974
  (0, 5102390)	0.03729186256515947
  (0, 5061960)	0.07287241830948726
  (0, 4948039)	0.05803248811455758
  (0, 4903499)	0.0812

In [19]:
def cosine_pairwise_sim(a_vectorized, b_vectorized):
    
    #record time for computing similarity 
    start = time.time()

    sim_score =  cosine_similarity(a_vectorized, b_vectorized)

    done = time.time()
    elapsed = done - start
    return elapsed, sim_score

## Measure Document Similarity Score of any Document A to any Document B

In [23]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)

print("elapsed: " + str(elapsed1))
print(doc_sim_score)

elapsed 1: 0.014885902404785156
[[0.8490097]]
elapsed 2: 0.014529943466186523
[[0.8490097]]


## Measure Pairwise similarity between Sections of Document A & Sections of Document B

In [24]:
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

In [25]:
print(elapsed)
print(sec_doc_sim_score)

0.01469278335571289
[[0.71088467 0.03478385 0.         ... 0.         0.00418193 0.        ]
 [0.02527629 1.         0.         ... 0.         0.23520597 0.        ]
 [0.01527173 0.         0.84355816 ... 0.         0.         0.05656838]
 ...
 [0.04155805 0.26936992 0.         ... 0.         0.20619126 0.        ]
 [0.01467514 0.         0.00294065 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.55972105 0.         0.05080109]]


## Rendering results of similarity b/w Document A & Document B and their Sections

In [38]:
def create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score):
    
    #create result list
    res_list = []

    #create empty list
    temp=[]
    temp.append("ORIGINAL DOCUMENT ID: " + A_doc_name)
    temp.append("MATCHED DOCUMENT ID: " + B_doc_name)
    temp.append("DOCUMENT SIMILARITY SCORE: " + str(doc_sim_score[0][0]))

    #iterate over sec_doc_sim_score list 
    for i, section_score_list in enumerate(sec_doc_sim_score):
        
        #add original document sentence id number
        temp.append("ORIGINAL SENTENCE ID: " + str(i+1))
           
        #sort similarity score of sections list
        section_score_list = list(enumerate(section_score_list))
        sorted_section_score_list = sorted(section_score_list, key=lambda x: x[1], reverse=True)
        
        #iterate over section level score only 
        for j, sim_score in sorted_section_score_list:
            temp.append({"MATCHED DOCUMENT ID":  B_doc_name, "MATCHED SENTENCE ID": j+1 , "SENTENCE SIMILARITY SCORE":  sim_score})

    res_list.append(temp)
        
    #return pretty json
    r = json.dumps(res_list)
    parsed = json.loads(r)
    return json.dumps(parsed, indent=5)

    

In [39]:
#put document A file name & document B file name
#A_doc_name = 'BILLS-117hr200ih'
#B_doc_name = 'BILLS-117hr201ih'

#get json response of newly computed similarity score of document A and document B
response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2547ih",
          "MATCHED DOCUMENT ID: BILLS-117hr2547rfs",
          "DOCUMENT SIMILARITY SCORE: 0.8508737724037765",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2547rfs",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.7117384009449566
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2547rfs",
               "MATCHED SENTENCE ID": 19,
               "SENTENCE SIMILARITY SCORE": 0.10299852574797382
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2547rfs",
               "MATCHED SENTENCE ID": 6,
               "SENTENCE SIMILARITY SCORE": 0.1017117632842342
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2547rfs",
               "MATCHED SENTENCE ID": 25,
               "SENTENCE SIMILARITY SCORE": 0.08183411472716967
          },
          {
               "M

# Testing 

## Case # 1

In [16]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hjres27enr'
B_doc_name = 'BILLS-117hjres27ih'


A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))


In [17]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [18]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hjres27enr",
          "MATCHED DOCUMENT ID: BILLS-117hjres27ih",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999998",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hjres27ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          }
     ]
]


## Case # 2

In [19]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr293eh'
B_doc_name = 'BILLS-117hr293ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [20]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [21]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr293eh",
          "MATCHED DOCUMENT ID: BILLS-117hr293ih",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999997",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr293ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000004
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr293ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr293ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr293ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999998
          },
          {
               "MATCHED D

## Case # 3

In [22]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr335enr'
B_doc_name = 'BILLS-117hr335ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [23]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [24]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr335enr",
          "MATCHED DOCUMENT ID: BILLS-117hr335ih",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000036",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr335ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999998
          }
     ]
]


## Case # 4

In [25]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr335enr'
B_doc_name = 'BILLS-117hr335ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [26]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [27]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr335enr",
          "MATCHED DOCUMENT ID: BILLS-117hr335ih",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000036",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr335ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999998
          }
     ]
]


## Case # 5

In [28]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1195eh'
B_doc_name = 'BILLS-117hr1195rh'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [29]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [30]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1195eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1195rh",
          "DOCUMENT SIMILARITY SCORE: 0.977852488703694",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 8,
               "SENTENCE SIMILARITY SCORE": 0.2168741428291012
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.04678072486075494
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0395936129650904
          },
          {
               "MATCHED DOCUMENT ID": "BI

## Case # 6

In [31]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1205eh'
B_doc_name = 'BILLS-117hr1205ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [32]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [33]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1205eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1205ih",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999863",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1205ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1205ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1205ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1205ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999997
          },
          {
               "MAT

## Case # 7

In [34]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
#A_doc_name = 'BILLS-117hr1251eh'
#B_doc_name = 'BILLS-117hr1251ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [35]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [36]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1251eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1251ih",
          "DOCUMENT SIMILARITY SCORE: 0.9289690386554342",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1251ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000004
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1251ih",
               "MATCHED SENTENCE ID": 10,
               "SENTENCE SIMILARITY SCORE": 0.1161022452774018
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1251ih",
               "MATCHED SENTENCE ID": 5,
               "SENTENCE SIMILARITY SCORE": 0.03667330745972546
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1251ih",
               "MATCHED SENTENCE ID": 6,
               "SENTENCE SIMILARITY SCORE": 0.008909819256373155
          },
          {
               "MATCHE

## Case # 8

In [37]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1257eh'
B_doc_name = 'BILLS-117hr1257rfs'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [38]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [39]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1257eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1257rfs",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000009",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1257rfs",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1257rfs",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1257rfs",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1257rfs",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUM

## Case # 9

In [40]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1195eh'
B_doc_name = 'BILLS-117hr1195rh'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [41]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [42]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1195eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1195rh",
          "DOCUMENT SIMILARITY SCORE: 0.977852488703694",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 8,
               "SENTENCE SIMILARITY SCORE": 0.2168741428291012
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.04678072486075494
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1195rh",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0395936129650904
          },
          {
               "MATCHED DOCUMENT ID": "BI

## Case # 10

In [43]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1262eh'
B_doc_name = 'BILLS-117hr1262ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [44]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [45]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1262eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1262ih",
          "DOCUMENT SIMILARITY SCORE: 0.9407637689064062",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1262ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1262ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1262ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9408737010231012
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1262ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]


## Case # 11

In [46]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1280eh'
B_doc_name = 'BILLS-117hr1280ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [47]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [48]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1280eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1280ih",
          "DOCUMENT SIMILARITY SCORE: 0.9998036377384408",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1280ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000007
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1280ih",
               "MATCHED SENTENCE ID": 45,
               "SENTENCE SIMILARITY SCORE": 0.05517825493007554
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1280ih",
               "MATCHED SENTENCE ID": 58,
               "SENTENCE SIMILARITY SCORE": 0.0533766438665852
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1280ih",
               "MATCHED SENTENCE ID": 42,
               "SENTENCE SIMILARITY SCORE": 0.046864767212212805
          },
          {
               "MATC

## Case # 12

In [49]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1314eh'
B_doc_name = 'BILLS-117hr1314ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [50]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [51]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1314eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1314ih",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000024",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1314ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1314ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1314ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000007
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1314ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]


## Case # 13

In [52]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1318enr'
B_doc_name = 'BILLS-117hr1318eh'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [53]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [54]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1318enr",
          "MATCHED DOCUMENT ID: BILLS-117hr1318eh",
          "DOCUMENT SIMILARITY SCORE: 1.000000000000002",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1318eh",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1318eh",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1318eh",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1318eh",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117h

## Case # 14

In [55]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1324eh'
B_doc_name = 'BILLS-117hr1324ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [56]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [57]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1324eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1324ih",
          "DOCUMENT SIMILARITY SCORE: 0.9994893123971358",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1324ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1324ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.012047491668665698
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1324ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.006203334326419799
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1324ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9993872413549618
          

## Case # 15

In [58]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1333eh'
B_doc_name = 'BILLS-117hr1333ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [59]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [60]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1333eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1333ih",
          "DOCUMENT SIMILARITY SCORE: 0.9689457170872295",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1333ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1333ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1333ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1333ih",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117h

## Case # 16

In [61]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1392eh'
B_doc_name = 'BILLS-117hr1392ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [62]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [63]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1392eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1392ih",
          "DOCUMENT SIMILARITY SCORE: 0.9018812516324968",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1392ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1392ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1392ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1392ih",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1392ih",
            

## Case # 17

In [64]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1602eh'
B_doc_name = 'BILLS-117hr1602ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [65]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [66]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1602eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1602ih",
          "DOCUMENT SIMILARITY SCORE: 0.9958174662617439",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1602ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1602ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1602ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.995896995612144
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1602ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]


## Case # 18

In [67]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1651enr'
B_doc_name = 'BILLS-117hr1651ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [68]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [69]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1651enr",
          "MATCHED DOCUMENT ID: BILLS-117hr1651ih",
          "DOCUMENT SIMILARITY SCORE: 0.8230106276686338",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1651ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1651ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.2922331359232185
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1651ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.892049726610896
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1651ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.3312603995008717
          },
          "ORIGI

## Case # 19

In [70]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1711eh'
B_doc_name = 'BILLS-117hr1711ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [71]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [72]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1711eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1711ih",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999841",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1711ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1711ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1711ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1711ih",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117h

## Case # 20

In [73]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1833eh'
B_doc_name = 'BILLS-117hr1833ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [74]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [75]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1833eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1833ih",
          "DOCUMENT SIMILARITY SCORE: 0.7345620477499715",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1833ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1833ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.019232336411061347
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1833ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.7305223234833066
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1833ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.012818069894104991
          

## Case # 21

In [76]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2008eh'
B_doc_name = 'BILLS-117hr2008ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [77]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [78]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2008eh",
          "MATCHED DOCUMENT ID: BILLS-117hr2008ih",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000002",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2008ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2008ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2008ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2008ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]


## Case # 22

In [79]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2016rh'
B_doc_name = 'BILLS-117hr2016ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [80]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [81]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2016rh",
          "MATCHED DOCUMENT ID: BILLS-117hr2016ih",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999883",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2016ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2016ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2016ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2016ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999902
          },
          {
               "MATCHED DOCUMENT I

## Case # 23

In [82]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2027eh'
B_doc_name = 'BILLS-117hr2027ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [83]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [84]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2027eh",
          "MATCHED DOCUMENT ID: BILLS-117hr2027ih",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000078",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2027ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2027ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2027ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2027ih",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2027ih",
            

## Case # 24

In [85]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2062eh'
B_doc_name = 'BILLS-117hr2062rfs'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [86]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [87]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2062eh",
          "MATCHED DOCUMENT ID: BILLS-117hr2062rfs",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000036",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2062rfs",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2062rfs",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2062rfs",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2062rfs",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2062rfs",
               "MATCH

## Case # 25

In [88]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr1573eh'
B_doc_name = 'BILLS-117hr1573ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [89]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [90]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr1573eh",
          "MATCHED DOCUMENT ID: BILLS-117hr1573ih",
          "DOCUMENT SIMILARITY SCORE: 0.928393975216383",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1573ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1573ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1573ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9292457930874963
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr1573ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]


## Case # 26

In [91]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2332eh'
B_doc_name = 'BILLS-117hr2332ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [92]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [93]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2332eh",
          "MATCHED DOCUMENT ID: BILLS-117hr2332ih",
          "DOCUMENT SIMILARITY SCORE: 0.7037812302306787",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2332ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2332ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2332ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2332ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.7066945664036552
          },
          {
               "MAT

## Case # 27

In [94]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2467ih'
B_doc_name = 'BILLS-117hr2467rh'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [95]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [96]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2467ih",
          "MATCHED DOCUMENT ID: BILLS-117hr2467rh",
          "DOCUMENT SIMILARITY SCORE: 0.9999999999999997",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2467rh",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000009
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2467rh",
               "MATCHED SENTENCE ID": 17,
               "SENTENCE SIMILARITY SCORE": 0.1692170180717253
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2467rh",
               "MATCHED SENTENCE ID": 12,
               "SENTENCE SIMILARITY SCORE": 0.11127584710523475
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2467rh",
               "MATCHED SENTENCE ID": 9,
               "SENTENCE SIMILARITY SCORE": 0.05201327668990666
          },
          {
               "MATCHE

## Case # 28

In [97]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2485eh'
B_doc_name = 'BILLS-117hr2485ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [98]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [99]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2485eh",
          "MATCHED DOCUMENT ID: BILLS-117hr2485ih",
          "DOCUMENT SIMILARITY SCORE: 0.998935637661856",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2485ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2485ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2485ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2485ih",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2485ih",
               "MATCHED SENT

## Case # 29

In [100]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr2523eh'
B_doc_name = 'BILLS-117hr2523ih'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [101]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [102]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr2523eh",
          "MATCHED DOCUMENT ID: BILLS-117hr2523ih",
          "DOCUMENT SIMILARITY SCORE: 0.9753038133066074",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2523ih",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999998
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2523ih",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2523ih",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2523ih",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr2523ih",
            

## Case # 30

In [103]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117hr3007ih'
B_doc_name = 'BILLS-117hr3007rh'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [104]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [105]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117hr3007ih",
          "MATCHED DOCUMENT ID: BILLS-117hr3007rh",
          "DOCUMENT SIMILARITY SCORE: 0.9670147514339097",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr3007rh",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.9999999999999999
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr3007rh",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr3007rh",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9668822467123528
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117hr3007rh",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]


## Case # 31

In [106]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117s658rfh'
B_doc_name = 'BILLS-117s658rs'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [107]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [108]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117s658rfh",
          "MATCHED DOCUMENT ID: BILLS-117s658rs",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000009",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117s658rs",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000002
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117s658rs",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117s658rs",
               "MATCHED SENTENCE ID": 3,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117s658rs",
               "MATCHED SENTENCE ID": 4,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-

## Case # 32

In [109]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117sjres13is'
B_doc_name = 'BILLS-117sjres13es'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [110]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [111]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117sjres13is",
          "MATCHED DOCUMENT ID: BILLS-117sjres13es",
          "DOCUMENT SIMILARITY SCORE: 1.0",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117sjres13es",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000004
          }
     ]
]


## Case # 33

In [112]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117sres107is'
B_doc_name = 'BILLS-117sres107rs'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [113]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [114]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117sres107is",
          "MATCHED DOCUMENT ID: BILLS-117sres107rs",
          "DOCUMENT SIMILARITY SCORE: 1.000000000000003",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117sres107rs",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000027
          }
     ]
]


## Case # 34

In [115]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117sres120is'
B_doc_name = 'BILLS-117sres120ats'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [116]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [117]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117sres120is",
          "MATCHED DOCUMENT ID: BILLS-117sres120ats",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000007",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117sres120ats",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000038
          }
     ]
]


## Case # 35

In [118]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117sres81ats'
B_doc_name = 'BILLS-117sres81is'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [119]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [120]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117sres81ats",
          "MATCHED DOCUMENT ID: BILLS-117sres81is",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000004",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117sres81is",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0000000000000013
          }
     ]
]


## Case # 36

In [121]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117sres37is'
B_doc_name = 'BILLS-117sres37rs'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [122]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [123]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117sres37is",
          "MATCHED DOCUMENT ID: BILLS-117sres37rs",
          "DOCUMENT SIMILARITY SCORE: 0.9869431500006636",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117sres37rs",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.000000000000001
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117sres37rs",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.9548968217785122
          }
     ]
]


## Case # 37

In [124]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117sres22is'
B_doc_name = 'BILLS-117sres22rs'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [125]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [126]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117sres22is",
          "MATCHED DOCUMENT ID: BILLS-117sres22rs",
          "DOCUMENT SIMILARITY SCORE: 1.0000000000000016",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117sres22rs",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.000000000000007
          }
     ]
]


## Case # 38

In [127]:
#pick any Document A & any Document B from data lists (at least that have more than 1 section)
A_doc_name = 'BILLS-117s1910es'
B_doc_name = 'BILLS-117s1910is'

A_doc, A_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, A_doc_name+".xml"))
B_doc, B_section_doc = get_document_and_section_from_xml_file(os.path.join(PATH_117_USLM, B_doc_name+".xml"))

In [128]:
A_doc_vectorized = document_tfidf_vectorized_transformation(A_doc, doc_tfidf_vectorizer)
B_doc_vectorized = document_tfidf_vectorized_transformation(B_doc, doc_tfidf_vectorizer)

A_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(A_section_doc, sec_tfidf_vectorizer)
B_section_doc_vectorized = section_doc_tfidf_vectorized_transformation(B_section_doc, sec_tfidf_vectorizer)

In [129]:
elapsed, doc_sim_score = cosine_pairwise_sim(A_doc_vectorized, B_doc_vectorized)
elapsed, sec_doc_sim_score = cosine_pairwise_sim(A_section_doc_vectorized, B_section_doc_vectorized)

response = create_json_response(A_doc_name, B_doc_name, doc_sim_score, sec_doc_sim_score)
print(response)

[
     [
          "ORIGINAL DOCUMENT ID: BILLS-117s1910es",
          "MATCHED DOCUMENT ID: BILLS-117s1910is",
          "DOCUMENT SIMILARITY SCORE: 1.0",
          "ORIGINAL SENTENCE ID: 1",
          {
               "MATCHED DOCUMENT ID": "BILLS-117s1910is",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117s1910is",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 0.0
          },
          "ORIGINAL SENTENCE ID: 2",
          {
               "MATCHED DOCUMENT ID": "BILLS-117s1910is",
               "MATCHED SENTENCE ID": 2,
               "SENTENCE SIMILARITY SCORE": 1.0
          },
          {
               "MATCHED DOCUMENT ID": "BILLS-117s1910is",
               "MATCHED SENTENCE ID": 1,
               "SENTENCE SIMILARITY SCORE": 0.0
          }
     ]
]
