In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import pprint as pp

# text embeddings
import tensorflow as tf
import tensorflow_hub as hub

# keywords extraction
import yake

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.5.0


In [3]:
# Load the Universal Sentence Encoder's TF Hub module
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
model = hub.load(module_url)

# Module functions

In [4]:
def prepare_data_for_search(path, n_keywords=10):
    """
    converts database of filename -> paragraphs to
    database keyword -> paragraph -> filename

    input:
      path        - as string, path to tab delimited file with columns "file_name" and "paragraph"
                    where each paragraph is extracted and saved in a separate cell with corresponding file name
                    in the same row
      n_keywords  – as integer, number of keywords to extract

    output:
      pandas data frame, ready for search.
    """
    df_paragraphs = pd.read_csv(path, sep="\t")

    print("Extracting keywords...")
    df_paragraphs['keywords'] = df_paragraphs['paragraph'].apply(lambda text: extract_keywords(text,
                                                                                               method=method,
                                                                                               n_keywords=n_keywords))
    print("Transforming to a data frame with a keywordd per row...")
    list_dfs = []

    for i in range(len(df_paragraphs)):
        row = df_paragraphs.iloc[i]
        list_of_keywords = row['keywords']
        paragraph = row['paragraph']
        file_name = row['file_name']

        df_k = pd.DataFrame(list_of_keywords, columns=['keyword'])
        df_k['paragraph'] = paragraph
        df_k['file_name'] = file_name

        list_dfs.append(df_k)

    df_data = pd.concat(list_dfs)
    df_data = df_data[["file_name", "paragraph", "keyword"]]

    return df_data

In [5]:
def extract_keyphrases(text, n_keywords=10, ngram_range=3):
    """
    Extracts keywords/key/phrases from a text using "YAKE!" keyword extraction method

    Intuition:https://towardsdatascience.com/keyword-extraction-methods-the-overview-35557350f8bb
    Research paper: https://www.sciencedirect.com/science/article/abs/pii/S0020025519308588
    Demo: http://yake.inesctec.pt/
    Instalation: https://pypi.org/project/yake/

    Input:
        text        – as string, text of a paragraph to process and extract keywords
        n_keywords  – as integer, number of keywords to extract
        ngram_range – as integer, maximum number of words in extracted phrase. Default is 3.
        
    Output:
        List of strings – extracted keywords/keyphrases

    """
    list_keywords = []

    pyake = yake.KeywordExtractor(lan="en", n=ngram_range)

    result = pyake.extract_keywords(text)
    result.sort(key=lambda tup: tup[1], reverse=True)

    list_keywords = result[:n_keywords]
    list_keywords = [t[0] for t in list_keywords]

    return list_keywords

In [6]:
def get_embeddings(list_of_strings):
    return model(list_of_strings)

***

# Extracting keyphrases
**NOTE:**
- To make sure Universal Sentence Encoder can work with BILLs wording of the text, I copy-pasted a few paragraphs from 2 bills
- Example documents have extracted paragraphs in them tab delimited with bill name (file_name)
- For the illustration, last paragraph of the document A is a copy of the first paragraph in document B

In [7]:
input_folder = "./input_folder/"

doc_A = "doc1.txt"
doc_B = "doc2.txt"

In [8]:
df_A = pd.read_csv(input_folder + doc_A, 
                   encoding="utf-8",
                   encoding_errors='ignore',
                   sep="\t")
df_A

Unnamed: 0,file_name,paragraph
0,BILLS-116hjres31enr,For necessary expenses of U.S. Customs and Bor...
1,BILLS-116hjres31enr,For necessary expenses of U.S. Immigration and...
2,BILLS-116hjres31enr,For necessary expenses of the Coast Guard for ...


In [9]:
df_B = pd.read_csv(input_folder + doc_B, 
                   encoding="utf-8",
                   encoding_errors='ignore',
                   sep="\t")
df_B

Unnamed: 0,file_name,paragraph
0,BILLS-116s47enr,For necessary expenses of the Coast Guard for ...
1,BILLS-116s47enr,"Notwithstanding any other provision of law, if..."
2,BILLS-116s47enr,The University shall pay all costs associated ...
3,BILLS-116s47enr,The Secretary of Agriculture shall permit by s...


### **The Idea:**
Use Universal Sentence Encoder on the string of concatenated key-phrases from each paragraphs to get similarity of the paragraph.
This way legislations that work on the same issues will be captured for comparison. Note, details that make the difference myght be lost during such search.


**Thoughts behind:**<br>
Pretrained Universal Sentence Encoder (USE) is a good choice in comparing short texts (paragraphs).
The drawback is that it is based on BERT which is pretrained on general English texts not Legal or Government specific wording. Hence, working with full paragraph can reduce the quality of comparison. Taking just key-phrases reduces confusion of USE around semantic similarity of the paragraphs.



**Extracting keyphrases for each paragraph:**

In [10]:
df_A['keyphrases'] = df_A['paragraph'].apply(lambda x: extract_keyphrases(x))
df_B['keyphrases'] = df_B['paragraph'].apply(lambda x: extract_keyphrases(x))

In [11]:
#Example:
index=0

text = df_A['paragraph'].iloc[index]
keyphrases = df_A['keyphrases'].iloc[index]

print("TEXT:\n")
pp.pprint(text)

print("\n\nEXTRACTED KEYPHRASES:\n")
pp.pprint(keyphrases)

TEXT:

('For necessary expenses of U.S. Customs and Border Protection for operations '
 'and support, including the transportation of unaccompanied minor aliens; the '
 'provision of air and marine support to Federal, State, and local agencies in '
 'the enforcement or administration of laws enforced by the Department of '
 'Homeland Security; at the discretion of the Secretary of Homeland Security, '
 'the provision of such support to Federal, State, and local agencies in other '
 'law enforcement and emergency humanitarian efforts; the purchase and lease '
 'of up to 7,500 (6,500 for replacement only) police-type vehicles; the '
 'purchase, maintenance, or operation of marine vessels, aircraft, and '
 'unmanned aerial systems; and contracting with individuals for personal '
 'services abroad; $12,179,729,000; of which $3,274,000 shall be derived from '
 'the Harbor Maintenance Trust Fund for administrative expenses related to the '
 'collection of the Harbor Maintenance Fee pursuant 

**concatenating keyphrases in one string separated by whitespace**

In [12]:
df_A['key_phr_string'] = df_A['keyphrases'].apply(lambda x: " ".join(x))
df_B['key_phr_string'] = df_B['keyphrases'].apply(lambda x: " ".join(x))
df_A['key_phr_string'].iloc[index]

'Customs and Border Maintenance Trust Maintenance Fee support to Federal local agencies Security Act Harbor Maintenance administrative expenses related unaccompanied minor aliens emergency humanitarian efforts'

**Getting USE embeddings**

In [13]:
#NOTE: using tensors here might be faster and require less memory space
e_columns = ["e_" + str(i) for i in range(512)]

list_of_strings_A = list(df_A['key_phr_string'])
embeddings = model(list_of_strings_A).numpy()

df_A[e_columns] = embeddings
df_A.head(1)

Unnamed: 0,file_name,paragraph,keyphrases,key_phr_string,e_0,e_1,e_2,e_3,e_4,e_5,...,e_502,e_503,e_504,e_505,e_506,e_507,e_508,e_509,e_510,e_511
0,BILLS-116hjres31enr,For necessary expenses of U.S. Customs and Bor...,"[Customs and Border, Maintenance Trust, Mainte...",Customs and Border Maintenance Trust Maintenan...,-0.001087,0.040324,-0.115766,0.052774,0.038278,-0.027752,...,-0.006531,-0.048225,0.083112,0.042204,0.03136,0.052765,0.048066,-0.084458,0.004189,0.052355


In [14]:
#NOTE: using tensors here might be faster and require less memory space
e_columns = ["e_" + str(i) for i in range(512)]

list_of_strings_B = list(df_B['key_phr_string'])
embeddings = model(list_of_strings_B).numpy()

df_B[e_columns] = embeddings
df_B.head(1)

Unnamed: 0,file_name,paragraph,keyphrases,key_phr_string,e_0,e_1,e_2,e_3,e_4,e_5,...,e_502,e_503,e_504,e_505,e_506,e_507,e_508,e_509,e_510,e_511
0,BILLS-116s47enr,For necessary expenses of the Coast Guard for ...,"[support including, contingent and emergent, u...",support including contingent and emergent unit...,0.005286,-0.079505,0.008594,-0.011146,-0.009077,0.016665,...,-0.038383,-0.015171,0.071899,0.021445,0.03865,0.013235,-0.028723,-0.074453,0.050429,0.020433


# For Doc_A paragraphs, finding most similar paragraph in Doc_B

In [15]:
similarity_matrix = cosine_similarity(df_A[e_columns].values, df_B[e_columns].values)
similarity_matrix

array([[ 0.36875415,  0.01394344,  0.13117695,  0.11275332],
       [ 0.10659477,  0.02777174, -0.01563178,  0.04973911],
       [ 1.0000002 ,  0.0902687 ,  0.26971418,  0.09652793]],
      dtype=float32)

The cosine similarity is bounded in the interval [-1, 1]<br>
To make it [0,1], we can replace any values below 0 with 0, since it does not matter how dissimilar paragraphs are.<br><br>
**NOTE:** 
- similarity matrix displays similarity scores between paragraphs in doc_A (rows) with doc_B (columns)
- you can experiment with different input data for this notebook before putting this intu module functions

**Observations:**
- last paragraph of document A was correctly identified to be a copy of first paragraph of doc B

In [16]:
#printing text of paragraps by indexies in the similarity_matrix
row = 1 #doc_A paragraph number
column = 1 #doc_A paragraph number

print("Doc_A TEXT:")
pp.pprint(df_A['paragraph'].iloc[row])
print("\n\nDoc_B TEXT:")
pp.pprint(df_B['paragraph'].iloc[column])

Doc_A TEXT:
('For necessary expenses of U.S. Immigration and Customs Enforcement for '
 'operations and support, including the purchase and lease of up to 3,790 '
 '(2,350 for replacement only) police-type vehicles; overseas vetted units; '
 'and maintenance, minor construction, and minor leasehold improvements at '
 'owned and leased facilities; $7,542,153,000; of which $6,000,000 shall '
 'remain available until expended for efforts to enforce laws against forced '
 'child labor; of which $75,448,000 shall remain available until September 30, '
 '2020; of which $1,500,000 is for paid apprenticeships for participants in '
 'the Human Exploitation Rescue Operative Child-Rescue Corps; of which not '
 'less than $15,000,000 shall be available for investigation of intellectual '
 'property rights violations, including operation of the National Intellectual '
 'Property Rights Coordination Center; and of which not less than '
 '$4,273,857,000 shall be for enforcement, detention, and remova