In [1]:
import pandas as pd
import swifter
import numpy as np
import re
from nltk.stem import WordNetLemmatizer



In [2]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=8)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
import pke

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
def extract_keywords_tokens(sentence):
    try:
        extractor = pke.unsupervised.TopicalPageRank()
        extractor.load_document(input=sentence,
                            language='en',
                            normalization=None)
        pos = {'NOUN', 'PROPN', 'ADJ'}
        grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"
        extractor.candidate_selection(grammar=grammar)
        extractor.candidate_weighting(window=10,
                                  pos=pos,
                                  lda_model='lda-by-section-text-1-gram-1000-topics.gz')
        keywords = extractor.get_n_best(n=30)
        return [x[0] for x in keywords]
    except Exception as e:
        print(e)
        return []

In [7]:
def keyword_search(query, query_keywords, keyword_df):
    query_terms = list()
    for _query in query_keywords:
        for term in _query.split(' '):
            query_terms.append(term)
    if len(query_terms) > 0:
        query_terms = list(set(query_terms))
        
    print(query_keywords)
    print(query)
    query_terms.append(query)
    query_terms.extend(query_keywords)
    
    if len(query_terms) > 0:
        query_terms = list(set(query_terms))
        
    print(query_terms)
    for idx, term in enumerate(query_terms):
        keyword_df[term] = keyword_df['keywords'].apply(lambda x: term in x)

    keyword_df['terms_found'] = keyword_df[query_terms].sum(axis=1) / len(query_terms)

    keyword_df['all_terms_found'] = keyword_df[query_terms].sum(axis=1) / len(query_terms)
    keyword_df['query_in_chapter_header'] = keyword_df['chapter_header'].str.contains(query, case=False)
    keyword_df['query_in_section_header'] = keyword_df['section_header'].str.contains(query, case=False)
    keyword_df['query_in_headers'] = keyword_df[['query_in_chapter_header','query_in_section_header']].sum(axis=1) / 2
    query_terms.extend(['query_in_section_header','query_in_chapter_header'])
    keyword_df['all_matching'] = keyword_df[query_terms].sum(axis=1) / len(query_terms)
    return keyword_df

In [8]:
def lemmatize_section_text(text):
    split_text = [lemmatizer.lemmatize(word) for word in text.split(' ')]
    return ' '.join(split_text)

def lemmatize_keywords(keyword_set):
    lemma_keywords = list()
    for _keyword in keyword_set:
        lemma_keywords.append(lemmatize_section_text(_keyword))
    return set(lemma_keywords)

In [9]:
lemmatizer = WordNetLemmatizer()

In [10]:
df = pd.read_parquet('USCS Codes Keywords 1000 topics 1gram.parquet')

In [11]:
term_columns = [x for x in df.columns if ('term' in x) & ('value' not in x)]

In [12]:
keyword_set = df[term_columns].apply(lambda x: set(x.dropna().values), axis=1)

In [13]:
keyword_set.head()

0    {instrumentality, agency, united states govern...
1                         {term, vessel, title, barge}
2    {line, boundary line, u.s.c., february, act, s...
3    {nationality act, reference, a, immigration, u...
4    {regulations, united states government, consul...
dtype: object

In [14]:
keyword_set = keyword_set.apply(lemmatize_keywords)

In [15]:
all_keywords = list(set().union(*keyword_set.values))
len(all_keywords)

151318

In [16]:
keyword_set = keyword_set.to_frame()
keyword_set.columns = ['keywords']

In [17]:
df['keywords'] = keyword_set['keywords']

In [18]:
term_value_cols = [x for x in df.columns if 'term_value' in x]
df[term_value_cols] = df[term_value_cols].astype(float)

In [19]:
df['section_text'] = df['section_text'].parallel_apply(lemmatize_section_text)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=5082), Label(value='0 / 5082'))), …

In [20]:
keyword_tfidf = TfidfVectorizer(ngram_range=(1,4), vocabulary=all_keywords)
keyword_sentences = df['keywords'].apply(lambda x: ' '.join(x))
keyword_tfidf.fit(keyword_sentences)
keyword_vectors = keyword_tfidf.transform(keyword_sentences)

In [21]:
keyword_section_tfidf = TfidfVectorizer(ngram_range=(1,4), vocabulary=all_keywords)
keyword_section_tfidf.fit(df['section_text'])
keyword_section_vectors = keyword_section_tfidf.transform(df['section_text'])

In [22]:
tfidf = TfidfVectorizer(ngram_range=(1,2), max_df=0.3, min_df=5, max_features=100000, stop_words=["0o", "0s", "3a", "3b", "3d", "6b", "6o", "a", "a1", "a2", "a3", "a4", "ab", "able", "about", "above", "abst", "ac", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again", "against", "ah", "ain", "ain't", "aj", "al", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "ap", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "ar", "are", "aren", "arent", "aren't", "arise", "around", "as", "a's", "aside", "ask", "asking", "associated", "at", "au", "auth", "av", "available", "aw", "away", "awfully", "ax", "ay", "az", "b", "b1", "b2", "b3", "ba", "back", "bc", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bi", "bill", "biol", "bj", "bk", "bl", "bn", "both", "bottom", "bp", "br", "brief", "briefly", "bs", "bt", "bu", "but", "bx", "by", "c", "c1", "c2", "c3", "ca", "call", "came", "can", "cannot", "cant", "can't", "cause", "causes", "cc", "cd", "ce", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "cit", "cj", "cl", "clearly", "cm", "c'mon", "cn", "co", "com", "come", "comes", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn", "couldnt", "couldn't", "course", "cp", "cq", "cr", "cry", "cs", "c's", "ct", "cu", "currently", "cv", "cx", "cy", "cz", "d", "d2", "da", "date", "dc", "dd", "de", "definitely", "describe", "described", "despite", "detail", "df", "di", "did", "didn", "didn't", "different", "dj", "dk", "dl", "do", "does", "doesn", "doesn't", "doing", "don", "done", "don't", "down", "downwards", "dp", "dr", "ds", "dt", "du", "due", "during", "dx", "dy", "e", "e2", "e3", "ea", "each", "ec", "ed", "edu", "ee", "ef", "effect", "eg", "ei", "eight", "eighty", "either", "ej", "el", "eleven", "else", "elsewhere", "em", "empty", "en", "end", "ending", "enough", "entirely", "eo", "ep", "eq", "er", "es", "especially", "est", "et", "et-al", "etc", "eu", "ev", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "ey", "f", "f2", "fa", "far", "fc", "few", "ff", "fi", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "fj", "fl", "fn", "fo", "followed", "following", "follows", "for", "former", "formerly", "forth", "forty", "found", "four", "fr", "from", "front", "fs", "ft", "fu", "full", "further", "furthermore", "fy", "g", "ga", "gave", "ge", "get", "gets", "getting", "gi", "give", "given", "gives", "giving", "gj", "gl", "go", "goes", "going", "gone", "got", "gotten", "gr", "greetings", "gs", "gy", "h", "h2", "h3", "had", "hadn", "hadn't", "happens", "hardly", "has", "hasn", "hasnt", "hasn't", "have", "haven", "haven't", "having", "he", "hed", "he'd", "he'll", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "here's", "hereupon", "hers", "herself", "hes", "he's", "hh", "hi", "hid", "him", "himself", "his", "hither", "hj", "ho", "home", "hopefully", "how", "howbeit", "however", "how's", "hr", "hs", "http", "hu", "hundred", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ibid", "ic", "id", "i'd", "ie", "if", "ig", "ignored", "ih", "ii", "ij", "il", "i'll", "im", "i'm", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "insofar", "instead", "interest", "into", "invention", "inward", "io", "ip", "iq", "ir", "is", "isn", "isn't", "it", "itd", "it'd", "it'll", "its", "it's", "itself", "iv", "i've", "ix", "iy", "iz", "j", "jj", "jr", "js", "jt", "ju", "just", "k", "ke", "keep", "keeps", "kept", "kg", "kj", "km", "know", "known", "knows", "ko", "l", "l2", "la", "largely", "last", "lately", "later", "latter", "latterly", "lb", "lc", "le", "least", "les", "less", "lest", "let", "lets", "let's", "lf", "like", "liked", "likely", "line", "little", "lj", "ll", "ll", "ln", "lo", "look", "looking", "looks", "los", "lr", "ls", "lt", "ltd", "m", "m2", "ma", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mightn't", "mill", "million", "mine", "miss", "ml", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "ms", "mt", "mu", "much", "mug", "must", "mustn", "mustn't", "my", "myself", "n", "n2", "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needn", "needn't", "needs", "neither", "never", "nevertheless", "new", "next", "ng", "ni", "nine", "ninety", "nj", "nl", "nn", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "novel", "now", "nowhere", "nr", "ns", "nt", "ny", "o", "oa", "ob", "obtain", "obtained", "obviously", "oc", "od", "of", "off", "often", "og", "oh", "oi", "oj", "ok", "okay", "ol", "old", "om", "omitted", "on", "once", "one", "ones", "only", "onto", "oo", "op", "oq", "or", "ord", "os", "ot", "other", "others", "otherwise", "ou", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "ow", "owing", "own", "ox", "oz", "p", "p1", "p2", "p3", "page", "pagecount", "pages", "par", "part", "particular", "particularly", "pas", "past", "pc", "pd", "pe", "per", "perhaps", "pf", "ph", "pi", "pj", "pk", "pl", "placed", "please", "plus", "pm", "pn", "po", "poorly", "possible", "possibly", "potentially", "pp", "pq", "pr", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provides", "ps", "pt", "pu", "put", "py", "q", "qj", "qu", "que", "quickly", "quite", "qv", "r", "r2", "ra", "ran", "rather", "rc", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "research-articl", "respectively", "resulted", "resulting", "results", "rf", "rh", "ri", "right", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "run", "rv", "ry", "s", "s2", "sa", "said", "same", "saw", "say", "saying", "says", "sc", "sd", "se", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "sf", "shall", "shan", "shan't", "she", "shed", "she'd", "she'll", "shes", "she's", "should", "shouldn", "shouldn't", "should've", "show", "showed", "shown", "showns", "shows", "si", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "sj", "sl", "slightly", "sm", "sn", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "sp", "specifically", "specified", "specify", "specifying", "sq", "sr", "ss", "st", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sy", "system", "sz", "t", "t1", "t2", "t3", "take", "taken", "taking", "tb", "tc", "td", "te", "tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that's", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "there's", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'd", "they'll", "theyre", "they're", "they've", "thickv", "thin", "think", "third", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "ti", "til", "tip", "tj", "tl", "tm", "tn", "to", "together", "too", "took", "top", "toward", "towards", "tp", "tq", "tr", "tried", "tries", "truly", "try", "trying", "ts", "t's", "tt", "tv", "twelve", "twenty", "twice", "two", "tx", "u", "u201d", "ue", "ui", "uj", "uk", "um", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "uo", "up", "upon", "ups", "ur", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "ut", "v", "va", "value", "various", "vd", "ve", "ve", "very", "via", "viz", "vj", "vo", "vol", "vols", "volumtype", "vq", "vs", "vt", "vu", "w", "wa", "want", "wants", "was", "wasn", "wasnt", "wasn't", "way", "we", "wed", "we'd", "welcome", "well", "we'll", "well-b", "went", "were", "we're", "weren", "werent", "weren't", "we've", "what", "whatever", "what'll", "whats", "what's", "when", "whence", "whenever", "when's", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "where's", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "who's", "whose", "why", "why's", "wi", "widely", "will", "willing", "wish", "with", "within", "without", "wo", "won", "wonder", "wont", "won't", "words", "world", "would", "wouldn", "wouldnt", "wouldn't", "www", "x", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y", "y2", "yes", "yet", "yj", "yl", "you", "youd", "you'd", "you'll", "your", "youre", "you're", "yours", "yourself", "yourselves", "you've", "yr", "ys", "yt", "z", "zero", "zi", "zz"])
tfidf.fit(df['section_text'])
section_vecs = tfidf.transform(df['section_text'])



In [23]:
all_keywords[:25]

['telecommunication terminal equipment',
 'draining',
 'prior grant',
 'certain ocean shoreline',
 'voyage repair',
 'more alaska',
 'private loss',
 'descriptor',
 'other appropriate group',
 'such infant formula',
 'national tire fuel efficiency information program',
 'section l',
 'foot west',
 'brices cross road',
 'federal capital crime',
 'fish processing industry',
 'similar farming operation',
 'such conservatorship',
 'subsection claim',
 'civics',
 'written certification',
 'sustainable level',
 'state primacy agency',
 'beautiful national park quarter dollar coin act',
 'witness desire']

In [402]:
query = 'age discrimination'
lemma_query = [lemmatizer.lemmatize(word) for word in query.split(' ')]
lemma_query = ' '.join(lemma_query)

In [403]:
lemma_query

'age discrimination'

In [404]:
unigram = lemma_query.split(' ')
bigram = [' '.join(x) for x in zip(unigram, unigram[1:])]
trigram = [' '.join(x) for x in zip(bigram, unigram[2:])]
fourgram = [' '.join(x) for x in zip(trigram, unigram[3:])]
keyword_phrases = set(unigram).union(set(bigram))
keyword_phrases = keyword_phrases.union(set(trigram))
keyword_phrases = keyword_phrases.union(set(fourgram))

In [405]:
keyword_phrases

{'age', 'age discrimination', 'discrimination'}

In [406]:
keyword_phrases = keyword_phrases.intersection(set(all_keywords))

In [407]:
keyword_phrases

{'age', 'age discrimination', 'discrimination'}

In [408]:
query_keyword_vec = keyword_tfidf.transform([lemma_query])
query_keyword_section_vec = keyword_section_tfidf.transform([lemma_query])
query_vec = tfidf.transform([lemma_query])

In [409]:
keyword_df = keyword_search(lemma_query, list(keyword_phrases), df.copy())

['age discrimination', 'age', 'discrimination']
age discrimination
['age discrimination', 'age', 'discrimination']


In [410]:
keyword_df

Unnamed: 0,chapter_id,chapter_number,chapter_header,section_id,section_number,section_header,section_text,term_0,term_value_0,term_1,...,keywords,age discrimination,age,discrimination,terms_found,all_terms_found,query_in_chapter_header,query_in_section_header,query_in_headers,all_matching
0,/us/usc/t46/stI/ch1,1,DEFINITIONS,/us/usc/t46/s101,101,Agency,"In this title, the term “agency” mean a depart...",united states government,0.292345,agency,...,"{instrumentality, agency, term, united state g...",False,False,False,0.0,0.0,False,False,0.0,0.0
1,/us/usc/t46/stI/ch1,1,DEFINITIONS,/us/usc/t46/s102,102,Barge,"In this title, the term “barge” mean a non-sel...",barge,0.308230,term,...,"{term, vessel, title, barge}",False,False,False,0.0,0.0,False,False,0.0,0.0
2,/us/usc/t46/stI/ch1,1,DEFINITIONS,/us/usc/t46/s103,103,Boundary Line,"In this title, the term “Boundary Line” mean a...",boundary line,0.300177,line,...,"{b, line, boundary line, u.s.c., february, act...",False,False,False,0.0,0.0,False,False,0.0,0.0
3,/us/usc/t46/stI/ch1,1,DEFINITIONS,/us/usc/t46/s104,104,Citizen of the United States,"In this title, the term “citizen of the United...",united states,0.251406,nationality act,...,"{title, nationality act, reference, immigratio...",False,False,False,0.0,0.0,False,False,0.0,0.0
4,/us/usc/t46/stI/ch1,1,DEFINITIONS,/us/usc/t46/s105,105,Consular officer,"In this title, the term “consular officer” mea...",consular officer,0.297160,united states government,...,"{regulation, visa, consular officer, employee,...",False,False,False,0.0,0.0,False,False,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58943,/us/usc/t19/ch29,29,UNITED STATES–MEXICO–CANADA AGREEMENT IMPLEMEN...,/us/usc/t19/s4713,4713,Monitoring actions,"\n shall review the factual record; and may,...",trade representative,0.089337,mexico,...,"{information, purpose, enforcement action, fin...",False,False,False,0.0,0.0,False,False,0.0,0.0
58945,/us/usc/t19/ch29,29,UNITED STATES–MEXICO–CANADA AGREEMENT IMPLEMEN...,/us/usc/t19/s4715,4715,Other monitoring and enforcement actions,The Secretary of Commerce ha authority to take...,enforcement actions,0.127453,appropriate monitoring,...,"{appropriate monitoring, wild bird conservatio...",False,False,False,0.0,0.0,False,False,0.0,0.0
58947,/us/usc/t19/ch29,29,UNITED STATES–MEXICO–CANADA AGREEMENT IMPLEMEN...,/us/usc/t19/s4717,4717,Regulations,The head of any Federal agency described in th...,interagency environment committee,0.281967,federal agency,...,"{federal agency, consultation, interagency env...",False,False,False,0.0,0.0,False,False,0.0,0.0
58948,/us/usc/t19/ch29,29,UNITED STATES–MEXICO–CANADA AGREEMENT IMPLEMEN...,/us/usc/t19/s4731,4731,Border water infrastructure improvement authority,The Administrator of the Environmental Protect...,high priority treatment works,0.121739,international transboundary water flows,...,"{portion, operation, international transbounda...",False,False,False,0.0,0.0,False,False,0.0,0.0


In [411]:
keyword_df['keyword_similarity'] = cosine_similarity(query_keyword_vec, keyword_vectors).T
keyword_df['keyword_section_similarity'] = cosine_similarity(query_keyword_section_vec, keyword_section_vectors).T
keyword_df['query_section_similarity']  = cosine_similarity(query_vec, section_vecs).T

In [412]:
term_values = list()
for i in range(30):
    term_in_keyword = keyword_df['term_{}'.format(i)].apply(lambda x: x in keyword_phrases)
    term_value = keyword_df['term_value_{}'.format(i)].where(term_in_keyword, np.nan)
    term_values.append(term_value)
term_values = pd.concat(term_values, axis=1)
term_values.index = keyword_df.index
value_columns = term_values.columns
term_values['term_min'] = term_values[value_columns].min(axis=1)
term_values['term_max'] = term_values[value_columns].max(axis=1)
term_values['term_mean'] = term_values[value_columns].mean(axis=1)
term_values['term_std'] = term_values[value_columns].std(axis=1)
term_values['term_sum'] = term_values[value_columns].sum(axis=1)

In [413]:
term_values[['term_min','term_max','term_std','term_mean','term_sum']].describe()

Unnamed: 0,term_min,term_max,term_std,term_mean,term_sum
count,278.0,278.0,14.0,278.0,40653.0
mean,0.035371,0.036346,0.013077,0.035845,0.000264
std,0.028924,0.029989,0.018761,0.029226,0.004323
min,0.009017,0.009017,0.000109,0.009017,0.0
25%,0.017602,0.017878,0.002474,0.017799,0.0
50%,0.024555,0.024756,0.007364,0.024555,0.0
75%,0.043522,0.045316,0.013225,0.044375,0.0
max,0.227075,0.227075,0.073125,0.227075,0.264942


In [414]:
keyword_df[['term_min','term_max','term_std','term_mean','term_sum']] = term_values[['term_min','term_max','term_std','term_mean','term_sum']]

In [415]:
keyword_df.describe()

Unnamed: 0,term_value_0,term_value_1,term_value_2,term_value_3,term_value_4,term_value_5,term_value_6,term_value_7,term_value_8,term_value_9,...,query_in_headers,all_matching,keyword_similarity,keyword_section_similarity,query_section_similarity,term_min,term_max,term_std,term_mean,term_sum
count,39437.0,39410.0,39174.0,38838.0,38369.0,37771.0,36895.0,36019.0,35014.0,33913.0,...,40653.0,40653.0,40653.0,40653.0,40653.0,278.0,278.0,14.0,278.0,40653.0
mean,0.188987,0.145609,0.1188,0.100428,0.086823,0.07619,0.067698,0.060795,0.055181,0.050532,...,0.000271,0.001584,0.000958,0.001216,0.000962,0.035371,0.036346,0.013077,0.035845,0.000264
std,0.106392,0.068891,0.048909,0.037659,0.030779,0.026402,0.023009,0.020848,0.01914,0.017992,...,0.012146,0.019323,0.01181,0.011174,0.008441,0.028924,0.029989,0.018761,0.029226,0.004323
min,0.029804,0.020904,0.002392,0.003437,0.0026,0.003479,0.002346,0.003258,0.001358,0.001139,...,0.0,0.0,0.0,0.0,0.0,0.009017,0.009017,0.000109,0.009017,0.0
25%,0.117487,0.09755,0.084211,0.074197,0.065921,0.058833,0.052933,0.047646,0.043315,0.03959,...,0.0,0.0,0.0,0.0,0.0,0.017602,0.017878,0.002474,0.017799,0.0
50%,0.163529,0.130687,0.10956,0.093918,0.08174,0.071696,0.063714,0.057098,0.051647,0.04708,...,0.0,0.0,0.0,0.0,0.0,0.024555,0.024756,0.007364,0.024555,0.0
75%,0.230772,0.176678,0.142781,0.119194,0.101416,0.088084,0.077617,0.069123,0.062359,0.056914,...,0.0,0.0,0.0,0.0,0.0,0.043522,0.045316,0.013225,0.044375,0.0
max,1.112789,0.804801,0.627384,0.604467,0.505407,0.496967,0.46446,0.428567,0.423511,0.420345,...,1.0,0.8,0.487955,0.477401,0.372884,0.227075,0.227075,0.073125,0.227075,0.264942


In [416]:
keyword_df = keyword_df.drop(['term_0',
       'term_value_0', 'term_1', 'term_value_1', 'term_2', 'term_value_2',
       'term_3', 'term_value_3', 'term_4', 'term_value_4', 'term_5',
       'term_value_5', 'term_6', 'term_value_6', 'term_7', 'term_value_7',
       'term_8', 'term_value_8', 'term_9', 'term_value_9', 'term_10',
       'term_value_10', 'term_11', 'term_value_11', 'term_12', 'term_value_12',
       'term_13', 'term_value_13', 'term_14', 'term_value_14', 'term_15',
       'term_value_15', 'term_16', 'term_value_16', 'term_17', 'term_value_17',
       'term_18', 'term_value_18', 'term_19', 'term_value_19', 'term_20',
       'term_value_20', 'term_21', 'term_value_21', 'term_22', 'term_value_22',
       'term_23', 'term_value_23', 'term_24', 'term_value_24', 'term_25',
       'term_value_25', 'term_26', 'term_value_26', 'term_27', 'term_value_27',
       'term_28', 'term_value_28', 'term_29', 'term_value_29'], axis=1)

In [417]:
keyword_df[(keyword_df['all_terms_found'] > 0.5) &
          (keyword_df['keyword_similarity'] > 0.0) &
          (keyword_df['keyword_section_similarity'] > 0.0) &
          (keyword_df['query_section_similarity'] > 0.0)].sort_values(by=['keyword_section_similarity','all_terms_found','terms_found'])

Unnamed: 0,chapter_id,chapter_number,chapter_header,section_id,section_number,section_header,section_text,keywords,age discrimination,age,...,query_in_headers,all_matching,keyword_similarity,keyword_section_similarity,query_section_similarity,term_min,term_max,term_std,term_mean,term_sum
5454,/us/usc/t42/ch68,68,DISASTER RELIEF,/us/usc/t42/s5151,5151,Nondiscrimination in disaster assistance,"The President shall issue, and may alter and a...","{other organization, relief effort, sex, relig...",False,True,...,0.0,0.4,0.124358,0.094708,0.073121,0.018224,0.021517,0.002329,0.019871,0.039741
18553,/us/usc/t47/ch16,16,BROADBAND ACCESS,/us/usc/t47/s1726,1726,General provisions,"No individual in the United States may, on the...","{paragraph, judicial review, sex, extent, reli...",False,True,...,0.0,0.4,0.15624,0.106234,0.067324,0.015315,0.018082,0.001957,0.016698,0.033396
15607,/us/usc/t26/stF/ch76,76,JUDICIAL PROCEEDINGS,/us/usc/t26/s7471,7471,Employees,\n \n \n \n \n \n prohibit discrimination on ...,"{sex, code, special counsel, religion, adverse...",False,True,...,0.0,0.4,0.120893,0.107318,0.07511,0.018384,0.022501,0.002911,0.020443,0.040885
16665,/us/usc/t29/ch28,28,FAMILY AND MEDICAL LEAVE,/us/usc/t29/s2651,2651,Effect on other laws,Nothing in this Act or any amendment made by t...,"{state law, state, sex, religion, local law, r...",False,True,...,0.0,0.4,0.201149,0.141507,0.103077,0.04378,0.048243,0.003156,0.046012,0.092023
3881,/us/usc/t42/ch21,21,CIVIL RIGHTS,/us/usc/t42/s2000e–16a,2000e–16a,Short title; purpose; definition,Sections e–a to e–c of this title may be cited...,"{section e–a, sex, purpose, religion, term, di...",False,True,...,0.0,0.4,0.175825,0.185627,0.106488,0.038931,0.039085,0.000109,0.039008,0.078015
53608,/us/usc/t20/ch44,44,CAREER AND TECHNICAL EDUCATION,/us/usc/t20/s2396,2396,Federal laws guaranteeing civil rights,Nothing in this chapter shall be construed to ...,"{sex, federal program, race, age, service, nat...",False,True,...,0.0,0.4,0.260087,0.223506,0.134965,0.066564,0.069,0.001722,0.067782,0.135564
18604,/us/usc/t3/ch5,5,EXTENSION OF CERTAIN RIGHTS AND PROTECTIONS TO...,/us/usc/t3/s411,411,Rights and protections under title VII of the ...,"race, color, religion, sex, or national origi...","{such liquidated damage, disability act, advis...",True,True,...,0.5,0.6,0.271246,0.253979,0.18726,0.015501,0.027239,0.0083,0.02137,0.04274
5790,/us/usc/t42/ch76,76,AGE DISCRIMINATION IN FEDERALLY ASSISTED PROGRAMS,/us/usc/t42/s6102,6102,Prohibition of discrimination,Pursuant to regulation prescribed under sectio...,"{activity, regulation, person, discrimination,...",False,True,...,0.5,0.6,0.365936,0.294172,0.1412,0.034786,0.138201,0.073125,0.086493,0.172987
16198,/us/usc/t29/ch14,14,AGE DISCRIMINATION IN EMPLOYMENT,/us/usc/t29/s633a,633a,Nondiscrimination on account of age in Federal...,\n be responsible for the review and evaluati...,"{recommendation, complaint, operation, interes...",True,True,...,0.5,0.8,0.388009,0.322331,0.247334,0.035088,0.081804,0.02432,0.054536,0.163609
16188,/us/usc/t29/ch14,14,AGE DISCRIMINATION IN EMPLOYMENT,/us/usc/t29/s624,624,Study by Secretary of Labor; reports to Presid...,an examination of the effect of the amendment...,"{paragraph, secretary, contract, teaching pers...",True,True,...,0.5,0.6,0.394599,0.342135,0.279237,0.069216,0.087958,0.013253,0.078587,0.157174


In [418]:
keyword_df['all_matching'].describe()

count    40653.000000
mean         0.001584
std          0.019323
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          0.800000
Name: all_matching, dtype: float64

In [419]:
keyword_df['query_in_headers'].describe()

count    40653.000000
mean         0.000271
std          0.012146
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: query_in_headers, dtype: float64

In [420]:
keyword_df['all_terms_found'].describe()

count    40653.000000
mean         0.002460
std          0.030023
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: all_terms_found, dtype: float64

In [421]:
keyword_df['keyword_similarity'].describe()

count    40653.000000
mean         0.000958
std          0.011810
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          0.487955
Name: keyword_similarity, dtype: float64

In [422]:
keyword_df['keyword_section_similarity'].describe()

count    40653.000000
mean         0.001216
std          0.011174
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          0.477401
Name: keyword_section_similarity, dtype: float64

In [423]:
keyword_df['query_section_similarity'].describe()

count    40653.000000
mean         0.000962
std          0.008441
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          0.372884
Name: query_section_similarity, dtype: float64

In [424]:
keyword_df.columns

Index(['chapter_id', 'chapter_number', 'chapter_header', 'section_id',
       'section_number', 'section_header', 'section_text', 'keywords',
       'age discrimination', 'age', 'discrimination', 'terms_found',
       'all_terms_found', 'query_in_chapter_header', 'query_in_section_header',
       'query_in_headers', 'all_matching', 'keyword_similarity',
       'keyword_section_similarity', 'query_section_similarity', 'term_min',
       'term_max', 'term_std', 'term_mean', 'term_sum'],
      dtype='object')

In [425]:
keyword_df['similarity_sum'] = keyword_df[['keyword_similarity', 'keyword_section_similarity','query_section_similarity']].sum(axis=1)
keyword_df['similarity_mean'] = keyword_df[['keyword_similarity', 'keyword_section_similarity','query_section_similarity']].mean(axis=1)
keyword_df['similarity_max'] = keyword_df[['keyword_similarity', 'keyword_section_similarity','query_section_similarity']].max(axis=1)
keyword_df['similarity_min'] = keyword_df[['keyword_similarity', 'keyword_section_similarity','query_section_similarity']].min(axis=1)

In [440]:
filtered_results = keyword_df[(keyword_df['term_sum'] >= 0.1) &  
                               (keyword_df['similarity_mean'] >= 0.1) & 
                             (keyword_df['keyword_similarity'] > 0.0) & 
                             (keyword_df['keyword_section_similarity'] > 0.0)].round(2).sort_values(by=['term_sum','similarity_sum','keyword_section_similarity',
                                                                                       'keyword_similarity', 'query_section_similarity',
                                                                                                        'all_matching','query_in_headers',
                                                                                       'all_terms_found','terms_found'],
                                                                                  ascending=False)

In [441]:
filtered_results.shape

(16, 29)

In [442]:
filtered_results

Unnamed: 0,chapter_id,chapter_number,chapter_header,section_id,section_number,section_header,section_text,keywords,age discrimination,age,...,query_section_similarity,term_min,term_max,term_std,term_mean,term_sum,similarity_sum,similarity_mean,similarity_max,similarity_min
5789,/us/usc/t42/ch76,76,AGE DISCRIMINATION IN FEDERALLY ASSISTED PROGRAMS,/us/usc/t42/s6101,6101,Statement of purpose,It is the purpose of this chapter to prohibit ...,"{activity, purpose, age, program, basis, discr...",False,True,...,0.2,0.13,0.14,0.01,0.13,0.26,0.9,0.3,0.36,0.2
28166,/us/usc/t14/stII/ch23,23,PERSONNEL; ENLISTED,/us/usc/t14/s2304,2304,Compulsory retirement at age of sixty-two,Any enlisted member who ha reached the age of ...,"{age, enlisted member, active service}",False,True,...,0.08,0.23,0.23,,0.23,0.23,0.44,0.15,0.21,0.08
2609,/us/usc/t42/ch7,7,SOCIAL SECURITY,/us/usc/t42/s427,427,Transitional insured status for purposes of ol...,\n quarter of coverage if such surviving spo...,"{quarter, coverage, age, spouse attains}",False,True,...,0.1,0.18,0.18,,0.18,0.18,0.38,0.13,0.18,0.1
5790,/us/usc/t42/ch76,76,AGE DISCRIMINATION IN FEDERALLY ASSISTED PROGRAMS,/us/usc/t42/s6102,6102,Prohibition of discrimination,Pursuant to regulation prescribed under sectio...,"{activity, regulation, person, discrimination,...",False,True,...,0.14,0.03,0.14,0.07,0.09,0.17,0.8,0.27,0.37,0.14
16188,/us/usc/t29/ch14,14,AGE DISCRIMINATION IN EMPLOYMENT,/us/usc/t29/s624,624,Study by Secretary of Labor; reports to Presid...,an examination of the effect of the amendment...,"{paragraph, secretary, contract, teaching pers...",True,True,...,0.28,0.07,0.09,0.01,0.08,0.16,1.02,0.34,0.39,0.28
16198,/us/usc/t29/ch14,14,AGE DISCRIMINATION IN EMPLOYMENT,/us/usc/t29/s633a,633a,Nondiscrimination on account of age in Federal...,\n be responsible for the review and evaluati...,"{recommendation, complaint, operation, interes...",True,True,...,0.25,0.04,0.08,0.02,0.05,0.16,0.96,0.32,0.39,0.25
3853,/us/usc/t42/ch21,21,CIVIL RIGHTS,/us/usc/t42/s2000c–8,2000c–8,Personal suits for relief against discriminati...,Nothing in this subchapter shall affect advers...,"{person, relief, court, public education, disc...",False,False,...,0.15,0.15,0.15,,0.15,0.15,0.63,0.21,0.26,0.15
3843,/us/usc/t42/ch21,21,CIVIL RIGHTS,/us/usc/t42/s2000b–2,2000b–2,Personal suits for relief against discriminati...,Nothing in this subchapter shall affect advers...,"{facility, person, relief, court, discriminati...",False,False,...,0.14,0.14,0.14,,0.14,0.14,0.69,0.23,0.31,0.14
53608,/us/usc/t20/ch44,44,CAREER AND TECHNICAL EDUCATION,/us/usc/t20/s2396,2396,Federal laws guaranteeing civil rights,Nothing in this chapter shall be construed to ...,"{sex, federal program, race, age, service, nat...",False,True,...,0.13,0.07,0.07,0.0,0.07,0.14,0.62,0.21,0.26,0.13
7774,/us/usc/t42/ch130,130,NATIONAL AFFORDABLE HOUSING,/us/usc/t42/s12832,12832,Nondiscrimination,No person in the United States shall on the gr...,"{sex, religion, prohibition, secretary, such p...",False,True,...,0.32,0.05,0.08,0.02,0.06,0.13,1.23,0.41,0.48,0.32


In [443]:
for idx, row in filtered_results[['all_matching','query_in_headers','all_terms_found',
                                  'terms_found','section_text','section_number',
                                  'keyword_similarity','keyword_section_similarity','query_section_similarity']].head(10).iterrows():
    print(idx)
    print(row[['all_matching','query_in_headers','all_terms_found','terms_found','section_number',
               'keyword_similarity','keyword_section_similarity','query_section_similarity']])
    print(row['section_text'])
    print('------')

5789
all_matching                   0.6
query_in_headers               0.5
all_terms_found               0.67
terms_found                   0.67
section_number                6101
keyword_similarity            0.36
keyword_section_similarity    0.34
query_section_similarity       0.2
Name: 5789, dtype: object
It is the purpose of this chapter to prohibit discrimination on the basis of age in program or activity receiving Federal financial assistance.
------
28166
all_matching                   0.2
query_in_headers               0.0
all_terms_found               0.33
terms_found                   0.33
section_number                2304
keyword_similarity            0.21
keyword_section_similarity    0.16
query_section_similarity      0.08
Name: 28166, dtype: object
Any enlisted member who ha reached the age of sixty-two shall be retired from active service.
------
2609
all_matching                   0.2
query_in_headers               0.0
all_terms_found               0.33
terms_found   

In [444]:
for idx, row in filtered_results[['all_matching','query_in_headers','all_terms_found',
                                  'terms_found','section_text','section_number',
                                  'keyword_similarity','keyword_section_similarity','query_section_similarity']].tail(10).iterrows():
    print(idx)
    print(row[['all_matching','query_in_headers','all_terms_found','terms_found','section_number',
               'keyword_similarity','keyword_section_similarity','query_section_similarity']])
    print(row['section_text'])
    print('------')

3853
all_matching                      0.2
query_in_headers                  0.0
all_terms_found                  0.33
terms_found                      0.33
section_number                2000c–8
keyword_similarity               0.26
keyword_section_similarity       0.22
query_section_similarity         0.15
Name: 3853, dtype: object
Nothing in this subchapter shall affect adversely the right of any person to sue for or obtain relief in any court against discrimination in public education.
------
3843
all_matching                      0.2
query_in_headers                  0.0
all_terms_found                  0.33
terms_found                      0.33
section_number                2000b–2
keyword_similarity               0.31
keyword_section_similarity       0.24
query_section_similarity         0.14
Name: 3843, dtype: object
Nothing in this subchapter shall affect adversely the right of any person to sue for or obtain relief in any court against discrimination in any facility covered by

In [445]:
query_df = keyword_df[keyword_df['query_section_similarity'] > 0].sort_values(by=['query_section_similarity'], ascending=False)

In [446]:
query_df['query_section_similarity'].describe()

count    1394.000000
mean        0.028056
std         0.036310
min         0.000574
25%         0.008133
50%         0.016924
75%         0.032310
max         0.372884
Name: query_section_similarity, dtype: float64

In [447]:
query_df['keyword_similarity'].describe()

count    1394.000000
mean        0.027819
std         0.057593
min         0.000000
25%         0.000000
50%         0.000000
75%         0.044889
max         0.487955
Name: keyword_similarity, dtype: float64

In [448]:
query_df['keyword_section_similarity'].describe()

count    1394.000000
mean        0.035471
std         0.049270
min         0.000623
25%         0.009348
50%         0.019601
75%         0.040089
max         0.477401
Name: keyword_section_similarity, dtype: float64

In [449]:
overlap_index = query_df.index.intersection(filtered_results.index)

In [455]:
sorted_results = query_df.loc[overlap_index].round(2).sort_values(by=['similarity_sum','term_sum','keyword_section_similarity','keyword_similarity',
                                                              'query_section_similarity', 'all_matching','query_in_headers',
                                                                'all_terms_found','terms_found'],
                                                                                  ascending=False)
sorted_results

Unnamed: 0,chapter_id,chapter_number,chapter_header,section_id,section_number,section_header,section_text,keywords,age discrimination,age,...,query_section_similarity,term_min,term_max,term_std,term_mean,term_sum,similarity_sum,similarity_mean,similarity_max,similarity_min
7774,/us/usc/t42/ch130,130,NATIONAL AFFORDABLE HOUSING,/us/usc/t42/s12832,12832,Nondiscrimination,No person in the United States shall on the gr...,"{sex, religion, prohibition, secretary, such p...",False,True,...,0.32,0.05,0.08,0.02,0.06,0.13,1.23,0.41,0.48,0.32
16188,/us/usc/t29/ch14,14,AGE DISCRIMINATION IN EMPLOYMENT,/us/usc/t29/s624,624,Study by Secretary of Labor; reports to Presid...,an examination of the effect of the amendment...,"{paragraph, secretary, contract, teaching pers...",True,True,...,0.28,0.07,0.09,0.01,0.08,0.16,1.02,0.34,0.39,0.28
16198,/us/usc/t29/ch14,14,AGE DISCRIMINATION IN EMPLOYMENT,/us/usc/t29/s633a,633a,Nondiscrimination on account of age in Federal...,\n be responsible for the review and evaluati...,"{recommendation, complaint, operation, interes...",True,True,...,0.25,0.04,0.08,0.02,0.05,0.16,0.96,0.32,0.39,0.25
5789,/us/usc/t42/ch76,76,AGE DISCRIMINATION IN FEDERALLY ASSISTED PROGRAMS,/us/usc/t42/s6101,6101,Statement of purpose,It is the purpose of this chapter to prohibit ...,"{activity, purpose, age, program, basis, discr...",False,True,...,0.2,0.13,0.14,0.01,0.13,0.26,0.9,0.3,0.36,0.2
5790,/us/usc/t42/ch76,76,AGE DISCRIMINATION IN FEDERALLY ASSISTED PROGRAMS,/us/usc/t42/s6102,6102,Prohibition of discrimination,Pursuant to regulation prescribed under sectio...,"{activity, regulation, person, discrimination,...",False,True,...,0.14,0.03,0.14,0.07,0.09,0.17,0.8,0.27,0.37,0.14
3843,/us/usc/t42/ch21,21,CIVIL RIGHTS,/us/usc/t42/s2000b–2,2000b–2,Personal suits for relief against discriminati...,Nothing in this subchapter shall affect advers...,"{facility, person, relief, court, discriminati...",False,False,...,0.14,0.14,0.14,,0.14,0.14,0.69,0.23,0.31,0.14
3853,/us/usc/t42/ch21,21,CIVIL RIGHTS,/us/usc/t42/s2000c–8,2000c–8,Personal suits for relief against discriminati...,Nothing in this subchapter shall affect advers...,"{person, relief, court, public education, disc...",False,False,...,0.15,0.15,0.15,,0.15,0.15,0.63,0.21,0.26,0.15
53608,/us/usc/t20/ch44,44,CAREER AND TECHNICAL EDUCATION,/us/usc/t20/s2396,2396,Federal laws guaranteeing civil rights,Nothing in this chapter shall be construed to ...,"{sex, federal program, race, age, service, nat...",False,True,...,0.13,0.07,0.07,0.0,0.07,0.14,0.62,0.21,0.26,0.13
7510,/us/usc/t42/ch126,126,EQUAL OPPORTUNITY FOR INDIVIDUALS WITH DISABIL...,/us/usc/t42/s12133,12133,Enforcement,"The remedies, procedures, and right set forth ...","{person, remedy, procedure, disability, basis,...",False,False,...,0.11,0.1,0.1,,0.1,0.1,0.56,0.19,0.25,0.11
33931,/us/usc/t23/ch3,3,GENERAL PROVISIONS,/us/usc/t23/s324,324,Prohibition of discrimination on the basis of sex,No person shall on the ground of sex be exclud...,"{sex, discrim­inatee, rule, person, federal as...",False,False,...,0.13,0.11,0.11,,0.11,0.11,0.52,0.17,0.2,0.13


In [456]:
sorted_results.columns

Index(['chapter_id', 'chapter_number', 'chapter_header', 'section_id',
       'section_number', 'section_header', 'section_text', 'keywords',
       'age discrimination', 'age', 'discrimination', 'terms_found',
       'all_terms_found', 'query_in_chapter_header', 'query_in_section_header',
       'query_in_headers', 'all_matching', 'keyword_similarity',
       'keyword_section_similarity', 'query_section_similarity', 'term_min',
       'term_max', 'term_std', 'term_mean', 'term_sum', 'similarity_sum',
       'similarity_mean', 'similarity_max', 'similarity_min'],
      dtype='object')

In [457]:
query

'age discrimination'

In [459]:
for idx, row in sorted_results.iterrows():
    print(idx)
    print(row[['section_number','chapter_header','section_header','terms_found',
       'all_terms_found', 'query_in_chapter_header', 'query_in_section_header',
       'query_in_headers', 'all_matching', 'keyword_similarity',
       'keyword_section_similarity', 'query_section_similarity', 'term_min',
       'term_max', 'term_std', 'term_mean', 'term_sum', 'similarity_sum',
       'similarity_mean', 'similarity_max', 'similarity_min']])
    print(row['section_text'])
    print('------')

7774
section_number                                      12832
chapter_header                NATIONAL AFFORDABLE HOUSING
section_header                          Nondiscrimination
terms_found                                          0.67
all_terms_found                                      0.67
query_in_chapter_header                             False
query_in_section_header                             False
query_in_headers                                      0.0
all_matching                                          0.4
keyword_similarity                                   0.43
keyword_section_similarity                           0.48
query_section_similarity                             0.32
term_min                                             0.05
term_max                                             0.08
term_std                                             0.02
term_mean                                            0.06
term_sum                                             0.13
similarit