In [2]:
import lancedb
import pandas as pd
import numpy as np
import json
from pathlib import Path
from tqdm.notebook import tqdm
from copy import deepcopy
import gzip
import re

In [3]:
base_folder = Path('../wonky_data')
index_folder = base_folder.joinpath('indexes')
source_folder = base_folder.joinpath('index_data/sections')
# source_folder = base_folder.joinpath('parsed_reports/sections')

In [4]:
db = lancedb.connect(index_folder)

In [5]:
source_files = list(source_folder.glob('*.json.gz'))

In [6]:
len(source_files)

109

In [7]:
with gzip.open(source_files[0],'rt') as f:
    _data = json.load(f)
for record in _data:
    record['section_id'] = f"""{record['id']}_{record['section_start']}_{record['section_end']}"""

In [8]:
starter_data = pd.DataFrame(_data)

In [9]:
string_cols = ['id',
 'type',
 'typeId',
 'number',
 'topics',
 'date',
 'title',
 'summary',
 'doc_id',
 'filename',
 'source_file',
 'text',
 'sections']

In [10]:
starter_data

Unnamed: 0,id,type,typeId,number,active,topics,date,title,summary,doc_id,filename,source_file,text,sections,section_ids,section_start,section_end,vector,section_id
0,R42373,CRS Report,REPORTS,R42373,True,"[American Law, Foreign Affairs]",2012-02-24,Issues in International Trade Law: Restricting...,Electronic waste (e-waste) is a term that loos...,414169,20120224_R42373_e0af9b8fc8c1464e8e9c4d8fa288ff...,R42373.json,# Issues in International Trade Law: Restricti...,{'1': '# Issues in International Trade Law: Re...,"[1, 2, 3, 4]",1,4,"[0.18518035113811493, 1.7670409679412842, -2.6...",R42373_1_4
1,R42373,CRS Report,REPORTS,R42373,True,"[American Law, Foreign Affairs]",2012-02-24,Issues in International Trade Law: Restricting...,Electronic waste (e-waste) is a term that loos...,414169,20120224_R42373_e0af9b8fc8c1464e8e9c4d8fa288ff...,R42373.json,## International Agreements on Hazardous Waste...,{'5': '## International Agreements on Hazardou...,"[5, 6, 7]",5,7,"[0.474021315574646, 1.617024302482605, -3.0516...",R42373_5_7
2,R42373,CRS Report,REPORTS,R42373,True,"[American Law, Foreign Affairs]",2012-02-24,Issues in International Trade Law: Restricting...,Electronic waste (e-waste) is a term that loos...,414169,20120224_R42373_e0af9b8fc8c1464e8e9c4d8fa288ff...,R42373.json,## Imposing a Ban on E-Waste Exports\n As disc...,{'8': '## Imposing a Ban on E-Waste Exports  A...,[8],8,8,"[0.5606708526611328, 2.607553005218506, -3.253...",R42373_8_8
3,R42373,CRS Report,REPORTS,R42373,True,"[American Law, Foreign Affairs]",2012-02-24,Issues in International Trade Law: Restricting...,Electronic waste (e-waste) is a term that loos...,414169,20120224_R42373_e0af9b8fc8c1464e8e9c4d8fa288ff...,R42373.json,## Imposing a Licensing System on E-Waste Expo...,{'9': '## Imposing a Licensing System on E-Was...,"[9, 10]",9,10,"[0.4774671792984009, 1.927130937576294, -2.882...",R42373_9_10
4,R42373,CRS Report,REPORTS,R42373,True,"[American Law, Foreign Affairs]",2012-02-24,Issues in International Trade Law: Restricting...,Electronic waste (e-waste) is a term that loos...,414169,20120224_R42373_e0af9b8fc8c1464e8e9c4d8fa288ff...,R42373.json,"### Article I\n Article I, also known as the g...","{'11': '### Article I  Article I, also known a...",[11],11,11,"[0.5588755011558533, 1.3099936246871948, -3.00...",R42373_11_11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1664,R43778,CRS Report,REPORTS,R43778,True,[],2014-11-07,Medicaid Prescription Drug Pricing and Policy,Medicaid is a federal-state entitlement progra...,441591,20141107_R43778_411bfd4ca40cda0ae72915dd2ea401...,R43778.json,### Patient Protection and Affordable Care Act...,{'45': '### Patient Protection and Affordable ...,"[45, 46]",45,46,"[0.9020079970359802, 0.9113937020301819, -2.81...",R43778_45_46
1665,R43778,CRS Report,REPORTS,R43778,True,[],2014-11-07,Medicaid Prescription Drug Pricing and Policy,Medicaid is a federal-state entitlement progra...,441591,20141107_R43778_411bfd4ca40cda0ae72915dd2ea401...,R43778.json,## Selected Medicaid Prescription Drug Issues\...,{'47': '## Selected Medicaid Prescription Drug...,"[47, 48]",47,48,"[0.4073198437690735, 1.676227331161499, -2.708...",R43778_47_48
1666,R43778,CRS Report,REPORTS,R43778,True,[],2014-11-07,Medicaid Prescription Drug Pricing and Policy,Medicaid is a federal-state entitlement progra...,441591,20141107_R43778_411bfd4ca40cda0ae72915dd2ea401...,R43778.json,## Medicaid Rebates for Sovaldi\n Gilead parti...,{'49': '## Medicaid Rebates for Sovaldi  Gilea...,"[49, 50]",49,50,"[0.26424339413642883, 1.8337219953536987, -2.9...",R43778_49_50
1667,R43778,CRS Report,REPORTS,R43778,True,[],2014-11-07,Medicaid Prescription Drug Pricing and Policy,Medicaid is a federal-state entitlement progra...,441591,20141107_R43778_411bfd4ca40cda0ae72915dd2ea401...,R43778.json,### ACA Implementation: Pending Final Rule\n C...,{'51': '### ACA Implementation: Pending Final ...,[51],51,51,"[0.6830442547798157, 1.0622128248214722, -3.19...",R43778_51_51


In [11]:
for _col in string_cols:
    starter_data[_col] = starter_data[_col].astype(str)

In [12]:
total_records = 0

In [13]:
tbl = db.create_table("sections_fts", data=starter_data, mode='overwrite')

In [14]:
total_records += starter_data.shape[0]

In [15]:
for file in tqdm(source_files[1:]):
    with gzip.open(file,'rt') as f:
        _data = json.load(f)
    for record in _data:
        record['section_id'] = f"""{record['id']}_{record['section_start']}_{record['section_end']}"""
    _data = pd.DataFrame(_data)
    for _col in string_cols:
        _data[_col] = _data[_col].astype(str)

    tbl.add(_data)
    total_records += _data.shape[0]

  0%|          | 0/108 [00:00<?, ?it/s]

In [17]:
total_records

180709

In [18]:
tbl.create_index(metric="cosine")

In [19]:
tbl.create_fts_index(["text",'title','summary'], use_tantivy=True, replace=True)

In [20]:
from sentence_transformers import SentenceTransformer
encoder = SentenceTransformer('nomic-ai/nomic-embed-text-v1.5', device='mps',trust_remote_code=True)

!!!!!!!!!!!!megablocks not available, using torch.matmul instead
<All keys matched successfully>


In [21]:
query = """USAID's headquarters in South Africa supports a "one-stop-shop" approach allowing access to the largest PAWG players: MCC, OPIC, Ex-Im, and USTDA, and the Commerce Department and its U.S. Foreign Commercial Service.  USAID has deployed more than 25 advisors across Africa.  A major technical assistance component of USAID's Power Africa work is the Power Africa Transactions and Reforms Program (PATRP), a three- to five-year, $64 million contract implemented by Tetra Tech and awarded in 2014."""
fts_query = ' '.join(re.findall(r'(\w+)', query))
query_vector = encoder.encode(query)

In [22]:
print(tbl.search(fts_query).limit(10).to_pandas().iloc[-2]['text'])

# Powering Africa: Challenges of and U.S. Aid for Electrification in Africa

 September 14, 2015 (R43593)   [Jump to Main Text of Report](#Content) (R43593(1))
## Introduction
 Sub-Saharan Africa is the most electricity-poor region globally, which has had profound impacts on economic growth and development prospects. In recent years, U.S. policymakers have sought to help increase access to electricity in sub-Saharan Africa in order to spur economic growth, reduce poverty, and for socio-economic development generally in the region; and to expand U.S. and other international trade with and investment in Africa.  Efforts to achieve these goals have taken the form of Power Africa, a major, multi-agency Obama Administration initiative to increase African access to electricity (also termed "power" in this report); and two congressional bills (one in the House and one in the Senate; see below).  The latter seek to establish as a U.S. policy priority a benchmarked, multi-year, market-driven an

In [23]:
results = tbl.search(fts_query).limit(10).to_pandas()

In [24]:
results

Unnamed: 0,id,type,typeId,number,active,topics,date,title,summary,doc_id,filename,source_file,text,sections,section_ids,section_start,section_end,vector,section_id,_score
0,R43593,CRS Report,REPORTS,R43593,True,"['African Affairs', 'Economic Policy', 'Energy...",2015-09-14,Powering Africa: Challenges of and U.S. Aid fo...,The largest infrastructure deficit in sub-Saha...,445388,20150914_R43593_f78bd8014e5835c314f0e1a3d83d32...,R43593.json,## Agency Roles: USAID\n Power Africa is led b...,{'26': '## Agency Roles: USAID\n Power Africa ...,"[26, 27]",26,27,"[0.05080948, 1.144739, -3.275504, -0.4763342, ...",R43593_26_27,408.537201
1,R43593,CRS Report,REPORTS,R43593,True,"['African Affairs', 'Economic Policy', 'Energy...",2015-09-14,Powering Africa: Challenges of and U.S. Aid fo...,The largest infrastructure deficit in sub-Saha...,445388,20150914_R43593_f78bd8014e5835c314f0e1a3d83d32...,R43593.json,"## USTDA and USADF\n Under Power Africa, OPIC ...",{'30': '## USTDA and USADF\n Under Power Afric...,"[30, 31]",30,31,"[0.5088026, 1.6286339, -3.574566, -0.3596658, ...",R43593_30_31,302.317017
2,R43593,CRS Report,REPORTS,R43593,True,"['African Affairs', 'Economic Policy', 'Energy...",2015-09-14,Powering Africa: Challenges of and U.S. Aid fo...,The largest infrastructure deficit in sub-Saha...,445388,20150914_R43593_f78bd8014e5835c314f0e1a3d83d32...,R43593.json,## Other Agencies and Power Africa Commitments...,"{'28': ""## Other Agencies and Power Africa Com...","[28, 29]",28,29,"[0.34541905, 1.4823565, -3.8423011, -0.1534712...",R43593_28_29,296.195251
3,R43593,CRS Report,REPORTS,R43593,True,"['African Affairs', 'Economic Policy', 'Energy...",2015-09-14,Powering Africa: Challenges of and U.S. Aid fo...,The largest infrastructure deficit in sub-Saha...,445388,20150914_R43593_f78bd8014e5835c314f0e1a3d83d32...,R43593.json,## Possible Issues and Questions for Congress\...,{'34': '## Possible Issues and Questions for C...,[34],34,34,"[0.7185223, 1.922172, -2.5162842, -0.38139454,...",R43593_34_34,263.290009
4,R43593,CRS Report,REPORTS,R43593,True,"['African Affairs', 'Economic Policy', 'Energy...",2015-09-14,Powering Africa: Challenges of and U.S. Aid fo...,The largest infrastructure deficit in sub-Saha...,445388,20150914_R43593_f78bd8014e5835c314f0e1a3d83d32...,R43593.json,### Power Africa: Key Achievements and Critiqu...,{'5': '### Power Africa: Key Achievements and ...,"[5, 6]",5,6,"[0.5450391, 1.7149588, -3.3914585, -0.63610107...",R43593_5_6,262.873962
5,R43593,CRS Report,REPORTS,R43593,True,"['African Affairs', 'Economic Policy', 'Energy...",2015-09-14,Powering Africa: Challenges of and U.S. Aid fo...,The largest infrastructure deficit in sub-Saha...,445388,20150914_R43593_f78bd8014e5835c314f0e1a3d83d32...,R43593.json,### Goal and Focus\n The initiative is designe...,{'23': '### Goal and Focus\n The initiative is...,"[23, 24, 25]",23,25,"[0.49734694, 1.0396512, -3.1792257, -0.3109190...",R43593_23_25,261.08725
6,R43593,CRS Report,REPORTS,R43593,True,"['African Affairs', 'Economic Policy', 'Energy...",2015-09-14,Powering Africa: Challenges of and U.S. Aid fo...,The largest infrastructure deficit in sub-Saha...,445388,20150914_R43593_f78bd8014e5835c314f0e1a3d83d32...,R43593.json,## Outlook\n Now into its second year of imple...,{'35': '## Outlook\n Now into its second year ...,"[35, 36]",35,36,"[0.5349806, 0.9393154, -2.8325799, -0.28838456...",R43593_35_36,244.265686
7,R43593,CRS Report,REPORTS,R43593,True,"['African Affairs', 'Economic Policy', 'Energy...",2015-09-14,Powering Africa: Challenges of and U.S. Aid fo...,The largest infrastructure deficit in sub-Saha...,445388,20150914_R43593_f78bd8014e5835c314f0e1a3d83d32...,R43593.json,## Commerce and Energy Departments\n The Comme...,{'32': '## Commerce and Energy Departments\n T...,"[32, 33]",32,33,"[-0.18631841, 0.8763114, -3.9369643, -0.228735...",R43593_32_33,243.18544
8,R43593,CRS Report,REPORTS,R43593,True,"['African Affairs', 'Economic Policy', 'Energy...",2015-09-14,Powering Africa: Challenges of and U.S. Aid fo...,The largest infrastructure deficit in sub-Saha...,445388,20150914_R43593_f78bd8014e5835c314f0e1a3d83d32...,R43593.json,# Powering Africa: Challenges of and U.S. Aid ...,{'1': '# Powering Africa: Challenges of and U....,"[1, 2, 3, 4]",1,4,"[0.48626584, 1.4472132, -3.7264488, -0.8857924...",R43593_1_4,231.176651
9,IB95052,CRS Report,REPORT,IB95052,False,"['Foreign Affairs', 'Intelligence and National...",2006-06-19,Africa: U.S. Foreign Assistance Issues,Under the Administration's FY2006 foreign assi...,IB95052_2006Jun19,20060619_IB95052_44d064233dcac904568cea0844e16...,IB95052.json,IB95052 06-19-06 The United States contributes...,{'10': 'IB95052 06-19-06 The United States con...,"[10, 11]",10,11,"[0.9512743, 0.7145364, -2.9581912, -0.36053047...",IB95052_10_11,226.688339


In [25]:
vec_results = tbl.search(query_vector).limit(10).to_pandas()

In [27]:
vec_results

Unnamed: 0,id,type,typeId,number,active,topics,date,title,summary,doc_id,filename,source_file,text,sections,section_ids,section_start,section_end,vector,section_id,_distance
0,R43593,CRS Report,REPORTS,R43593,True,"['African Affairs', 'Economic Policy', 'Energy...",2015-09-14,Powering Africa: Challenges of and U.S. Aid fo...,The largest infrastructure deficit in sub-Saha...,445388,20150914_R43593_f78bd8014e5835c314f0e1a3d83d32...,R43593.json,## Agency Roles: USAID\n Power Africa is led b...,{'26': '## Agency Roles: USAID\n Power Africa ...,"[26, 27]",26,27,"[0.05080948, 1.144739, -3.275504, -0.4763342, ...",R43593_26_27,0.37211
1,R43593,CRS Report,REPORTS,R43593,True,"['African Affairs', 'Economic Policy', 'Energy...",2015-09-14,Powering Africa: Challenges of and U.S. Aid fo...,The largest infrastructure deficit in sub-Saha...,445388,20150914_R43593_f78bd8014e5835c314f0e1a3d83d32...,R43593.json,## Other Agencies and Power Africa Commitments...,"{'28': ""## Other Agencies and Power Africa Com...","[28, 29]",28,29,"[0.34541905, 1.4823565, -3.8423011, -0.1534712...",R43593_28_29,0.389696
2,R43593,CRS Report,REPORTS,R43593,True,"['African Affairs', 'Economic Policy', 'Energy...",2015-09-14,Powering Africa: Challenges of and U.S. Aid fo...,The largest infrastructure deficit in sub-Saha...,445388,20150914_R43593_f78bd8014e5835c314f0e1a3d83d32...,R43593.json,"## USTDA and USADF\n Under Power Africa, OPIC ...",{'30': '## USTDA and USADF\n Under Power Afric...,"[30, 31]",30,31,"[0.5088026, 1.6286339, -3.574566, -0.3596658, ...",R43593_30_31,0.418164
3,IB95052,CRS Report,REPORT,IB95052,False,"['Foreign Affairs', 'Intelligence and National...",2006-06-19,Africa: U.S. Foreign Assistance Issues,Under the Administration's FY2006 foreign assi...,IB95052_2006Jun19,20060619_IB95052_44d064233dcac904568cea0844e16...,IB95052.json,IB95052 06-19-06 Africa Enterprise Development...,{'15': 'IB95052 06-19-06 Africa Enterprise Dev...,[15],15,15,"[-0.00518867, 1.3195407, -3.6752663, -0.674701...",IB95052_15_15,0.418188
4,IF11384,CRS In Focus,IF,IF11384,True,"['African Affairs', 'Foreign Affairs']",2020-11-17,The Trump Administration’s Prosper Africa Init...,,IF11384_6_2020-11-17,2020-11-17_IF11384_6ecbb285e4a5fe0205123846047...,IF11384.json,"Updated November 17, 2020 **The Trump Administ...","{'1': 'Updated November 17, 2020 **The Trump A...",[1],1,1,"[0.21241844, 1.6974026, -3.0930176, -0.0020947...",IF11384_1_1,0.431043
5,RL34003,CRS Report,REPORTS,RL34003,False,"['African Affairs', 'Foreign Affairs', 'Intell...",2011-07-22,Africa Command: U.S. Strategic Interests and t...,"In recent years, analysts and U.S. policymaker...",389551,20110722_RL34003_fda3a9182a25d4ccae4fe39536ac1...,RL34003.json,## Headquarters Location\n There has been cons...,"{'14': ""## Headquarters Location\n There has b...",[14],14,14,"[0.6275664, 0.72023827, -3.7677124, -0.6870053...",RL34003_14_14,0.432925
6,R41880,CRS Report,REPORTS,R41880,True,['Foreign Affairs'],2013-10-28,Foreign Assistance: Public-Private Partnership...,The flow of private sector resources to develo...,425223,20131028_R41880_ea28073f5445e71704dfb18376c7cc...,R41880.json,### Other Bilateral Agencies\n While USAID and...,{'12': '### Other Bilateral Agencies\n While U...,"[12, 13]",12,13,"[0.45230317, 1.6280302, -3.2728295, -0.3164047...",R41880_12_13,0.43663
7,IF11384,CRS In Focus,IF,IF11384,True,"['African Affairs', 'Foreign Affairs']",2020-11-17,The Trump Administration’s Prosper Africa Init...,,IF11384_6_2020-11-17,2020-11-17_IF11384_6ecbb285e4a5fe0205123846047...,IF11384.json,The Trump Administration’s Prosper Africa Init...,"{'2': ""The Trump Administration’s Prosper Afri...",[2],2,2,"[0.32424703, 1.8523045, -2.8766463, 0.07541474...",IF11384_2_2,0.437528
8,R45687,CRS Report,R,R45687,True,"['African Affairs', 'Foreign Affairs', 'Nation...",2020-09-17,"South Africa: Current Issues, Economy, and U.S...",,R45687_9_2020-09-17,2020-09-17_R45687_9ac66be2db5a89528302e6194976...,R45687.json,"*South Africa: Current Issues, Economy, and U....","{'9': '*South Africa: Current Issues, Economy,...",[9],9,9,"[0.2895086, 0.2675329, -3.1297789, -0.29616845...",R45687_9_9,0.440551
9,R44117,CRS Report,REPORTS,R44117,True,"['Appropriations', 'Economic Policy', 'Foreign...",2015-07-21,U.S. Agency for International Development (USA...,This report provides background information on...,443235,20150721_R44117_8021de4e0ae1c1185f341b8a174b72...,R44117.json,### Mission and Headquarters Roles\n Organizat...,{'16': '### Mission and Headquarters Roles\n O...,[16],16,16,"[0.015327629, 0.7716815, -3.2985578, -0.158780...",R44117_16_16,0.442825
