# FOIA Logs and Vector DB Metadata Demonstration

In [1]:
import pandas as pd

# FOIA Logs Data Exploration

In [2]:
logs_df = pd.read_csv('logs_tabular.csv')
logs_df = logs_df[['request_id','request','disposition','component']]
ref_df = pd.read_csv('foia_logs_index.csv')
#display a nice data snippet

view_df = logs_df.merge(ref_df, left_on='component', right_on='model_num')[[
    'request_id', 
    'request',
    'disposition',
    'abbreviation',
    'agency']]

view_df

Unnamed: 0,request_id,request,disposition,abbreviation,agency
0,CEQ-AGENCY-000001,Christy Goldfuss schedule from June 1 2015 thr...,,CEQ,AGENCY
1,CEQ-AGENCY-000002,List of special government employees as define...,,CEQ,AGENCY
2,CEQ-AGENCY-000003,FOIA Log Reports for month of June 2015,,CEQ,AGENCY
3,CEQ-AGENCY-000004,Logs of correspondence that record letters fro...,,CEQ,AGENCY
4,CEQ-AGENCY-000005,Ethics waivers in resolving financial conflict...,,CEQ,AGENCY
...,...,...,...,...,...
124381,VHA-VA-002496,LEXIS NEXIS is requesting Police Report #654I...,-,VHA,VA
124382,VHA-VA-002497,Job Announcement Number- CBDU-10626462-20- LP...,Granted/Denied in Part,VHA,VA
124383,VHA-VA-002498,To obtairj state of licensure and license nu...,Granted in Full,VHA,VA
124384,VHA-VA-002499,USA Job Applications scores 1) CBAY - 10509802...,,VHA,VA


#### Info and Shape

In [3]:
logs_df.disposition.value_counts().to_csv("disposition_type.csv")

In [4]:
## a sample of some data filtered to the CDC
logs_df[logs_df['component'] == 65].request

47922    credit counseling and debtor  education applic...
47923    emails the USTP sent or received  from the ema...
47924                      a third party's bankruptcy case
47925           the requester's corporate bankruptcy  case
47926                           credit counseling agencies
                               ...                        
48091                     US trustee performance of duties
48092                   Chapter 7 panel trustee complaints
48093                            a letter sent to the USTP
48094                                    a bankruptcy case
48095                                         IT purchases
Name: request, Length: 174, dtype: object

#### Further Exploration

In [5]:
## further exploration code

# FreqDocs Vector Metadata Exploration

In [6]:
vec_meta_df = pd.read_csv('freqdocs.csv')
vec_meta_df.head()

Unnamed: 0,title,component,abbreviation,parent_abbreviation,href,origination_url
0,American Opportunities Regional Center Inc,U.S. Citizenship & Immigration Services,USCIS,DHS,https://www.uscis.gov/sites/default/files/docu...,no_origin
1,1100.73-4 Reasonable Accommodation Program,Transportation Security Administration,TSA,DHS,https://www.tsa.gov/sites/default/files/foia-r...,no_origin
2,National Organization for Women (NOW),Federal Bureau of Investigation,FBI,DOJ,https://vault.fbi.gov/National%20Organization%...,no_origin
3,Prosperity Regional Center LLC formerly US Pro...,U.S. Citizenship & Immigration Services,USCIS,DHS,https://www.uscis.gov/sites/default/files/docu...,no_origin
4,900 Equal Employment Opportunity Policy Statement,Transportation Security Administration,TSA,DHS,https://www.tsa.gov/sites/default/files/foia-r...,no_origin


#### Info and Shape

In [7]:
vec_meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38385 entries, 0 to 38384
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   title                38385 non-null  object
 1   component            38385 non-null  object
 2   abbreviation         38334 non-null  object
 3   parent_abbreviation  33055 non-null  object
 4   href                 38385 non-null  object
 5   origination_url      38385 non-null  object
dtypes: object(6)
memory usage: 1.8+ MB


#### Further Exploration

In [8]:
## further exploration code 
vec_meta_df.iloc[:5][['title','component','abbreviation','parent_abbreviation']]

Unnamed: 0,title,component,abbreviation,parent_abbreviation
0,American Opportunities Regional Center Inc,U.S. Citizenship & Immigration Services,USCIS,DHS
1,1100.73-4 Reasonable Accommodation Program,Transportation Security Administration,TSA,DHS
2,National Organization for Women (NOW),Federal Bureau of Investigation,FBI,DOJ
3,Prosperity Regional Center LLC formerly US Pro...,U.S. Citizenship & Immigration Services,USCIS,DHS
4,900 Equal Employment Opportunity Policy Statement,Transportation Security Administration,TSA,DHS


In [13]:
## unique agency components in FreqDocs

df = vec_meta_df[["abbreviation","parent_abbreviation"]].value_counts().reset_index()
df.columns = ["abbreviation","parent_abbreviation","count"]
df.head()
print(len(df))

df.to_csv("freqdocs_agencies.csv")

153
