# FOIA Logs and Vector DB Metadata Demonstration

In [1]:
import pandas as pd

# FOIA Logs Data Exploration

In [18]:
logs_df = pd.read_csv('logs_tabular.csv')
logs_df = logs_df[['request_id','request','disposition','component']]
ref_df = pd.read_csv('foia_logs_index.csv')
#display a nice data snippet

view_df = logs_df.merge(ref_df, left_on='component', right_on='model_num')[[
    'request_id', 
    'request',
    'disposition',
    'abbreviation',
    'agency']]

view_df

Unnamed: 0,request_id,request,disposition,abbreviation,agency
0,CRCL-DHS-000001,"All complaints, final reports, and final memos...",,CRCL,DHS
1,CRCL-DHS-000002,Records concerning ICE home-based enforcement ...,,CRCL,DHS
2,CRCL-DHS-000003,"any and all records, which includes but is not...",,CRCL,DHS
3,CRCL-DHS-000004,"any recommendations, guidance, reports, or oth...",,CRCL,DHS
4,CRCL-DHS-000005,"All records regarding the transfer, position d...",,CRCL,DHS
...,...,...,...,...,...
110810,NASA-AGENCY-002496,"proposal submitted by L&M Technologies, Inc. r...",,NASA,AGENCY
110811,NASA-AGENCY-002497,Contract Number NNJ12JB86C,,NASA,AGENCY
110812,NASA-AGENCY-002498,Maritime Observation of Ships at Sea (MOSES) e...,,NASA,AGENCY
110813,NASA-AGENCY-002499,Contract Number NNJ08JA01C,,NASA,AGENCY


#### Info and Shape

In [28]:
logs_df.disposition.value_counts().to_csv("disposition_type.csv")

In [22]:
## a sample of some data filtered to the CDC
logs_df[logs_df['component'] == 65].request

43395    copies of proposals and attachments, CDC-RFA-P...
43396    respective records related to the "Outbreak of...
43397    for all information regarding the 2013 CDC mul...
43398    consultation on a FOIA request for Facility's ...
43399                          funded grants for PS10-1003
                               ...                        
45890    Re: IDPB 2015-0940 (Date Range for Record Sear...
45891    1. All documents related to complaints of susp...
45892    respective records showing updates to the CDC'...
45893    Stroke Data: Year, State, Type, Result in Deat...
45894    any documents regarding firefighter occupation...
Name: request, Length: 2500, dtype: object

#### Further Exploration

In [5]:
## further exploration code

# FreqDocs Vector Metadata Exploration

In [6]:
vec_meta_df = pd.read_csv('freqdocs.csv')
vec_meta_df.head()

Unnamed: 0,title,component,abbreviation,parent_abbreviation,href,origination_url
0,American Opportunities Regional Center Inc,U.S. Citizenship & Immigration Services,USCIS,DHS,https://www.uscis.gov/sites/default/files/docu...,no_origin
1,1100.73-4 Reasonable Accommodation Program,Transportation Security Administration,TSA,DHS,https://www.tsa.gov/sites/default/files/foia-r...,no_origin
2,National Organization for Women (NOW),Federal Bureau of Investigation,FBI,DOJ,https://vault.fbi.gov/National%20Organization%...,no_origin
3,Prosperity Regional Center LLC formerly US Pro...,U.S. Citizenship & Immigration Services,USCIS,DHS,https://www.uscis.gov/sites/default/files/docu...,no_origin
4,900 Equal Employment Opportunity Policy Statement,Transportation Security Administration,TSA,DHS,https://www.tsa.gov/sites/default/files/foia-r...,no_origin


#### Info and Shape

In [7]:
vec_meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38385 entries, 0 to 38384
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   title                38385 non-null  object
 1   component            38385 non-null  object
 2   abbreviation         38334 non-null  object
 3   parent_abbreviation  33055 non-null  object
 4   href                 38385 non-null  object
 5   origination_url      38385 non-null  object
dtypes: object(6)
memory usage: 1.8+ MB


#### Further Exploration

In [12]:
## further exploration code 
vec_meta_df.iloc[:5][['title','component','abbreviation','parent_abbreviation']]

Unnamed: 0,title,component,abbreviation,parent_abbreviation
0,American Opportunities Regional Center Inc,U.S. Citizenship & Immigration Services,USCIS,DHS
1,1100.73-4 Reasonable Accommodation Program,Transportation Security Administration,TSA,DHS
2,National Organization for Women (NOW),Federal Bureau of Investigation,FBI,DOJ
3,Prosperity Regional Center LLC formerly US Pro...,U.S. Citizenship & Immigration Services,USCIS,DHS
4,900 Equal Employment Opportunity Policy Statement,Transportation Security Administration,TSA,DHS


In [14]:
df = vec_meta_df[["abbreviation","parent_abbreviation"]].value_counts().reset_index()
df.to_csv("freqdocs_agencies.csv")