# Notebook for USASpending files

### This notebook will provide the steps for a pipeline script in Python to handle data

In [None]:
'''
Remove rows with product code nn99
'''

In [4]:
# Import necessary libraries
import numpy as np
import pandas as pd
import re
import datetime
import requests
import sys
import os

# API pull for bulk download file URL
params =  {"agency": 22, "fiscal_year": 2016, "type": "contracts"}
r = requests.post("https://api.usaspending.gov/api/v2/bulk_download/list_monthly_files//", data=params)

# Use this txt as a dict in python
with open('Treasury.txt', 'wb') as file:
    file.write(r.content)

In [7]:
# Create small data set to pull date-parsing column names
col_list = list(pd.read_csv(
    '_Data/TREAS/api_bulk_listmonthlyfiles/2015_020_Contracts_Full_20181212_1.csv', low_memory=False).columns)

# Extract date column names with regular expression
date_list = [col for col in col_list if re.search(r'date\b', col) is not None]

# Build full dataframe with parsed dates
df = pd.read_csv('_Data/TREAS/api_bulk_listmonthlyfiles/2015_020_Contracts_Full_20181212_1.csv',
                 parse_dates=date_list, infer_datetime_format=True, low_memory=False)

# Parse the dates of problematic column: period_of_performance_start_date
df.loc[:, 'period_of_performance_start_date'] = pd.to_datetime(
    df['period_of_performance_start_date'], errors='coerce')

# Forward fill NaN values in parent_award_id with award_id_piid values
df.loc[:, 'parent_award_id'] = df['parent_award_id'].fillna(df['award_id_piid'])

# Create dataframes of vehicle and work
vehicle = pd.read_csv('_Data/TREAS/ContractVehicles.csv', 
                      names=['parent_award_id', 'contract_vehicle'])
work = pd.read_csv('_Data/TREAS/Work.csv')
work.columns = ['parent_award_id', 'award_id_piid', 'work']

# Join vehicle and work to file
df_new = df.merge(vehicle, on='parent_award_id')
df_new = pd.merge(df, work, on=['parent_award_id', 'award_id_piid'], how='left')

# Create fiscal year by adding '1' to months after September
# Need to verify WHICH column is used for fiscal year
def to_fiscal(date_column, index=df_new.index):
    df_new['fiscal_year'] = pd.to_datetime(date_column)
    fiscal_year = []
    for date in date_column:
        if date.month > 9:
            fiscal_year.append(date.year + 1)
        else:
            fiscal_year.append(date.year)
    return pd.Series(fiscal_year, index=index)
df_new['fiscal_year'] = to_fiscal(df_new['action_date'])

# Extract columns names with 't' and 'f' values only
bool_list = []
for col in df_new:
    if col != 'fiscal_year':
        if len(df_new[col].unique()) == 2 and df_new[col].dtype != '<M8[ns]':
            if 't' and 'f' in df[col].unique():
                bool_list.append(str(col))
        if len(df_new[col].unique()) < 2 and df_new[col].dtype != '<M8[ns]':
            if 't' in df[col].unique():
                df_new.loc[:, col] = True
            if 'f' in df[col].unique():
                df_new.loc[:, col] = False
            
# Overwrite columns with only 't' and 'f' with boolean values
for name in bool_list:
    df_new.loc[:, name] = pd.get_dummies(df_new[name]).astype('bool')['t']
    
# Return Excel xlsm file
# df_new.to_excel('Output.xlsm', index=False)



In [9]:
df_new['township_local_government'].value_counts()

False    23640
Name: township_local_government, dtype: int64

In [10]:
df_new['township_local_government']

0        False
1        False
2        False
3        False
4        False
5        False
6        False
7        False
8        False
9        False
10       False
11       False
12       False
13       False
14       False
15       False
16       False
17       False
18       False
19       False
20       False
21       False
22       False
23       False
24       False
25       False
26       False
27       False
28       False
29       False
         ...  
23610    False
23611    False
23612    False
23613    False
23614    False
23615    False
23616    False
23617    False
23618    False
23619    False
23620    False
23621    False
23622    False
23623    False
23624    False
23625    False
23626    False
23627    False
23628    False
23629    False
23630    False
23631    False
23632    False
23633    False
23634    False
23635    False
23636    False
23637    False
23638    False
23639    False
Name: township_local_government, Length: 23640, dtype: bool

In [64]:
class usaspendingobj:
    
    def __init__(self, obj):
        # Create small data set to pull date-parsing column names
        col_list = list(pd.read_csv(
            obj).columns)

        # Extract date column names with regular expression
        date_list = [col for col in col_list if re.search(r'date\b', col) is not None]
        self.file = [col for col in col_list if re.search(r'date\b', col) is not None]

#         # Build full dataframe with parsed dates
#         df = pd.read_csv(obj,
#                          parse_dates=date_list, infer_datetime_format=True)

#         # Parse the dates of problematic column: period_of_performance_start_date
#         df.loc[:, 'period_of_performance_start_date'] = pd.to_datetime(
#             df['period_of_performance_start_date'], errors='coerce')

#         # Forward fill NaN values in parent_award_id with award_id_piid values
#         df.loc[:, 'parent_award_id'] = df['parent_award_id'].fillna(df['award_id_piid'])

#         # Create dataframes of vehicle and work
#         vehicle = pd.read_csv('_Data/TREAS/ContractVehicles.csv', 
#                               names=['parent_award_id', 'contract_vehicle'])
#         work = pd.read_csv('_Data/TREAS/Work.csv')
#         work.columns = ['parent_award_id', 'award_id_piid', 'work']

#         # Join vehicle and work to file
#         df_new = df.merge(vehicle, on='parent_award_id')
#         df_new = pd.merge(df, work, on=['parent_award_id', 'award_id_piid'], how='left')

#         # Create fiscal year by adding '1' to months after September
#         # Need to verify WHICH column is used for fiscal year
#         def to_fiscal(date_column, index=df_new.index):
#             df_new['fiscal_year'] = pd.to_datetime(date_column)
#             fiscal_year = []
#             for date in date_column:
#                 if date.month > 9:
#                     fiscal_year.append(date.year + 1)
#                 else:
#                     fiscal_year.append(date.year)
#             return pd.Series(fiscal_year, index=index)
#         df_new['fiscal_year'] = to_fiscal(df_new['action_date'])

#         # Extract columns names with 't' and 'f' values only
#         bool_list = []
#         for col in df_new:
#             if col != 'fiscal_year':
#                 if len(df_new[col].unique()) == 2 and df_new[col].dtype != '<M8[ns]':
#                     if 't' and 'f' in df[col].unique():
#                         bool_list.append(str(col))
#                 if len(df_new[col].unique()) < 2 and df_new[col].dtype != '<M8[ns]':
#                     if 't' in df[col].unique():
#                         df_new.loc[:, col] = True
#                     if 'f' in df[col].unique():
#                         df_new.loc[:, col] = False

#         # Overwrite columns with only 't' and 'f' with boolean values
#         for name in bool_list:
#             df_new.loc[:, name] = pd.get_dummies(df_new[name]).astype('bool')['t']
        
#         self.obj = df_new
#         # Return Excel xlsm file
#         # df_new.to_excel('Output.xlsm', index=False)

In [65]:
usaspendingobj('_Data/TREAS/api_bulk_listmonthlyfiles/2015_020_Contracts_Full_20181212_1.csv')

FileNotFoundError: File b'_Data/TREAS/api_bulk_listmonthlyfiles/2015_020_Contracts_Full_20181212_1.csv' does not exist

In [None]:
if __name__ == "__main__":

In [45]:
df_new['action_date']

0       2015-03-02
1       2015-01-13
2       2015-04-01
3       2015-01-21
4       2015-03-24
5       2015-08-21
6       2015-08-31
7       2014-11-24
8       2015-08-31
9       2015-07-07
10      2015-03-17
11      2015-08-31
12      2015-07-28
13      2014-10-17
14      2015-07-21
15      2015-09-26
16      2015-04-27
17      2014-11-05
18      2015-09-21
19      2015-07-28
20      2015-05-22
21      2015-09-28
22      2015-08-06
23      2014-12-23
24      2015-01-23
25      2014-10-14
26      2015-06-30
27      2015-01-28
28      2015-03-27
29      2014-12-12
           ...    
23610   2014-10-01
23611   2014-10-01
23612   2014-10-01
23613   2014-10-01
23614   2014-10-01
23615   2014-10-01
23616   2014-10-01
23617   2014-10-01
23618   2014-10-01
23619   2014-10-01
23620   2014-10-01
23621   2014-10-01
23622   2014-10-01
23623   2014-10-01
23624   2014-10-01
23625   2014-10-01
23626   2014-10-01
23627   2014-10-01
23628   2014-10-01
23629   2014-10-01
23630   2014-10-01
23631   2014

In [13]:
df_new.head(5)

Unnamed: 0,award_id_piid,modification_number,transaction_number,parent_award_agency_id,parent_award_agency_name,parent_award_id,parent_award_modification_number,federal_action_obligation,total_dollars_obligated,base_and_exercised_options_value,...,veterinary_college,dot_certified_disadvantage,self_certified_small_disadvantaged_business,small_disadvantaged_business,c8a_program_participant,historically_underutilized_business_zone_hubzone_firm,sba_certified_8a_joint_venture,last_modified_date,work,fiscal_year
0,TFSAHUD15K0007,0,0,4732.0,FEDERAL ACQUISITION SERVICE,GS06F1197Z,2.0,65943.08,65943.08,65943.08,...,False,False,False,True,True,False,False,2018-12-10 20:40:30,,2015
1,TFSACFP14K0001,2,0,4732.0,FEDERAL ACQUISITION SERVICE,GS06F0806Z,0.0,0.0,3704818.88,0.0,...,False,False,False,True,True,False,False,2018-12-10 20:24:44,,2015
2,TMDN15MX0593,0,0,,,TMDN15MX0593,,5670.0,,5670.0,...,False,False,True,False,False,False,False,2015-04-01 00:00:00,,2015
3,TOSOFR14D00060001,1,0,4732.0,FEDERAL ACQUISITION SERVICE,GS06F0723Z,0.0,73773.76,442851.66,73773.76,...,False,False,False,True,True,False,False,2018-12-10 20:20:26,,2015
4,TFSAADF14K0010,700,0,4732.0,FEDERAL ACQUISITION SERVICE,GS06F0753Z,0.0,0.0,510112.1,0.0,...,False,True,True,True,True,False,False,2018-12-10 20:21:48,,2015


In [36]:
df1 = df_new.groupby(['work', 'award_id_piid'])[['period_of_performance_potential_end_date']].max()

# Shows we need to build a different function for fiscal on the multi-index
df1.index

MultiIndex(levels=[['4003 - IRS HSPD-12 Credential Issuance and Maintenance', '4014 - Electronic Fraud and Detection System O&M', '4019 - Joint Operations Center Support', '4025 - Integrated Financial System Upgrade and O&M Support', '4029 - Computer Security Incident Respons Center', '4045 - Audit Programs Support Services', '4050 - Computer Assisted Publishing System (CAPS)', '4053 - IRS E-Gov Travel Services Support', '4054 - Information Technology & Financial Management Support Services', '4055 - E-Services Integrated Customer Communication Environment (ICCE) Support', '4060 - Teir 1 Interfaces and Payment Distribution O&M Support', '4074 - IRS Safeguards Support', '4076 - Financial Management Information Systes (FMIS) & Redesign of the Revenue Accounting Controls System (RRACS) Support', '4077 - HR Connect Program Office (HRPO) IT Services', '4089 - Technical Support for the Enterprise Systems Testing Domain', '4103 - Patient Protection Affordable Care Act (PPACA) Program Manageme

In [42]:
df_new[df_new['work'] == 'Operations and Maintenance Support for ITI Middleware Components'][['work',
                                                                      'award_id_piid', 
                                                                      'period_of_performance_potential_end_date']]

Unnamed: 0,work,award_id_piid,period_of_performance_potential_end_date
6683,Operations and Maintenance Support for ITI Mid...,TIRNO12D000110029,2016-04-08
9015,Operations and Maintenance Support for ITI Mid...,TIRNO12D000110030,2016-04-01
10509,Operations and Maintenance Support for ITI Mid...,TIRNO12D000110027,2017-12-01
12038,Operations and Maintenance Support for ITI Mid...,TIRNO12D000110029,2016-04-06
12689,Operations and Maintenance Support for ITI Mid...,TIRNO12D000110027,2015-12-01
15061,Operations and Maintenance Support for ITI Mid...,TIRNO12D000110027,2015-12-01


In [43]:
df1.loc['Operations and Maintenance Support for ITI Middleware Components']

Unnamed: 0_level_0,period_of_performance_potential_end_date
award_id_piid,Unnamed: 1_level_1
TIRNO12D000110027,2017-12-01
TIRNO12D000110029,2016-04-08
TIRNO12D000110030,2016-04-01


In [8]:
df_again = pd.merge(df_new, df_new.groupby('work')[['period_of_performance_potential_end_date']].max(),
         on='work', how='left')

df_again