# Notebook for USASpending files

### This notebook will provide the steps for a pipeline script in Python to handle data

In [76]:
'''Turn f and t into booleans
Concat contract vehicles
Other concat in java folder'''

'Turn f and t into booleans\nConcat contract vehicles\nOther concat in java folder'

In [81]:
# Import necessary libraries
import numpy as np
import pandas as pd
import re
import datetime

# Create small data set to pull date-parsing column names
df = pd.read_csv('_Data/TREAS/api_bulk_listmonthlyfiles/2015_020_Contracts_Full_20181212_1.csv').head()

# Extract date column names with regular expression
date_list = [col for col in df if re.search(r'date\b', col) is not None]

# Build full dataframe with parsed dates
df = pd.read_csv('_Data/TREAS/api_bulk_listmonthlyfiles/2015_020_Contracts_Full_20181212_1.csv',
                 parse_dates=date_list, infer_datetime_format=True)

# Parse the dates of problematic column: period_of_performance_start_date
df.loc[:, 'period_of_performance_start_date'] = pd.to_datetime(
    df['period_of_performance_start_date'], errors='coerce')

# Forward fill NaN values in parent_award_id with award_id_piid values
df.loc[:, 'parent_award_id'] = df['parent_award_id'].fillna(df['award_id_piid'])

# Create dataframes of vehicle and work
vehicle = pd.read_csv('_Data/TREAS/ContractVehicles.csv', 
                      names=['parent_award_id', 'contract_vehicle'])
work = pd.read_csv('_Data/TREAS/Work.csv')
work.columns = ['parent_award_id', 'award_id_piid', 'work']

# Join vehicle and work to file
df_new = df.merge(vehicle, on='parent_award_id')
df_new = pd.merge(df, work, on=['parent_award_id', 'award_id_piid'], how='left')

In [82]:
work

Unnamed: 0,parent_award_id,award_id_piid,work
0,2031JW18P00016,2031JW18P00016,All Other Contracts Under $2M Annually
1,2031JW18P00043,2031JW18P00043,All Other Contracts Under $2M Annually
2,2031ZA17C00080,2031ZA17C00080,All Other Contracts Under $2M Annually
3,2032H318C00010,2032H318C00010,Internal Control and Oversight Support
4,2032H518C00036,2032H518C00036,"Project Planning, Investment, and Governance (..."
5,2032H518C00045,2032H518C00045,All Other Contracts Under $2M Annually
6,2032H518C00056,2032H518C00056,All Other Contracts Under $2M Annually
7,2032H518C00065,2032H518C00065,All Other Contracts Under $2M Annually
8,2032H518C00072,2032H518C00072,Corporate Data Domain Data Services IT Support
9,2032H518C00082,2032H518C00082,All Other Contracts Under $2M Annually


In [78]:
for col in df:
    if len(df[col].unique()) == 2:
        if df[col].unique() != ['f', 't'] and df[col].unique() != ['t', 'f']:
            print('{} \n\n'.format(col), df[col].unique())
        
['f', 't']
['t', 'f']
['N', 'Y']

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [74]:
df['tribally_owned_business'].unique() == ['f', 't']

array([ True,  True])

In [78]:
df['contract_bundling_code'].value_counts()

D    2621
Name: contract_bundling_code, dtype: int64

In [79]:
len(df['contract_bundling_code'])

2626