/
pull_soi_corp.py
177 lines (155 loc) · 7.18 KB
/
pull_soi_corp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
"""
SOI Corp Data (pull_soi_corp.py):
------------------------------------------------------------------------
Module that handles reading in the soi corporate data. Contains a script
that loads in the capital stock information for total corporations and
s corporations. Based on the ratio of assets for total corporations,
data for s corporations is imputed. Finally, using the difference
between the two, the c corporation data can be allocated to all the
industries.
"""
# Packages:
import pandas as pd
# Directory names:
from data_paths import get_paths
globals().update(get_paths())
_DFLT_S_CORP_COLS_DICT = DFLT_S_CORP_COLS_DICT =\
dict([('depreciable_assets', 'DPRCBL_ASSTS'),
('accumulated_depreciation', 'ACCUM_DPR'), ('land', 'LAND'),
('inventories', 'INVNTRY'), ('interest_paid', 'INTRST_PD'),
('Capital_stock', 'CAP_STCK'),
('additional_paid-in_capital', 'PD_CAP_SRPLS'),
('earnings_(rtnd_appr.)', ''),
('earnings_(rtnd_unappr.)', 'COMP_RTND_ERNGS_UNAPPR'),
('cost_of_treasury_stock', 'CST_TRSRY_STCK'),
('depreciation', 'NET_DPR')])
_CORP_FILE_FCTR = 10 ** 3
_NAICS_COL_NM = 'INDY_CD'
_CODE_RANGE = ['32', '33', '45', '49']
_PARENTS = {'32': '31', '33': '31', '45': '44', '49': '48'}
def load_corp_data():
"""
Reads in the total corp and s corp data and calculates the c corp
data.
Args:
None
Returns:
corp_data (dictionary): dictionary containing DataFrames of SOI
corporate capital stock data
"""
cols_dict = _DFLT_S_CORP_COLS_DICT
# Dataframe column names
columns = list(str(x) for x in cols_dict.values())
columns.remove('')
# Opening the soi S-corporate data file:
try:
s_corp = pd.read_csv(_S_CORP_IN_PATH).fillna(0)
s_corp = s_corp.drop(s_corp[s_corp['AC'] > 1.].index)
# drop total across all industries
s_corp = s_corp.drop(s_corp[s_corp['INDY_CD'] == 1.].index)
# put in dollars (data in 1000s)
s_corp[columns] = s_corp[columns] * _CORP_FILE_FCTR
except IOError:
print("IOError: S-Corp soi data file not found.")
raise
# Opening the soi Total-corporate data file:
try:
tot_corp = pd.read_csv(_TOT_CORP_IN_PATH).fillna(0)
tot_corp = tot_corp.drop(tot_corp[tot_corp['AC'] > 1.].index)
# drop total across all industries
tot_corp = tot_corp.drop(tot_corp[tot_corp['INDY_CD'] == 1.].index)
tot_corp = tot_corp[['INDY_CD'] + columns].copy()
# put in dollars (data in 1000s)
tot_corp[columns] = tot_corp[columns] * _CORP_FILE_FCTR
except IOError:
print("IOError: total corp soi data file not found.")
raise
# read in crosswalk for bea and soi industry codes
soi_bea_ind_codes = pd.read_csv(_SOI_BEA_CROSS,
dtype={'bea_ind_code': str})
soi_bea_ind_codes.drop('notes', axis=1, inplace=True)
# drop one repeated minor ind code in crosswalk
soi_bea_ind_codes.drop_duplicates(subset=['minor_code_alt'],
inplace=True)
# merge codes to total corp data
# inner join means that we keep only rows that match in both datasets
# this should keep only unique soi minor industries
# in total corp data - note that s corp data already unique by sector
tot_corp = tot_corp.merge(
soi_bea_ind_codes, how='inner', left_on=['INDY_CD'],
right_on=['minor_code_alt'], suffixes=('_x', '_y'), copy=True)
# apportion s corp data across industries within sectors so has same
# level of industry detail as total corp data
s_corp = calc_proportions(tot_corp, s_corp, columns)
# merge s corp and total corp to find c corp only
c_corp = tot_corp.merge(
s_corp, how='inner', on=['INDY_CD'], suffixes=('_x', '_y'),
copy=True)
# calculate s corp values by minor industry using ratios
for var in columns:
c_corp[var] = c_corp[var + '_x'] - c_corp[var + '_y']
# clean up data by dropping and renaming columns
c_corp.drop(list(x + '_x' for x in columns), axis=1, inplace=True)
c_corp.drop(list(x + '_y' for x in columns), axis=1, inplace=True)
# NOTE:
# totals in s_corp match totals in SOI data
# totals in tot_corp match totals in SOI data if you sum over
# industries - but here and in raw SOI, summing over industries does
# not return value for "all industries". It's within 1%, but
# difference can't be accounted for (sum over industry > totals for
# all industries)
s_corp.rename(columns={"LAND": "Land", "INVNTRY": "Inventories",
"DPRCBL_ASSTS": "Fixed Assets",
"NET_DPR": "Depreciation",
"INDY_CD": "minor_code_alt"}, inplace=True)
c_corp.rename(columns={"LAND": "Land", "INVNTRY": "Inventories",
"DPRCBL_ASSTS": "Fixed Assets",
"NET_DPR": "Depreciation"},
inplace=True)
tot_corp.rename(columns={"LAND": "Land", "INVNTRY": "Inventories",
"DPRCBL_ASSTS": "Fixed Assets",
"NET_DPR": "Depreciation",
"INDY_CD": "minor_code_alt"}, inplace=True)
# Creates a dictionary of a sector : dataframe
corp_data = {'tot_corp': tot_corp, 'c_corp': c_corp, 's_corp': s_corp}
for k, v in corp_data.items():
v.rename({c: str(c) for c in v.columns})
return corp_data
def calc_proportions(tot_corp, s_corp, columns):
"""
Uses the ratio of the minor industry to the major industry to fill
in missing s corp data.
Args:
tot_corp (DataFrame): capital stock for all corporations
s_corp (DataFrame): capital stock for S-corporations
columns (list): names of variables to keep
Returns:
s_corp (DataFrame): capital stock for S-corporations with all
industries filled in
"""
# find ratio of variable in minor industry to variable in sector
# in total corp data
corp_ratios = tot_corp[['INDY_CD', 'sector_code'] + columns].copy()
for var in columns:
corp_ratios[var + '_ratio'] =\
tot_corp.groupby(['sector_code'])[var].apply(lambda x: x /
float(x.sum()))
corp_ratios.drop(columns, axis=1, inplace=True)
# new data w just ratios that will then merge to s corp data by
# sector code (many to one merge)
# first just keep s corp columns want_
# merge ratios to s corp data
s_corp = corp_ratios.merge(
s_corp, how='inner', left_on=['sector_code'],
right_on=['INDY_CD'], suffixes=('_x', '_y'), copy=True,
indicator=True)
# calculate s corp values by minor industry using ratios
for var in columns:
s_corp[var + '_final'] = s_corp[var] * s_corp[var + '_ratio']
# clean up data by dropping and renaming columns
s_corp.drop(['INDY_CD_y', '_merge', 'sector_code'] + columns, axis=1,
inplace=True)
s_corp.drop(list(x + '_ratio' for x in columns), axis=1, inplace=True)
s_corp.rename(columns={"INDY_CD_x": "INDY_CD"}, inplace=True)
s_corp.columns = s_corp.columns.str.replace('_final', '')
return s_corp