### Combines houshold and population files, and subsets to relevant features

In [23]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from collections import Counter

### Data Dictionary (household features)

- SERIALNO: household number
- FES: married, heterosexual couples are 1, 2, 3 or 4
- SSMC: same-sex marriage flag
- HHL: household language (1=English only)
- NOC: number of children in household

In [5]:
# vars to keep
hh_vars = ['SERIALNO', 'SSMC', 'FES', 'HHL', 'NOC']

In [31]:
def merge_all(year):
    ''' merges houshold and population files per year'''
    y = year - 2000 #for file names
    # hh files
    h1_file = "csv_hus/ss{}husa.csv".format(y)
    h2_file = "csv_hus/ss{}husb.csv".format(y)
    
    def merge_df(file1, file2):
        df1 = pd.read_csv(file1)
        df2 = pd.read_csv(file2)
        return pd.concat([df1, df2])
    
    hh = merge_df(h1_file, h2_file) # merge the two hh files
    print ""
    
    def ss_het(df):
        ss = list(df.loc[(df['SSMC'].isin([1,2])) & (df['FES'].isnull())]['SERIALNO'])
        het = list(df.loc[(df['FES'].isin([1,2,3,4]))]['SERIALNO'])
        return ss, het
    
    ss, het = ss_het(hh) # create lists for ss and het
    
    #files for pop
    pop1 = "csv_pus/ss{}pusa.csv".format(y)
    pop2 = "csv_pus/ss{}pusb.csv".format(y)
    
    pop = merge_df(pop1, pop2) # merge pop files
    
    #subset the population files by married households
    ss_df = pop[pop['SERIALNO'].isin(ss)][[i for i in pop.columns if "pwgtp" not in i]]
    het_df = pop[pop['SERIALNO'].isin(het)][[i for i in pop.columns if "pwgtp" not in i]]
    
    # subset just to married people
    ss_df = ss_df[ss_df['MAR']==1]
    het_df = het_df[het_df['MAR']==1]
    
    # add SSC flag
    ss_df['gay'] = 1
    het_df['gay'] = 0
    
    pop_2 = pd.concat([ss_df, het_df]) # concatenate to one pop
    
    hh_sub = hh[hh_vars] #get back hh features needed
    
    pop_2 = pop_2.merge(hh_sub, on='SERIALNO', how='left') # add in hh features
    pop_2['year'] = year # add year
    
    #subset only to two partner households for simplicity in calculating spouse's wage
    
    c = Counter(list(pop_2['SERIALNO']))
    twop_houses = [i for i, j in c.items() if j==2] # houses w/ two partners
    pop_2 = pop_2.loc[pop_2['SERIALNO'].isin(twop_houses)] # subset
    
    pop_2.to_csv('sub{}.csv'.format(y), index=False) # save out file


In [32]:
merge_all(2014) #create 2014 file




In [33]:
merge_all(2015)




In [None]:
merge_all(2016)

In [44]:
def add_spouse_salary(file_path):
    '''adds feature for salary of spouse'''
    df = pd.read_csv(file_path) # read in file
    wage_by_house = df.groupby('SERIALNO')['WAGP'].sum() # total wage by house

    def spouse_salary(x):
        return wage_by_house.loc[x[0]] - x[1] # subtract wage from total to find spouse wage

    zippers = zip(df['SERIALNO'], df['WAGP']) #zip house and wage

    # find spouse's salary
    s_sal = [] 
    for z in zippers:
        s_sal.append(spouse_salary(z))

    df['spouse_wage'] = s_sal # add spouse's salary to df
    
    df.to_csv(file_path, index=False)

In [45]:
add_spouse_salary('sub14.csv')

  if self.run_code(code, result):


In [46]:
add_spouse_salary('sub15.csv')

In [47]:
add_spouse_salary('sub16.csv')