In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

import statsmodels.api as sm

## Load data

In [2]:
## Load data

gss = pd.read_csv('../../GSS_2012/ICPSR_35478/DS0001/35478-0001-Data.tsv', sep='\t')
gss.head()

Unnamed: 0,YEAR,ID,INTID,FEEUSED,FEELEVEL,DATEINTV,LNGTHINV,INTAGE,INTETHN,MODE,...,SAMPCODE,SAMPLE,OVERSAMP,WTSS,WTSSNR,WTSSALL,WTCOMB,WTCOMBNR,VSTRAT,VPSU
0,2012,1,49,1,75,721,69,60,1,1,...,601,10,1,2.621963,2.869532,2.621963,6.402159,7.006659,-1,-1
1,2012,2,150,1,75,624,53,32,1,1,...,601,10,1,3.49595,3.826043,3.49595,6.514477,7.129583,-1,-1
2,2012,3,150,1,75,627,77,32,1,1,...,601,10,1,1.747975,1.913021,1.747975,1.67113,1.82892,-1,-1
3,2012,4,49,1,20,527,78,60,1,1,...,601,10,1,1.235694,1.35237,1.235694,1.18137,1.292917,-1,-1
4,2012,5,235,1,75,620,149,62,1,1,...,601,10,1,0.873988,0.956511,0.873988,0.835565,0.91446,-1,-1


## Select useful features manually

In [3]:
## Most useful features with majority responses

gss_select = pd.DataFrame({
    'age': gss['AGE'], #p60
    'yob': gss['COHORT'],
    'sex': gss['SEX'], #p87
    'race': gss['RACECEN1'], #p291
    'siblings': gss['SIBS'], #p58
    'region': gss['REGION'], #p103
    'place_size_000s': gss['SIZE'], #p105
    'place_size': gss['XNORCSIZ'], #p104
    'beltcode': gss['SRCBELT'], #p105
    'work_status': gss['WRKSTAT'],
    'hours': gss['HRS1'], #p.21
    'self_employed': gss['WRKSLF'],
    'occupation': gss['OCC10'],
    'income': gss['RINCOME'], #p100
    'income_granular': gss['RINCOM06'], #p102
    'highest_school': gss['EDUC'], #p64
    'highest_degree': gss['DEGREE'], #p68
    'major': gss['MAJOR1'], #p70
    'residence_16' : gss['RES16'], #p88
    'region_16': gss['REG16'], #p89
    'family_16': gss['FAMILY16'], #90
    'live_who_16': gss['FAMDIF16'], #91
    'fam_income_16': gss['INCOM16'], #92
    'relig_raised': gss['RELIG16'], #p.186
    'geo_mobility': gss['MOBILE16'], #p90
    'mother_employed': gss['MAWRKGRW'], #92
    'father_school': gss['PAEDUC'], #p65
    'mother_school': gss['MAEDUC'], #p66
    'father_degree': gss['PADEG'], #p68
    'mother_degree': gss['MADEG'], #p69
    'father_occupation': gss['PAOCC10'], #p.45
    'mother_occupation': gss['MAOCC10'], #p51
    'father_self_employed': gss['PAWRKSLF'], #p.45
    'mother_self_employed': gss['MAWRKSLF'], #p51
    'class': gss['CLASS'], #p.236
    'happiness': gss['HAPPY'], #p210
    'pol_party': gss['PARTYID'], #p140
    'pol_views': gss['POLVIEWS'], #p142
    'vote_2008': gss['VOTE08'], #p140
    'pres_2008': gss['PRES08'], #p141
    'if_pres_2008': gss['IF08WHO'], #P141
    'religion': gss['RELIG'], #p.175
    'religion_strength': gss['RELITEN'], #p183
    'fundamentalist': gss['FUND'], #p182
    'rel_attend': gss['ATTEND'], #p182
    'rel_person': gss['RELPERSN'],
    'spi_person': gss['SPRTPRSN'],
    'post_life': gss['POSTLIFE'], #p184
    'pray': gss['PRAY'], #p184
    'bible': gss['BIBLE'], #p206
    'god': gss["GOD"],
    'born_again': gss['REBORN'],
    'save_soul': gss['SAVESOUL'],
    'rel_activities': gss['RELACTIV'],
    'op_road': gss['NATROAD'], #p149
    'op_socsec': gss['NATSOC'], #p150
    'op_masstrans': gss['NATMASS'], #p151
    'op_parks': gss['NATPARK'], #p152
    'op_childcare': gss['NATCHLD'], #p153
    'op_scires': gss['NATSCI'], #p154
    'op_cappun': gss['CAPPUN'], #p173
    'op_gunlaws': gss['GUNLAW'], #173
    'op_criminals': gss['COURTS'] #p174
    
})

In [4]:
## Features that might have label leakage

gss_may_hint = pd.DataFrame({
    'dwelling_type': gss['DWELLING'], #p295
    'children': gss['CHILDS'], #p59
    'age_1st_child': gss['AGEKDBRN'], #P62
    'sex_orient': gss['SEXORNT'] #p277
})

In [5]:
## Outcome variable

gss_marital_status = pd.DataFrame({
    'marital_status' : gss['MARITAL'],
    'ever_divorced' : gss['DIVORCE'],
    'ever_widowed' : gss['WIDOWED']
})

In [6]:
## marital_status: 1-Married, 2-Widowed, 3-Divorced, 4-Separated, 5-Never married, 9-No answer
## ever_divorced: 1-Yes, 2-No, 0-Inapplicable, 8-Don't know, 9-No answer
## ever_widowed: 1-Yes, 2-No, 0-Inapplicable, 8-Don't know, 9-No answer
gss_marital_status.head()

Unnamed: 0,marital_status,ever_divorced,ever_widowed
0,5,0,0
1,5,0,0
2,1,2,2
3,1,1,2
4,4,0,2


In [7]:
## Create binary outcome variable

gss_divsep = pd.DataFrame({
    'ever_divsep': np.where(gss_marital_status['marital_status'].isin([3, 4]), 1, 0) + np.where(gss_marital_status['ever_divorced'] == 1, 1, 0)
})

In [8]:
gss_divsep.head()

Unnamed: 0,ever_divsep
0,0
1,0
2,0
3,1
4,1


In [9]:
## Why so many non responses for income?

gss_income = gss.loc[gss['RINCOM06'] == 0, ]

In [10]:
gss_income.VERSION.value_counts()

2    619
3    577
1    566
Name: VERSION, dtype: int64

In [11]:
gss_income.SAMPTYPE.value_counts()

2012    723
2010    567
2008    472
Name: SAMPTYPE, dtype: int64

## Create features and record NAs

In [12]:
## Create features and record NAs

In [13]:
dummies_needed = []
gss_features = pd.DataFrame()
gss_nas = pd.DataFrame({
    'variable': gss_select.columns,
    'inapplicable': np.nan,
    'dont_know': np.nan,
    'no_answer': np.nan
})

gss_nas.set_index(['variable'], drop=True, inplace=True)
gss_nas.head()

Unnamed: 0_level_0,inapplicable,dont_know,no_answer
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
age,,,
yob,,,
sex,,,
race,,,
siblings,,,


In [14]:
## Filling in gss_nas

def fill_gn(row, col, value):
    gss_nas.loc[row, col] = (gss_select[row] == value).sum()
    

#### Age

In [15]:
## Fill in NAs
gss_nas.loc['age', 'dont_know'] = (gss_select['age'] == 98).sum()
fill_gn('age', 'no_answer', 99)

## Create feature with NAs
gss_features['age'] = np.where(gss_select['age'] == 99, np.nan, gss_select['age'])

## Display results
print(gss_nas.head())
print(gss_features.head())

          inapplicable  dont_know  no_answer
variable                                    
age                NaN        0.0       51.0
yob                NaN        NaN        NaN
sex                NaN        NaN        NaN
race               NaN        NaN        NaN
siblings           NaN        NaN        NaN
    age
0  22.0
1  21.0
2  42.0
3  49.0
4  70.0


#### YOB

In [16]:
## Ignoring this as duplicates age

#### Sex (p.87)

In [17]:
## 1-Male, 2-Female in survey
## Code to 0-Female, 1-Male

gss_features['sex'] = np.where(gss_select['sex'] == 2, 0, 1)

#### Race (p.291)

In [18]:
fill_gn('race', 'dont_know', 98)
fill_gn('race', 'no_answer', 99)

## If less than 1% code as other
race_dict = {
    1: 'white',
    2: 'black',
    3: 'american_indian',
    4: 'other', #'asian indian', 
    5: 'other', #'chinese', 
    6: 'other', #'filipino', 
    7: 'other', #'japanese', 
    8: 'other', #'korean', 
    9: 'other', # vietnamese
    10: 'other', #other asian 
    11: 'other', #native hawaiian 
    12: 'other', #Guamanian or Chamorro 
    13: 'other', #Samoan 
    14: 'other', #Other pacific islander 
    15: 'other', #another race 
    16: 'hispanic', 
    98: 'other',
    99: 'other'
}

gss_features['race'] = gss_select['race'].replace(race_dict)
dummies_needed.append('race')

## Can code 98 and 99 as 'other' because if obvious interviewer codes it.

#### Siblings p.58

In [19]:
fill_gn('siblings', 'dont_know', 98)
fill_gn('siblings', 'no_answer', 99)
gss_features['siblings'] = np.where(gss_select['siblings'].isin([98, 99]), np.nan, gss_select['siblings'])

#### Region p.103

In [20]:
region_dict = {
    1: 'New_England',
    2: 'Middle_Atlantic',
    3: 'East_North_Central',
    4: 'West_North_Central',
    5: 'South_Atlantic',
    6: 'East_South_Central',
    7: 'West_South_Central',
    8: 'Mountain',
    9: 'Pacific',
    0: np.nan
}

fill_gn('region', 'no_answer', 0)

gss_features['region'] = gss_select['region'].replace(region_dict)

dummies_needed.append('region')

#### Place size 000s p.105

In [21]:
gss_select.loc[gss_select['place_size_000s'] <0,]

Unnamed: 0,age,yob,sex,race,siblings,region,place_size_000s,place_size,beltcode,work_status,...,rel_activities,op_road,op_socsec,op_masstrans,op_parks,op_childcare,op_scires,op_cappun,op_gunlaws,op_criminals


In [22]:
## No missing values (-1) so no cleaning needed

gss_features['place_size_000s'] = gss_select['place_size_000s']

#### Place size p.104

In [23]:
## Not used - replicates previous

#### Beltcode p.105

In [24]:
## Use this to code urban / suburban / countryside

urban_dict = {
    1: 'urban',
    2: 'urban',
    3: 'suburban',
    4: 'suburban',
    5: 'urban',
    6: 'rural', 
}

gss_features['urban'] = gss_select['beltcode'].replace(urban_dict)

dummies_needed.append('beltcode')

#### Work status p.20

In [25]:
## Potential leakage here from keeping house and part time. Ignore.

#### Hours worked last week p.21

In [26]:
## -1 is inapplicable, 98 is don't know, 99 is no answer

fill_gn('hours', 'inapplicable', -1)
fill_gn('hours', 'dont_know', 98)
fill_gn('hours', 'no_answer', 99)

gss_features['hours_worked'] = gss_select['hours']

gss_features.loc[gss_features['hours_worked'] == -1, 'hours_worked'] = 0

gss_features.loc[gss_features['hours_worked'].isin([98,99]), 'hours_worked'] = np.nan


#### Self employed p.25

In [27]:
## Only 131 inapplicables doesn't make sense with work status results.
## Leave out.

#### Occupation p.26

In [28]:
## The numbers do not tally with the totals for each occupation in 
## appendix F. Maybe Appendix F doesn't include all panels?

len(gss['OCC10'].unique())

421

In [29]:
occs = gss_select['occupation'].value_counts().to_frame('counts').sort_values('counts', ascending=False)
occs['percentage'] = occs['counts'] / occs['counts'].sum()
occs.loc[occs['percentage'] > 0.015,]

Unnamed: 0,counts,percentage
5700,145,0.030083
0,131,0.027178
4760,110,0.022822
4220,110,0.022822
3600,108,0.022407
2310,101,0.020954
9130,96,0.019917
3255,96,0.019917
430,94,0.019502
5240,76,0.015768


In [30]:
## Too small - code by general occupation area.

occ_dict = {
    0: 'none',
    430: 'manager',
    950: 'business',
    1240: 'computers_maths',
    1560: 'engineering',
    1965: 'science',
    2060: 'social_services',
    2160: 'legal',
    2550: 'education',
    2960: 'arts',
    3540: 'healthcare_technical',
    3600: 'healthcare_support',
    3955: 'protective_services',
    4160: 'food_prep_serving',
    4250: 'cleaning_maintenance',
    4650: 'personal_care',
    4965: 'sales',
    5940: 'administrative_support',
    6130: 'farming_fishing_forestry',
    6940: 'construction',
    7630: 'installation',
    8965: 'production',
    9750: 'transportation',
    9830: 'military',
    9999: np.nan
    
}

gss_occs = pd.DataFrame()
gss_occs['occs'] = gss_select['occupation']
gss_occs['occs_summary'] = gss_occs['occs']

for occ in occ_dict:
    for row in np.arange(0, len(gss_occs)):
        if(isinstance(gss_occs.loc[row, 'occs_summary'], str)): 
            continue
        if(gss_occs.loc[row, 'occs_summary'] <= occ): 
                gss_occs.loc[row, 'occs_summary'] = occ_dict[occ]
    
gss_features['occupation'] = gss_occs['occs_summary']
dummies_needed.append('occupation')

#### Income p.102

In [31]:
## Use mid point of ranges

income_dict = {
    1: 500,
    2: 1500,
    3: 3500,
    4: 4500,
    5: 5500,
    6: 6500,
    7: 7500,
    8: 9000,
    9: 11250,
    10: 13750,
    11: 16250,
    12: 18750,
    13: 21250,
    14: 23750,
    15: 27500,
    16: 32500,
    17: 37500,
    18: 45000,
    19: 55000,
    20: 67500,
    21: 82500,
    22: 100000,
    23: 120000,
    24: 140000,
    25: 160000 #guess because uncapped
}

In [32]:
## 1762 inapplicable, 170 refused and 98 don't know. 
## Impute using logical rules.


## Separate the complete from the incomplete
## The complete are used to predict the incomplete

gss_income_impute = gss_select.loc[gss_select['income_granular'].isin([0, 26, 98]), ['age', 'occupation', 'income_granular', 'hours']]
gss_income_impute['occs_summary'] = gss_occs.loc[gss_occs.index.isin(gss_income_impute.index), 'occs_summary']
gss_income_complete = gss_select.loc[~gss_select.index.isin(gss_income_impute.index), ]
gss_income_complete['occs_summary'] = gss_occs.loc[gss_occs.index.isin(gss_income_complete.index), 'occs_summary']
gss_income_impute.reset_index(inplace=True)
gss_income_impute.rename(columns={'index': 'old_index'}, inplace=True)
gss_income_complete.reset_index(inplace=True)
gss_income_complete.rename(columns={'index': 'old_index'}, inplace = True)
gss_income_impute['income_impute'] = np.nan

gss_income_complete['income_granular'] = gss_income_complete['income_granular'].replace(income_dict)
gss_income_complete.loc[gss_income_complete['hours'].isin([-1, 98, 99]), 'hours'] = np.nan
gss_income_impute.loc[gss_income_impute['hours'].isin([-1, 98, 99]), 'hours'] = np.nan
gss_income_complete['income_per_hourwk'] = gss_income_complete['income_granular'] / gss_income_complete['hours']

len(gss_income_impute.loc[gss_income_impute['income_impute'].isnull(), ])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexi

1988

In [33]:
gss_income_impute.head()

Unnamed: 0,old_index,age,occupation,income_granular,hours,occs_summary,income_impute
0,0,22,4800,0,15.0,sales,
1,1,21,2900,0,30.0,arts,
2,3,49,800,0,,business,
3,4,70,3800,0,,protective_services,
4,5,50,1300,0,,engineering,


In [34]:
gss_income_complete.income_per_hourwk.isnull().value_counts()

False    2468
True      364
Name: income_per_hourwk, dtype: int64

In [35]:
occ_counts = pd.DataFrame(gss_income_complete['occupation'].value_counts())
occ_counts.head()

Unnamed: 0,occupation
4760,65
430,65
4220,65
3255,63
5700,62


In [36]:
occ_counts_age = pd.DataFrame(gss_income_complete.groupby(['age'])['occupation'].value_counts())
occ_counts_age.columns = ['counts']
occ_counts_age.reset_index(inplace=True)
occ_counts_age['age_occ'] = occ_counts_age['age'].astype('str') +'_'+ occ_counts_age['occupation'].astype('str')
occ_counts_age.head()

Unnamed: 0,age,occupation,counts,age_occ
0,18,4050,1,18_4050
1,18,4610,1,18_4610
2,18,5240,1,18_5240
3,18,5400,1,18_5400
4,18,7630,1,18_7630


In [37]:
## Is there someone with same job?

gss_income_impute['same_job'] = np.nan

for row in np.arange(0, len(gss_income_impute)):
    try:
        if(occ_counts.loc[gss_income_impute.loc[row, 'occupation'], ].item() > 0):
            gss_income_impute.loc[row, 'same_job'] = True
    except KeyError:
        continue
    
gss_income_impute.head(10)

Unnamed: 0,old_index,age,occupation,income_granular,hours,occs_summary,income_impute,same_job
0,0,22,4800,0,15.0,sales,,True
1,1,21,2900,0,30.0,arts,,True
2,3,49,800,0,,business,,True
3,4,70,3800,0,,protective_services,,True
4,5,50,1300,0,,engineering,,True
5,6,35,4760,0,,sales,,True
6,7,24,0,0,,none,,
7,8,28,0,0,,none,,
8,10,55,6200,0,,construction,,True
9,12,28,0,0,,none,,


In [38]:
## Now do same job and same age

gss_income_impute['same_age_and_job'] = np.nan

for row in np.arange(0, len(gss_income_impute)):
    row_ind = gss_income_impute.loc[row, 'age'].astype('str') + '_' + gss_income_impute.loc[row, 'occupation'].astype('str')
    if(occ_counts_age.loc[occ_counts_age['age_occ'] == row_ind, 'counts'].empty):
        continue
    elif(occ_counts_age.loc[occ_counts_age['age_occ'] == row_ind, 'counts'].item() > 0):
        gss_income_impute.loc[row, 'same_age_and_job'] = True

In [39]:
len(gss_income_impute.loc[gss_income_impute['same_age_and_job'] == True])

441

In [40]:
## Now do same job and similar age (+/- 3 years)

gss_income_impute['sim_age_same_job'] = np.nan

for row in np.arange(2, len(gss_income_impute)):
    for age in np.arange(gss_income_impute.loc[row, 'age'].item() - 3, gss_income_impute.loc[row, 'age'].item() + 4):
        if(gss_income_impute.loc[row, 'sim_age_same_job'] == True):
            continue
        else:
            row_ind = age.astype('str') + '_' + gss_income_impute.loc[row, 'occupation'].astype('str')
            if(occ_counts_age.loc[occ_counts_age['age_occ'] == row_ind, 'counts'].empty):
                continue
            elif(occ_counts_age.loc[occ_counts_age['age_occ'] == row_ind, 'counts'].item() > 0):
                gss_income_impute.loc[row, 'sim_age_same_job'] = True

## Now do same job and similar age but +/- 10 years

gss_income_impute['sim_age_same_job_20'] = np.nan

for row in np.arange(2, len(gss_income_impute)):
    for age in np.arange(gss_income_impute.loc[row, 'age'].item() - 10, gss_income_impute.loc[row, 'age'].item() + 11):
        if(gss_income_impute.loc[row, 'sim_age_same_job_20'] == True):
            continue
        else:
            row_ind = age.astype('str') + '_' + gss_income_impute.loc[row, 'occupation'].astype('str')
            if(occ_counts_age.loc[occ_counts_age['age_occ'] == row_ind, 'counts'].empty):
                continue
            elif(occ_counts_age.loc[occ_counts_age['age_occ'] == row_ind, 'counts'].item() > 0):
                gss_income_impute.loc[row, 'sim_age_same_job'] = True

In [41]:
print(gss_income_impute.loc[gss_income_impute['sim_age_same_job']==True,].shape)
gss_income_impute.loc[gss_income_impute['occupation'] == 4020, ].head()

(1391, 11)


Unnamed: 0,old_index,age,occupation,income_granular,hours,occs_summary,income_impute,same_job,same_age_and_job,sim_age_same_job,sim_age_same_job_20
324,811,72,4020,0,,food_prep_serving,,True,,True,
438,1106,23,4020,0,20.0,food_prep_serving,,True,True,True,
510,1256,29,4020,0,25.0,food_prep_serving,,True,True,True,
526,1285,22,4020,0,,food_prep_serving,,True,True,True,
577,1418,33,4020,0,,food_prep_serving,,True,,True,


In [42]:
## Imputing the income

gss_income_impute['income_impute'] = np.nan

## 9810 is first line enlisted military supervisors - lots missing
## Average salary is $63,738 : https://datausa.io/profile/soc/552010/
## Code as $67,500 as within that band
gss_income_impute.loc[gss_income_impute['occupation'] == 9810, 'income_impute'] = 67500

## If no hours then zero pay
gss_income_impute.loc[gss_income_impute['occupation'].isin([0, 9999]), 'income_impute'] = 0

## If same age and job then same income, weighted by hours
for row in np.arange(2, 3):#len(gss_income_impute)):
    if (np.isnan(gss_income_impute.loc[row, 'income_impute'])):
        age = gss_income_impute.loc[row, 'age'].item()
        occ = gss_income_impute.loc[row, 'occupation'].item()
        if(gss_income_impute.loc[row, 'same_age_and_job'] == True):
            replicate_df = gss_income_complete.loc[(gss_income_complete['age'] == age) & (gss_income_complete['occupation'] == occ), ]
            replicate_df.dropna(subset=['hours'], inplace=True)
            av_income_per_hourwk = replicate_df['income_per_hourwk'].mean()
            av_hours = replicate_df['hours'].mean()
            if(np.isnan(gss_income_impute.loc[row, 'hours'])):
                gss_income_impute.loc[row, 'income_impute'] = av_income_per_hourwk * av_hours
            else:
                gss_income_impute.loc[row, 'income_impute'] = av_income_per_hourwk * gss_income_impute.loc[row, 'hours'].item()


## If similar age and same job then same income, weighted by hours
for row in np.arange(0, len(gss_income_impute)):
    if(np.isnan(gss_income_impute.loc[row, 'income_impute'])):
        ages = np.arange(gss_income_impute.loc[row, 'age'].item() - 3, gss_income_impute.loc[row, 'age'].item() +4)
        occ = gss_income_impute.loc[row, 'occupation'].item()
        if(gss_income_impute.loc[row, 'sim_age_same_job'] == True):
            replicate_df = gss_income_complete.loc[(gss_income_complete['age'].isin(ages)) & (gss_income_complete['occupation'] == occ), ]
            replicate_df.dropna(subset=['hours'], inplace=True)
            av_income_per_hourwk = replicate_df['income_per_hourwk'].mean()
            av_hours = replicate_df['hours'].mean()
            if(np.isnan(gss_income_impute.loc[row, 'hours'])):
                gss_income_impute.loc[row, 'income_impute'] = av_income_per_hourwk * av_hours
            else:
                gss_income_impute.loc[row, 'income_impute'] = av_income_per_hourwk * gss_income_impute.loc[row, 'hours'].item()

                
## If age within 20 yrs and same job then same income, weighted by hours
for row in np.arange(0, len(gss_income_impute)):
    if(np.isnan(gss_income_impute.loc[row, 'income_impute'])):
        ages = np.arange(gss_income_impute.loc[row, 'age'].item() - 10, gss_income_impute.loc[row, 'age'].item() +11)
        occ = gss_income_impute.loc[row, 'occupation'].item()
        if(gss_income_impute.loc[row, 'sim_age_same_job_20'] == True):
            replicate_df = gss_income_complete.loc[(gss_income_complete['age'].isin(ages)) & (gss_income_complete['occupation'] == occ), ]
            replicate_df.dropna(subset=['hours'], inplace=True)
            av_income_per_hourwk = replicate_df['income_per_hourwk'].mean()
            av_hours = replicate_df['hours'].mean()
            if(np.isnan(gss_income_impute.loc[row, 'hours'])):
                gss_income_impute.loc[row, 'income_impute'] = av_income_per_hourwk * av_hours
            else:
                gss_income_impute.loc[row, 'income_impute'] = av_income_per_hourwk * gss_income_impute.loc[row, 'hours'].item()

                
## If same job then same income, weighted by hours
for row in np.arange(0, len(gss_income_impute)):
    if(np.isnan(gss_income_impute.loc[row, 'income_impute'])):
        occ = gss_income_impute.loc[row, 'occupation'].item()
        if(gss_income_impute.loc[row, 'same_job'] == True):
            replicate_df = gss_income_complete.loc[gss_income_complete['occupation'] == occ, ]
            replicate_df.dropna(subset=['hours'], inplace=True)
            av_income_per_hourwk = replicate_df['income_per_hourwk'].mean()
            av_hours = replicate_df['hours'].mean()
            if(np.isnan(gss_income_impute.loc[row, 'hours'])):
                gss_income_impute.loc[row, 'income_impute'] = av_income_per_hourwk * av_hours
            else:
                gss_income_impute.loc[row, 'income_impute'] = av_income_per_hourwk * gss_income_impute.loc[row, 'hours'].item()

                
## If same job then same income, unable to weight by hours
for row in np.arange(0, len(gss_income_impute)):
    if(np.isnan(gss_income_impute.loc[row, 'income_impute'])):
        occ = gss_income_impute.loc[row, 'occupation'].item()
        if(gss_income_impute.loc[row, 'same_job'] == True):
            replicate_df = gss_income_complete.loc[gss_income_complete['occupation'] == occ, ]
            av_income = replicate_df['income_granular'].mean()
            gss_income_impute.loc[row, 'income_impute'] = av_income

            
## If same industry then same mean income, weighted by hours
for row in np.arange(0, len(gss_income_impute)):
    if(np.isnan(gss_income_impute.loc[row, 'income_impute'].item())):
        industry = gss_income_impute.loc[row, 'occs_summary']
        replicate_df = gss_income_complete.loc[gss_income_complete['occs_summary'] == industry, ]
        replicate_df.dropna(subset=['hours'], inplace=True)
        av_income_per_hourwk = replicate_df['income_per_hourwk'].mean()
        av_hours = replicate_df['hours'].mean()
        if (np.isnan(gss_income_impute.loc[row, 'hours'])):
            gss_income_impute.loc[row, 'income_impute'] = av_income_per_hourwk * av_hours
        else:
            gss_income_impute.loc[row, 'income_impute'] = av_income_per_hourwk * gss_income_impute.loc[row, 'hours'].item()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [43]:
len(gss_income_impute.loc[gss_income_impute['income_impute'].isnull(),])

0

In [44]:
gss_income_impute.head()

Unnamed: 0,old_index,age,occupation,income_granular,hours,occs_summary,income_impute,same_job,same_age_and_job,sim_age_same_job,sim_age_same_job_20
0,0,22,4800,0,15.0,sales,21207.724567,True,,,
1,1,21,2900,0,30.0,arts,50510.204082,True,,,
2,3,49,800,0,,business,67500.0,True,True,True,
3,4,70,3800,0,,protective_services,40505.032206,True,,,
4,5,50,1300,0,,engineering,100000.0,True,,True,


In [45]:
gss_select['imputed_income'] = np.nan

for row in np.arange(0, len(gss_select)):
    if (gss_select.iloc[gss_select.index == row, ].index.item() in gss_income_impute['old_index'].values):
        gss_select.loc[row, 'imputed_income'] = gss_income_impute.loc[gss_income_impute['old_index'] == row, 'income_impute'].item()
    else:
        gss_select.loc[row, 'imputed_income'] = gss_select.loc[row, 'income_granular']

In [46]:
gss_select.loc[gss_select['imputed_income'].isnull(), ]

Unnamed: 0,age,yob,sex,race,siblings,region,place_size_000s,place_size,beltcode,work_status,...,op_road,op_socsec,op_masstrans,op_parks,op_childcare,op_scires,op_cappun,op_gunlaws,op_criminals,imputed_income


In [47]:
gss_features['income'] = gss_select['imputed_income']

In [48]:
gss_features.head()

Unnamed: 0,age,sex,race,siblings,region,place_size_000s,urban,hours_worked,occupation,income
0,22.0,1,white,1.0,New_England,14,suburban,15.0,sales,21207.724567
1,21.0,1,white,2.0,New_England,14,suburban,30.0,arts,50510.204082
2,42.0,1,hispanic,1.0,New_England,14,suburban,60.0,engineering,23.0
3,49.0,0,white,2.0,New_England,14,suburban,0.0,business,67500.0
4,70.0,0,black,0.0,Middle_Atlantic,24,urban,0.0,protective_services,40505.032206


#### Highest school p.64 and highest degree p.68

In [51]:
## Just use highest degree because school is number of years and
## degree includes those who didn't finish school

gss_select.loc[0:10, ['highest_school', 'highest_degree']]

Unnamed: 0,highest_school,highest_degree
0,16,3
1,12,1
2,12,1
3,13,1
4,16,3
5,19,3
6,15,2
7,11,0
8,9,0
9,17,3


In [53]:
degree_dict = {
    0: 'dnf_school',
    1: 'high_school',
    2: 'junior_college',
    3: 'bachelor',
    4: 'graduate',
}

gss_features['education'] = gss_select['highest_degree'].replace(degree_dict)
dummies_needed.append('education')

#### Major p.70

In [54]:
## Ignore as only relevant for 700 respondents and likely to overlap
## occupation

#### Residence age 16 p.88

In [56]:
fill_gn('residence_16', 'dont_know', 8)
fill_gn('residence_16', 'no_answer', 9)

residence16_dict = {
    1: 'rural',
    2: 'rural',
    3: 'urban',
    4: 'urban',
    5: 'suburban',
    6: 'urban',
    8: np.nan,
    9: np.nan
}

gss_features['residence_16'] = gss_select['residence_16'].replace(residence16_dict)
dummies_needed.append('residence_16')

#### Region age 16 p.89

In [57]:
region16_dict = {
    1: 'New_England',
    2: 'Middle_Atlantic',
    3: 'East_North_Central',
    4: 'West_North_Central',
    5: 'South_Atlantic',
    6: 'East_South_Central',
    7: 'West_South_Central',
    8: 'Mountain',
    9: 'Pacific',
    0: 'foreign'
}

gss_features['region_16'] = gss_select['region_16'].replace(region16_dict)
dummies_needed.append('region_16')

#### Family age 16 p.90

In [58]:
fill_gn('family_16', 'no_answer', 9)

family16_dict = {
    0: 'other',
    1: 'both_parents',
    2: 'father_stepmother',
    3: 'mother_stepfather',
    4: 'father_only',
    5: 'mother_only',
    6: 'male_relative',
    7: 'female_relative',
    8: 'm_f_relatives',
    9: np.nan
}

gss_features['family_16'] = gss_select['family_16'].replace(family16_dict)
dummies_needed.append('family_16')

In [59]:
gss_features.head()

Unnamed: 0,age,sex,race,siblings,region,place_size_000s,urban,hours_worked,occupation,income,education,residence_16,region_16,family_16
0,22.0,1,white,1.0,New_England,14,suburban,15.0,sales,21207.724567,bachelor,urban,New_England,both_parents
1,21.0,1,white,2.0,New_England,14,suburban,30.0,arts,50510.204082,high_school,urban,New_England,mother_only
2,42.0,1,hispanic,1.0,New_England,14,suburban,60.0,engineering,23.0,high_school,urban,Middle_Atlantic,mother_only
3,49.0,0,white,2.0,New_England,14,suburban,0.0,business,67500.0,high_school,suburban,New_England,both_parents
4,70.0,0,black,0.0,Middle_Atlantic,24,urban,0.0,protective_services,40505.032206,bachelor,urban,Middle_Atlantic,mother_stepfather


### Code dummies

In [49]:
## Race
#gss_race_dummies = pd.get_dummies(gss_features['race']).rename(columns=lambda x: 'race_' + str(x))
#gss_features = pd.concat([gss_features, gss_race_dummies], axis=1)
#gss_features.drop(['race'], inplace=True, axis=1)

# Region
#gss_region_dummies = pd.get_dummies(gss_features['region']).rename(columns=lambda x: 'region_' + str(x))
#gss_features = pd.concat([gss_features, gss_region_dummies], axis=1)
#gss_features.drop(['region'], inplace=True, axis=1)

#Beltcode
#gss_beltcode_dummies = pd.get_dummies(gss_features['urban']).rename(columns=lambda x: 'beltcode_' + str(x))
#gss_features = pd.concat([gss_features, gss_beltcode_dummies], axis=1)
#gss_features.drop(['urban'], inplace=True, axis=1)

# Occupation
#gss_occupation_dummies = pd.get_dummies(gss_features['occupation']).rename(columns=lambda x: 'occ_' + str(x))
#gss_features = pd.concat([gss_features, gss_occupation_dummies], axis=1)
#gss_features.drop(['occupation'], inplace=True, axis=1)

# Education
#gss_education_dummies = pd.get_dummies(gss_features['education']).rename(columns=lambda x: 'educ_' + str(x))
#gss_features = pd.concat([gss_features, gss_education_dummies], axis=1)
#gss_features.drop(['education'], inplace=True, axis=1)

# Residence 16
#gss_residence_16_dummies = pd.get_dummies(gss_features['residence_16']).rename(columns=lambda x: 'res16_' + str(x))
#gss_features = pd.concat([gss_features, gss_residence_16_dummies], axis=1)
#gss_features.drop(['residence_16'], inplace=True, axis=1)

# Region 16
#gss_region_16_dummies = pd.get_dummies(gss_features['region_16']).rename(columns=lambda x: 'reg16_' + str(x))
#gss_features = pd.concat([gss_features, gss_region_16_dummies], axis=1)
#gss_features.drop(['region_16'], inplace=True, axis=1)

# Family 16
#gss_family_16_dummies = pd.get_dummies(gss_features['family_16']).rename(columns=lambda x: 'fam16_' + str(x))
#gss_features = pd.concat([gss_features, gss_family_16_dummies], axis=1)
#gss_features.drop(['family_16'], inplace=True, axis=1)