## Predict income shock risk

This notebook allows users to input their own data and identify their predicted income shock risk. 

This visualization is intended for educational purposes only and should not be considered financial advice.

In [1]:
import re
from ipywidgets import interact, interactive, Layout, interactive_output,VBox
import ipywidgets as widgets
from IPython.display import display, HTML
from ipywidgets import HBox, Label, FloatSlider,Dropdown,IntSlider,RadioButtons, Box, Layout
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
import pickle

First, import our data and our prebuilt model.

In [2]:
cohort_79=pd.read_csv('data/cohort79_Jun8.csv')
cohort_97=pd.read_csv('data/cohort97_Jun8.csv')

merged_data = pd.concat([cohort_79, cohort_97], sort=False).drop(axis=1,labels='Unnamed: 0')
merged_data.drop(merged_data[merged_data["adjusted_income"] <= 1000].index, inplace=True)
merged_data.fillna(0, inplace=True)

predictors = list(merged_data.columns)
vars_to_drop = ["case_id", 'urban_or_rural',"sample_id", "year", "shock", "region","highest_grade", "industry", "occupation", 'marital_status','race','work_amount_limited','work_kind_limited', 'family_size','region_1',"region_2", "region_3", "region_4"]
for var in vars_to_drop:
    predictors.remove(var)

df_test = pd.read_csv("sample_predictions.csv")
bins = np.linspace(0,1, 50)
loaded_model = pickle.load(open("finalized_model.sav", 'rb'))

Next, establish the variables and functions required for our visualization.

In [3]:
IV=['adjusted_income', 'age','hours_worked_last_year', 'weeks_worked_last_year', 'number_of_kids', 'prior_income', 'unemployment', 'gdp_growth',
       'inflation', 'regional_unemployment']
CV=['sex', 'curr_pregnant',  'work_limited', 'highest_grade_0',
       'highest_grade_5', 'highest_grade_8', 'highest_grade_12',
       'highest_grade_13', 'highest_grade_16', 'highest_grade_17',
       'industry_10', 'industry_40', 'industry_60', 'industry_100',
       'industry_400', 'industry_500', 'industry_580', 'industry_700',
       'industry_721', 'industry_761', 'industry_800', 'industry_812',
       'industry_900', 'industry_940', 'industry_992', 'occupation_10',
       'occupation_500', 'occupation_800', 'occupation_1000',
       'occupation_1300', 'occupation_1550', 'occupation_1600',
       'occupation_2000', 'occupation_2100', 'occupation_2200',
       'occupation_2600', 'occupation_3000', 'occupation_3600',
       'occupation_3700', 'occupation_4000', 'occupation_4200',
       'occupation_4300', 'occupation_4700', 'occupation_5000',
       'occupation_6005', 'occupation_6200', 'occupation_6800',
       'occupation_7000', 'occupation_7700', 'occupation_9000',
       'occupation_9800', 'occupation_9920', 'marital_status_0',
       'marital_status_1', 'marital_status_2', 'marital_status_3',
       'marital_status_4', 'race_1', 'race_2', 'race_3']

In [4]:
# load the model from pickle and predict according to user-input
def load_model(age, sex, income, prior_income, number_of_kids, hours_per_week, race, marital_status,pregnant,occupation,industry, highest_grade,work_limited,weeks_worked_last_year):

    #2018 macroeconomic info
    User_input=merged_data[predictors].iloc[0]
    User_input['unemployment']=0.04
    User_input['gdp_growth']=.03
    User_input['inflation']=0.02
    User_input['regional_unemployment']=.04
    
    #interval variables
    User_input['age']
    User_input['adjusted_income']=income
    User_input['number_of_kids']=number_of_kids
    User_input['hours_worked_last_year']=hours_per_week*weeks_worked_last_year
    User_input['weeks_worked_last_year']=weeks_worked_last_year
    User_input['prior_income']=prior_income
    
    if User_input['prior_income']==0:
        User_input['income_change']=5,
    else:
        User_input['income_change']=min(income/prior_income-1,5)
    
    #categorical variables
    User_input[CV]=0
    User_input['sex']=sex
    User_input[race]=1
    User_input[marital_status]=1
    User_input['curr_pregnant']=pregnant
    User_input[occupation]=1
    User_input[industry]=1
    User_input[highest_grade]=1
    User_input['work_limited']=work_limited
    
    user_proba=loaded_model.predict_proba([User_input])[:,1]
     
    #show the histgram
    plt.figure(figsize=(15,5))
    plt.yticks([])
    plt.ylabel("Frequency of Predicted Instances")
    plt.hist(df_test[df_test['shock']==0]["probability of income shock"],bins, alpha=.9, label='No Shock',color="#259433")  
    plt.hist(df_test[df_test['shock']==1]["probability of income shock"],bins,alpha=0.7, label='Shock', color="#ff4940")
    plt.axvline(user_proba,0,3000, linewidth=1)
    plt.xticks([user_proba,user_proba])
    plt.legend()
    plt.show()

    return

style={'description_width' :"initial"}
age=widgets.IntSlider(value=30, min=18,max=58,description='Age', layout=Layout(flex='1 1 auto', width='auto'),style=style, continuous_update=False)
income=widgets.IntSlider(value=30000, min=1000,max=225000,description='Income',style=style, continuous_update=False)
number_of_kids=widgets.IntSlider(value=0, min=0,max=10,description='Number of Kids',style=style, continuous_update=False)
hours_per_week=widgets.IntSlider(value=40, min=0.0,max=86.0, layout=Layout(flex='1 1 auto', width='auto'),description='Hours Worked Per Week',style=style, continuous_update=False)
weeks_worked_last_year=widgets.IntSlider(value=44, min=0.0,max=52, layout=Layout(flex='1 1 auto', width='auto'),description='Weeks Worked Last Year',style=style, continuous_update=False)
prior_income=widgets.IntSlider(value=30000, min=1000,max=225000,description='Prior Income', layout=Layout(flex='1 1 auto', width='auto'),style=style, continuous_update=False)
sex=widgets.RadioButtons(value=1,options=[("Female",1),('Male',2)],layout=Layout(flex='1 1 auto', width='auto'),description="Sex", style=style, disabled=False)
race=widgets.RadioButtons(value='race_3',options=[('Black','race_1'),("Hispanic",'race_2'),("Non-Black/Non-Hispanic",'race_3')],style=style, layout=Layout(flex='1 1 auto', width='1'),description='Race',disabled=False)
marital_status=widgets.RadioButtons(value='marital_status_0',options=[("Never Married",'marital_status_0'),('Married','marital_status_1'),('Separated','marital_status_2'),('Divorced','marital_status_3'),('Widowed','marital_status_4')],layout=Layout(flex='1 1 auto', width='auto'),style=style, description='Marital Status',disabled=False)
pregnant=widgets.RadioButtons(value=0,options=[('Currently Pregnant',1),('Not Currently Pregnant',0)], layout=Layout(flex='2 1 auto', width='auto',length='auto'),description='Pregnant or Not',style=style, disabled=False)
work_limited=widgets.RadioButtons(value=0,style=style, description="Disability That Limits Amount/Kind of Work",disabled=False,layout=Layout(flex='1 1 auto', width='auto'),
                    options=[("Yes",1),("No", 0)])
highest_grade=widgets.Dropdown(value='highest_grade_12',style=style, description="Highest Grade Completed",disabled=False,layout=Layout(flex='1 1 auto', width='auto'),
                       options=[('LESS THAN ELEMENTARY SCHOOL (0-4)','highest_grade_0'),
                               ('ELEMENTARY SCHOOL (5-7)','highest_grade_5'),
                              ('MIDDLE SCHOOL (8-11)','highest_grade_8'),
                              ('HIGH SCHOOL (12)','highest_grade_12'),
                              ('SOME COLLEGE (13-15)','highest_grade_13'),
                              ('FOUR-YEAR COLLEGE DEGREE (16)','highest_grade_16'),
                              ('GRADUATE SCHOOL (17-20)','highest_grade_17')])
occupation=widgets.Dropdown(value='occupation_5000',style=style, description="Occupation",disabled=False,
                    options=[("MANAGEMENT, BUSINESS, SCIENCE, AND ARTS",'occupation_10'),
                            ("BUSINESS OPERATIONS SPECIALISTS",'occupation_500'),
                            ('FINANCIAL SPECIALISTS','occupation_800'),
                            ('COMPUTER AND MATHEMATICAL','occupation_1000'),
                            ('ARCHITECTURE AND ENGINEERING','occupation_1300'),
                            ('TECHNICIANS','occupation_1550'),
                            ('LIFE, PHYSICAL, AND SOCIAL SCIENCE','occupation_1600'),
                            ('COMMUNITY AND SOCIAL SERVICES', 'occupation_2000'),
                            ('LEGAL','occupation_2100'),
                            ('EDUCATION, TRAINING, AND LIBRARY','occupation_2200'),
                            ('ARTS, DESIGN, ENTERTAINMENT, SPORTS, AND MEDIA','occupation_2600'),
                            ('HEALTHCARE PRACTITIONERS AND TECHNICAL','occupation_3000'),
                            ('HEALTHCARE SUPPORT','occupation_3600'),
                            ('PROTECTIVE SERVICE','occupation_3700'),
                            ('FOOD PREPARATION AND SERVING','occupation_4000'),
                            ('BUILDING AND GROUNDS CLEANING AND MAINTENANCE','occupation_4200'),
                            ('PERSONAL CARE AND SERVICE','occupation_4300'),
                            ('SALES AND RELATED','occupation_4700'),
                            ('OFFICE AND ADMINISTRATIVE SUPPORT','occupation_5000'),
                            ('FARMING, FISHING, AND FORESTRY','occupation_6005'),
                            ('CONSTRUCTION','occupation_6200'),
                            ('EXTRACTION','occupation_6800'),
                            ('INSTALLATION, MAINTENANCE, AND REPAIR', 'occupation_7000'),
                            ('PRODUCTION','occupation_7700'),
                            ('TRANSPORTATION AND MATERIAL MOVING','occupation_9000'),
                            ('MILITARY SPECIFIC','occupation_9800'),
                            ('UNEMPLOYED OR NEVER WORKED','occupation_9920')])
industry=widgets.Dropdown(value='industry_812',style=style, description="Industry",disabled=False,
                  options=[('NOT APPLICABLE','industry_0'),
                       ('AGRICULTURE, FORESTRY, AND FISHERIES','industry_10'),
                       ('MINING','industry_40'),
                       ('CONSTRUCTION','industry_60'),
                       ('MANUFACTURING','industry_100'),
                       ('TRANSPORTATION, COMMUNICATIONS, AND OTHER PUBLIC UTILITIES', 'industry_400'),
                       ('WHOLESALE TRADE','industry_500'),
                       ('RETAIL TRADE',  'industry_580' ),
                       ('FINANCE, INSURANCE, AND REAL ESTATE', 'industry_700'),
                       ('BUSINESS AND REPAIR SERVICES', 'industry_721'),
                       ('PERSONAL SERVICES', 'industry_761'),
                       ('ENTERTAINMENT AND RECREATION SERVICES', 'industry_800'),
                       ('PROFESSIONAL AND RELATED SERVICES', 'industry_812'),
                       ('PUBLIC ADMINISTRATION', 'industry_900'),
                       ('ACTIVE DUTY MILITARY', 'industry_940'),
                       ('EXPERIENCED UNEMPLOYED NOT CLASSIFIED BY INDUSTRY', 'industry_992')])


Finally, display the visualization. Many of the fields are self-explanatory, but for a few, some definitions are in order:

**Prior income**: This should represent the respondent's income two years previously. For instance, a respondent using this visualization in 2019 would report their income from 2017.

**Hours worked per week**: This should represent the average number of hours worked per week in weeks in which the respondent conducted any paid labor market work in the last year.

In [5]:
left_box=VBox([age,number_of_kids,hours_per_week,weeks_worked_last_year])
mid_box=VBox([income,prior_income,occupation,industry,highest_grade,work_limited,pregnant])
right_box=VBox([sex,race,marital_status,])
ui=widgets.HBox([left_box,mid_box, right_box])
output=widgets.interactive_output(load_model,{'age':age,'income':income,'prior_income':prior_income,'weeks_worked_last_year':weeks_worked_last_year,'hours_per_week':hours_per_week, 'number_of_kids':number_of_kids, 'sex':sex,'race':race,'marital_status':marital_status, 'pregnant':pregnant,'occupation':occupation, 'industry':industry, 'highest_grade':highest_grade, 'work_limited':work_limited})
output.layout.height="400px"
output.layout.width='800px'
box=widgets.VBox([ui,output])
display(box)

VBox(children=(HBox(children=(VBox(children=(IntSlider(value=30, continuous_update=False, description='Age', l…