In [1]:
import pandas as pd
from decouple import config
import os
import numpy as np
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook

output_notebook()
%matplotlib inline

# Load Data

In [2]:
y = pd.read_csv(os.path.join(config('PYTHONPATH'), 'cache/target.csv'))['0']
y_multiclass = pd.read_csv(os.path.join(config('PYTHONPATH'), 'cache/y_multiclass.csv'))

In [3]:
multiclass = y_multiclass.loc[:,['1', '2', '3', '4', '5']].sum()

# Visualization

In [4]:
tops, edges = np.histogram(y.dropna(), bins=78, range=(-1, 1), density=True)

p = figure(
    title="Income Growth", 
    x_axis_label='Annual Income Growth (fraction)',
    y_axis_label='Density (%)',
    plot_height=467, 
)
p.quad(top=tops, bottom=0, left=edges[:-1], right=edges[1:], color=(230, 54, 54), line_color="white")
p.y_range.start = 0
p.xaxis.axis_label_text_font_size = '16px'
p.yaxis.axis_label_text_font_size = '16px'
p.title.text_font_size = '20pt'

show(p)

In [6]:
x = ['-100% >', '-50% >', '0% >', '50% >', '100% >']
p = figure(
    x_range=x,
    plot_height=367, 
    title="Income Growth",
    x_axis_label='Income Growth',
    y_axis_label='Number of Examples',
)
p.vbar(x=x, top=list(multiclass), width=0.9, color=(230, 54, 54))
p.xgrid.grid_line_color = None
p.xaxis.axis_label_text_font_size = '16px'
p.yaxis.axis_label_text_font_size = '16px'
p.title.text_font_size = '16pt'
p.y_range.start = 0
p.y_range.end = 4000

show(p)

In [7]:
print(f'Mean: {np.mean(y):.2f}')
print(f'Mean: {np.nanmedian(y):.2f}')

Mean: 0.20
Mean: 0.05
