In [1]:
import pandas as pd
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource
from bokeh.layouts import row, column
from bokeh.layouts import layout
from bokeh.layouts import gridplot
from math import pi
from bokeh.models.widgets import Div

output_notebook()


column_names = [
    'age', 'working_class', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 
    'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income'
]
df_lab8 = pd.read_csv("C:/Users/saket/Desktop/IFT 533/adult.data", names = column_names, header= None)

df_lab8

Unnamed: 0,age,working_class,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [5]:
column_names = ['age', 'working_class', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation',
                'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']
df = pd.read_csv("C:/Users/saket/Desktop/IFT 533/adult.data", names=column_names, header=None)

age_groups = [
    (0, 19), (20, 29), (30, 39),
    (40, 49), (50, 59), (60, 69),
    (70, 79), (80, 89), (90, 99)
]

def create_empty_label_figure(labels):
    y = [0] * len(labels)
    empty_figure = figure(x_range=labels, height=50, width=300)
    empty_figure.circle(labels, y, size=0, alpha=0)
    empty_figure.xaxis.visible = True
    empty_figure.yaxis.visible = False
    empty_figure.xaxis.major_label_orientation = pi/2
    empty_figure.toolbar.logo = None
    empty_figure.toolbar_location = None
    return empty_figure

def create_distribution_figures(data, attribute, age_ranges):
    distribution_figures = []
    attribute_labels = []

    for age_range in age_ranges:
        start_age, end_age = age_range
        filtered_data = data[(data['age'] >= start_age) & (data['age'] <= end_age)]
        attribute_counts = filtered_data[attribute].value_counts().reset_index()

        source = ColumnDataSource(data=dict(x=attribute_counts['index'], top=attribute_counts[attribute]))

        dist_figure = figure(x_range=attribute_counts['index'], toolbar_location=None, width=300, height=300)
        dist_figure.vbar(x='x', top='top', source=source, width=0.03 * len(attribute_counts['index']), color='grey')

        dist_figure.yaxis.minor_tick_line_color = None
        dist_figure.xgrid.grid_line_color = None
        dist_figure.xaxis.visible = False
        dist_figure.xgrid.grid_line_color = None

        distribution_figures.append(dist_figure)
        attribute_labels = attribute_counts['index']

    distribution_figures.append(create_empty_label_figure(attribute_labels))
    return distribution_figures

edu_distribution_figures = create_distribution_figures(df, 'education', age_groups)
wkg_distribution_figures = create_distribution_figures(df, 'working_class', age_groups)
mrt_distribution_figures = create_distribution_figures(df, 'marital_status', age_groups)
inc_distribution_figures = create_distribution_figures(df, 'income', age_groups)

divs = [Div(text=age_group, height=300) for age_group in ['less than 20', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-100']]
label_box = column(divs)



grid_lab8 = gridplot([[label_box, column(edu_distribution_figures), column(wkg_distribution_figures), column(mrt_distribution_figures), column(inc_distribution_figures)]])

show(grid_lab8)
