In [None]:
import numpy as np
import pandas as pd
import sidetable
from bokeh.io import output_notebook, show
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
output_notebook()


In [None]:
## https://archive-beta.ics.uci.edu/dataset/2/adult
df = pd.read_csv("usercode/dataset.data" ,sep = ', ', engine = "python")
df.head()


In [None]:
df.isna().sum()
df.stb.counts()

In [None]:
df = df.replace('?',np.nan)

df.dropna(inplace=True)
df.stb.missing()
df.stb.missing()

In [None]:
df.stb.freq(["class"],style = True)
#df.columns

In [None]:
distribution = df.stb.freq(['class'])
distribution[["class","count","percent"]]

In [None]:
from bokeh.transform import factor_cmap
from bokeh.palettes import Spectral6
output = distribution['class'].to_list()
count = distribution['count'].to_list()

source = ColumnDataSource(dict(output=output,count=count))
p = figure(x_range = output, toolbar_location = None, tools = 'hover', tooltips = "@output:@count",title= 'Adult Income Graph')
p.vbar(x = 'output',top = 'count', width = 0.8, line_color = 'white',source = source, fill_color = factor_cmap('output',palette=Spectral6,factors=output))
p.xaxis.axis_label = 'Income Status'
p.yaxis.axis_label = "No. of Adults"
show(p)


## Data Distribution in Categorical

In [None]:
df.head()
race = df.stb.freq(['race'])
relationship = df.stb.freq(['relationship'])
marital_status = df.stb.freq(['marital-status'])
workclass = df.stb.freq(['workclass'])
occupation = df.stb.freq(['occupation'])
education = df.stb.freq(['education'])
education_num = df.stb.freq(['education-num'])
print(race, '\n', relationship, '\n', marital_status, '\n',workclass, '\n',occupation,'\n', education, '\n', education_num)


## Finding Trends and plotting multiple graphs using gridplot

In [None]:
# for multiple plots
from bokeh.layouts import gridplot
race_output = race['race'].to_list()
race_count = race['count'].to_list()
source = ColumnDataSource(dict(output =race_output,count = race_count))
plot1 = figure(x_range = race_output,toolbar_location = None, tools = "hover",tooltips = '@output:@count', title = 'Race' )
plot1.vbar(x = 'output', top = 'count', width=0.9,source = source,line_color='white', color = 'turquoise')
plot1.xaxis.major_label_orientation = 1.1

rel_output = relationship['relationship'].to_list()
rel_count = relationship['count'].to_list()
source1 = ColumnDataSource(dict(output =rel_output,count = rel_count))
plot2 = figure(x_range = rel_output,toolbar_location = None,tools = "hover",tooltips = '@output:@count',title = "Relationship Graph")
plot2.vbar(x = 'output', top = 'count', width=0.9,source = source1,line_color='white', color = 'mediumturquoise')
plot2.xaxis.major_label_orientation = 1.1

ms_output = marital_status['marital-status'].to_list()
ms_count = marital_status['count'].to_list()
source2 = ColumnDataSource(dict(output =ms_output,count = ms_count))
plot3 = figure(x_range = ms_output, toolbar_location = None, tools = "hover", tooltips = "@output:@count", title = 'Marital status Graph')
plot3.vbar(x = 'output', top = 'count', width=0.9,source = source2,line_color='white', color = 'darkturquoise')
plot3.bxaxis.major_label_orientation = 1.1

workclass_output = workclass['workclass'].to_list()
workclass_count = workclass['count'].to_list()
source3 = ColumnDataSource(dict(output =workclass_output,count = workclass_count))
plot4 = figure(x_range = workclass_output, toolbar_location = None, tools = "hover", tooltips = "@output:@count", title = 'Workclass Graph')
plot4.vbar(x = 'output', top = 'count', width=0.9,source = source3,line_color='white', color = 'lightseagreen')
plot4.xaxis.major_label_orientation = 1.1

occupation_output = occupation['occupation'].to_list()
occupation_count = occupation['count'].to_list()
source4 = ColumnDataSource(dict(output =occupation_output,count = occupation_count))
plot5 = figure(x_range = occupation_output, toolbar_location = None, tools = "hover", tooltips = "@output:@count", title = 'Occupation Graph')
plot5.vbar(x = 'output', top = 'count', width=0.9,source = source4,line_color='white', color = 'turquoise')
plot5.xaxis.major_label_orientation = 1.1

education_output = education['education'].to_list()
education_count = education['count'].to_list()
source5 = ColumnDataSource(dict(output =education_output,count = education_count))
plot6 = figure(x_range = education_output, toolbar_location = None, tools = "hover", tooltips = "@output:@count", title = 'Education Graph')
plot6.vbar(x = 'output',top = 'count',width = 0.9,source = source5,line_color = "white",color = 'cadetblue' )
plot6.xaxis.major_label_orientation = 1.1

education_num['education-num'] = education_num['education-num'].apply(str)
education_n_output = education_num['education-num'].to_list()
education_n_count = education_num['count'].to_list()
source6 = ColumnDataSource(dict(output =education_n_output,count = education_n_count))
plot7 = figure(x_range = education_n_output, toolbar_location = None, tools = "hover", tooltips = "@output:@count", title = 'Education_num Graph')
plot7.vbar(x = 'output', top = 'count', width=0.9,source = source6,line_color='white', color = 'darkcyan')
plot7.xaxis.major_label_orientation = 1.1


gridplot_output = gridplot([[plot1,plot2],[plot3,plot4],[plot5,plot6],[plot7]], toolbar_location = None)

show(gridplot_output)


In [None]:
p = figure()
p.circle(df['hours-per-week'],df['capital-gain'],size = 10,color = 'green')
p.yaxis.formatter.use_scientific = False
show(p)