## Task 1: Import Libraries


In [4]:
import numpy as np
import pandas as pd
import sidetable
from bokeh.io import output_notebook, show
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
output_notebook()


# Task 2: Load the Dataset


In [5]:
## https://archive-beta.ics.uci.edu/dataset/2/adult
df = pd.read_csv("dataset.data" ,sep = ', ', engine = "python")
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# Task 3: Explore the Dataset


In [6]:
df.isna().sum()
df.stb.counts()

Unnamed: 0,count,unique,most_freq,most_freq_count,least_freq,least_freq_count
class,32561,2,<=50K,24720,>50K,7841
sex,32561,2,Male,21790,Female,10771
race,32561,5,White,27816,Other,271
relationship,32561,6,Husband,13193,Other-relative,981
marital-status,32561,7,Married-civ-spouse,14976,Married-AF-spouse,23
workclass,32561,9,Private,22696,Never-worked,7
occupation,32561,15,Prof-specialty,4140,Armed-Forces,9
education,32561,16,HS-grad,10501,Preschool,51
education-num,32561,16,9,10501,1,51
native-country,32561,42,United-States,29170,Holand-Netherlands,1


## Task 4: Treat Missing Values


In [7]:
df = df.replace('?',np.nan)

df.dropna(inplace=True)
df.stb.missing()
df.stb.missing()

Unnamed: 0,missing,total,percent
age,0,30162,0.0
workclass,0,30162,0.0
fnlwgt,0,30162,0.0
education,0,30162,0.0
education-num,0,30162,0.0
marital-status,0,30162,0.0
occupation,0,30162,0.0
relationship,0,30162,0.0
race,0,30162,0.0
sex,0,30162,0.0


## Task 5: Inspect Class Distribution


In [8]:
df.stb.freq(["class"],style = True)
#df.columns

Unnamed: 0,class,count,percent,cumulative_count,cumulative_percent
0,<=50K,22654,75.11%,22654,75.11%
1,>50K,7508,24.89%,30162,100.00%


## Task 6: Display Class Imbalance as Histogram


In [9]:
distribution = df.stb.freq(['class'])
distribution[["class","count","percent"]]

Unnamed: 0,class,count,percent
0,<=50K,22654,75.107751
1,>50K,7508,24.892249


## Task 7: Explore the Categorical Variables


In [10]:
from bokeh.transform import factor_cmap
from bokeh.palettes import Spectral6
output = distribution['class'].to_list()
count = distribution['count'].to_list()

source = ColumnDataSource(dict(output=output,count=count))
p = figure(x_range = output, toolbar_location = None, tools = 'hover', tooltips = "@output:@count",title= 'Adult Income Graph')
p.vbar(x = 'output',top = 'count', width = 0.8, line_color = 'white',source = source, fill_color = factor_cmap('output',palette=Spectral6,factors=output))
p.xaxis.axis_label = 'Income Status'
p.yaxis.axis_label = "No. of Adults"
show(p)


## Task 8: Visualize Data Distribution in Categorical Variables


In [11]:
df.head()
race = df.stb.freq(['race'])
relationship = df.stb.freq(['relationship'])
marital_status = df.stb.freq(['marital-status'])
workclass = df.stb.freq(['workclass'])
occupation = df.stb.freq(['occupation'])
education = df.stb.freq(['education'])
education_num = df.stb.freq(['education-num'])
print(race, '\n', relationship, '\n', marital_status, '\n',workclass, '\n',occupation,'\n', education, '\n', education_num)


                 race  count    percent  cumulative_count  cumulative_percent
0               White  25933  85.979046             25933           85.979046
1               Black   2817   9.339566             28750           95.318613
2  Asian-Pac-Islander    895   2.967310             29645           98.285923
3  Amer-Indian-Eskimo    286   0.948213             29931           99.234136
4               Other    231   0.765864             30162          100.000000 
      relationship  count    percent  cumulative_count  cumulative_percent
0         Husband  12463  41.320204             12463           41.320204
1   Not-in-family   7726  25.615012             20189           66.935216
2       Own-child   4466  14.806710             24655           81.741927
3       Unmarried   3212  10.649161             27867           92.391088
4            Wife   1406   4.661495             29273           97.052583
5  Other-relative    889   2.947417             30162          100.000000 
           

## Task 9: Explore Trends via Continuous Variables


In [12]:
# for multiple plots
from bokeh.layouts import gridplot
race_output = race['race'].to_list()
race_count = race['count'].to_list()
source = ColumnDataSource(dict(output =race_output,count = race_count))
plot1 = figure(x_range = race_output,toolbar_location = None, tools = "hover",tooltips = '@output:@count', title = 'Race' )
plot1.vbar(x = 'output', top = 'count', width=0.9,source = source,line_color='white', color = 'turquoise')
plot1.xaxis.major_label_orientation = 1.1

rel_output = relationship['relationship'].to_list()
rel_count = relationship['count'].to_list()
source1 = ColumnDataSource(dict(output =rel_output,count = rel_count))
plot2 = figure(x_range = rel_output,toolbar_location = None,tools = "hover",tooltips = '@output:@count',title = "Relationship Graph")
plot2.vbar(x = 'output', top = 'count', width=0.9,source = source1,line_color='white', color = 'mediumturquoise')
plot2.xaxis.major_label_orientation = 1.1

ms_output = marital_status['marital-status'].to_list()
ms_count = marital_status['count'].to_list()
source2 = ColumnDataSource(dict(output =ms_output,count = ms_count))
plot3 = figure(x_range = ms_output, toolbar_location = None, tools = "hover", tooltips = "@output:@count", title = 'Marital status Graph')
plot3.vbar(x = 'output', top = 'count', width=0.9,source = source2,line_color='white', color = 'darkturquoise')
plot3.xaxis.major_label_orientation = 1.1

workclass_output = workclass['workclass'].to_list()
workclass_count = workclass['count'].to_list()
source3 = ColumnDataSource(dict(output =workclass_output,count = workclass_count))
plot4 = figure(x_range = workclass_output, toolbar_location = None, tools = "hover", tooltips = "@output:@count", title = 'Workclass Graph')
plot4.vbar(x = 'output', top = 'count', width=0.9,source = source3,line_color='white', color = 'lightseagreen')
plot4.xaxis.major_label_orientation = 1.1

occupation_output = occupation['occupation'].to_list()
occupation_count = occupation['count'].to_list()
source4 = ColumnDataSource(dict(output =occupation_output,count = occupation_count))
plot5 = figure(x_range = occupation_output, toolbar_location = None, tools = "hover", tooltips = "@output:@count", title = 'Occupation Graph')
plot5.vbar(x = 'output', top = 'count', width=0.9,source = source4,line_color='white', color = 'turquoise')
plot5.xaxis.major_label_orientation = 1.1

education_output = education['education'].to_list()
education_count = education['count'].to_list()
source5 = ColumnDataSource(dict(output =education_output,count = education_count))
plot6 = figure(x_range = education_output, toolbar_location = None, tools = "hover", tooltips = "@output:@count", title = 'Education Graph')
plot6.vbar(x = 'output',top = 'count',width = 0.9,source = source5,line_color = "white",color = 'cadetblue' )
plot6.xaxis.major_label_orientation = 1.1

education_num['education-num'] = education_num['education-num'].apply(str)
education_n_output = education_num['education-num'].to_list()
education_n_count = education_num['count'].to_list()
source6 = ColumnDataSource(dict(output =education_n_output,count = education_n_count))
plot7 = figure(x_range = education_n_output, toolbar_location = None, tools = "hover", tooltips = "@output:@count", title = 'Education_num Graph')
plot7.vbar(x = 'output', top = 'count', width=0.9,source = source6,line_color='white', color = 'darkcyan')
plot7.xaxis.major_label_orientation = 1.1


gridplot_output = gridplot([[plot1,plot2],[plot3,plot4],[plot5,plot6],[plot7]], toolbar_location = None)

show(gridplot_output)


In [19]:
p = figure()
p.scatter(x='hours-per-week', y='capital-gain', size=10, color='green', marker="circle", source=df)
p.yaxis.formatter.use_scientific = False
show(p)
