In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Learning from Ken Jee ##

I'll be following Ken's YouTube video (part 2) here with this latest survey.  Ken's video is a year old so his data is different.  I will assume for now that the files have the same structure since they are from multiple years of Kaggle's survey.

I decided I will now be calling these notebooks:

# CAWK Day! #

...Code Along With Ken. :)

Some days might be **CAWD PROF** days.  (Coding along with the Data Professor). :) <br>
...and so on. <br>
You get it.

In [None]:
# import other relevant modules
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

# to display more rows and columns to manually look through the data (neat! - i didn't know this was a thing)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
# read in the data
df = pd.read_csv("/kaggle/input/kaggle-survey-2021/kaggle_survey_2021_responses.csv")
df.shape

df.describe()

In [None]:
#df.info

In [None]:
df.head()

In [None]:
# remove the top row - where the questions are
df_fin = df.iloc[1:,:]
df_fin.head()

In [None]:
# get percent of null values in each question
df_fin.isnull().sum() / df.shape[0]

This next part is *super* cool.  Ken took each question and made a dictionary for it.  The key is the question number, and the value is a *dataframe* - (yes, a dataframe!) - with the parts to the question.  That makes it easy to pull data for individual questions rather than having to filter all the time.  Some questions have a LOT of parts.

In [None]:
# create dictionary for questions
questions = {}

# create list of questions
# keep in mind that a question with parts is numbered like this: Q7_Part_1
qnums = dict.fromkeys([i.split('_')[0] for i in df_fin.columns])
qnums

Super cool way to get rid of duplicates - meaning, since there are multiple parts to many questions, since we are using `dict.fromkeys()`, it is only taking that question number one time.

In [None]:
qnums = list(qnums)
qnums

Hmm.  That just kept the keys.

In [None]:
# add data for each question to key value pairs in dictionary
for i in qnums:
    if i in['Q1', 'Q2', 'Q3']:   # so Q1 doesn't get lumped in with 11, 12, 13, etc, and similarly with 20s and 30s
        questions[i] = df_fin[i]
    else:
        questions[i] = df_fin[[q for q in df_fin.columns if q.startswith(i)]]
        
# wowwwwww 
# so if a column starts with Q22, no matter which part it is, that column will be included in the dataframe for that question

In [None]:
questions['Q9']

SO COOL!!!!!!!

Okay here I'll take some notes on plotly express vs graph objects.  I'm going to type a summary of what Ken has in his video.

### px ###

 - takes in the data frame as a parameter and you use other parameters to manipulate the columns (maybe better for beginners)
 - works with a full dataframe

### go ###

 - takes in just the data as parameters
 - manipulate the data before passing it in
 - a bit more flexible
 
We'll use on items like Q7 where there are columns for each answer choice.

My first step would be to look up histogram in plotly express documentation.  ([Here it is](https://plotly.com/python/histograms/).)

In [None]:
df_fin.Q1

In [None]:
fig = px.histogram(df_fin.Q1)
fig.show()

In [None]:
# ahhhhhh how easy!!  
# okay so Ken used the whole dataframe - let me see if it looks different. (his does)
fig = px.histogram(df_fin, x = "Q1")
fig.show()

Ohhhh yeah.  He has the 2020 data and I'm using the 2021.

In [None]:
# i now see the need to use this maybe more than once, so...
age_orders = {'Q1' : ['18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59','60-69','70+']}
type(age_orders)


In [None]:
# this would also be nice to have
code_experience_orders = {'Q6' : ['I have never written code','< 1 years','1-3 years','3-5 years','5-10 years','10-20 years','20+ years']}

In [None]:
hm_orders = {**age_orders, **code_experience_orders}
hm_orders

In [None]:
fig = px.histogram(df_fin, x='Q1', category_orders = age_orders, title = "Ages of survey responders")
fig.show()

In [None]:
# now try a density heatmap
fig = px.density_heatmap(df_fin, x = 'Q1', y = 'Q6', category_orders = hm_orders)
# adding code_exp_orders to that - needs to be dictionary (or set?) not list
fig.show()

## Ahhhhhh!!!! ## ^^^

That took about 10 extra minutes (more??) to come up with my own way to do that since I already created two "ordering" dictionaries and I wanted to use them, but now I know how to merge two dictionaries together!!  (party hat emoji dude here)

In [None]:
questions['Q7']

In [None]:
# question 7 asked what programming language the person uses on a regular basis
# let's aggregate that data first before using graph objects
questions['Q7'].columns = list(questions['Q7'].mode().values)  
# this is cool!  this will grab all of the most common values in that 25973 by 13 df??
questions['Q7']

Ahhhhhhh!!!  No...so it finds the mode by column - so now the columns are renamed to the language that column was for.  Now we can change our NaNs and probably even make these columns numerical without losing their original meaning.  SO COOL, KEN!

In [None]:
questions['Q7'].count()

In [None]:
q7 = questions['Q7'].count().reset_index()
q7

Holy s%^& this is cool.

In [None]:
q7.columns = ['Language','Count']
q7 = q7.sort_values('Count', ascending = False)
q7

In [None]:
# now let's use plotly.graph_objects to illustrate this
fig = go.Figure([go.Bar(x = q7.Language, y = q7.Count)])
fig.show()

We could look at so much more - I will after this Ken code.  For now, we are concentrating on his question about roles. <br>
*Question 5* asked what role sounds most similar to the role of the survey taker.  <br>
These 3 eloquent lines will...
- create a dictionary
- find the unique answers to this question
- make those the keys of this dictionary

In [None]:
roles = {}
for i in df_fin['Q5'].unique():
    roles[i] = df_fin[df_fin.Q5 == i]

In [None]:
roles.keys()

In [None]:
roles['Other']

It's cool to see how many rows are in that one.  Hmm...let me check out how many roles are in each.

In [None]:
for i in roles.keys():
    print("Role: {} \tNumber of respondents: {}".format(i,roles[i].shape[0]))

Now the first subquestion - How does education vary by role?

In [None]:
edu = df_fin.Q4.value_counts()
edu

In [None]:
type(edu)

In [None]:
# Remember, for a series, just use .index or .values
# Here's a graph:
fig = go.Figure([go.Bar(x=edu.index, y=edu.values)])
fig.show()

In [None]:
# education for JUST data scientists
roles['Data Scientist']

In [None]:
# check out just their education
ds_edu = roles['Data Scientist'].Q4.value_counts()
ds_edu

In [None]:
fig = go.Figure([go.Bar(x=ds_edu.index, y=ds_edu.values)])
fig.show()

First, I'll copy Ken's first iteration.  I think I only need to change capitalization on Roles vs. roles.  Since he struggled on tweaking code from StackExchange, I'm not really feeling that right now.  :)

In [None]:
# again...this is ALL KEN'S

#########################################
# First Iteration - Basic dropdown 
#########################################

#https://stackoverflow.com/questions/59406167/plotly-how-to-filter-a-pandas-dataframe-using-a-dropdown-menu
#https://plotly.com/python/dropdowns/

fig = go.Figure()
fig.add_trace(go.Bar(x= edu.index, y=edu.values))

#buttons are the things you see in the dropdown 
buttons = []

#for each graph we want to show, we need a button for it
#you can do a lot with dropdowns, not just replace data 
buttons.append(dict(method='restyle',
                    label='Data Scientist',
                    visible=True,
                    args=[{'y':[roles['Data Scientist'].Q4.value_counts().values],
                           'x':[roles['Data Scientist'].Q4.value_counts().index],
                           'type':'bar'}, [0]],
                    )
              )
buttons.append(dict(method='restyle',
                    label='Student',
                    visible=True,
                    args=[{'y':[roles['Student'].Q4.value_counts().values],
                           'x':[roles['Student'].Q4.value_counts().index],
                           'type':'bar'}, [0]],
                    )
              )
buttons.append(dict(method='restyle',
                    label='Data Analyst',
                    visible=True,
                    args=[{'y':[roles['Data Analyst'].Q4.value_counts().values],
                           'x':[roles['Data Analyst'].Q4.value_counts().index],
                           'type':'bar'}, [0]],
                    )
              )

#to get a menu to show, you need to create an updatemenu. 
#at this point I had no clue how it worked, I just was trying to get something to run

updatemenu = []
your_menu = {}
updatemenu.append(your_menu)

updatemenu[0]['buttons'] = buttons
updatemenu[0]['direction'] = 'down'
updatemenu[0]['showactive'] = True

# add dropdown menus to the figure
fig.update_layout(showlegend=False, updatemenus=updatemenu)
fig.show()

I listened to what changed in the second iteration.  Now I'll **copy and paste** that.

In [None]:
#########################################
# Second Iteration - Comparison Chart vs Baseline 
#########################################

#Added title to the figure 
fig = go.Figure(layout=go.Layout(title= go.layout.Title(text="Comparing Education by Position")))

#change to percent of group rather than raw numbers
fig.add_trace(go.Bar(name= 'Role Selection', x= edu.index, y=(edu.values/ edu.values.sum())))

#added another trace, this is the second series of bars 
fig.add_trace(go.Bar(name= 'All Data',x= edu.index, y=(edu.values/ edu.values.sum())))

#updatemenu = []
buttons = []
              
#add all roles with a loop, in previous we added them individually.
for i in list(roles.keys())[1:]:
    buttons.append(dict(method='restyle',
                        label= i,
                        visible=True,
                        args=[{'y':[roles[i].Q4.value_counts().values/roles[i].Q4.value_counts().values.sum()],
                               'x':[roles[i].Q4.value_counts().index],
                               'type':'bar'}, [0]],
                        )
                  )


#at this point I still didn't understand how this worked, I just knew it didn't add a dropdown without it 
updatemenu = []
your_menu = {}
updatemenu.append(your_menu)

updatemenu[0]['buttons'] = buttons
updatemenu[0]['direction'] = 'down'
updatemenu[0]['showactive'] = True

# add dropdown menus to the figure
fig.update_layout( updatemenus=updatemenu)

#order axes https://plotly.com/python/categorical-axes/
fig.update_xaxes(categoryorder= 'array', categoryarray= ["Doctoral degree",'Master’s degree','Bachelor’s degree','Some college/university study without earning a bachelor’s degree',"Professional doctorate","No formal education past high school","I prefer not to answer"])
fig.show()

# note - I have to change Professional degree to Professional doctorate
# apparently when adding the 2021 survey to the data, someone changed the name of that choice?  
# I would definitely say it should be Professional degree, but I am forced to change it based on my data

Now I almost feel bad haha.  This took Ken *hours*.  I'm about to copy Ken's explanation and then copy his.  I will move the one statement he talked about moving.

In [None]:
#########################################
# Third Iteration - Two Drop Down Comparison 
#########################################

fig = go.Figure(layout=go.Layout(title= go.layout.Title(text="Comparing Education by Position")))
fig.add_trace(go.Bar(name= 'Role 1', x= edu.index, y=(edu.values/ edu.values.sum())))
fig.add_trace(go.Bar(name= 'Role 2',x= edu.index, y=(edu.values/ edu.values.sum())))
# ^^ I moved this line up.  Ken explained that he later realized it could go here as it is placed in the append dictionary with args

buttons = []
# add buttons for first series of bars  
for i in list(roles.keys())[1:]:
    buttons.append(dict(method='restyle',
                        label= i,
                        visible=True,
                        args=[{'y':[roles[i].Q4.value_counts().values/roles[i].Q4.value_counts().values.sum()],
                               'x':[roles[i].Q4.value_counts().index],
                               'type':'bar'}, [0]], # the [0] at the end lets us know they are for the first trace
                        )
                  )



buttons2 = []
# add buttons for second series of bars               
for i in list(roles.keys())[1:]:
    buttons2.append(dict(method='restyle',
                        label= i,
                        visible=True,
                        args=[{'y':[roles[i].Q4.value_counts().values/roles[i].Q4.value_counts().values.sum()],
                               'x':[roles[i].Q4.value_counts().index],
                               'type':'bar'}, [1]], # the [1] at the end lets us know they are for the first trace
                        )                        #literally figured that out by just experimenting      --- haha cool
                  )
# adjusted dropdown placement 
#found out updatemenus take a dictionary of buttons and allow you to format how the dropdowns look etc.
# https://plotly.com/python/dropdowns/
button_layer_1_height = 1.23
updatemenus = list([
    dict(buttons=buttons,
            direction="down",
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.1,
            xanchor="left",
            y=button_layer_1_height,
            yanchor="top"),
    dict(buttons=buttons2,
            direction="down",
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.5,
            xanchor="left",
            y=button_layer_1_height,
            yanchor="top")])
    
fig.update_layout( updatemenus=updatemenus)
fig.update_xaxes(categoryorder= 'array', categoryarray= ["Doctoral degree",'Master’s degree','Bachelor’s degree','Some college/university study without earning a bachelor’s degree',"Professional doctorate","No formal education past high school","I prefer not to answer"])
fig.show()

#add topline to each for all types
# add seleciton 1 and selection 2

The one thing I changed on the graph was the titles of the bars to Role 1 and Role 2.

Haha now I see Ken did that for the next part.  I'm going to get that now. <br>
Also, this next one brings in the ability to go back to *All samples* after you have made a selection.  I had noticed too that you couldn't do that before.

In [None]:
#########################################
# Final Iteration - Touch-ups
#########################################
fig = go.Figure(layout=go.Layout(title= go.layout.Title(text="Comparing Education by Position")))
#changed from role 1 to selection 1
fig.add_trace(go.Bar(name= 'Selection 1', x= edu.index, y=(edu.values/ edu.values.sum())))

buttons = []

#added button for all data comparison
buttons.append(dict(method='restyle',
                        label= 'All Samples',
                        visible=True,
                        args=[{'y':[df_fin.Q4.value_counts().values/df_fin.Q4.value_counts().values.sum()],
                               'x':[df_fin.Q4.value_counts().index],
                               'type':'bar'}, [0]], # the [0] at the end lets us know they are for the first trace
                        )
                  )

for i in list(roles.keys())[1:]:
    buttons.append(dict(method='restyle',
                        label= i,
                        visible=True,
                        args=[{'y':[roles[i].Q4.value_counts().values/roles[i].Q4.value_counts().values.sum()],
                               'x':[roles[i].Q4.value_counts().index],
                               'type':'bar'}, [0]], # the [0] at the end lets us know they are for the first trace
                        )
                  )

fig.add_trace(go.Bar(name= 'Selection 2',x= edu.index, y=(edu.values/ edu.values.sum())))

buttons2 = []
#added button for all data comparison
buttons2.append(dict(method='restyle',
                        label= 'All Samples',
                        visible=True,
                        args=[{'y':[df_fin.Q4.value_counts().values/df_fin.Q4.value_counts().values.sum()],
                               'x':[df_fin.Q4.value_counts().index],
                               'type':'bar'}, [1]], # the [1] at the end lets us know they are for the second trace
                        )
                  )

for i in list(roles.keys())[1:]:
    buttons2.append(dict(method='restyle',
                        label= i,
                        visible=True,
                        args=[{'y':[roles[i].Q4.value_counts().values/roles[i].Q4.value_counts().values.sum()],
                               'x':[roles[i].Q4.value_counts().index],
                               'type':'bar'}, [1]], # the [1] at the end lets us know they are for the first trace
                        )                         
                  )
# adjusted dropdown placement 
#found out updatemenus take a dictionary of buttons and allow you to format how the dropdowns look etc.
# https://plotly.com/python/dropdowns/
button_layer_1_height = 1.23
updatemenus = list([
    dict(buttons=buttons,
            direction="down",
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.11,
            xanchor="left",
            y=button_layer_1_height,
            yanchor="top"),
    dict(buttons=buttons2,
            direction="down",
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.71,
            xanchor="left",
            y=button_layer_1_height,
            yanchor="top")])
    
fig.update_layout( updatemenus=updatemenus)
#added annotations next to dropdowns 
fig.update_layout(
    annotations=[
        dict(text="Selection 1", x=0, xref="paper", y=1.15, yref="paper",
                             align="left", showarrow=False),
        dict(text="Selection 2", x=0.65, xref="paper", y=1.15,
                             yref="paper", showarrow=False)
    ])
fig.update_xaxes(categoryorder= 'array', categoryarray= ["Doctoral degree",'Master’s degree','Bachelor’s degree','Some college/university study without earning a bachelor’s degree',"Professional doctorate","No formal education past high school","I prefer not to answer"])
fig.show()

THIS IS SO COOOOOOOOL!!! 😎 <br>
(Also I LOVE this bar on this new Mac from Hannah!! - Emojis? Really?? 😳)