# Who takes the longest to complete the Kaggle Survey? ⏱️

#### This is somewhat of a joke notebook. I was just curious to see if certain groups take longer to fill out the survey compared to others. To make it even more joke-worthy, the figures use Comic Sans MS as the font. 😅

#### A few things to note:
- There are a few values that are extremely high, skewing the mean. Whenever I make comments, I am referring to the median. 

- Because there are a wide range of values, a log scale is used.

- Time is in seconds, and for reference. 3600 seconds = 1 hour, 86.4k seconds = 1 day. 

- For exact values, hover over the points.

- Figures should be color-blind friendly

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.offline import init_notebook_mode
import plotly.graph_objects as go
init_notebook_mode(connected=True)
pd.set_option('display.max_columns', 5000)
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv")
columns = df.iloc[0, :]
df = df.iloc[1:, :]
df.columns = columns

In [None]:
dur_col = "Duration (in seconds)"

df[dur_col] = df[dur_col].astype(int)

fig = px.histogram(df, x=dur_col, log_y=True, title="Histogram of Time to Complete Survey")
fig.add_vline(x=86400,annotation_text="1 day", annotation_position="top right",
              opacity=0.5, line_width=2, line_dash="dot")
fig.add_vline(x=604800,annotation_text="1 week", annotation_position="top right",
              opacity=0.5, line_width=2, line_dash="dot")
fig.add_vline(x=604800*2,annotation_text="2 weeks", annotation_position="top right",
              opacity=0.5, line_width=2, line_dash="dot")
fig.add_vline(x=604800*3, annotation_text="3 weeks", annotation_position="top right",
              opacity=0.5, line_width=2, line_dash="dot")
fig.add_vline(x=2592000,annotation_text="30 days", annotation_position="top left",
              opacity=0.5, line_width=2, line_dash="dot")
fig.update_layout(
            font=dict(
            family="Comic Sans MS",
            size=18,
        )
    )
fig.show()

In [None]:
durations = df[dur_col].sort_values()

values = np.logspace(2, 7, num=1000)
sums = [(durations < val).sum() for val in values]
fracs = [x/len(df) for x in sums]

frac_col = "Fraction of respondents <br>that completed the survey"

frac_df = pd.DataFrame(data={dur_col: values, frac_col: fracs})

fig = px.line(frac_df, x=dur_col, y=frac_col, log_x=True, title="Fraction of respondents that completed <br>the survey under a certain amount of time")
fig.add_vline(x=np.log10(600),annotation_text="10'", annotation_position="bottom left",
              opacity=0.5, line_width=0, line_dash="dot")
fig.add_vline(x=600, opacity=0.5, line_width=2, line_dash="dot")
fig.add_vline(x=np.log10(1800),annotation_text="30'", annotation_position="bottom left",
              opacity=0.5, line_width=0, line_dash="dot")
fig.add_vline(x=1800, opacity=0.5, line_width=2, line_dash="dot")
fig.add_vline(x=np.log10(3600),annotation_text="1 hour", annotation_position="bottom right",
              opacity=0.5, line_width=0, line_dash="dot")
fig.add_vline(x=3600, opacity=0.5, line_width=2, line_dash="dot")
fig.add_vline(x=np.log10(86400),annotation_text="1 day", annotation_position="bottom left",
              opacity=0.5, line_width=0, line_dash="dot")
fig.add_vline(x=86400, opacity=0.5, line_width=2, line_dash="dot")
fig.add_vline(x=np.log10(604800),annotation_text="1 wk", annotation_position="bottom left",
              opacity=0.5, line_width=0, line_dash="dot")
fig.add_vline(x=604800, opacity=0.5, line_width=2, line_dash="dot")
fig.add_vline(x=np.log10(2592000),annotation_text="30 days", annotation_position="bottom right",
              opacity=0.5, line_width=0, line_dash="dot")
fig.add_vline(x=2592000, opacity=0.5, line_width=2, line_dash="dot")
fig.update_layout(
            font=dict(
            family="Comic Sans MS",
            size=18,
        )
    )
fig.show()

In [None]:
age_col = "What is your age (# years)?"


def create_chart(df, groupby, other_col, x, y, color, symbol, title, log_y=True, log_x=False, sort_values=None, labels=None, **kwargs):
    
    metrics = ["count", "mean", "median", "std"]
    grouped = df.groupby(groupby, as_index=False)[other_col].agg(metrics).reset_index()
    melted = pd.melt(grouped, id_vars=[groupby], value_vars=metrics, var_name="metric", value_name="value")
    if sort_values:
        melted = melted.sort_values(**sort_values)
    
    if labels:
        labels = {"value": "Time to complete survey (seconds)", **labels}
    else:
        labels = {"value": "Time to complete survey (seconds)"}
    fig = px.scatter(melted, x=x, y=y, color=color, symbol=symbol,title=title, log_y=log_y, log_x=log_x, labels=labels, **kwargs)
    fig.update_traces(marker=dict(size=15,
                              line=dict(width=1,
                                        color='black')))
    fig.update_layout(
            font=dict(
            family="Comic Sans MS",
            size=18,
        )
    )
    return fig

create_chart(df=df, groupby=age_col, other_col=dur_col, x=age_col, y="value", color="metric", symbol="metric", title="Time to Complete by Age Group<br><sup>Almost monotonically increasing with age</sup>").show()

In [None]:
gender_col = "What is your gender? - Selected Choice"

labels = {
    gender_col: "What is your gender?"
}

create_chart(df=df, groupby=gender_col, other_col=dur_col, x=gender_col, y="value", color="metric", symbol="metric", title="Time to Complete by Gender", labels=labels).show()

In [None]:
country_col = "In which country do you currently reside?"

country_map = {
    "Iran, Islamic Republic of...": "Iran",
    "United States of America": "United States",
    "United Kingdom of Great Britain and Northern Ireland": "UK and Northern Ireland"
}

df[country_col] = [country_map.get(x,x) for x in df[country_col]]

create_chart(df=df, groupby=country_col, other_col=dur_col, x='value', y=country_col, color="metric", symbol="metric", log_y=False, log_x=True, title="Time to Complete by Country", height=2000, sort_values={"by": [country_col, "metric"], "ascending": [False, True]}).show()

In [None]:
education_col = "What is the highest level of formal education that you have attained or plan to attain within the next 2 years?"

ed_map = {
    'Some college/university study without earning a bachelor’s degree': "Some college/university-no bachelor's degree"
}

df[education_col] = [ed_map.get(x,x) for x in df[education_col]]

category_orders = {
    education_col: [
        'I prefer not to answer',
        'No formal education past high school',
        "Some college/university-no bachelor's degree",
        'Bachelor’s degree',
         'Master’s degree',
         'Doctoral degree',
         'Professional doctorate',
 ]
}

labels = {education_col: "What is the highest level of formal education <br>that you have attained or plan to attain within the next 2 years?"}

create_chart(df=df, groupby=education_col, other_col=dur_col, x=education_col, y="value", color="metric", symbol="metric", title="Time to Complete by Education<br><sup>Almost monotonically increasing with degree level</sup>", height=700, labels=labels, category_orders=category_orders).show()

In [None]:
title_col = 'Select the title most similar to your current role (or most recent title if retired): - Selected Choice'

labels = {
    title_col: 'Select the title most similar<br>to your current role (or most recent title if retired)'
}

create_chart(df=df, groupby=title_col, other_col=dur_col, x=title_col, y="value", color="metric", symbol="metric", title="Time to Complete by Job Title", height=800, labels=labels).show()

In [None]:
exp_col = 'For how many years have you been writing code and/or programming?'

create_chart(df=df, groupby=exp_col, other_col=dur_col, x=exp_col, y="value", color="metric", symbol="metric", title="Time to Complete by Experience<br><sup>Monotonically increasing with age</sup>", category_orders={exp_col: ['I have never written code', '< 1 years', '1-3 years',  '3-5 years', '5-10 years', 
       '10-20 years', '20+ years', ]}).show()

In [None]:
industry_col = 'In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice'

labels = {
    industry_col: 'In what industry is your current employer/contract <br>(or your most recent employer if retired)?'
}

create_chart(df=df, groupby=industry_col, other_col=dur_col, x=industry_col, y="value", color="metric", symbol="metric", title="Time to Complete by Industry", labels=labels, height=800).show()

In [None]:
size_col = 'What is the size of the company where you are employed?'

category_orders = {
    size_col: ['0-49 employees', '50-249 employees', '250-999 employees', '1000-9,999 employees', '10,000 or more employees', ]
}

create_chart(df=df, groupby=size_col, other_col=dur_col, x=size_col, y="value", color="metric", symbol="metric", title="Time to Complete by Company Size", category_orders=category_orders).show()

In [None]:
compensation_col = 'What is your current yearly compensation (approximate $USD)?'

category_orders = {compensation_col: ['$0-999',
 '1,000-1,999',
 '2,000-2,999',
 '3,000-3,999',
'4,000-4,999',
 '5,000-7,499',
 '7,500-9,999',
 '10,000-14,999',
 '15,000-19,999',
 '20,000-24,999',
 '25,000-29,999',
 '30,000-39,999',
 '40,000-49,999',
 '50,000-59,999',
 '60,000-69,999',
 '70,000-79,999',
 '80,000-89,999',
 '90,000-99,999',
 '100,000-124,999',
 '125,000-149,999',
'150,000-199,999',
 '200,000-249,999',
'250,000-299,999',
'300,000-499,999',
 '$500,000-999,999',
 '>$1,000,000']}

create_chart(df=df, groupby=compensation_col, other_col=dur_col, x=compensation_col, y="value", color="metric", symbol="metric", title="Time to Complete by Compensation<br><sup>If you make a ton of money, you won't take as much time</sup>", height=800, category_orders=category_orders).show()

# What does this show?

#### Not really anything. Maybe the fact that people take longer if they are older/more experienced/more educated. Oh well, I enjoyed myself!

<iframe src="https://giphy.com/embed/3o7btNRptqBgLSKR2w" width="480" height="480" frameBorder="0" class="giphy-embed" allowFullScreen></iframe>