<center style="font-size:400%; font-family:Hiragino Kaku Gothic Pro, sans-serif"><b> The Typical Kaggle Data Scientist in 2021</b></center>

<br><br>

<center style="font-size:200%; font-family:Hiragino Kaku Gothic Pro, sans-serif"> By @miguelfzzz</center>

<center><img src='https://www.gcppodcast.com/images/post/kaggle.png'></center>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore') 

df = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv')
df = df.iloc[1:, :]

<br>

<center style="font-size:300%; font-family:Hiragino Kaku Gothic Pro, sans-serif"> 0. Introduction</center>

<br>

<div style="font-size:130%; font-family:Hiragino Kaku Gothic Pro, sans-serif"> 
    
This notebook will explore the fascinating results obtained from the survey conducted by Kaggle in September 2021. Over 25,000 data scientists and ML engineers participated, providing information on their backgrounds and experience in their occupations. <br>

To increase readability, this report is divided into four sections: </div>
    
1. <div style="font-size:130%; font-family:Hiragino Kaku Gothic Pro, sans-serif">Demographics & Geographics</div>

2. <div style="font-size:130%; font-family:Hiragino Kaku Gothic Pro, sans-serif">Education & Occupation </div>

3. <div style="font-size:130%; font-family:Hiragino Kaku Gothic Pro, sans-serif">Knowledge & Skills</div>

4. <div style="font-size:130%; font-family:Hiragino Kaku Gothic Pro, sans-serif">Platforms & Media</div>

<br>

***


<center style="font-size:300%; font-family:Hiragino Kaku Gothic Pro, sans-serif"> 1. Demographics & Geographics</center>

In [None]:
age = (
    df['Q1']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Age', 'Q1':'Count'})
    .sort_values(by=['Age'], ascending=True)   
          )   

age['percent'] = ((age['Count'] / age['Count'].sum())*100).round(2).astype(str) + '%'

colors = ['#033351',] * 11
colors[0] = '#5abbf9'
colors[1] = '#5abbf9'
colors[2] = '#5abbf9'
colors[3] = '#0779c3'
colors[4] = '#0779c3'


fig = go.Figure(go.Bar(
            y=age['Count'],
            x=age['Age'],
            marker_color=colors,
            cliponaxis = False,
            text=age['percent']
                        ))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  hovertemplate='<b>Age</b>: %{x}<br>'+
                                '<b>Count</b>: %{y}',
                  textfont_size=12)
                  
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(coloraxis=dict(colorscale='Teal'),
                  showlegend=False, 
                  plot_bgcolor='#F7F7F7', 
                  margin=dict(pad=20),
                  paper_bgcolor='#F7F7F7',
                  height=500,
                  yaxis={'showticklabels': False},
                  yaxis_title=None,
                  xaxis_title=None,
                  title_text="<b>Age</b> Distribution",
                  title_x=0.5,
                  font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=17, color='#000000'),
                  title_font_size=35)

fig.add_annotation(dict(font=dict(size=14),
                                    x=0.98,
                                    y=-0.25,
                                    showarrow=False,
                                    text="@miguelfzzz",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.add_annotation(dict(font=dict(size=12),
                                    x=0,
                                    y=-0.25,
                                    showarrow=False,
                                    text="Source: 2021 Kaggle Machine Learning & Data Science Survey",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.show()


<blockquote><p style="font-size:130%; font-family:Hiragino Kaku Gothic Pro, sans-serif"><b>Over <span style="color:#5abbf9;">55%</span> of respondents are between 18 and 29 years of age.</b><p></blockquote>

In [None]:
gender = (
    df['Q2']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Gender', 'Q2':'Count'})
    .replace(['Prefer not to say','Nonbinary','Prefer to self-describe'], 'Other')  
    .replace(['Man','Woman'], ['Male', 'Female']) 
    .groupby('Gender')
    .sum()
    .reset_index()    
          )   

colors = ['#5abbf9','#033351', 'b9e2fc']

fig = go.Figure(data=[go.Pie(labels=gender['Gender'], 
                             values=gender['Count'], 
                             hole=.4)])

fig.update_traces(hoverinfo='percent', 
                  textinfo='label', 
                  textfont_size=20,
                  marker=dict(colors=colors, 
                              line=dict(color='#000000', width=1)))

fig.update_layout(showlegend=False, 
                  plot_bgcolor='#F7F7F7', 
                  paper_bgcolor='#F7F7F7',
                  title_text="<b>Gender</b> Distribution",
                  title_x=0.5,
                  font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=25, color='#000000'))

fig.add_annotation(dict(font=dict(size=14),
                                    x=1.1,
                                    y=-0.16,
                                    showarrow=False,
                                    text="@miguelfzzz",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.add_annotation(dict(font=dict(size=12),
                                    x=-0.28,
                                    y=-0.16,
                                    showarrow=False,
                                    text="Source: 2021 Kaggle Machine Learning & Data Science Survey",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))
fig.show()

<blockquote><p style="font-size:130%; font-family:Hiragino Kaku Gothic Pro, sans-serif"><b> Males represent the majority of respondents, with <span style="color:#5abbf9;">79%</span> of the total.</b><p></blockquote>

In [None]:
df1 = df.copy()
df1['Q3'] = df1['Q3'].astype('category')

others = df1['Q3'].value_counts().index[15:]
label = 'Others'

df1['Q3'] = df1['Q3'].cat.add_categories([label])
df1['Q3'] = df1['Q3'].replace(others, label)

country = (
    df1['Q3']
    .replace(['Other'], 'Others')
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Country', 'Q3':'Count'})
    .sort_values(by=['Count'], ascending=False) 
    .replace(['United Kingdom of Great Britain and Northern Ireland'], 'United Kingdom')
          )  

country['percent'] = ((country['Count'] / country['Count'].sum())*100).round(2).astype(str) + '%'
          
colors = ['#033351',] * 16
colors[14] = '#0779c3'
colors[13] = '#5abbf9'
colors[12] = '#5abbf9'


country = (country
           .sort_values(by = ['Count'])
           .iloc[0:16]
           .reset_index())

fig = go.Figure(go.Scatter(x = country['Count'], 
                           y = country["Country"],
                           text = country['percent'],
                           mode = 'markers',
                           marker_color =colors,
                           marker_size  = 12))

for i in range(0, len(country)):
               fig.add_shape(type='line',
                              x0 = 0, y0 = i,
                              x1 = country["Count"][i],
                              y1 = i,
                              line=dict(color=colors[i], width = 4))

fig.update_traces(hovertemplate='<b>Country</b>: %{y}<br><extra></extra>'+
                                '<b>Count</b>: %{x}<br>'+
                                '<b>Proportion</b>: %{text}')

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#9f9f9f', ticklabelmode='period')
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='#F7F7F7', 
                  margin=dict(pad=20),
                  paper_bgcolor='#F7F7F7',
                  yaxis_title=None,
                  xaxis_title=None,
                  title_text="Most Common <b>Countries</b>",
                  title_x=0.5,
                  height=700,
                  font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=17, color='#000000'),
                  title_font_size=35)

fig.add_annotation(dict(font=dict(size=14),
                                    x=0.98,
                                    y=-0.155,
                                    showarrow=False,
                                    text="@miguelfzzz",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.add_annotation(dict(font=dict(size=12),
                                    x=0,
                                    y=-0.155,
                                    showarrow=False,
                                    text="Source: 2021 Kaggle Machine Learning & Data Science Survey",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.show()

<blockquote><p style="font-size:130%; font-family:Hiragino Kaku Gothic Pro, sans-serif"><b> India is the most common country of the respondents, with over <span style="color:#5abbf9;">28% </span> of the total. Followed by the U.S. with <span style="color:#5abbf9;">10%</span>. </b><p></blockquote>

<center style="font-size:300%; font-family:Hiragino Kaku Gothic Pro, sans-serif"> 2. Education & Occupation</center>

In [None]:
education = (
    df['Q4']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Education', 'Q4':'Count'})
    .replace(['Some college/university study without earning a bachelor’s degree'], 'University studies - No degree')
          )  

education['percent'] = ((education['Count'] / education['Count'].sum())*100).round(2).astype(str) + '%'

colors = ['#033351',] * 7
colors[0] = '#5abbf9'
colors[1] = '#5abbf9'
colors[2] = '#0779c3'
colors[3] = '#0779c3'


fig = go.Figure(go.Bar(
            x=education['Count'],
            y=education['Education'],
            text=education['percent'],
            orientation='h',
            marker_color=colors
                        ))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Education</b>: %{y}<br><extra></extra>'+
                                '<b>Count</b>: %{x}',
                  textfont_size=12)
                  
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='#F7F7F7', 
                  margin=dict(pad=20),
                  paper_bgcolor='#F7F7F7',
                  xaxis={'showticklabels': False},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Education</b> Distribution",
                  title_x=0.5,
                  font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=15, color='#000000'),
                  title_font_size=35)

fig.add_annotation(dict(font=dict(size=14),
                                    x=0.98,
                                    y=-0.21,
                                    showarrow=False,
                                    text="@miguelfzzz",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.add_annotation(dict(font=dict(size=12),
                                    x=0,
                                    y=-0.21,
                                    showarrow=False,
                                    text="Source: 2021 Kaggle Machine Learning & Data Science Survey",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.show()



<blockquote><p style="font-size:130%; font-family:Hiragino Kaku Gothic Pro, sans-serif"><b> Over <span style="color:#5abbf9;">77% </span>of respondents have a Bachelor's and/or a Master's degree.</b><p></blockquote>

In [None]:
role = (
    df['Q5']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Role', 'Q5':'Count'})
    .sort_values(by=['Count'], ascending=False) 
          ) 

role['percent'] = ((role['Count'] / role['Count'].sum())*100).round(2).astype(str) + '%'

colors = ['#033351',] * 15
colors[0] = '#5abbf9'
colors[1] = '#0779c3'
colors[2] = '#0779c3'
colors[3] = '#0779c3'
colors[4] = '#0779c3'
colors[5] = '#0779c3'




fig = go.Figure(go.Bar(
            x=role['Count'],
            y=role['Role'],
            text=role['percent'],
            orientation='h',
            marker_color=colors
                        ))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Role</b>: %{y}<br><extra></extra>'+
                                '<b>Count</b>: %{x}',
                  textfont_size=12)
                  
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='#F7F7F7', 
                  margin=dict(pad=20),
                  paper_bgcolor='#F7F7F7',
                  xaxis={'showticklabels': False},
                  yaxis_title=None,
                  height = 600,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Role</b> Distribution",
                  title_x=0.5,
                  font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=15, color='#000000'),
                  title_font_size=35)

fig.add_annotation(dict(font=dict(size=14),
                                    x=0.98,
                                    y=-0.17,
                                    showarrow=False,
                                    text="@miguelfzzz",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.add_annotation(dict(font=dict(size=12),
                                    x=0,
                                    y=-0.17,
                                    showarrow=False,
                                    text="Source: 2021 Kaggle Machine Learning & Data Science Survey",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.show()

<blockquote><p style="font-size:130%; font-family:Hiragino Kaku Gothic Pro, sans-serif"><b> Over a quarter of respondents are students, with <span style="color:#5abbf9;">26.2% </span>of the total.</b><p></blockquote>

In [None]:
industry = (
    df['Q20']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Industry', 'Q20':'Count'})
    .sort_values(by=['Count'], ascending=False)   
          )  

colors = ['#033351',] * 18
colors[0] = '#5abbf9'
colors[1] = '#5abbf9'
colors[2] = '#0779c3'
colors[3] = '#0779c3'
colors[4] = '#0779c3'
colors[5] = '#0779c3'
colors[6] = '#0779c3'

industry['percent'] = ((industry['Count'] / industry['Count'].sum())*100).round(2).astype(str) + '%'

fig = go.Figure(go.Bar(
            x=industry['Count'],
            y=industry['Industry'],
            text=industry['percent'],
            orientation='h',
            cliponaxis = False,
            marker_color=colors
                        ))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  hovertemplate='<b>Industry</b>: %{y}<br><extra></extra>'+
                                '<b>Count</b>: %{x}',
                  textfont_size=12)
                  
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='#F7F7F7', 
                  margin=dict(pad=20),
                  paper_bgcolor='#F7F7F7',
                  height=700,
                  xaxis={'showticklabels': False},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Industry</b> Distribution",
                  title_x=0.5,
                  font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=17, color='#000000'),
                  title_font_size=35)

fig.add_annotation(dict(font=dict(size=14),
                                    x=0.98,
                                    y=-0.13,
                                    showarrow=False,
                                    text="@miguelfzzz",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.add_annotation(dict(font=dict(size=12),
                                    x=0,
                                    y=-0.13,
                                    showarrow=False,
                                    text="Source: 2021 Kaggle Machine Learning & Data Science Survey",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.show()

<blockquote><p style="font-size:130%; font-family:Hiragino Kaku Gothic Pro, sans-serif"><b> The most common industry is technology, with <span style="color:#5abbf9;">25% </span> of the respondents working on it, followed by the education industry, with <span style="color:#5abbf9;">20%</span>.</b><p></blockquote>

<center style="font-size:300%; font-family:Hiragino Kaku Gothic Pro, sans-serif"> 3. Knowledge & Skills</center>

In [None]:
experience = (
    df['Q6']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Experience', 'Q6':'Count'})
    .replace(['I have never written code','< 1 years',
              '1-3 years', '3-5 years', '5-10 years',
              '10-20 years', '20+ years'], ['No experience', '<1 years',
                                        '1-3 years', '3-5 years', '5-10 years',
                                        '10-20 years', '20+ years'])
          ) 

experience['Experience'] = pd.Categorical(
                                        experience['Experience'], 
                                        ['No experience', '<1 years',
                                        '1-3 years', '3-5 years', '5-10 years',
                                        '10-20 years', '20+ years']
                                         )
                                         

experience['percent'] = ((experience['Count'] / experience['Count'].sum())*100).round(2).astype(str) + '%'

experience = experience.sort_values('Experience')

colors = ['#033351',] * 7
colors[1] = '#5abbf9'
colors[2] = '#5abbf9'
colors[3] = '#0779c3'
colors[4] = '#0779c3'


fig = go.Figure(go.Bar(
            y=experience['Count'],
            x=experience['Experience'],
            cliponaxis = False,
            text=experience['percent'],
            marker_color=colors
                        ))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  hovertemplate='<b>Experience</b>: %{x}<br><extra></extra>'+
                                '<b>Count</b>: %{y}',
                  textfont_size=12)
                  
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='#F7F7F7', 
                  margin=dict(pad=20),
                  paper_bgcolor='#F7F7F7',
                  height=500,
                  yaxis={'showticklabels': False},
                  yaxis_title=None,
                  xaxis_title=None,
                  title_text="<b>Experience</b> Distribution",
                  title_x=0.5,
                  font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=14, color='#000000'),
                  title_font_size=35)

fig.add_annotation(dict(font=dict(size=14),
                                    x=0.98,
                                    y=-0.24,
                                    showarrow=False,
                                    text="@miguelfzzz",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.add_annotation(dict(font=dict(size=12),
                                    x=-0.03,
                                    y=-0.24,
                                    showarrow=False,
                                    text="Source: 2021 Kaggle Machine Learning & Data Science Survey",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))
fig.show()

<blockquote><p style="font-size:130%; font-family:Hiragino Kaku Gothic Pro, sans-serif"><b> Over <span style="color:#5abbf9;">52% </span>of respondents have experience of less than 3 years writing code and programming.</b><p></blockquote>


In [None]:
algorithms_cols = [col for col in df if col.startswith('Q17')]

algorithms = df[algorithms_cols]

algorithms.columns = ['Linear or Logistic Regression', 'Decision Trees or Random Forests', 
                     'Gradient Boosting Machines', 'Bayesian Approaches', 'Evolutionary Approaches', 
                     'Dense Neural Networks', 'Convolutional Neural Networks', 'Generative Adversarial Networks',
                     'Recurrent Neural Networks', 'Transformer Networks', 'None', 'Other']
                     
algorithms = (
    algorithms
    .count()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Algorithms', 0:'Count'})
    .sort_values(by=['Count'], ascending=False)
    )

algorithms['percent'] = ((algorithms['Count'] / len(df))*100).round(2).astype(str) + '%'

colors = ['#033351',] * 12
colors[0] = '#5abbf9'
colors[1] = '#5abbf9'
colors[2] = '#066eb0'
colors[3] = '#066eb0'
colors[4] = '#044a77'
colors[5] = '#044a77'
colors[6] = '#044a77'


fig = go.Figure(go.Bar(
            x=algorithms['Count'],
            y=algorithms['Algorithms'],
            text=algorithms['percent'],
            orientation='h',
            marker_color=colors
                        ))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Algorithm</b>: %{y}<br><extra></extra>'+
                                '<b>Count</b>: %{x}',
                  textfont_size=12)
                  
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='#F7F7F7', 
                  margin=dict(pad=20),
                  paper_bgcolor='#F7F7F7',
                  xaxis={'showticklabels': False},
                  yaxis_title=None,
                  height = 600,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="Most Commonly Used <b>Algorithms</b>",
                  title_x=0.5,
                  font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=15, color='#000000'),
                  title_font_size=35)

fig.add_annotation(dict(font=dict(size=14),
                                    x=0.98,
                                    y=-0.17,
                                    showarrow=False,
                                    text="@miguelfzzz",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.add_annotation(dict(font=dict(size=12),
                                    x=0,
                                    y=-0.17,
                                    showarrow=False,
                                    text="Source: 2021 Kaggle Machine Learning & Data Science Survey",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))


fig.show()

<blockquote><p style="font-size:130%; font-family:Hiragino Kaku Gothic Pro, sans-serif"><b> The most commonly used algorithms between the data scientist are the Linear or Logistic Regression and the Decision Trees or Random Forests, respectively, with <span style="color:#5abbf9;">53% </span>and <span style="color:#5abbf9;">66% </span>.</b><p></blockquote>

In [None]:
# Features that start with Q7
languages_cols = [col for col in df if col.startswith('Q7')]

languages = df[languages_cols]

languages.columns = ['Python', 'R', 'SQL', 'C', 'C++', 'Java', 
                     'Javascript', 'Julia', 'Swift', 'Bash',
                     'MATLAB', 'None', 'Other']
                     
languages = (
    languages
    .count()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Languages', 0:'Count'})
    .sort_values(by=['Count'], ascending=False)
    )
    
languages['percent'] = ((languages['Count'] / len(df))*100).round(2).astype(str) + '%'

colors = ['#033351',] * 13
colors[0] = '#5abbf9'
colors[1] = '#5abbf9'
colors[2] = '#0779c3'
colors[3] = '#0779c3'
colors[4] = '#0779c3'
colors[5] = '#0779c3'
colors[6] = '#0779c3'
colors[7] = '#05568a'
colors[8] = '#05568a'
colors[9] = '#05568a'


fig = go.Figure(go.Bar(
            x=languages['Count'],
            y=languages['Languages'],
            text=languages['percent'],
            orientation='h',
            marker_color=colors
                        ))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Lenguage</b>: %{y}<br><extra></extra>'+
                                '<b>Count</b>: %{x}',
                  textfont_size=12)
                  
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='#F7F7F7', 
                  margin=dict(pad=20),
                  paper_bgcolor='#F7F7F7',
                  height=700,
                  xaxis={'showticklabels': False},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="Most Commonly Used <b>Programming Languages</b>",
                  title_x=0.5,
                  font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=17, color='#000000'),
                  title_font_size=35)

fig.add_annotation(dict(font=dict(size=14),
                                    x=0.98,
                                    y=-0.13,
                                    showarrow=False,
                                    text="@miguelfzzz",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.add_annotation(dict(font=dict(size=12),
                                    x=0,
                                    y=-0.13,
                                    showarrow=False,
                                    text="Source: 2021 Kaggle Machine Learning & Data Science Survey",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.show()


<blockquote><p style="font-size:130%; font-family:Hiragino Kaku Gothic Pro, sans-serif"><b> The most commonly used programming languages are Python and SQL, with  <span style="color:#5abbf9;">84% </span> and <span style="color:#5abbf9;">41% </span> of all respondents respectively.</b><p></blockquote>

In [None]:
recommend_leng = (
    df['Q8']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Lenguage', 'Q8':'Count'})
    .sort_values(by=['Count'], ascending=False)   
          )   

colors = ['#033351',] * 13
colors[0] = '#5abbf9'
colors[1] = '#066eb0'
colors[2] = '#044a77'
colors[3] = '#043e64'
colors[4] = '#043e64'

fig = go.Figure(go.Treemap(
    labels = recommend_leng['Lenguage'],
    values = recommend_leng['Count'],
    parents = ['']*recommend_leng.shape[0],
    textinfo = "percent root+label+value+text",
))

fig.update_traces(hovertemplate='<b>Lenguage</b>: %{label}<br><extra></extra>'+
                                '<b>Count</b>: %{value}')
 
fig.update_layout(showlegend=False, 
                  treemapcolorway = colors,
                  margin=dict(pad=20),
                  paper_bgcolor='#F7F7F7',
                  plot_bgcolor='#F7F7F7',
                  height=600,
                  yaxis={'showticklabels': False},
                  yaxis_title=None,
                  xaxis_title=None,
                  title_text="Most Recommended <b>Programming Language</b>",
                  title_x=0.5,
                  title_y=0.95,
                  font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=17, color='#000000'),
                  title_font_size=35)

fig.add_annotation(dict(font=dict(size=14),
                                    x=0.96,
                                    y=-0.14,
                                    showarrow=False,
                                    text="@miguelfzzz",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.add_annotation(dict(font=dict(size=12),
                                    x=0.01,
                                    y=-0.14,
                                    showarrow=False,
                                    text="Source: 2021 Kaggle Machine Learning & Data Science Survey",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.show()

<blockquote><p style="font-size:130%; font-family:Hiragino Kaku Gothic Pro, sans-serif"><b> Python is by far the most recommended programming language for aspiring data scientist, with <span style="color:#5abbf9;">81% </span> of the total.</b><p></blockquote>

In [None]:
# Features that start with Q7
ide_cols = [col for col in df if col.startswith('Q9')]

ide = df[ide_cols]

ide.columns = ['JupyterLab', 'RStudio', 'Visual Studio', 'VSCode', 
               'PyCharm', 'Spyder', 'Notepad++', 'Sublime Text', 'Vim, Emacs, or similar', 
               'MATLAB', 'Jupyter Notebook', 'None', 'Other']
                     
ide = (
    ide
    .count()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'IDE', 0:'Count'})
    .sort_values(by=['Count'], ascending=False)
    )


ide['percent'] = ((ide['Count'] / len(df))*100).round(2).astype(str) + '%'

colors = ['#033351',] * 13
colors[0] = '#5abbf9'
colors[1] = '#5abbf9'
colors[2] = '#0779c3'
colors[3] = '#0779c3'
colors[4] = '#0779c3'
colors[5] = '#0779c3'
colors[6] = '#0779c3'
colors[7] = '#0779c3'



fig = go.Figure(go.Bar(
            x=ide['Count'],
            y=ide['IDE'],
            text=role['percent'],
            orientation='h',
            marker_color=colors
                        ))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>IDE</b>: %{y}<br><extra></extra>'+
                                '<b>Count</b>: %{x}',
                  textfont_size=12)
                  
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='#F7F7F7', 
                  margin=dict(pad=20),
                  paper_bgcolor='#F7F7F7',
                  xaxis={'showticklabels': False},
                  yaxis_title=None,
                  height = 600,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="Most Commonly Used <b>IDE's</b>",
                  title_x=0.5,
                  font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=15, color='#000000'),
                  title_font_size=35)

fig.add_annotation(dict(font=dict(size=14),
                                    x=0.98,
                                    y=-0.17,
                                    showarrow=False,
                                    text="@miguelfzzz",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.add_annotation(dict(font=dict(size=12),
                                    x=0,
                                    y=-0.17,
                                    showarrow=False,
                                    text="Source: 2021 Kaggle Machine Learning & Data Science Survey",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))


fig.show()

<blockquote><p style="font-size:130%; font-family:Hiragino Kaku Gothic Pro, sans-serif"><b> The most common IDE's for daily use are Jupyter Notebook and VSCode, with  <span style="color:#5abbf9;">26% </span> and <span style="color:#5abbf9;">14% </span> respectively.</b><p></blockquote>


<center style="font-size:300%; font-family:Hiragino Kaku Gothic Pro, sans-serif"> 4. Platforms & Media</center>

In [None]:
platform = (
    df['Q11']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Platform', 'Q11':'Count'})
    .sort_values(by=['Count'], ascending=False)   
    .replace(['A deep learning workstation (NVIDIA GTX, LambdaLabs, etc)',
              'A cloud computing platform (AWS, Azure, GCP, hosted notebooks, etc)'], 
              ['A deep learning workstation', 'A cloud computing platform'])
          )  

platform['percent'] = ((platform['Count'] / platform['Count'].sum())*100).round(2).astype(str) + '%'

colors = ['#033351',] * 6
colors[5] = '#5abbf9'
colors[4] = '#0779c3'
colors[3] = '#0779c3'

platform = (platform
           .sort_values(by = ['Count'])
           .iloc[0:15]
           .reset_index())

fig = go.Figure(go.Scatter(x = platform['Count'], 
                           y = platform["Platform"],
                           text = platform['percent'],
                           mode = 'markers',
                           marker_color =colors,
                           marker_size  = 12))

for i in range(0, len(platform)):
               fig.add_shape(type='line',
                              x0 = 0, y0 = i,
                              x1 = platform["Count"][i],
                              y1 = i,
                              line=dict(color=colors[i], width = 4))

fig.update_traces(hovertemplate='<b>Platform</b>: %{y}<br><extra></extra>'+
                                '<b>Count</b>: %{x}<br>'+
                                '<b>Proportion</b>: %{text}')

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#9f9f9f', ticklabelmode='period')
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='#F7F7F7', 
                  margin=dict(pad=20),
                  paper_bgcolor='#F7F7F7',
                  yaxis_title=None,
                  xaxis_title=None,
                  title_text="Most Commonly Used <b>Computing Platforms</b>",
                  title_x=0.5,
                  font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=17, color='#000000'),
                  title_font_size=35)

fig.add_annotation(dict(font=dict(size=14),
                                    x=0.98,
                                    y=-0.22,
                                    showarrow=False,
                                    text="@miguelfzzz",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.add_annotation(dict(font=dict(size=12),
                                    x=0.04,
                                    y=-0.22,
                                    showarrow=False,
                                    text="Source: 2021 Kaggle Machine Learning & Data Science Survey",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.show()

<blockquote><p style="font-size:130%; font-family:Hiragino Kaku Gothic Pro, sans-serif"><b> The majority of respondents use a laptop for their work, with <span style="color:#5abbf9;">66% </span>of the total.</b><p></blockquote>

In [None]:
cloud_platform_cols = [col for col in df if col.startswith('Q27_A')]

cloud_platform = df[cloud_platform_cols]

cloud_platform.columns = ['Amazon Web Services (AWS)', 'Microsoft Azure', 
                     'Google Cloud Platform (GCP)', 'IBM Cloud / Red Hat', 'Oracle Cloud', 
                     'SAP Cloud', 'Salesforce Cloud', 'VMware Cloud',
                     'Alibaba Cloud', 'Tencent Cloud', 'None', 'Other']
                     
cloud_platform = (
    cloud_platform
    .count()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Cloud_Platform', 0:'Count'})
    .sort_values(by=['Count'], ascending=False)
    )

cloud_platform['percent'] = ((cloud_platform['Count'] / len(df))*100).round(2).astype(str) + '%'

colors = ['#033351',] * 12
colors[0] = '#5abbf9'
colors[1] = '#5abbf9'
colors[2] = '#5abbf9'
colors[3] = '#5abbf9'

fig = go.Figure(go.Bar(
            x=cloud_platform['Count'],
            y=cloud_platform['Cloud_Platform'],
            text=cloud_platform['percent'],
            orientation='h',
            cliponaxis = False,
            marker_color=colors
                        ))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  hovertemplate='<b>Cloud Platform</b>: %{y}<br><extra></extra>'+
                                '<b>Count</b>: %{x}',
                  textfont_size=12)
                  
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='#F7F7F7', 
                  margin=dict(pad=20),
                  paper_bgcolor='#F7F7F7',
                  height=700,
                  xaxis={'showticklabels': False},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="Most Commonly Used <b>Cloud Platforms</b>",
                  title_x=0.5,
                  font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=17, color='#000000'),
                  title_font_size=35)

fig.add_annotation(dict(font=dict(size=14),
                                    x=0.98,
                                    y=-0.13,
                                    showarrow=False,
                                    text="@miguelfzzz",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.add_annotation(dict(font=dict(size=12),
                                    x=0,
                                    y=-0.13,
                                    showarrow=False,
                                    text="Source: 2021 Kaggle Machine Learning & Data Science Survey",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.show()

<blockquote><p style="font-size:130%; font-family:Hiragino Kaku Gothic Pro, sans-serif"><b> The most popular used cloud platforms are Amazon Web Services (<span style="color:#5abbf9;">14%</span>), Google Cloud Platform (<span style="color:#5abbf9;">12%</span>) and Microsoft Azure (<span style="color:#5abbf9;">9%</span>).</b><p></blockquote>

In [None]:
courses_cols = [col for col in df if col.startswith('Q40')]

courses = df[courses_cols]

courses.columns = ['Coursera', 'edX', 
                   'Kaggle Learn Courses', 'DataCamp', 'Fast.ai', 
                   'Udacity', 'Udemy', 'LinkedIn Learning',
                   'Cloud-certification programs', 'University Courses', 'None', 'Other']
                     
courses = (
    courses
    .count()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Courses', 0:'Count'})
    .sort_values(by=['Count'], ascending=False)
    )

colors = ['#033351',] * 12
colors[0] = '#5abbf9'
colors[1] = '#34abf8'
colors[2] = '#0e9cf6'
colors[3] = '#0885d6'
colors[4] = '#066eb0'
colors[5] = '#05568a'
colors[6] = '#043e64'
colors[6] = '#043e64'
colors[6] = '#043e64'
colors[11] = '#021b2b'


fig = go.Figure(go.Treemap(
    labels = courses['Courses'],
    values = courses['Count'],
    parents = ['']*courses.shape[0],
    textinfo = "percent root+label+value+text",
))

fig.update_traces(hovertemplate='<b>Course Platform</b>: %{label}<br><extra></extra>'+
                                '<b>Count</b>: %{value}')
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='#F7F7F7', 
                  treemapcolorway = colors,
                  margin=dict(pad=20),
                  paper_bgcolor='#F7F7F7',
                  height=600,
                  yaxis={'showticklabels': False},
                  yaxis_title=None,
                  xaxis_title=None,
                  title_text="Most Commonly Used <b>Course Platforms</b>",
                  title_x=0.5,
                  title_y=0.95,
                  font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=17, color='#000000'),
                  title_font_size=35)

fig.add_annotation(dict(font=dict(size=14),
                                    x=0.96,
                                    y=-0.14,
                                    showarrow=False,
                                    text="@miguelfzzz",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.add_annotation(dict(font=dict(size=12),
                                    x=0.01,
                                    y=-0.14,
                                    showarrow=False,
                                    text="Source: 2021 Kaggle Machine Learning & Data Science Survey",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.show()

<blockquote><p style="font-size:130%; font-family:Hiragino Kaku Gothic Pro, sans-serif"><b> Data scientists' most commonly used course platforms are Coursera and Kaggle, respectively, with <span style="color:#5abbf9;">20% </span> and <span style="color:#5abbf9;">18% </span>.</b><p></blockquote>

In [None]:
media_cols = [col for col in df if col.startswith('Q42')]

media = df[media_cols]

media.columns = ['Twitter', 'Email newsletters', 
                'Reddit', 'Kaggle', 'Course Forums', 
                'YouTube', 'Podcasts', 'Blogs',
                'Journal Publications', 'Slack Communities', 'None', 'Other']
                     
media = (
    media
    .count()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Medias', 0:'Count'})
    .sort_values(by=['Count'], ascending=False)
    )

colors = ['#033351',] * 12
colors[11] = '#5abbf9'
colors[10] = '#5abbf9'
colors[9] = '#5abbf9'
colors[8] = '#0779c3'
colors[7] = '#0779c3'
colors[6] = '#0779c3'
colors[5] = '#0779c3'
colors[4] = '#0779c3'

media['percent'] = ((media['Count'] / len(df))*100).round(2).astype(str) + '%'

media = (media
        .sort_values(by = ['Count'])
        .iloc[0:15]
        .reset_index())

fig = go.Figure(go.Scatter(x = media['Count'], 
                           y = media["Medias"],
                           text = media['percent'],
                           mode = 'markers',
                           marker_color =colors,
                           marker_size  = 12))

for i in range(0, len(media)):
               fig.add_shape(type='line',
                              x0 = 0, y0 = i,
                              x1 = media["Count"][i],
                              y1 = i,
                              line=dict(color=colors[i], width = 4))

fig.update_traces(hovertemplate='<b>Media Source</b>: %{y}<br><extra></extra>'+
                                '<b>Count</b>: %{x}<br>'+
                                '<b>Proportion</b>: %{text}')

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#9f9f9f', ticklabelmode='period')
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='#F7F7F7', 
                  margin=dict(pad=20),
                  paper_bgcolor='#F7F7F7',
                  yaxis_title=None,
                  xaxis_title=None,
                  title_text="Most Commonly Used <b>Media Sources</b>",
                  title_x=0.5,
                  height=700,
                  font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=17, color='#000000'),
                  title_font_size=35)

fig.add_annotation(dict(font=dict(size=14),
                                    x=0.98,
                                    y=-0.15,
                                    showarrow=False,
                                    text="@miguelfzzz",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.add_annotation(dict(font=dict(size=12),
                                    x=0,
                                    y=-0.15,
                                    showarrow=False,
                                    text="Source: 2021 Kaggle Machine Learning & Data Science Survey",
                                    xanchor='left',
                                    xref="paper",
                                    yref="paper"))

fig.show()

<blockquote><p style="font-size:130%; font-family:Hiragino Kaku Gothic Pro, sans-serif"><b> The most commonly used media sources that report on data science topics are Kaggle (<span style="color:#5abbf9;">44%</span>), YouTube (<span style="color:#5abbf9;">40%</span>) and Blogs (<span style="color:#5abbf9;">31%</span>).</b><p></blockquote>

<center style="font-size:350%; font-family:Hiragino Kaku Gothic Pro, sans-serif"> <b>Takeaways</b></center>

<center style="font-size:300%; font-family:Hiragino Kaku Gothic Pro, sans-serif"> The typical Kaggle data scientist in 2021</center>

<center><img src='https://i.ibb.co/SwLBK3r/Presentation1.jpg'></center>


<center style="font-size:200%; font-family:Hiragino Kaku Gothic Pro, sans-serif"> 🛠 Work in progress 🛠</center>

<br>

<center style="font-size:200%; font-family:Hiragino Kaku Gothic Pro, sans-serif"> If you liked my work, please let me know in the comments and give it a like!</center>

<br>

<center style="font-size:250%; font-family:Hiragino Kaku Gothic Pro, sans-serif"> Thanks for your attention</center>
