# Introduction
In this notebook, we answer the most frequently asked question(FAQ) about data scientists by analyzing real-world survey data. We will use the Kaggle machine learning and data science survey of 2021. It will give us answers to the most pressing questions about data scientists by real-world data. **For consistency in analysis, we only include data scientists in the USA**. 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt 
plt.style.use('ggplot')
import seaborn as sns 

%matplotlib inline 




df_survey_2021 = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv', low_memory=False, encoding='ISO-8859-1')
question = df_survey_2021.iloc[0] # store the question list for future if needed 
df_survey_2021 = df_survey_2021.drop([0]) # we are deleting the first row containing questions 


df_data_scientist = df_survey_2021[df_survey_2021.Q5 == 'Data Scientist']
df_usa_data_scientist = df_data_scientist[df_data_scientist.Q3 == 'United States of America']







# make salary into number of figures type
compensation_list_order = ['$0-999',
                           '1,000-1,999',
                        '2,000-2,999',
                        '3,000-3,999',
                        '4,000-4,999',
                        '5,000-7,499',
                        '7,500-9,999',
                        '10,000-14,999',
                        '15,000-19,999',
                        '20,000-24,999',
                        '25,000-29,999',
                        '30,000-39,999',
                        '40,000-49,999',
                        '50,000-59,999',
                        '60,000-69,999',
                        '70,000-79,999',
                        '80,000-89,999',
                        '90,000-99,999',
                        '100,000-124,999',
                        '125,000-149,999',
                        '150,000-199,999',
                        '200,000-249,999',
                        '250,000-299,999',
                        '300,000-499,999',
                        '$500,000-999,999',
                        '>$1,000,000']

salary_4_figures = compensation_list_order[0:7]
salary_5_figures = compensation_list_order[7:18]
salary_6_figures = compensation_list_order[18:25]
salary_7_figures = compensation_list_order[25:]


conditions = [
    (df_usa_data_scientist['Q25'].isin(salary_4_figures)),
    (df_usa_data_scientist['Q25'].isin(salary_5_figures)),
    (df_usa_data_scientist['Q25'].isin(salary_6_figures)),
    (df_usa_data_scientist['Q25'].isin(salary_7_figures))
]

choices = ['4 Figure Income', '5 Figure Income', '6 Figure Income', '7 Figure Income']
df_usa_data_scientist['salary_conditions'] = np.select(
    conditions, choices, default='N')

df_usa_data_scientist = df_usa_data_scientist\
.loc[df_usa_data_scientist.salary_conditions.isin(choices)]





# FAQ 1: What is the salary of data scientist?

In [None]:
#setting up color for aesthetics 
primary_color = '#174A7E'
secondary_color = '#80807F'
secondary_color_text = '#80807F'

# setting up data 
temp = df_usa_data_scientist['Q25'].value_counts(normalize=True, ascending=True).mul(100).round(1)

# starting plotting process 
bar_secondary_colors = [secondary_color] * temp.shape[0]
fig, ax = plt.subplots(figsize=(8, 14), facecolor='#FFFFFF')
ax.barh(width=temp.values, y=temp.index, color=bar_secondary_colors)


# add percentage, colors to the bars 
for i, y in enumerate(ax.patches):
    label_per = temp.values[i]
    ax.text(y.get_width()+.19, y.get_y()+.26, str(f'{round((label_per), 2)}%'), 
                fontsize=15, color=secondary_color)
    


plt.box(on=None) # remove boxes of plot
plt.xticks([]) # remove x axis ticks 
# remove y axis hypen by setting length = 0
# and some additional design parametre
ax.yaxis.set_tick_params(length=0,
                         labelsize=15,
                         colors=secondary_color_text) 


# highlight to first bar for attention
ax.get_children()[25].set_color(primary_color) # make first row specific color for highlight
ax.get_yticklabels()[25].set_color(primary_color) # make first y axis tick label bold 
# ax.get_yticklabels()[25].set_fontweight('bold')

ax.get_children()[24].set_color(primary_color)
ax.get_yticklabels()[24].set_color(primary_color)  
# ax.get_yticklabels()[24].set_fontweight('bold')

ax.get_children()[23].set_color(primary_color)
ax.get_yticklabels()[23].set_color(primary_color) 
# ax.get_yticklabels()[23].set_fontweight('bold')


# add annotation for understanding 
ax.annotate('100K-200K is the most popular salary range',
            xy = (-.75, 1.06), xycoords='axes fraction', 
            fontsize=20, fontweight = 'medium', color=primary_color)
ax.annotate('Yearly compensation of Data Scientists in USA',
            xy = (-.75, 1.002), xycoords='axes fraction',
            fontsize=15, fontweight = 'light', color=secondary_color_text)
ax.annotate('Source: Kaggle Survey 2021. ' + '\nQuestion: ' \
            + 'What is your current yearly compensation (approximate $USD)? ' \
            + '\nRespondents are filtered by job title=Data Scientist and location=USA' \
            + '\nTotal 430 respondents after filter applied',
            xy = (-.75, -.04), xycoords='axes fraction',
            fontsize=11,color=secondary_color_text)





plt.show()

In [None]:
#setting up color for aesthetics 
primary_color = '#174A7E'
secondary_color = '#80807F'
secondary_color_text = '#80807F'

# setting up data 
temp = df_usa_data_scientist['salary_conditions'].value_counts(normalize=True, ascending=True).mul(100).round(1)

# starting plotting process 
bar_secondary_colors = [secondary_color] * temp.shape[0]
fig, ax = plt.subplots(figsize=(5, 4), facecolor='#FFFFFF')
ax.barh(width=temp.values, y=temp.index, color=bar_secondary_colors)


# add percentage, colors to the bars 
for i, y in enumerate(ax.patches):
    label_per = temp.values[i]
    ax.text(y.get_width()+1, y.get_y()+.26, str(f'{round((label_per), 2)}%'), 
                fontsize=15, color=secondary_color)
    


plt.box(on=None) # remove boxes of plot
plt.xticks([]) # remove x axis ticks 
# remove y axis hypen by setting length = 0
# and some additional design parametre
ax.yaxis.set_tick_params(length=0,
                         labelsize=15,
                         colors=secondary_color_text) 


# highlight to first bar for attention
ax.get_children()[3].set_color(primary_color) # make first row specific color for highlight
# ax.get_yticklabels()[14].set_color(primary_color) # make first y axis tick label bold 
# ax.get_yticklabels()[14].set_fontweight('bold')


# add annotation for understanding 
ax.annotate('Data Scientists are 6 figure earners',
            xy = (-.75, 1.15), xycoords='axes fraction', 
            fontsize=20, fontweight = 'medium', color=primary_color)
ax.annotate('Yearly compensation of Data Scientists in USA',
            xy = (-.75, 1.002), xycoords='axes fraction',
            fontsize=15, fontweight = 'light', color=secondary_color_text)
ax.annotate('Source: Kaggle Survey 2021. ' + '\nQuestion: ' \
            + 'What is your current yearly compensation (approximate $USD)? ' \
            + '\nRespondents are filtered by job title=Data Scientist and location=USA' \
            + '\nTotal 430 respondents after filter applied',
            xy = (-.75, -.19), xycoords='axes fraction',
            fontsize=11,color=secondary_color_text)





plt.show()

We see that Data Scientists earn a very good salary. We also see that about 69% of them earn 6 Figure salaries in USA.

# FAQ 2: Do you need formal education for data science?

In [None]:
#process Q4/education 
dict_education_value_rename = {'Some college/university study without earning a bachelorâs degree' : 'Some college/university study',
                              'Bachelorâs degree':'Bachelor\'s degree',
                              'Masterâs degree':'Master\'s degree'}
df_usa_data_scientist.Q4 = df_usa_data_scientist.Q4.replace(dict_education_value_rename)

plot_df_usa_ds_education = df_usa_data_scientist[df_usa_data_scientist.Q4 != 'I prefer not to answer']




#setting up color for aesthetics 
primary_color = '#174A7E'
secondary_color = '#80807F'
secondary_color_text = '#80807F'

# setting up data 
temp = plot_df_usa_ds_education['Q4'].value_counts(normalize=True, ascending=True).mul(100).round(1)

# starting plotting process 
bar_secondary_colors = [secondary_color] * temp.shape[0]
fig, ax = plt.subplots(figsize=(5, 5), facecolor='#FFFFFF')
ax.barh(width=temp.values, y=temp.index, color=bar_secondary_colors)


# add percentage, colors to the bars 
for i, y in enumerate(ax.patches):
    label_per = temp.values[i]
    ax.text(y.get_width()+.19, y.get_y()+.26, str(f'{round((label_per), 2)}%'), 
                fontsize=15, color=secondary_color)
    


plt.box(on=None) # remove boxes of plot
plt.xticks([]) # remove x axis ticks 
# remove y axis hypen by setting length = 0
# and some additional design parametre
ax.yaxis.set_tick_params(length=0,
                         labelsize=15,
                         colors=secondary_color_text) 


# highlight to first bar for attention
ax.get_children()[4].set_color(primary_color) # make first row specific color for highlight
ax.get_yticklabels()[4].set_color(primary_color) # make first y axis tick label bold 
# ax.get_yticklabels()[14].set_fontweight('bold')


# add annotation for understanding 
ax.annotate('Data Scientists thinks formal education are important \nand Master\'s degree is most popular',
            xy = (-.75, 1.15), xycoords='axes fraction', 
            fontsize=20, fontweight = 'medium', color=primary_color)
ax.annotate('Formal Education attained or plan to attain by Data Scientist in USA',
            xy = (-.75, 1.002), xycoords='axes fraction',
            fontsize=15, fontweight = 'light', color=secondary_color_text)
ax.annotate('Source: Kaggle Survey 2021. ' + '\nQuestion: ' \
            + 'What is the highest level of formal education that you have attained or plan to attain within the next 2 years? ' \
            + '\nRespondents are filtered by job title=Data Scientist and location=USA' \
            + '\nTotal 430 respondents after filter applied',
            xy = (-.75, -.19), xycoords='axes fraction',
            fontsize=11,color=secondary_color_text)





plt.show()

The majority of the Data Scientists in the USA attained or planned to attain formal education. Especially, masters' degree, Doctoral degree, and Bachelor's degree. So we see that formal education is perceived as important in a data science career.

# FAQ 3: Which industry is the most popular for data scientists?

In [None]:
#setting up color for aesthetics 
primary_color = '#174A7E'
secondary_color = '#80807F'
secondary_color_text = '#80807F'

# setting up data 
temp = df_usa_data_scientist['Q20'].value_counts(normalize=True, ascending=True).mul(100).round(1)

# starting plotting process 
bar_secondary_colors = [secondary_color] * temp.shape[0]
fig, ax = plt.subplots(figsize=(5, 8), facecolor='#FFFFFF')
ax.barh(width=temp.values, y=temp.index, color=bar_secondary_colors)


# add percentage, colors to the bars 
for i, y in enumerate(ax.patches):
    label_per = temp.values[i]
    ax.text(y.get_width()+.16, y.get_y()+.14, str(f'{round((label_per), 2)}%'), 
                fontsize=15, color=secondary_color)
    


plt.box(on=None) # remove boxes of plot
plt.xticks([]) # remove x axis ticks 
# remove y axis hypen by setting length = 0
# and some additional design parametre
ax.yaxis.set_tick_params(length=0,
                         labelsize=15,
                         colors=secondary_color_text) 


# highlight to first bar for attention
ax.get_children()[17].set_color(primary_color) # make first row specific color for highlight
ax.get_yticklabels()[17].set_color(primary_color) # make first y axis tick label bold 
# ax.get_yticklabels()[14].set_fontweight('bold')


# add annotation for understanding 
ax.annotate('Technology industries are the most popular among data scientists',
            xy = (-.75, 1.15), xycoords='axes fraction', 
            fontsize=20, fontweight = 'medium', color=primary_color)
ax.annotate('Industries for Data Scientists in the USA',
            xy = (-.75, 1.002), xycoords='axes fraction',
            fontsize=15, fontweight = 'light', color=secondary_color_text)
ax.annotate('Source: Kaggle Survey 2021. ' + '\nQuestion: ' \
            + 'In what industry is your current employer/contract? ' \
            + '\nRespondents are filtered by job title=Data Scientist and location=USA' \
            + '\nTotal 430 respondents after filter applied',
            xy = (-.75, -.19), xycoords='axes fraction',
            fontsize=11,color=secondary_color_text)





plt.show()

In [None]:
#setting up color for aesthetics 
primary_color = '#174A7E'
secondary_color = '#80807F'
secondary_color_text = '#80807F'


# setting up data 
plot_df_5_figure_earner = df_usa_data_scientist[df_usa_data_scientist.salary_conditions == '5 Figure Income']
plot_df_6_figure_earner = df_usa_data_scientist[df_usa_data_scientist.salary_conditions == '6 Figure Income']


temp = plot_df_5_figure_earner['Q20'].value_counts(normalize=True, ascending=True).mul(100).round(1)

# starting plotting process 
bar_secondary_colors = [secondary_color] * temp.shape[0]

fig, axes = plt.subplots(nrows=1, ncols=2, sharey=False, figsize=(20, 8), facecolor='#FFFFFF')
axes[0].barh(width=temp.values, y=temp.index, color=bar_secondary_colors)
# add percentage, colors to the bars 
for i, y in enumerate(axes[0].patches):
    label_per = temp.values[i]
    axes[0].text(y.get_width()+.19, y.get_y()+.26, str(f'{round((label_per), 2)}%'), 
                fontsize=15, color=secondary_color)




# setting up data 
temp = plot_df_6_figure_earner['Q20'].value_counts(normalize=True, ascending=True).mul(100).round(1)

# starting plotting process 
bar_secondary_colors = [secondary_color] * temp.shape[0]
axes[1].barh(width=temp.values, y=temp.index, color=bar_secondary_colors)

# add percentage, colors to the bars 
for i, y in enumerate(axes[1].patches):
    label_per = temp.values[i]
    axes[1].text(y.get_width()+.19, y.get_y()+.26, str(f'{round((label_per), 2)}%'), 
                fontsize=15, color=secondary_color)

    
    
    

# remove y axis hypen by setting length = 0
# and some additional design parametre
axes[0].yaxis.set_tick_params(length=0,
                         labelsize=15,
                         colors=secondary_color_text)  
axes[1].yaxis.set_tick_params(length=0,
                         labelsize=15,
                         colors=secondary_color_text)  


axes[0].set_frame_on(False)
axes[1].set_frame_on(False)
axes[0].set_xticks([])
axes[1].set_xticks([])



axes[0].get_children()[16].set_color(primary_color) 
axes[0].get_yticklabels()[16].set_color(primary_color)
axes[0].get_children()[13].set_color(primary_color) 
axes[0].get_yticklabels()[13].set_color(primary_color)
axes[0].get_children()[11].set_color(primary_color) 
axes[0].get_yticklabels()[11].set_color(primary_color)

axes[1].get_children()[17].set_color(primary_color) 
axes[1].get_yticklabels()[17].set_color(primary_color)
axes[1].get_children()[16].set_color(primary_color) 
axes[1].get_yticklabels()[16].set_color(primary_color)
axes[1].get_children()[15].set_color(primary_color) 
axes[1].get_yticklabels()[15].set_color(primary_color)


# add annotation for understanding 
# axes[0].annotate('USA Data Scientist thinks formal education are important',
#             xy = (-.75, 1.15), xycoords='axes fraction', 
#             fontsize=20, fontweight = 'medium', color=secondary_color_text)
axes[0].annotate('Data Scientists with 6 figure income mostly work in Technology, Pharma and Finance industries',
            xy = (-1.5, 1.09), xycoords='axes fraction',
            fontsize=20, fontweight = 'medium', color=primary_color)
axes[0].annotate('Employer industries of Data Scientists of 5 figure income in USA',
            xy = (-.75, 1.002), xycoords='axes fraction',
            fontsize=15, fontweight = 'light', color=secondary_color_text)

axes[1].annotate('Employer industries of Data Scientists of 6 figure income in USA',
            xy = (-.075, 1.002), xycoords='axes fraction',
            fontsize=15, fontweight = 'light', color=secondary_color_text)

axes[0].annotate('Source: Kaggle Survey 2021. ' + '\nQuestion: ' \
            + 'In what industry is your current employer/contract? ' \
            + '\nRespondents are filtered by job title=Data Scientist and location=USA' \
            + '\nTotal 430 respondents after filter applied',
            xy = (-.75, -.09), xycoords='axes fraction',
            fontsize=11,color=secondary_color_text)




plt.tight_layout()
plt.show()

# FAQ 4: Which types of companies do data scientists work for?

In [None]:
#setting up color for aesthetics 
primary_color = '#174A7E'
secondary_color = '#80807F'
secondary_color_text = '#80807F'

# setting up data 
temp = df_usa_data_scientist['Q23'].value_counts(normalize=True, ascending=True).mul(100).round(1)

# starting plotting process 
bar_secondary_colors = [secondary_color] * temp.shape[0]
fig, ax = plt.subplots(figsize=(8, 5), facecolor='#FFFFFF')
ax.barh(width=temp.values, y=temp.index, color=bar_secondary_colors)


# add percentage, colors to the bars 
for i, y in enumerate(ax.patches):
    label_per = temp.values[i]
    ax.text(y.get_width()+1, y.get_y()+.26, str(f'{round((label_per), 2)}%'), 
                fontsize=15, color=secondary_color)
    


plt.box(on=None) # remove boxes of plot
plt.xticks([]) # remove x axis ticks 
# remove y axis hypen by setting length = 0
# and some additional design parametre
ax.yaxis.set_tick_params(length=0,
                         labelsize=15,
                         colors=secondary_color_text) 


# highlight to first bar for attention
ax.get_children()[5].set_color(primary_color) # make first row specific color for highlight
ax.get_yticklabels()[5].set_color(primary_color) # make first y axis tick label bold 
# ax.get_yticklabels()[14].set_fontweight('bold')


# add annotation for understanding 
ax.annotate('Data Scientists favor companies that uses Machine Learning in their production',
            xy = (-.75, 1.15), xycoords='axes fraction', 
            fontsize=20, fontweight = 'medium', color=primary_color)
ax.annotate('Data scientists employer machine learning adoption',
            xy = (-.75, 1.002), xycoords='axes fraction',
            fontsize=15, fontweight = 'light', color=secondary_color_text)
ax.annotate('Source: Kaggle Survey 2021. ' + '\nQuestion: ' \
            + 'Does your current employer incorporate machine learning methods into their business? ' \
            + '\nRespondents are filtered by job title=Data Scientist and location=USA' \
            + '\nTotal 430 respondents after filter applied',
            xy = (-.75, -.19), xycoords='axes fraction',
            fontsize=11,color=secondary_color_text)





plt.show()

In [None]:
#setting up color for aesthetics 
primary_color = '#174A7E'
secondary_color = '#80807F'
secondary_color_text = '#80807F'


# setting up data 
plot_df_5_figure_earner = df_usa_data_scientist[df_usa_data_scientist.salary_conditions == '5 Figure Income']
plot_df_6_figure_earner = df_usa_data_scientist[df_usa_data_scientist.salary_conditions == '6 Figure Income']


temp = plot_df_5_figure_earner['Q22'].value_counts(normalize=True, ascending=True).mul(100).round(1)

# starting plotting process 
bar_secondary_colors = [secondary_color] * temp.shape[0]

fig, axes = plt.subplots(nrows=1, ncols=2, sharey=False, figsize=(20, 8), facecolor='#FFFFFF')
axes[0].barh(width=temp.values, y=temp.index, color=bar_secondary_colors)
# add percentage, colors to the bars 
for i, y in enumerate(axes[0].patches):
    label_per = temp.values[i]
    axes[0].text(y.get_width()+.19, y.get_y()+.26, str(f'{round((label_per), 2)}%'), 
                fontsize=15, color=secondary_color)




# setting up data 
temp = plot_df_6_figure_earner['Q22'].value_counts(normalize=True, ascending=True).mul(100).round(1)

# starting plotting process 
bar_secondary_colors = [secondary_color] * temp.shape[0]
axes[1].barh(width=temp.values, y=temp.index, color=bar_secondary_colors)

# add percentage, colors to the bars 
for i, y in enumerate(axes[1].patches):
    label_per = temp.values[i]
    axes[1].text(y.get_width()+.19, y.get_y()+.26, str(f'{round((label_per), 2)}%'), 
                fontsize=15, color=secondary_color)

    
    
    

# remove y axis hypen by setting length = 0
# and some additional design parametre
axes[0].yaxis.set_tick_params(length=0,
                         labelsize=15,
                         colors=secondary_color_text)  
axes[1].yaxis.set_tick_params(length=0,
                         labelsize=15,
                         colors=secondary_color_text)  


axes[0].set_frame_on(False)
axes[1].set_frame_on(False)
axes[0].set_xticks([])
axes[1].set_xticks([])



axes[1].get_children()[5].set_color(primary_color) 
axes[1].get_yticklabels()[5].set_color(primary_color)


# add annotation for understanding 
axes[0].annotate('Data Scientists with 6 figure incomes favor working \nin companies that have a good data science team ',
            xy = (-.75, 1.15), xycoords='axes fraction',
            fontsize=20, fontweight = 'medium', color=primary_color)
axes[0].annotate('Size of the data science team at the \nworkplace of 5 figure income data scientists in USA',
            xy = (-.75, 1.002), xycoords='axes fraction',
            fontsize=15, fontweight = 'light', color=secondary_color_text)

axes[1].annotate('Size of the data science team at the \nworkplace of 6 figure income data scientists in USA',
            xy = (-.075, 1.002), xycoords='axes fraction',
            fontsize=15, fontweight = 'light', color=secondary_color_text)

axes[0].annotate('Source: Kaggle Survey 2021. ' + '\nQuestion: ' \
            + 'Approximately how many individuals are responsible for data science workloads at your place of business? ' \
            + '\nRespondents are filtered by job title=Data Scientist and location=USA' \
            + '\nTotal 430 respondents after filter applied',
            xy = (-.75, -.09), xycoords='axes fraction',
            fontsize=11,color=secondary_color_text)




plt.tight_layout()
plt.show()

In [None]:
#setting up color for aesthetics 
primary_color = '#174A7E'
secondary_color = '#80807F'
secondary_color_text = '#80807F'

# setting up data 
temp = df_usa_data_scientist['Q21'].value_counts(normalize=True, ascending=True).mul(100).round(1)

# starting plotting process 
bar_secondary_colors = [secondary_color] * temp.shape[0]
fig, ax = plt.subplots(figsize=(5, 5), facecolor='#FFFFFF')
ax.barh(width=temp.values, y=temp.index, color=bar_secondary_colors)


# add percentage, colors to the bars 
for i, y in enumerate(ax.patches):
    label_per = temp.values[i]
    ax.text(y.get_width()+1, y.get_y()+.26, str(f'{round((label_per), 2)}%'), 
                fontsize=15, color=secondary_color)
    


plt.box(on=None) # remove boxes of plot
plt.xticks([]) # remove x axis ticks 
# remove y axis hypen by setting length = 0
# and some additional design parametre
ax.yaxis.set_tick_params(length=0,
                         labelsize=15,
                         colors=secondary_color_text) 


# highlight to first bar for attention
ax.get_children()[4].set_color(primary_color) # make first row specific color for highlight
ax.get_yticklabels()[4].set_color(primary_color) # make first y axis tick label bold 
# ax.get_yticklabels()[14].set_fontweight('bold')


# add annotation for understanding 
ax.annotate('Data Scientists favor working in large companies',
            xy = (-.75, 1.15), xycoords='axes fraction', 
            fontsize=20, fontweight = 'medium', color=primary_color)
ax.annotate('Size of employer of data scientists',
            xy = (-.75, 1.002), xycoords='axes fraction',
            fontsize=15, fontweight = 'light', color=secondary_color_text)
ax.annotate('Source: Kaggle Survey 2021. ' + '\nQuestion: ' \
            + 'What is the size of the company where you are employed? ' \
            + '\nRespondents are filtered by job title=Data Scientist and location=USA' \
            + '\nTotal 430 respondents after filter applied',
            xy = (-.75, -.19), xycoords='axes fraction',
            fontsize=11,color=secondary_color_text) 





plt.show()

# FAQ 5: What does a data scientist do on a daily basis?

In [None]:
# plotting technique for multiple choise answers are taken from this notebook
# https://www.kaggle.com/corazzon/how-to-use-pandas-filter-in-survey-eda/
# visit this notebook to check how this technique works

df_all_col = df_usa_data_scientist.filter(regex="Q24")
df_all_col = df_all_col.dropna(how='all')

temp = df_all_col.describe()
temp = temp.loc[["top", "count"]].T
temp = temp.set_index("top")
temp = temp.sort_values("count", ascending=False)
temp['count'] = temp['count'].astype(np.double)
temp['perc']= temp['count']/df_all_col.shape[0]
temp['perc']= round(temp['perc']*100, 1)
temp = temp.reset_index()



temp.index = temp.top
temp = temp.drop('top', axis=1)


#setting up color for aesthetics 
primary_color = '#174A7E'
secondary_color = '#A6A6A5'
secondary_color_text = '#80807F'


# starting plotting process 
bar_secondary_colors = [secondary_color] * temp.shape[0]
fig, ax = plt.subplots(figsize=(6, 7), facecolor='#FFFFFF')
ax.barh(width=temp.perc, y=temp.index, color=bar_secondary_colors)
plt.gca().invert_yaxis()

# add percentage, colors to the bars 
for i, y in enumerate(ax.patches):
    label_per = temp.perc[i]
    ax.text(y.get_width()+.19, y.get_y()+.59, str(f'{round((label_per), 2)}%'), 
                fontsize=15, color=secondary_color)
    


plt.box(on=None) # remove boxes of plot
plt.xticks([]) # remove x axis ticks 
# remove y axis hypen by setting length = 0
# and some additional design parametre
ax.yaxis.set_tick_params(length=0,
                         labelsize=15,
                         colors=secondary_color_text) 


# highlight to first bar for attention
ax.get_children()[0].set_color(primary_color) # make first row specific color for highlight
ax.get_yticklabels()[0].set_color(primary_color) # make first y axis tick label bold 
# ax.get_yticklabels()[14].set_fontweight('bold')


# add annotation for understanding 
ax.annotate('Data Scientists are analyzing and understanding data \nto influence product and business decisions',
            xy = (-.75, 1.1), xycoords='axes fraction', 
            fontsize=20, fontweight = 'medium', color=primary_color)
ax.annotate('Data Scientists Job Duties',
            xy = (-.75, 1.002), xycoords='axes fraction',
            fontsize=15, fontweight = 'light', color=secondary_color_text)
ax.annotate('Source: Kaggle Survey 2021. ' + '\nQuestion: ' \
            + 'Select any activities that make up an important part of your role at work ' \
            + '\nRespondents can choose multiple answers' \
            + '\nRespondents are filtered by job title=Data Scientist and location=USA' \
            + '\nTotal 430 respondents after filter applied',
            xy = (-.75, -.19), xycoords='axes fraction',
            fontsize=11,color=secondary_color_text)





plt.show()

# FAQ 6: Which programming language is used by data scientists?

In [None]:
# plotting technique for multiple choise answers are taken from this notebook
# https://www.kaggle.com/corazzon/how-to-use-pandas-filter-in-survey-eda/
# visit this notebook to check how this technique works

df_all_col = df_usa_data_scientist.filter(regex="Q7")
df_all_col = df_all_col.dropna(how='all')

temp = df_all_col.describe()
temp = temp.loc[["top", "count"]].T
temp = temp.set_index("top")
temp = temp.sort_values("count", ascending=False)
temp['count'] = temp['count'].astype(np.double)
temp['perc']= temp['count']/df_all_col.shape[0]
temp['perc']= round(temp['perc']*100, 1)
temp = temp.reset_index()



temp.index = temp.top
temp = temp.drop('top', axis=1)


#setting up color for aesthetics 
primary_color = '#174A7E'
secondary_color = '#A6A6A5'
secondary_color_text = '#80807F'


# starting plotting process 
bar_secondary_colors = [secondary_color] * temp.shape[0]
fig, ax = plt.subplots(figsize=(6, 7), facecolor='#FFFFFF')
ax.barh(width=temp.perc, y=temp.index, color=bar_secondary_colors)
plt.gca().invert_yaxis()

# add percentage, colors to the bars 
for i, y in enumerate(ax.patches):
    label_per = temp.perc[i]
    ax.text(y.get_width()+.19, y.get_y()+.65, str(f'{round((label_per), 2)}%'), 
                fontsize=15, color=secondary_color)
    


plt.box(on=None) # remove boxes of plot
plt.xticks([]) # remove x axis ticks 
# remove y axis hypen by setting length = 0
# and some additional design parametre
ax.yaxis.set_tick_params(length=0,
                         labelsize=15,
                         colors=secondary_color_text) 


# highlight to first bar for attention
ax.get_children()[0].set_color(primary_color) # make first row specific color for highlight
ax.get_yticklabels()[0].set_color(primary_color) # make first y axis tick label bold 
# ax.get_yticklabels()[14].set_fontweight('bold')


# add annotation for understanding 
ax.annotate('Python is the most used programming language among data scientists',
            xy = (-.75, 1.1), xycoords='axes fraction', 
            fontsize=20, fontweight = 'medium', color=primary_color)
ax.annotate('Programming Language uses on a regular basis by data scientists',
            xy = (-.75, 1.002), xycoords='axes fraction',
            fontsize=15, fontweight = 'light', color=secondary_color_text)
ax.annotate('Source: Kaggle Survey 2021. ' + '\nQuestion: ' \
            + 'What programming languages do you use on a regular basis? ' \
            + '\nRespondents can choose multiple answers' \
            + '\nRespondents are filtered by job title=Data Scientist and location=USA' \
            + '\nTotal 430 respondents after filter applied',
            xy = (-.75, -.19), xycoords='axes fraction',
            fontsize=11,color=secondary_color_text)





plt.show()

# FAQ 7: Which IDE is used by data scientists?

In [None]:
# plotting technique for multiple choise answers are taken from this notebook
# https://www.kaggle.com/corazzon/how-to-use-pandas-filter-in-survey-eda/
# visit this notebook to check how this technique works


df_all_col = df_usa_data_scientist.filter(regex="Q9")
df_all_col = df_all_col.dropna(how='all')

temp = df_all_col.describe()
temp = temp.loc[["top", "count"]].T
temp = temp.set_index("top")
temp = temp.sort_values("count", ascending=False)
temp['count'] = temp['count'].astype(np.double)
temp['perc']= temp['count']/df_all_col.shape[0]
temp['perc']= round(temp['perc']*100, 1)
temp = temp.reset_index()



temp.index = temp.top
temp = temp.drop('top', axis=1)


#setting up color for aesthetics 
primary_color = '#174A7E'
secondary_color = '#A6A6A5'
secondary_color_text = '#80807F'


# starting plotting process 
bar_secondary_colors = [secondary_color] * temp.shape[0]
fig, ax = plt.subplots(figsize=(6, 7), facecolor='#FFFFFF')
ax.barh(width=temp.perc, y=temp.index, color=bar_secondary_colors)
plt.gca().invert_yaxis()

# add percentage, colors to the bars 
for i, y in enumerate(ax.patches):
    label_per = temp.perc[i]
    ax.text(y.get_width()+.19, y.get_y()+.55, str(f'{round((label_per), 2)}%'), 
                fontsize=15, color=secondary_color)
    


plt.box(on=None) # remove boxes of plot
plt.xticks([]) # remove x axis ticks 
# remove y axis hypen by setting length = 0
# and some additional design parametre
ax.yaxis.set_tick_params(length=0,
                         labelsize=15,
                         colors=secondary_color_text) 


# highlight to first bar for attention
ax.get_children()[0].set_color(primary_color) # make first row specific color for highlight
ax.get_yticklabels()[0].set_color(primary_color) # make first y axis tick label bold 
# ax.get_yticklabels()[14].set_fontweight('bold')


# add annotation for understanding 
ax.annotate('Jupyter notebook is the most used IDE among data scientists',
            xy = (-.75, 1.1), xycoords='axes fraction', 
            fontsize=20, fontweight = 'medium', color=primary_color)
ax.annotate('IDE\'s uses on a regular basis by data scientists',
            xy = (-.75, 1.002), xycoords='axes fraction',
            fontsize=15, fontweight = 'light', color=secondary_color_text)
ax.annotate('Source: Kaggle Survey 2021. ' + '\nQuestion: ' \
            + 'Which of the following integrated development environments (IDE\'s) do you use on a regular basis? ' \
            + '\nRespondents can choose multiple answers' \
            + '\nRespondents are filtered by job title=Data Scientist and location=USA' \
            + '\nTotal 430 respondents after filter applied',
            xy = (-.75, -.19), xycoords='axes fraction',
            fontsize=11,color=secondary_color_text)





plt.show()

# FAQ 8: What computer specs do I need for data science?

In [None]:
#setting up color for aesthetics 
primary_color = '#174A7E'
secondary_color = '#80807F'
secondary_color_text = '#80807F'

# setting up data 
temp = df_usa_data_scientist['Q11'].value_counts(normalize=True, ascending=True).mul(100).round(1)

# starting plotting process 
bar_secondary_colors = [secondary_color] * temp.shape[0]
fig, ax = plt.subplots(figsize=(5, 5), facecolor='#FFFFFF')
ax.barh(width=temp.values, y=temp.index, color=bar_secondary_colors)


# add percentage, colors to the bars 
for i, y in enumerate(ax.patches):
    label_per = temp.values[i]
    ax.text(y.get_width()+1, y.get_y()+.26, str(f'{round((label_per), 2)}%'), 
                fontsize=15, color=secondary_color)
    


plt.box(on=None) # remove boxes of plot
plt.xticks([]) # remove x axis ticks 
# remove y axis hypen by setting length = 0
# and some additional design parametre
ax.yaxis.set_tick_params(length=0,
                         labelsize=15,
                         colors=secondary_color_text) 


# highlight to first bar for attention
ax.get_children()[4].set_color(primary_color) # make first row specific color for highlight
ax.get_yticklabels()[4].set_color(primary_color) # make first y axis tick label bold 
# ax.get_yticklabels()[14].set_fontweight('bold')


# add annotation for understanding 
ax.annotate('No need for fancy computer specs for data science. \nA good laptop is enough',
            xy = (-.75, 1.15), xycoords='axes fraction', 
            fontsize=20, fontweight = 'medium', color=primary_color)
ax.annotate('Most popular computing platforms among data scientists',
            xy = (-.75, 1.002), xycoords='axes fraction',
            fontsize=15, fontweight = 'light', color=secondary_color_text)
ax.annotate('Source: Kaggle Survey 2021. ' + '\nQuestion: ' \
            + 'What type of computing platform do you use most often for your data science projects? ' \
            + '\nRespondents are filtered by job title=Data Scientist and location=USA' \
            + '\nTotal 430 respondents after filter applied',
            xy = (-.75, -.19), xycoords='axes fraction',
            fontsize=11,color=secondary_color_text) 





plt.show()

# FAQ 9: What data visualization libraries do data scientists use regularly?

In [None]:
# plotting technique for multiple choise answers are taken from this notebook
# https://www.kaggle.com/corazzon/how-to-use-pandas-filter-in-survey-eda/
# visit this notebook to check how this technique works

df_all_col = df_usa_data_scientist.filter(regex="Q14")
df_all_col = df_all_col.dropna(how='all')

temp = df_all_col.describe()
temp = temp.loc[["top", "count"]].T
temp = temp.set_index("top")
temp = temp.sort_values("count", ascending=False)
temp['count'] = temp['count'].astype(np.double)
temp['perc']= temp['count']/df_all_col.shape[0]
temp['perc']= round(temp['perc']*100, 1)
temp = temp.reset_index()



temp.index = temp.top
temp = temp.drop('top', axis=1)


#setting up color for aesthetics 
primary_color = '#174A7E'
secondary_color = '#A6A6A5'
secondary_color_text = '#80807F'


# starting plotting process 
bar_secondary_colors = [secondary_color] * temp.shape[0]
fig, ax = plt.subplots(figsize=(6, 7), facecolor='#FFFFFF')
ax.barh(width=temp.perc, y=temp.index, color=bar_secondary_colors)
plt.gca().invert_yaxis()

# add percentage, colors to the bars 
for i, y in enumerate(ax.patches):
    label_per = temp.perc[i]
    ax.text(y.get_width()+.3, y.get_y()+.55, str(f'{round((label_per), 2)}%'), 
                fontsize=15, color=secondary_color)
    


plt.box(on=None) # remove boxes of plot
plt.xticks([]) # remove x axis ticks 
# remove y axis hypen by setting length = 0
# and some additional design parametre
ax.yaxis.set_tick_params(length=0,
                         labelsize=15,
                         colors=secondary_color_text) 


# highlight to first bar for attention
ax.get_children()[0].set_color(primary_color) 
ax.get_yticklabels()[0].set_color(primary_color)
ax.get_children()[1].set_color(primary_color) 
ax.get_yticklabels()[1].set_color(primary_color)



# add annotation for understanding 
ax.annotate('Maplotlib and Seaborn are the most used data \nvisualization libraries among data scientists',
            xy = (-.75, 1.1), xycoords='axes fraction', 
            fontsize=20, fontweight = 'medium', color=primary_color)
ax.annotate('Data Visualization libraries uses on a regular basis by data scientists',
            xy = (-.75, 1.002), xycoords='axes fraction',
            fontsize=15, fontweight = 'light', color=secondary_color_text)
ax.annotate('Source: Kaggle Survey 2021. ' + '\nQuestion: ' \
            + 'What data visualization libraries or tools do you use on a regular basis? ' \
            + '\nRespondents can choose multiple answers' \
            + '\nRespondents are filtered by job title=Data Scientist and location=USA' \
            + '\nTotal 430 respondents after filter applied',
            xy = (-.75, -.19), xycoords='axes fraction',
            fontsize=11,color=secondary_color_text)





plt.show()

# FAQ 10: Which Machine Learning Algorithms do Data Scientists use on a regular basis?

In [None]:
# plotting technique for multiple choise answers are taken from this notebook
# https://www.kaggle.com/corazzon/how-to-use-pandas-filter-in-survey-eda/
# visit this notebook to check how this technique works


df_all_col = df_usa_data_scientist.filter(regex="Q17")
df_all_col = df_all_col.dropna(how='all')

temp = df_all_col.describe()
temp = temp.loc[["top", "count"]].T
temp = temp.set_index("top")
temp = temp.sort_values("count", ascending=False)
temp['count'] = temp['count'].astype(np.double)
temp['perc']= temp['count']/df_all_col.shape[0]
temp['perc']= round(temp['perc']*100, 1)
temp = temp.reset_index()



temp.index = temp.top
temp = temp.drop('top', axis=1)


#setting up color for aesthetics 
primary_color = '#174A7E'
secondary_color = '#A6A6A5'
secondary_color_text = '#80807F'


# starting plotting process 
bar_secondary_colors = [secondary_color] * temp.shape[0]
fig, ax = plt.subplots(figsize=(6, 7), facecolor='#FFFFFF')
ax.barh(width=temp.perc, y=temp.index, color=bar_secondary_colors)
plt.gca().invert_yaxis()

# add percentage, colors to the bars 
for i, y in enumerate(ax.patches):
    label_per = temp.perc[i]
    ax.text(y.get_width()+.5, y.get_y()+.55, str(f'{round((label_per), 2)}%'), 
                fontsize=15, color=secondary_color)
    


plt.box(on=None) # remove boxes of plot
plt.xticks([]) # remove x axis ticks 
# remove y axis hypen by setting length = 0
# and some additional design parametre
ax.yaxis.set_tick_params(length=0,
                         labelsize=15,
                         colors=secondary_color_text) 


# highlight to first bar for attention
ax.get_children()[0].set_color(primary_color) 
ax.get_yticklabels()[0].set_color(primary_color)
ax.get_children()[1].set_color(primary_color) 
ax.get_yticklabels()[1].set_color(primary_color)



# add annotation for understanding 
ax.annotate('Simple Machine learning Algorithms like linear regression and \ndecision trees are used on a regular basis by data scientists.',
            xy = (-.75, 1.1), xycoords='axes fraction', 
            fontsize=20, fontweight = 'medium', color=primary_color)
ax.annotate('Machine learning Algorithms uses on a regular basis by data scientists',
            xy = (-.75, 1.002), xycoords='axes fraction',
            fontsize=15, fontweight = 'light', color=secondary_color_text)
ax.annotate('Source: Kaggle Survey 2021. ' + '\nQuestion: ' \
            + 'Which of the following ML algorithms do you use on a regular basis? ' \
            + '\nRespondents can choose multiple answers' \
            + '\nRespondents are filtered by job title=Data Scientist and location=USA' \
            + '\nTotal 430 respondents after filter applied',
            xy = (-.75, -.19), xycoords='axes fraction',
            fontsize=11,color=secondary_color_text)





plt.show()

# FAQ 11: Which Cloud computing platforms do Data Scientists use on a regular basis?

In [None]:
# plotting technique for multiple choise answers are taken from this notebook
# https://www.kaggle.com/corazzon/how-to-use-pandas-filter-in-survey-eda/
# visit this notebook to check how this technique works


df_all_col = df_usa_data_scientist.filter(regex="Q27_A")
df_all_col = df_all_col.dropna(how='all')
df_all_col = df_all_col.dropna(how='all', axis=1)

temp = df_all_col.describe()
temp = temp.loc[["top", "count"]].T
temp = temp.set_index("top")
temp = temp.sort_values("count", ascending=False)
temp['count'] = temp['count'].astype(np.double)
temp['perc']= temp['count']/df_all_col.shape[0]
temp['perc']= round(temp['perc']*100, 1)
temp = temp.reset_index()



temp.index = temp.top
temp = temp.drop('top', axis=1)


#setting up color for aesthetics 
primary_color = '#174A7E'
secondary_color = '#A6A6A5'
secondary_color_text = '#80807F'


# starting plotting process 
bar_secondary_colors = [secondary_color] * temp.shape[0]
fig, ax = plt.subplots(figsize=(6, 7), facecolor='#FFFFFF')
ax.barh(width=temp.perc, y=temp.index, color=bar_secondary_colors)
plt.gca().invert_yaxis()

# add percentage, colors to the bars 
for i, y in enumerate(ax.patches):
    label_per = temp.perc[i]
    ax.text(y.get_width()+.29, y.get_y()+.55, str(f'{round((label_per), 2)}%'), 
                fontsize=15, color=secondary_color)
    


plt.box(on=None) # remove boxes of plot
plt.xticks([]) # remove x axis ticks 
# remove y axis hypen by setting length = 0
# and some additional design parametre
ax.yaxis.set_tick_params(length=0,
                         labelsize=15,
                         colors=secondary_color_text) 


# highlight to first bar for attention
ax.get_children()[0].set_color(primary_color) 
ax.get_yticklabels()[0].set_color(primary_color)
ax.get_children()[1].set_color(primary_color) 
ax.get_yticklabels()[1].set_color(primary_color)
ax.get_children()[2].set_color(primary_color) 
ax.get_yticklabels()[2].set_color(primary_color)



# add annotation for understanding 
ax.annotate('AWS, GCP and Azure are popular among data scientists',
            xy = (-.75, 1.1), xycoords='axes fraction', 
            fontsize=20, fontweight = 'medium', color=primary_color)
ax.annotate('Cloud computing platforms use on a regular basis by data scientists',
            xy = (-.75, 1.002), xycoords='axes fraction',
            fontsize=15, fontweight = 'light', color=secondary_color_text)
ax.annotate('Source: Kaggle Survey 2021. ' + '\nQuestion: ' \
            + 'Which of the following cloud computing platforms do you use on a regular basis? ' \
            + '\nRespondents can choose multiple answers' \
            + '\nRespondents are filtered by job title=Data Scientist and location=USA' \
            + '\nTotal 430 respondents after filter applied',
            xy = (-.75, -.19), xycoords='axes fraction',
            fontsize=11,color=secondary_color_text)





plt.show()

# FAQ 12: Which is the popular data science learning platform by data scientists?

In [None]:
# plotting technique for multiple choise answers are taken from this notebook
# https://www.kaggle.com/corazzon/how-to-use-pandas-filter-in-survey-eda/
# visit this notebook to check how this technique works


df_all_col = df_usa_data_scientist.filter(regex="Q40")
df_all_col = df_all_col.dropna(how='all')
# df_all_col = df_all_col.dropna(how='all', axis=1)

temp = df_all_col.describe()
temp = temp.loc[["top", "count"]].T
temp = temp.set_index("top")
temp = temp.sort_values("count", ascending=False)
temp['count'] = temp['count'].astype(np.double)
temp['perc']= temp['count']/df_all_col.shape[0]
temp['perc']= round(temp['perc']*100, 1)
temp = temp.reset_index()



temp.index = temp.top
temp = temp.drop('top', axis=1)


#setting up color for aesthetics 
primary_color = '#174A7E'
secondary_color = '#A6A6A5'
secondary_color_text = '#80807F'


# starting plotting process 
bar_secondary_colors = [secondary_color] * temp.shape[0]
fig, ax = plt.subplots(figsize=(6, 7), facecolor='#FFFFFF')
ax.barh(width=temp.perc, y=temp.index, color=bar_secondary_colors)
plt.gca().invert_yaxis()

# add percentage, colors to the bars 
for i, y in enumerate(ax.patches):
    label_per = temp.perc[i]
    ax.text(y.get_width()+.4, y.get_y()+.55, str(f'{round((label_per), 2)}%'), 
                fontsize=15, color=secondary_color)
    


plt.box(on=None) # remove boxes of plot
plt.xticks([]) # remove x axis ticks 
# remove y axis hypen by setting length = 0
# and some additional design parametre
ax.yaxis.set_tick_params(length=0,
                         labelsize=15,
                         colors=secondary_color_text) 


# highlight to first bar for attention
ax.get_children()[0].set_color(primary_color) 
ax.get_yticklabels()[0].set_color(primary_color)



# add annotation for understanding 
ax.annotate('Coursera is the most popular learning platform among data scientists',
            xy = (-.75, 1.1), xycoords='axes fraction', 
            fontsize=20, fontweight = 'medium', color=primary_color)
ax.annotate('Educational platforms used by data scientists in USA',
            xy = (-.75, 1.002), xycoords='axes fraction',
            fontsize=15, fontweight = 'light', color=secondary_color_text)
ax.annotate('Source: Kaggle Survey 2021. ' + '\nQuestion: ' \
            + 'On which platforms have you begun or completed data science courses? ' \
            + '\nRespondents can choose multiple answers' \
            + '\nRespondents are filtered by job title=Data Scientist and location=USA' \
            + '\nTotal 430 respondents after filter applied',
            xy = (-.75, -.19), xycoords='axes fraction',
            fontsize=11,color=secondary_color_text)





plt.show()

Thanks for reading. 

# The End



**References**:
1. [2021 Kaggle Data Science & Machine Learning Survey by PAUL MOONEY](https://www.kaggle.com/paultimothymooney/2021-kaggle-data-science-machine-learning-survey)
2. [How to use 🐼pandas filter🎨 in survey EDA 📊 by CORAZZON](https://www.kaggle.com/corazzon/how-to-use-pandas-filter-in-survey-eda)