In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Table of contents:
1. **Importing libraries, loading data and setting pandas options**
2. **Plan**
3. **Questions that I want answers of from this dataset:**
    + 1: What age group does most of the data analysts belong to?
    + 2: What is the gender ratio of data analysts?
    + 3: Which country have the highest number of data analysts?
    + 4: What is the average coding experience of data analysts?
    + 5: What highest education group does data analysts belong to?
    + 6: What is the most favourite programming language of data analysts?
    + 7: What is the most favourite big data product of data analysts?
    + 8: What is the most favourite BI Tool of data analysts?
    + 9: What is the most favourite tool to analyse data of data analysts?
    

# 1. Importing libraries, loading data and setting pandas options

In [None]:
## for geocoding the data
!pip install country-converter

In [None]:
## for data manipulation
import pandas as pd
import json

## for data viz
import seaborn as sns 
import matplotlib.pyplot as plt
import plotly.express as px 

## for choropleth map
import folium as fol
from folium import Map as fm

## for geocoding the data
import country_converter as coco
import pycountry as pc # for getting proper country names
from geopy.geocoders import Nominatim # for getting coordinates
locator = Nominatim(user_agent = "project use") # Nominatim API is a tool to search through OpenStreetMap

In [None]:
data_raw = pd.read_csv("../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv", 
                       low_memory = False)

In [None]:
pd.set_option('display.max_columns', 500) # to get an idea of all the columns
data_raw.head(5);

In [None]:
len(data_raw)

# 2. Plan
+ There's no need for data cleaning and manipulation as dataset is already clean, so I will straight skip to visualizations.
+ To understand data refer kaggle_survey_2020_methodology.pdf & kaggle_survey_2020_answer_choices.pdf
+ Since there is so much to infer from this data I will stick to only those questions(columns) which interests me.
+ The questions that interests me are related to data analytics field.
+ Overall the questions will include topics related to coding, data vizualisation, big data and business intelligence.
+ For selecting the questions/columns I will refer to the pdf which describes questions asked in the survey.
### The columns which I think will be useful are 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 14, 29-B, 30, 31-B, 32, 37, 38 and 39.

In [None]:
## creating a list of all the column names for later use
col_list = data_raw.columns.tolist();

In [None]:
## From col_list I extracted the column names corresponding to the question for later use
q7_list = ['Q7_Part_1', 'Q7_Part_2', 'Q7_Part_3', 'Q7_Part_4', 'Q7_Part_5', 'Q7_Part_6', 'Q7_Part_7', 'Q7_Part_8', 'Q7_Part_9', 'Q7_Part_10', 'Q7_Part_11', 'Q7_Part_12', 'Q7_OTHER']
q9_list = ['Q9_Part_1', 'Q9_Part_2', 'Q9_Part_3', 'Q9_Part_4', 'Q9_Part_5', 'Q9_Part_6', 'Q9_Part_7', 'Q9_Part_8', 'Q9_Part_9', 'Q9_Part_10', 'Q9_Part_11', 'Q9_OTHER']
q10_list = ['Q10_Part_1', 'Q10_Part_2', 'Q10_Part_3', 'Q10_Part_4', 'Q10_Part_5', 'Q10_Part_6', 'Q10_Part_7', 'Q10_Part_8', 'Q10_Part_9', 'Q10_Part_10', 'Q10_Part_11', 'Q10_Part_12', 'Q10_Part_13', 'Q10_OTHER']
q14_list = ['Q14_Part_1', 'Q14_Part_2', 'Q14_Part_3', 'Q14_Part_4', 'Q14_Part_5', 'Q14_Part_6', 'Q14_Part_7', 'Q14_Part_8', 'Q14_Part_9', 'Q14_Part_10', 'Q14_Part_11', 'Q14_OTHER']
q37_list = ['Q37_Part_1', 'Q37_Part_2', 'Q37_Part_3', 'Q37_Part_4', 'Q37_Part_5', 'Q37_Part_6', 'Q37_Part_7', 'Q37_Part_8', 'Q37_Part_9', 'Q37_Part_10', 'Q37_Part_11', 'Q37_OTHER']
q39_list = ['Q39_Part_1', 'Q39_Part_2', 'Q39_Part_3', 'Q39_Part_4', 'Q39_Part_5', 'Q39_Part_6', 'Q39_Part_7', 'Q39_Part_8', 'Q39_Part_9', 'Q39_Part_10', 'Q39_Part_11', 'Q39_OTHER']
q29_B_list = ['Q29_B_Part_1', 'Q29_B_Part_2', 'Q29_B_Part_3', 'Q29_B_Part_4', 'Q29_B_Part_5', 'Q29_B_Part_6', 'Q29_B_Part_7', 'Q29_B_Part_8', 'Q29_B_Part_9', 'Q29_B_Part_10', 'Q29_B_Part_11', 'Q29_B_Part_12', 'Q29_B_Part_13', 'Q29_B_Part_14', 'Q29_B_Part_15', 'Q29_B_Part_16', 'Q29_B_Part_17', 'Q29_B_OTHER']
q31_B_list = ['Q31_B_Part_1', 'Q31_B_Part_2', 'Q31_B_Part_3', 'Q31_B_Part_4', 'Q31_B_Part_5', 'Q31_B_Part_6', 'Q31_B_Part_7', 'Q31_B_Part_8', 'Q31_B_Part_9', 'Q31_B_Part_10', 'Q31_B_Part_11', 'Q31_B_Part_12', 'Q31_B_Part_13', 'Q31_B_Part_14', 'Q31_B_OTHER']

# 3. Questions that I want answers of from this dataset:
As an aspiring data analyst, I would like to know the answers of these following questions. 

## Question1: What age group does most of the data analysts belong to?

In [None]:
## filtering for "data analysts" and selecting columns
filt = (data_raw.loc[: ,'Q5'] == 'Data Analyst') 
fig_data = data_raw.loc[filt, ['Q1','Q5']]

In [None]:
## creating the chart
order = fig_data.loc[:, 'Q1'].value_counts().index.tolist()

fig = sns.catplot(kind = 'count',
                  data = fig_data,
                  x = 'Q1',
                  height = 7,
                  aspect = 1.5,
                  order = order,
                  palette = sns.color_palette('rocket_r'))

fig.set_xlabels('Age Group', size = 15)
fig.set_ylabels('Number of Data Analysts', size = 15)
fig.fig.suptitle("Data Analysts by age group", fontweight = 'bold', size = 25)

plt.show()

#### Analysis: 
This field has become popular in the recent times due to increasing job opportunities with high salaries and young aspirants are aware of it, this must be the reason why many of the data analysts are in their 20's and early 30's.

## Question2: What is the gender ratio of data analysts?

In [None]:
## filtering for "data analysts" and selecting columns
filt = (data_raw.loc[: ,'Q5'] == 'Data Analyst')
fig_data = data_raw.loc[filt, ['Q2','Q5']]

In [None]:
## creating the chart
count_val = [1110, 348, 11, 5, 1] # fig_data.Q2.value_counts().tolist()

labels = ['Man', 'Woman', 'Prefer not to say', 'Prefer to self-describe', 'Nonbinary'] # fig_data.Q2.value_counts().index.tolist()

fig = px.pie(names = labels,
             values = count_val,
             color = labels,
             color_discrete_map = {'Man': '#95d0fc',
                                   'Woman':'#ffd1df',
                                   'Prefer not to say':'#fcb001',
                                   'Prefer to self-describe':'#fcb001',
                                   'Nonbinary':'#fcb001'},
             template = 'ggplot2')

fig.update_layout(title = {'text':'Data Analysts by Genders', 'font': {'size': 24, 'color': 'RoyalBlue'}},
                  width = 900)
fig.show()

### Analysis:
This chart is reflective of the broader category of STEM fields. Be it school, university or workplace, the percentage of women is way less than men in STEM related institutions. What we can do is encourage all the women around us to break the stereo types and pursue STEM fields if they are interested in them.

## Question3: Which country have the highest number of data analysts?

In [None]:
## filtering only data analysts
filt = (data_raw.loc[: ,'Q5'] == 'Data Analyst')
fig_data = data_raw.loc[filt, ['Q3','Q5']]

## replacing the following country names as they are unusually long
fig_data.Q3.replace('United States of America', 'USA', inplace = True)
fig_data.Q3.replace('United Kingdom of Great Britain and Northern Ireland', 'UK', inplace = True)

## removing 'other' row values for countries
filt2 = (fig_data.loc[:, 'Q3'] != 'Other')
fig_data = fig_data.loc[filt2, ['Q3', 'Q5']]

# data for folium map
fig_data_for_map = fig_data.loc[:, 'Q3'].value_counts().to_frame().reset_index().head(10)
fig_data_for_map;

In [None]:
## Getting proper names of the countries and their coordinates for folium map
def convert_to_alpha_3(country): # to return alpha_3 code of the country
    return pc.countries.search_fuzzy(str(country))[0].alpha_3

def properName(country): # to return proper name of the country
    return pc.countries.search_fuzzy(str(country))[0].name
    
def lat(alpha_code): # to return latitude of the country
    return locator.geocode(alpha_code).latitude

def long(alpha_code): # to return longitude of the country
    return locator.geocode(alpha_code).longitude
    
fig_data_for_map['alpha_codes'] = fig_data_for_map['index'].apply(convert_to_alpha_3)
fig_data_for_map.replace('UKR', 'GBR', inplace = True) # manually changing the alpha3 code of united kingdom
fig_data_for_map['Country_name'] = fig_data_for_map['alpha_codes'].apply(properName)
fig_data_for_map['latitude'] = fig_data_for_map['Country_name'].apply(lat)
fig_data_for_map['longitude'] = fig_data_for_map['Country_name'].apply(long)

fig_data_for_map;

In [None]:
## creating the chart
order = fig_data.loc[:, 'Q3'].value_counts().head(10).index.tolist()

fig = sns.catplot(kind = 'count',
                  data = fig_data,
                  x = 'Q3',
                  height = 7,
                  aspect = 2,
                  order = order,
                  palette = sns.color_palette('GnBu_r',10))

fig.set_xlabels('Country', size = 15)
fig.ax.tick_params(axis = 'x', rotation = 20, labelsize = 15)
fig.set_ylabels('Number of Data Analysts', size = 15)
fig.fig.suptitle("Data Analysts by country", fontweight = 'bold', size = 25)

plt.show()

### Analysis:
A very huge number of computer engineering students graduating each year from India, add that to the growing craze in data science all over the world and we have highest number of Indian kagglers as data analysts. 
Whats strange is the low number of chinese data analysts in this dataset. The reason could be Alibaba Cloud's Tianchi platform which is a chinese alternative to kaggle with over 400,000 strong community. 

In [None]:
## loading the geojson file for creating boundaries of the countries which will be needed for choropleth map
with open("../input/countries-geojson/countries.geojson", mode = 'r') as f:
    countries_geojson = json.load(f)

In [None]:
## Creating map using folium

# Base map
world_map = fm(zoom_start = 2,
               location = (48.792102, -6.610949))

# Adding choropleth map on base map
fol.Choropleth(geo_data = countries_geojson,
               data = fig_data_for_map, 
               columns = ['alpha_codes', 'Q3'],
               key_on = 'feature.properties.ISO_A3',
               fill_opacity = 0.99,
               fill_color = 'OrRd',
               nan_fill_color='white',
               legend_name = "Number of Data Analysts").add_to(world_map)

# adding tags on base map
for country in range(0, len(fig_data_for_map)):
    fol.Marker(location = (fig_data_for_map.loc[country, 'latitude'],
                           fig_data_for_map.loc[country, 'longitude']),
               tooltip = f'{fig_data_for_map.loc[country, "Country_name"]} (No. of DA:{fig_data_for_map.loc[country, "Q3"]})').add_to(world_map)

world_map

### Analysis:
A very huge number of computer engineering students graduating each year from India, add that to the growing craze in data science all over the world and we have highest number of Indian kagglers as data analysts. 
Whats strange is the low number of chinese data analysts in this dataset. The reason could be Alibaba Cloud's Tianchi platform which is a chinese alternative to kaggle with over 400,000 strong community. 

## Question4: What is the average coding experience of data analysts?

In [None]:
## filtering for "data analysts" and selecting columns
filt = (data_raw.loc[: ,'Q5'] == 'Data Analyst')
fig_data = data_raw.loc[filt, ['Q6','Q5']]
fig_data.Q6.value_counts().index.tolist();

In [None]:
## creating the chart

order = fig_data.Q6.value_counts().index.tolist()
plot1 = sns.catplot(kind = 'count',
                    data = fig_data, 
                    x = 'Q6',
                    order = order,
                    palette = sns.color_palette('Purples_r', 10),
                    height = 7, 
                    aspect = 2)

plot1.fig.suptitle('Coding Experience of Data Analysts',
                   fontsize = 25,
                   weight = 'bold',
                   x = .5,
                   color = 'RoyalBlue')
plot1.ax.set_xlabel('Coding Experience in Years',fontsize = 15)
plot1.ax.set_ylabel('Number Of Data Analysts',fontsize = 15)

plot1.set_xticklabels(fontsize = 11)
plot1.set_yticklabels(fontsize = 11)

plt.show()

### Analysis:
As it is clear from the chart that most of the data Analysts who use kaggle have coding experience between 1-5 years. This could be the indicator of rising popularity of python based data analysis libraries in the recent time although python, R or the related libraries in question are by no means "new".

## Question5: What highest education group does data analysts belong to?

In [None]:
## filtering for "data analysts" and selecting columns
filt = (data_raw.loc[: ,'Q5'] == 'Data Analyst')
fig_data = data_raw.loc[filt, ['Q4','Q5']]
fig_data.Q4.value_counts();

In [None]:
## replacing some values because name is too long
fig_data.Q4.replace(to_replace = {'Some college/university study without earning a bachelor’s degree': "Some College",
                    "No formal education past high school":"High School"},
                    inplace = True);

In [None]:
## creating the chart
order = fig_data.loc[:, 'Q4'].value_counts().index.tolist()

fig = sns.catplot(kind = 'count',
                  data = fig_data,
                  x = 'Q4',
                  height = 7,
                  aspect = 2,
                  order = order,
                  palette = sns.color_palette('YlGnBu_r', 10))

fig.set_xlabels('Highest Education', size = 16)
fig.set_ylabels('Number of Data Analysts', size = 16)
fig.fig.suptitle("Data Analysts by Highest Education", fontweight = 'bold', size = 25)

fig.set_xticklabels(fontsize = 12)
fig.set_yticklabels(fontsize = 12)

plt.show()

### Analysis:
Data Analytics is a maths and statistics intensive field which can also require deep understanding of programming languages, so it's clear that many get master/bachelors degree before they break into the job market. Another factor could be growing competetion among candidates and growing complexity of this field which explains why most of them have master's degree. Reason for low number of "PHD holder data analysts" might be, that those who pursue PHD's are more interested in academia/research rather than corporate jobs.

## Question6: What is the most favourite programming language of data analysts?

In [None]:
## filtering and selecting columns
filt = ((data_raw.loc[: ,'Q5'] == 'Data Analyst') &
        (data_raw.loc[: ,'Q6'].isin(['< 1 years','5-10 years', '10-20 years'])) &
        (data_raw.loc[:,'Q8'] != 'Other') &
        (data_raw.loc[:,'Q8'] != 'None'))

fig_data = data_raw.loc[filt, ['Q8','Q6','Q5']]

In [None]:
## Creating the figure
order = fig_data.loc[:, 'Q8'].value_counts().index.tolist()

fig = sns.catplot(kind = 'count',
                  data = fig_data,
                  hue = 'Q6',
                  x = 'Q8',
                  height = 7,
                  aspect = 1.5,
                  order = order,
                  palette = sns.color_palette('Set1'),
                  legend_out = False)

fig.set_xlabels('Programming Language', size = 15)
fig.set_ylabels('Number of Data Analysts', size = 15)
fig.fig.suptitle("Language Recommended By Data Analysts", fontweight = 'bold', size = 25)

fig.add_legend(title='Experience in Coding', fontsize = 13, loc = 'center right')

plt.show()

### Analysis:
Python is the clear winner among all the experience groups. The reason must be the huge community support for almost all the problems,its a full fledged general programming language, its easy to use and easy to learn and python's huge collection of data science related libraries. SQL comes second because its still the industry standard when it comes to querying large amount of data in relational databases. R comes third which was once the most popular language among statisticians and data analysts but now is losing its popularity.

## Question7: What is the most favourite big data product of data analysts?

In [None]:
## filtering for "data analysts" and selecting columns
filt = (data_raw.loc[: ,'Q5'] == 'Data Analyst') & (data_raw.loc[:,'Q30'] != 'Other')
fig_data = data_raw.loc[filt, ['Q30','Q5']]

fig_data.Q30.value_counts();

In [None]:
## Creating the figure
order = fig_data.loc[:, 'Q30'].value_counts().index.tolist()

fig = plt.figure(figsize=(12, 10))
ax = fig.subplots()
sns.countplot(data = fig_data,
              y = 'Q30', 
              order = order,
              ax = ax,
              palette = sns.color_palette('Blues_r', 25))

ax.set_title('Big Data Products used most often by Data Analysts',
             fontsize = '25',
             x = 0.4,
             y = 1.05)

ax.set_ylabel('Big Data Product', fontsize = 18)
ax.set_xlabel('Number of Data Analysts', fontsize = 18)

plt.show()

### Analysis:
With MySQL at the top followed by SQL Server, PostgresSQL and Oracle Database, these are the most popular relational database management systems used by data analysts. Then there's MongoDB which is the most popular NoSQL database program among the others of the same category.

## Question8: What is the most favourite BI Tool of data analysts?

In [None]:
## filtering and selecting columns
filt = (data_raw.loc[: ,'Q5'] == 'Data Analyst') & (data_raw.loc[:,'Q32'] != 'Other')
fig_data = data_raw.loc[filt, ['Q32','Q5']]

In [None]:
## Creating the chart
order = fig_data.loc[:, 'Q32'].value_counts().index.tolist()

fig = plt.figure(figsize=(12, 10))
ax = fig.subplots()
sns.countplot(data = fig_data,
              y = 'Q32', 
              order = order,
              ax = ax,
              palette = sns.color_palette('PuRd_r', 16))

ax.set_title('BI Products used most often by Data Analysts',
             fontsize = '25',
             x = 0.4,
             y = 1.05)

ax.set_ylabel('Business Intelligence Product', fontsize = 18)
ax.set_xlabel('Number of Data Analysts', fontsize = 18)

plt.show()

### Analysis:
Tableau and Power BI are clearly the most popular BI tools used by data analysts followed by Google Data Studio. All three tools are used to create interactive visualizations and dashboards from different types of data sources.

## Question9: What is the most favourite tool to analyse data of data analysts?

In [None]:
## filtering and selecting columns
filt = (data_raw.loc[: ,'Q5'] == 'Data Analyst') & (data_raw.loc[:,'Q38'] != 'Other')
fig_data = data_raw.loc[filt, ['Q38','Q5']]

In [None]:
## creating the chart
order = fig_data.loc[:, 'Q38'].value_counts().index.tolist()

fig = plt.figure(figsize=(12, 10))
ax = fig.subplots()
sns.countplot(data = fig_data,
              y = 'Q38', 
              order = order,
              ax = ax,
              palette = sns.color_palette('YlGn_r', 7))

ax.set_title('Data Analyzing tools used most often by Data Analysts',
             fontsize = '25',
             x = 0.25,
             y = 1.05)

ax.set_ylabel('Data Analyzing Tools', fontsize = 18)
ax.set_xlabel('Number of Data Analysts', fontsize = 18)

plt.show()

### Analysis:
With the flexibility and capabilities of handling large data sources of programming languages such as python and R , local development environments are the most popular data analyzing tools among data analysts. Second comes the spreadsheet softwares such as excel and google sheets which makes it easy to analyse data without writing code, but are limited to particular amount of data.

# ---