<div style="background-color:#ADD8E6;">
    <h1 style="text-align:center; padding:50px;"> EDA: Kaggle Data Scientists in Europe </h1>
</div>

<h4> &ensp; The main purpose of this notebook is to analyse European Kaggle users and find patterns in their behaviour and preferences. 
    <br><br> &ensp; Since I come from Romania, my aim was to find out what it takes to be a Data Scientist in this country, considering the fact that there are few Data Science job opportunities available in the IT sector. </h4>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv")
data.head()

In [None]:
# Checking the list of countries

data['Q3'].unique()

In [None]:
# Keeping only data from European countries

countries = ['Russia', 'Greece', 'Belgium', 'Poland', 'Italy', 'Spain', 
          'United Kingdom of Great Britain and Northern Ireland',
         'France', 'Switzerland', 'Sweden', 'Netherlands', 'Ukraine',
         'Romania', 'Austria', 'Belarus', 'Ireland', 'Portugal',
         'Denmark', 'Germany', 'Norway', 'Czech Republic']

df = data[data['Q3'].isin(countries)].copy()
df.head()

In [None]:
# Checking the list and the number of countries

print('List of European countries available in the dataset:\n\n', df['Q3'].unique())
print('\nNumber of unique countries:', df['Q3'].nunique())

In [None]:
# Checking the shape of the dataframe

print('Number of samples:', df.shape[0])
print('Number of features per sample:', df.shape[1])

# Exploratory Data Analysis

In [None]:
# Defining a function to plot bar charts

def plot_bar(df, col_name, title, x_label, y_label):
    sns.barplot(x=df[col_name].value_counts(),
               y=df[col_name].value_counts().index,
               data=df, palette='rocket')
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)

In [None]:
# Defining a function to plot pie charts

def plot_pie(col_name, title):
    fig1, ax1 = plt.subplots()
    ax1.pie(df[col_name].value_counts(), startangle=90)
    
    # create legend from labels and percentages
    percentages = (df[col_name].value_counts().values / 
                   sum(df[col_name].value_counts().values)) * 100
    labels = [f"{l} {s:0.1f}%" for l, s in zip(df[col_name].value_counts().index, percentages)]
    ax1.legend(labels=labels, title='Categories and Percentages',
              bbox_to_anchor=(1,1), loc = "upper right", fontsize=12, 
               bbox_transform=plt.gcf().transFigure)
    
    # draw center circle
    centre_circle = plt.Circle((0,0),0.70,fc='white')
    fig = plt.gcf()
    fig.gca().add_artist(centre_circle)
    
    ax1.axis('equal')  
    plt.tight_layout()  
    plt.title(title)

In [None]:
# Defining a function to plot bar chart
# by combining questions with multiple parts

def plot_bar_combined(df, col_name, title, x_label, y_label):
    # filter the parts based on question number
    d = df.filter(like=col_name,axis=1)
    
    sns.barplot(x=d.count(axis=0),
               y=d.T.mode(axis=1)[0].values,
               data=d, palette='rocket')
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)

In [None]:
# Plotting Age distribution

plot_bar(df, 'Q1', 'Age Distribution among European Kagglers', 'Count', 'Age')

In [None]:
# Plotting Country distribution

plot_bar(df, 'Q3', 'Country-wise Distribution of European Kagglers', 'Count', 'Country')

In [None]:
# Plotting Education distribution

plot_bar(df, 'Q4', 'Distribution of Education among European Kagglers', 'Count', 'Education')

In [None]:
# How many people pursued a master's or higher education in Romania?

degrees = ["Master’s degree", "Doctoral degree", "Professional doctorate"]
res = len(df[(df['Q3']=="Romania") 
             & (df['Q4'].isin(degrees))])
print("Number of people in Romania that own a Master's degree or pursued a higher education:", res)

In [None]:
# Plotting Experience distribution

plot_bar(df, 'Q6', 'Distribution based on Years of Experience', 'Count', 'Experience')

In [None]:
# Plotting Gender distibution

plot_pie('Q2', 'Gender-wise distribution')

In [None]:
# Plotting distibution of Utilised Programming Languages

plot_bar_combined(df, 'Q7','Distibution of Used Programming Languages among European Kagglers','Count','Programming Language')

In [None]:
# Plotting distribution of Recommended Programming Languages

plot_pie('Q8','Distibution of Recommended Programming Languages')

In [None]:
# What is the ratio of programming in Python per country?

# divide number of people who use Python in a certain country
# by total number of people in that country
((df.groupby('Q3')['Q7_Part_1'].count() / 
  df['Q3'].value_counts()) * 100).round(2).sort_values(ascending=False)

In [None]:
# How many people choose programming in R over Python?

res = len(df[(~df['Q7_Part_2'].isnull()) 
             & (df['Q7_Part_1'].isnull())])
print('Number of people who chose programming in R over Python:', res)

In [None]:
# How many people own a Data Scientist job position in Romania?

print('Number of Data Scientists in Romania:',
      len(df[(df['Q5']=='Data Scientist') & (df['Q3']=='Romania')]))

In [None]:
# Gender distribution among Data Scientists in Romania

df2 = df[(df['Q5']=='Data Scientist') & (df['Q3']=='Romania')]

plt.pie(df2['Q2'].value_counts())
labels = [f'{l} {s}' for l, s in zip(df2['Q2'].value_counts().index, df2['Q2'].value_counts().values)]
plt.legend(labels=labels, title='Gender and Count', bbox_to_anchor=(1,1))
plt.title('Gender distribution among Data Scientists in Romania')

In [None]:
# Distribution of Romanian Kagglers based on Job title, Age and Gender

sns.catplot(x='Q1', y='Q5', hue='Q2', data=df[df['Q3']=='Romania'], height=5, aspect=4)
plt.xlabel('Age')
plt.ylabel('Job title')
plt.title('Distribution of Romanian Kagglers based on Job title, Age, and Gender')

In [None]:
# What is the distribution of experience with ML methods in Romania?

plot_bar(df['Q15'], df['Q3']=='Romania', "Distribution of experience with ML methods in Romania", "Count", "Experience")

In [None]:
# What is the distribution of utilised ML algorithms in Romania?

plot_bar_combined(df[df['Q3']=='Romania'], 'Q17', 'Distribution of used ML algorithms in Romania','Count', 'ML Algorithm')

In [None]:
# What is the ratio of utilising CNNs per country?

# divide the number of people who use CNNs in a certain country
# by the total number of people in a country
((df.groupby(['Q3'])['Q17_Part_7'].count() / 
  df['Q3'].value_counts()) * 100).round(2).sort_values(ascending=False)

In [None]:
# What is the distribution of utilised computing platforms in Romania?

plot_bar(df['Q11'], df['Q3']=='Romania', "Distribution of utilised computing platforms in Romania", "Count", "Computing platform")

In [None]:
# What is the most used IDE?

# Steps: - filter parts of a certain question
#        - apply value_count() on the columns to count non null data
#        - sum the values
#        - sort the values in descending order
#        - select the index of the first row to get the column name of the most used IDE
#        - since there is only one unique value in the columns we apply mode() (get the most frequent value) to get that value

col = df.filter(like='Q9', axis=1).apply(pd.Series.value_counts).sum().sort_values(ascending=False).head(1).index
print('The most used IDE:', str(df[col].mode().values).strip("['']"))

In [None]:
# What Interactive Notebooks do European Kagglers use?

plot_bar_combined(df, 'Q10','Distribution of utilised Hosted Notebook Products','Count','Hosted Notebook Platform')

In [None]:
# How many people utilise Google Colaboratory instead of Kaggle Notebook?

res = len(df[(~df['Q10_Part_2'].isnull()) 
             & (df['Q10_Part_1'].isnull())])
print('Number of people who use Google Colab instead of Kaggle:', res)

In [None]:
# Where do Kagglers usually publicly share their work?

plot_bar_combined(df, 'Q39','Distribution of Platforms where European Kagglers share their work','Count','Hosting Platform')

In [None]:
# What is the most used Business Intelligence Tool?

d = df['Q35'].value_counts().head(1).index.values
print('The most used BI Tool:', str(d).strip("['']"))

In [None]:
# How many people utilise Tableau instead of MSPowerBI?

res = len(df[(~df['Q34_A_Part_5'].isnull()) 
             & (df['Q34_A_Part_2'].isnull())])
print('Number of people who use Tableau instead of PowerBI:', res)

In [None]:
# How many people use both Matplotlib and Seaborn to visualize data?

res = len(df[(~df['Q14_Part_1'].isnull()) 
             & (~df['Q14_Part_2'].isnull())])
print('Number of people who use both Matplotlib and Seaborn for Data Visualisation:', res)

In [None]:
# What is the most used Big Data Product?

col = df.filter(like='Q32', axis=1).apply(pd.Series.value_counts).sum().sort_values(ascending=False).head(1).index
print('The most used Big Data Product:', str(df[col].mode().values).strip("['']"))