# 2020 Kaggle Machine Learning & Data Science Survey (EDA)

# Abstract

This year, Kaggle set out to conduct an industry-wide survey that presents a truly comprehensive view of the state of data science and machine learning and the best ways for new data scientists to break into the field. Survey data provides an overview of the sector on an aggregate scale. The main objective of this project is to identify the most popular programming languages in 2021 and compare the results with previous years, find the relationship between salaries and years of experience, and identify the five countries that are most aware of data science.


# Design
This project originates from the Kaggle survey competition. Kaggle a subsidiary of Google LLC, is an online community of data scientists and machine learning practitioners. Kaggle allows users to find and publish data sets, explore and build models in a web-based data-science environment, work with other data scientists and machine learning engineers, and enter competitions to solve data science challenges.


# Data
The dataset is provided in .csv format. It contains 369 columns and 25974 rows. . The columns are all the questions asked and they are detailed into many columns because the multiple answers were separated into more columns, but the basis of the questionnaire questions were 38 questions in the questionnaire and rows are responses from the Kaggle community.

# Algorithms

#### 1-	Checking the nulls

#### 2-	Drop heterosexuals from the data

#### 3-	Seeing the questions answered by 50% of the Kaggle community

#### 4-	Abbreviation of long country names

#### 5-	Adding Year column needed for exploring the data.

#### 6-	Comparison with other years

#### 7-	Convert variables to categorical so we can calculate correlation
 

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format = "svg"

In [None]:
df = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv")

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.drop(df.index[0],inplace=True)

In [None]:
df["Q2"].unique()

In [None]:
df.drop(df[df["Q2"].isin(['Nonbinary', 'Prefer not to say', 'Prefer to self-describe'])].index, inplace=True)

In [None]:
n = []
for i in df:
    if df[i].isnull().sum() > 25974/2:   #25974 in number of Respondents 
        n.append(i)    
        
df50 = df.drop(n,axis = 1)
df50.drop(df.index[0],inplace=True)

df50.isna().sum()

# Gender

In [None]:
df_g=df['Q2'].value_counts()
colors = ['dodgerblue', 'mediumturquoise']
labels=df_g.index

fig,ax1=plt.subplots(1,1,figsize=(5,5))
ax1.pie(df_g,colors = colors,labels=labels, autopct='%1.1f%%',shadow=True, startangle=90)
ax1.axis("equal")
plt.title("Gender",fontsize=26, weight = 'bold',color="black")
plt.show()

# Age Group

In [None]:
sns.set(style='darkgrid')
plt.figure(figsize=(8,8))
sns.countplot(y = 'Q1',data = df,order = df['Q1'].value_counts().index,palette = "viridis")
plt.ylabel('Age Group',fontsize = 20, weight = 'bold',color='black')
plt.show()

# Job

In [None]:
sns.set(style='darkgrid')
plt.figure(figsize=(8,8))
sns.countplot(y = 'Q5',
              data = df,
              order = df['Q5'].value_counts().index,palette= 'viridis')
plt.xticks(size = 12)
plt.ylabel('Job',fontsize = 20, weight = 'bold',color='black');
plt.show()

# Education

In [None]:
def degree_cleaning(x):
    if x=='Some college/university study without earning a bachelor’s degree':
        x='College without degree'
    elif x=='I prefer not to answer':
        x='No-response'
    elif x=='No formal education past high school':
        x='After high school'
    return x
df['Q4']=df['Q4'].map(lambda x: degree_cleaning(x))

In [None]:
sns.set(style='darkgrid')
plt.figure(figsize=(8,8))
sns.countplot(y = 'Q4',
              data = df,
              order = df['Q4'].value_counts().index,palette= 'viridis')
plt.ylabel('Education',fontsize = 20, weight = 'bold',color='black')
plt.show()

# Experience

In [None]:
def code_cleaning(x):
    if x=='I have never written code':
        x='0years'
    return x
df['Q6']=df['Q6'].map(lambda x: code_cleaning(x))

In [None]:
sns.set(style='darkgrid')
plt.figure(figsize=(8,8))
sns.countplot(y = 'Q6',
              data = df,
              order = df['Q6'].value_counts().index , palette= 'viridis')
plt.xticks(size = 12)
plt.ylabel('Years of Experience',fontsize = 20, weight = 'bold',color='black')
plt.show()

# Salary

In [None]:
sns.set(style='darkgrid')
plt.figure(figsize=(8,8))
sns.countplot(y = 'Q25',
              data = df,
              order = df['Q25'].value_counts().index,palette= 'viridis')
plt.xticks(size = 12)
plt.ylabel('Salary',fontsize = 20, weight = 'bold',color='black')
plt.show()

# Country

In [None]:
def country_cleaning(x):
    if x=='United States of America':
        x='USA'
    elif x=='United Kingdom of Great Britain and Northern Ireland':
        x='UK'
    return x
df['Q3']=df['Q3'].map(lambda x: country_cleaning(x))

In [None]:
sns.set(style='darkgrid')
plt.figure(figsize=(7,7))
sns.countplot(y = 'Q3',
              data = df,
              order = df['Q3'].value_counts().sort_values(ascending= False).head(10).index,palette= 'viridis')
plt.ylabel('Country',fontsize = 20, weight = 'bold',color='black')
plt.show()

## Education&Experience

In [None]:
plt.figure(figsize=(10, 8))
sns.countplot(data=df, x=df['Q6'], hue=df['Q4'], order=df['Q6'].value_counts().index,palette= 'viridis')
sns.despine()
plt.title('Education&Experience',fontsize = 20, weight = 'bold',color='black')
plt.xlabel('Experience',fontsize = 10, weight = 'bold',color='black')
plt.ylabel('Count',fontsize = 10, weight = 'bold',color='black')
plt.show()

In [None]:
Q7 = [col for col in df if col.startswith('Q7')]
df7 = df[Q7]
df7

## Most used programming languages in 2021

In [None]:
df7.columns = ['Python', 'R', 'SQL', 'C', 'C++', 'Java', 'Javascript', 'Julia', 'Swift', 'Bash', 'MATLAB', 'None', 'Other']
plt.figure(figsize=(8,8))
df7.count().sort_values(ascending=False).plot(kind = 'bar',color=["dodgerblue"]);
plt.title('Most used programming languages in 2021',fontsize = 20, weight = 'bold',color='black');
plt.show();

## Most used (IDE's) in 2021

In [None]:
Q9 = [col for col in df if col.startswith('Q9')]
df9 = df[Q9]
df99 = df9.iloc[:, : 12]

In [None]:
df99.columns = ['Jupyter (Lab, etc)',' RStudio','Visual Studio','VSCode',
              'PyCharm','Spyder','Notepad++','Sublime Text','Vim / Emacs','MATLAB','Jupyter Notebook','Other']

plt.figure(figsize=(7,7))
df99.count().sort_values(ascending=False).plot(kind = 'bar',color=["dodgerblue"]);
plt.title("Most used (IDE's) for 2021",fontsize = 20, weight = 'bold',color='black');
plt.show();

# Most used of notebook products in 2021

In [None]:
Q10 = [col for col in df if col.startswith('Q10')]
df10 = df[Q10]
df100 = df10.iloc[:, : 15]
df100

df100.columns = ['Kaggle Notebooks','Colab Notebooks','Azure Notebooks','Paperspace / Gradient','Binder / JupyterHub','Code Ocean','IBM Watson Studio',
              'Amazon Notebooks','Amazon EMR Notebooks','Google Cloud','Google Cloud Datalab',
              'Databricks Notebooks','Zeppelin / Zepl Notebooks','Deepnote Notebooks','Observable Notebooks']

plt.figure(figsize=(8,8))
df100.count().sort_values(ascending=True).plot(kind = 'barh',color=["dodgerblue"]);
plt.title("Most used of notebook products in 2021",fontsize = 20, weight = 'bold',color='black');
plt.show();

## Data for corr 

In [None]:
df1 = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv")

In [None]:
df1.drop(df1.index[0],inplace=True)

In [None]:
dc = { "$0-999"      :1,
"1,000-1,999"        :2,
"10,000-14,999"      :3,
"30,000-39,999"      :4,
"100,000-124,999"    :5,
"5,000-7,499"        :6,
"50,000-59,999"      :7,
"40,000-49,999"      :8,
"20,000-24,999"      :9,
"15,000-19,999"      :10,
"2,000-2,999"        :11,
"60,000-69,999"      :12,
"7,500-9,999"        :13,
"25,000-29,999"      :14,
"70,000-79,999"      :15,
"4,000-4,999"        :16,
"80,000-89,999"      :17,
"150,000-199,999"    :18,
"3,000-3,999"        :19,
"125,000-149,999"    :20,
"90,000-99,999"      :21,
"200,000-249,999"    :22,
"300,000-499,999"    :23,
"250,000-299,999"    :24,
">$1,000,000"        :25,
"$500,000-999,999"  :26,
     }

In [None]:
dy = { "1-3 years"          :1,
"< 1 years"                 :2,
"3-5 years"                 :3,
"5-10 years"                :4,
"10-20 years"               :5,
"20+ years"                 :6,
"I have never written code" :7,
     }

In [None]:
df1.replace({"Q25": dc ,"Q6":dy},inplace=True)

In [None]:
df1.dropna(subset=["Q25"],inplace=True)

# Correlation between experience and salary 

In [None]:
np.corrcoef(df1["Q25"],df1["Q6"]) #Thear is no correlation between experience and salary 

# Compare 3 years

In [None]:
survey19 = pd.read_csv("../input/kaggle/multiple_choice_responses.csv")
survey20 = pd.read_csv('../input/kaggle/kaggle_survey_2020_responses.csv')
survey21 = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv')

In [None]:
#2021

Q7_21 = [col for col in survey21 if col.startswith('Q7')]
survey21_Q7 = survey21[Q7_21] 
survey21_Q7 

survey21_Q7.columns = ['Python', 'R', 'SQL', 'C', 'C++', 'Java', 'Javascript', 'Julia', 'Swift', 'Bash', 'MATLAB', 'None', 'Other']

survey21_Q7.drop(survey21_Q7.index[0],inplace=True)

survey21_Q7["Year"] =  2021



In [None]:
#2020 

Q7_20 = [col for col in survey20 if col.startswith('Q7')]
survey20_Q7 = survey20[Q7_20] 
survey20_Q7 

survey20_Q7.columns = ['Python', 'R', 'SQL', 'C', 'C++', 'Java', 'Javascript', 'Julia', 'Swift', 'Bash', 'MATLAB', 'None', 'Other']

survey20_Q7.drop(survey20_Q7.index[0],inplace=True)

survey20_Q7["Year"] =  2020



In [None]:
#2019 

Q7_19 = [col for col in survey19 if col.startswith('Q18')]
survey19_Q7 = survey19[Q7_19] 
survey19_Q7 

survey19_Q7.columns = ['Python', 'R', 'SQL', 'C', 'C++', 'Java', 'Javascript', 'Julia', 'Swift', 'Bash', 'MATLAB', 'None', 'Other']

survey19_Q7.drop(survey19_Q7.index[0],inplace=True)
sns.color_palette("viridis")
survey19_Q7["Year"] =  2019



In [None]:
dfs = [survey21_Q7, survey20_Q7, survey19_Q7]
dfs_ = pd.concat(dfs)
dfs__ = dfs_.iloc[:, : 11]

In [None]:
dfs_

In [None]:
plt.figure(figsize=(5, 5))
sns.countplot(data=dfs_, x=dfs_["Python"],hue=dfs_['Year'],order=dfs_["Python"].value_counts().index,palette= 'viridis')
sns.despine()
plt.title('')
plt.xlabel('')
plt.ylabel('')
plt.show()

# Anather way to compare 3 years

In [None]:
#2021
Q7_2021 = [col for col in survey21 if col.startswith('Q7')]
Q7_2021

df7_2021 = survey21[Q7_2021]
Q2021 = list(df7_2021.count())

In [None]:
#2020
Q7_2020 = [col for col in survey20 if col.startswith('Q7')]
Q7_2020

df7_2020 = survey20[Q7_2020]
Q2020 = list(df7_2020.count())

In [None]:
#2019
Q7_2019 = [col for col in survey19 if col.startswith('Q18')]
Q7_2019

df7_2019 = survey19[Q7_2019]
Q2019 = list(df7_2019.count())

In [None]:
plotdata = pd.DataFrame({
    "2019":Q2019[0:11],    
    "2020":Q2020[0:11],
    "2021":Q2021[0:11]
    }, 
    index= ['Python', 'R', 'SQL', 'C', 'C++', 'Java', 'Javascript', 'Julia', 'Swift', 'Bash', 'MATLAB']
).sort_values(by='2021', ascending=False)

plotdata.plot(kind="bar", color=["dodgerblue", "mediumturquoise","silver"],figsize=(10,6))

plt.xlabel("programming languages",fontsize = 10, weight = 'bold',color='black')
plt.title("Most used programming languages 2019-2021",fontsize = 20, weight = 'bold',color='black')


In [None]:
plotdata

#2020 

Q7_20 = [col for col in survey20 if col.startswith('Q7')]
survey20_Q7 = survey20[Q7_20] 
survey20_Q7 

survey20_Q7.columns = ['Python', 'R', 'SQL', 'C', 'C++', 'Java', 'Javascript', 'Julia', 'Swift', 'Bash', 'MATLAB', 'None', 'Other']

survey20_Q7.drop(survey20_Q7.index[0],inplace=True)

survey20_Q7["Year"] =  2020



In [None]:
# Job