# About:
1. Welcome to Kaggle's annual Machine Learning and Data Science Survey competition!
2. The survey was live from 09/01/2021 to 10/04/2021, and after cleaning the data we finished with 25,973 responses!
3. The challenge objective: tell a data story about a subset of the data science community represented in this survey


> **We will go together step by step to explore the insights behind each question**

In [None]:
# the important modules
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt  # data Viz
import seaborn as sns

import os

> # Read the data

In [None]:
df = pd.read_csv("/kaggle/input/kaggle-survey-2021/kaggle_survey_2021_responses.csv")

In [None]:
df.info()

> **the number of questions**

In [None]:
questions = df.iloc[0,:]

> # Wrangle the data

In [None]:
# the count of null values
df.isnull().sum().sum()

In [None]:
nulls = df.isnull().sum()
null_columns = nulls[nulls > 0]
not_null_columns = nulls[nulls == 0]

> **The (Q1:Q6) have no null values** \
> **The (Q7 : Last Question) have null values**

**Explore the percentage of the null values**

In [None]:
# the null values of each column / the whole number of respones
null_percentages = null_columns / 25973

> **Show the distributions of null values percentages**

In [None]:
plt.figure(figsize=(10, 8))
null_percentages.hist(bins=20)
plt.title("the freq of null percentages for each null-column", fontsize=15)
plt.show()

> **Removing null values will let us lost a lot of insights, so we'll fill it by: *Not_Specified***

In [None]:
df.fillna("Not_Specified", inplace=True)

> **Check the duplicated values**

In [None]:
df.duplicated().sum()

In [None]:
# Remove the question row
df.drop(0, inplace=True)

**We are ready now to explore the data**

> # EDA: Let's explore it

> **The Main Functions**

In [None]:
# bar plot function
def bar_plot(column, title='', xlabel='',ylabel='',  bar_kind='bar', figure_size=(15,8), index=0, df=df):
    colors = ['darkblue', 'darkred', 'darkorange', 'red', 'gold']
    rand_color = colors[np.random.randint(0, 5)]
    plt.figure(figsize = figure_size)
    if index:
        df[column].value_counts()[:index].plot(kind= bar_kind, color = rand_color)
    else:
        df[column].value_counts().plot(kind= bar_kind, color = rand_color)
    plt.title(title, fontsize=15)
    plt.xlabel(xlabel, fontsize=15)
    plt.ylabel(ylabel, fontsize=15)
    plt.show()

In [None]:
# pie plot function
def pie_plot(column, title='', xlabel='',ylabel='', figure_size=(15,10), index=0, df=df):
    colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', "red", "lightblue", "yellow"]
    if index:
        df[column].value_counts()[:index].plot(kind='pie', autopct='%1.1f%%', figsize=figure_size, shadow=True, colors=colors)
    else:
        df[column].value_counts().plot(kind='pie', autopct='%1.1f%%', figsize=figure_size, shadow=True, colors=colors)
    plt.title(title, fontsize=15)
    plt.xlabel(xlabel, fontsize=15)
    plt.ylabel(ylabel, fontsize=15)
    plt.show()

> **Starting by the duration column**

In [None]:
# Rename the Duration column
df.rename(columns = {"Time from Start to Finish (seconds)": "Duration"}, inplace=True)

In [None]:
# convert from minutes to seconds
df.Duration = pd.to_numeric(df.Duration)
df.Duration = df.Duration / 60.0

> **The summary statistics about the Duration column**

In [None]:
df.Duration.describe()

> **Let's move on to the age: Q1**

In [None]:
# Rename the age column
df.rename(columns = {"Q1": "Age"}, inplace=True)

In [None]:
bar_plot("Age", "The Distributions Of The Age", "The Age", "The Count of the Age", "barh")

**Most of the responses age are between 25_29, 18_21, 22_24, and 30_34**

> **Dive into the gender**

In [None]:
# Rename the Gender column
df.rename(columns = {"Q2": "Gender"}, inplace=True)

In [None]:
bar_plot("Gender", "The Count Of Each Gendar", bar_kind='barh')

In [None]:
age_df = df[np.logical_or(df.Gender == 'Man', df.Gender == 'Woman')]

In [None]:
plt.figure(figsize=(15, 8))
sns.histplot(x = age_df.Age,zorder=4,linewidth=0.5, hue=age_df.Gender, multiple='dodge',)
plt.title("The Age freq according to the gender", fontsize=15)
plt.show()

> **What about the Country**

In [None]:
# Rename the Country column
df.rename(columns = {"Q3": "Country"}, inplace=True)

In [None]:
bar_plot("Country","The Frequancy of Countries", figure_size=(20,10))

In [None]:
countries = df.Country.value_counts()
top_countries = countries[:10]

> **The top 5 countries are: India, US, Others, Japan, and China**

In [None]:
plt.figure(figsize=(15, 8))
top_countries.plot(kind='bar', color = 'darkorange')
plt.title("the distributions of top 10 countries", fontsize= 15)
plt.show()

> **Education is really good index, isn't it?**

In [None]:
# Rename the Education column
df.rename(columns = {"Q4": "Education"}, inplace=True)

In [None]:
pie_plot("Education", "The Percentage of each Educational level", figure_size=(15, 10))

In [None]:
bar_plot("Education", "The Frequancy of each Educational level")

> Edu according to Gender

In [None]:
age_df = df[np.logical_or(df.Gender == 'Man', df.Gender == 'Woman')]

In [None]:
age_df['Education'] = age_df['Education'].replace(['Professional doctorate'],'doctorate')
age_df['Education'] = age_df['Education'].replace(['I prefer not to answer'],'not to answer')
age_df['Education'] = age_df['Education'].replace(['No formal education past high school'],'No Formal')
age_df['Education'] = age_df['Education'].replace(['Some college/university study without earning a bachelor’s degree'],'Some college')

In [None]:
plt.figure(figsize=(12, 8))
sns.histplot(x = age_df.Education,zorder=4,linewidth=0.5, hue=age_df.Gender, multiple='dodge')
plt.title("The EDU freq according to the gender", fontsize=15)
plt.show()

> **Current role: JOB JOB JOB ...**

In [None]:
# Rename the Role column
df.rename(columns = {"Q5": "Role"}, inplace=True)

In [None]:
bar_plot("Role", "The Count Of the Roles","The Role", bar_kind="barh")

In [None]:
bar_plot("Role", "The Frequancy OF Top 5 Roles", "The Role", "The Count", index=5)

In [None]:
bar_plot("Role", "The Frequancy OF Top 5 Roles", "The Role", "The Count", index=10)

> **The Years of writing code: code_experience**

In [None]:
# Rename the code_experience column
df.rename(columns = {"Q6": "code_experience"}, inplace=True)

In [None]:
pie_plot("code_experience", "The Percentage of each # of years coding experience")

In [None]:
plt.figure(figsize=(15, 8))
sns.histplot(x = age_df.Q6, zorder=4,linewidth=0.5, hue=age_df.Gender, multiple='dodge', palette='winter')
plt.title("The Code Experience Freq according to the Gender", fontsize=15)
plt.show()

> **The 13 Selected programming langs**

In [None]:
def get_df(index1, index2, df=df):
    # Create Dictionary which contains the key and value of seperated questions, which has different columns
    temp_dic = {}
    for column in df.iloc[:, index1:index2].columns:
        try:
            column = df[column][df[column] != "Not_Specified"].value_counts()
            key = column.index[0]
            value = column.values[0]
            temp_dic[key] = value
        except:
            continue
    temp_df = pd.DataFrame(temp_dic, index=temp_dic.keys())
    return temp_df

In [None]:
langs_df = get_df(7, 20)

In [None]:
langs_df.iloc[0,:].sort_values(ascending=True).plot(kind='barh', figsize=(15, 8), color='darkorange')
plt.title("The Freq of each Programming language", fontsize=15)
plt.xlabel("The language", fontsize=15)
plt.ylabel("The count", fontsize=15)
plt.show()

In [None]:
colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', "red"]
langs_df.iloc[0,:].sort_values(ascending=False)[:5].plot(kind='pie', autopct='%1.1f%%', figsize=(15, 10), shadow=True, colors=colors)
plt.title("The Percentage of Top 5 regular basis programming languages", fontsize=15)
plt.show()

> **Which Programming language which you recommend**

In [None]:
# Rename the recommend  programming language column
df.rename(columns = {"Q8": "recommend_lang"}, inplace=True)
recommend_lang = df.recommend_lang.value_counts()
recommend_lang.drop("Not_Specified", inplace=True)

In [None]:
colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', "red"]
recommend_lang.sort_values(ascending=False)[:5].plot(kind='pie', autopct='%1.1f%%', figsize=(15, 10), shadow=True, colors=colors)
plt.title("The Percentage of Top 5 recommended programming languages", fontsize=15)
plt.show()

> **regular basis integrated development environments (IDES)**

In [None]:
ide_df = get_df(21,30)

In [None]:
ide_df.iloc[0,:].sort_values(ascending=False).plot(kind='bar', figsize=(15, 8), color='darkorange')
plt.title("The Freq of regular IDEs", fontsize=15)
plt.xlabel("The IDE", fontsize=15)
plt.ylabel("The count", fontsize=15)
plt.show()

In [None]:
langs_df.iloc[0,:].sort_values(ascending=False)[:5].plot(kind='pie', autopct='%1.1f%%', figsize=(15, 10), shadow=True, colors=colors)
plt.title("The Percentages of Top 5 regular IDEs", fontsize=15)
plt.show()

> **Regular Basis hosted notebooks**

In [None]:
note_df = get_df(34, 51)

In [None]:
note_df.iloc[0,:].sort_values(ascending=True).plot(kind='barh', figsize=(15, 10), color='darkorange')
plt.title("The Freq of regular Hosted notebooks", fontsize=15)
plt.xlabel("The IDE", fontsize=15)
plt.ylabel("The count", fontsize=15)
plt.show()

In [None]:
langs_df.iloc[0,:].sort_values(ascending=False)[:5].plot(kind='pie', autopct='%1.1f%%', figsize=(15, 10), shadow=True, colors=colors)
plt.title("The Percentages of Top 5 regular Hosted Notebooks", fontsize=15)
plt.ylabel("")
plt.show()

> **Q11: computing platform**

In [None]:
pie_plot("Q11", "The Percentages of Top 3 computing platform", index=3, figure_size=(15, 8))

> **Q12: regular specialized hardware**

In [None]:
hard_df = get_df(52, 58)

In [None]:
hard_df.iloc[0,:].sort_values(ascending=False)[:5].plot(kind='pie', autopct='%1.1f%%', figsize=(15, 10), shadow=True, colors=colors)
plt.title("The Percentages of Top 5 regular specialized hardware", fontsize=15)
plt.ylabel("")
plt.show()

> __Q13: TPU__

In [None]:
bar_plot("Q13", "the # of times which TPU used", bar_kind='barh')

> **data visualization libraries: Q14**

In [None]:
viz_df = get_df(59, 71)

In [None]:
viz_df.iloc[0,:].sort_values(ascending=False)[:5].plot(kind='pie', autopct='%1.1f%%', figsize=(15, 10), shadow=True, colors=colors)
plt.title("The Percentages of Top 5 Data Viz libraries", fontsize=15)
plt.ylabel("")
plt.show()

> __ML Frameworks: Q15__

In [None]:
frame_df = get_df(72, 90)

In [None]:
frame_df.iloc[0,:].sort_values(ascending=True).plot(kind='barh', figsize=(15, 10), color='darkblue')
plt.title("The # of ML Frameworks", fontsize=15)
plt.show()

In [None]:
langs_df.iloc[0,:].sort_values(ascending=False)[:5].plot(kind='pie', autopct='%1.1f%%', figsize=(15, 10), shadow=True, colors=colors)
plt.title("The Percentages of Top 5 ML Frameworks", fontsize=15)
plt.ylabel("")
plt.show()

> __ML Algos: Q17__

In [None]:
algos_df = get_df(90, 102)

In [None]:
algos_df.iloc[0,:].sort_values(ascending=True).plot(kind='barh', figsize=(15, 10), color='green')
plt.title("The # of ML Algorithms", fontsize=15)
plt.show()

In [None]:
langs_df.iloc[0,:].sort_values(ascending=False)[:5].plot(kind='pie', autopct='%1.1f%%', figsize=(15, 10), shadow=True, colors=colors)
plt.title("The Percentages of Top 5 ML Algos", fontsize=15)
plt.ylabel("")
plt.show()

> **CV methods: Q18**

In [None]:
cv_df = get_df(102, 109)

In [None]:
cv_df.iloc[0,:].sort_values(ascending=True).plot(kind='barh', figsize=(15, 10), color='darkorange')
plt.title("The # of CV methods", fontsize=15)
plt.show()

> __Word embeddings: Q19__

In [None]:
embd_df = get_df(109, 115)

In [None]:
embd_df.iloc[0,:].sort_values(ascending=True).plot(kind='barh', figsize=(15, 10), color='darkorange')
plt.title("The # of Word embeddings", fontsize=15)
plt.show()

> __Manufacturing: Q20__

In [None]:
q20 = df.Q20.value_counts().drop("Not_Specified")

In [None]:
q20.sort_values(ascending=True).plot(kind='barh', figsize=(15, 10), color='darkorange')
plt.title("The # of Manufacturing", fontsize=15)
plt.show()

In [None]:
q20.sort_values(ascending=False)[:5].plot(kind='pie', autopct='%1.1f%%', figsize=(15, 10), shadow=True, colors=colors)
plt.title("The Percentages of Top 5 Manufacturing", fontsize=15)
plt.ylabel("")
plt.show()

> **UP Vote <3**

# Egypt Data: EDA

In [None]:
eg_df = df[df.Country == 'Egypt']
print(f"the number of Egyption responses: {len(eg_df)}")

In [None]:
print("the Rank of Egyption users in kaggl is 11")

In [None]:
bar_plot("Age", "The Distributions Of The Egyption Ages", "The Age", "The Count of The Age", df=eg_df)

In [None]:
bar_plot("Gender", "The Freq of The Egyption Genders", df=eg_df, figure_size=(12, 8))

In [None]:
eg_age_df = age_df[age_df.Country == "Egypt"]

In [None]:
plt.figure(figsize=(12, 8))
sns.histplot(x = eg_age_df.Age,zorder=4,linewidth=0.5, hue=eg_age_df.Gender, multiple='dodge',)
plt.title("The Egyption Ages Freq according to the Gender", fontsize=15)
plt.show()

In [None]:
pie_plot("Education","The Percentage of each Eg educational level", df=eg_df)

In [None]:
plt.figure(figsize=(12, 8))
sns.histplot(x = eg_age_df.Education,zorder=4,linewidth=0.5, hue=eg_age_df.Gender, multiple='dodge', palette='spring')
plt.title("The Eg education Freq according to the gender", fontsize=15)
plt.show()

In [None]:
bar_plot("Role", "The Count of Eg's Current Roles", "The Role", "The Count", bar_kind='barh', df=eg_df, figure_size=(15, 10))

In [None]:
bar_plot("Role", "The Count of Top 5 Eg's Current Roles", "The Role", "The Count", bar_kind='barh', df=eg_df, index=5)

In [None]:
pie_plot("code_experience", "The Percentage of each # coding experience years for Egyptions", df=eg_df)

In [None]:
langs_df = get_df(7,20, eg_df)

In [None]:
langs_df.iloc[0,:].sort_values(ascending=True).plot(kind='barh', figsize=(15, 8), color='darkorange')
plt.title("The Freq of each EG's Programming languages", fontsize=15)
plt.xlabel("The language", fontsize=15)
plt.ylabel("The count", fontsize=15)
plt.show()

In [None]:
colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', "red"]
langs_df.iloc[0,:].sort_values(ascending=False)[:5].plot(kind='pie', autopct='%1.1f%%', figsize=(15, 10), shadow=True, colors=colors)
plt.title("The Percentage of Top 5 EG's regular basis programming languages", fontsize=15)
plt.ylabel("")
plt.show()

In [None]:
recommend_lang = eg_df.recommend_lang.value_counts()
recommend_lang.drop("Not_Specified", inplace=True)

In [None]:
colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', "red"]
recommend_lang.sort_values(ascending=False)[:5].plot(kind='pie', autopct='%1.1f%%', figsize=(15, 10), shadow=True, colors=colors)
plt.title("The Percentage of Top 5 EG's recommended programming languages", fontsize=15)
plt.ylabel("")
plt.show()

In [None]:
ide_df = get_df(21, 30, eg_df)

In [None]:
ide_df.iloc[0,:].sort_values(ascending=False).plot(kind='barh', figsize=(15, 8), color='green')
plt.title("The Freq of EG's regular IDEs", fontsize=15)
plt.xlabel("The IDE", fontsize=15)
plt.ylabel("The count", fontsize=15)
plt.show()

In [None]:
langs_df.iloc[0,:].sort_values(ascending=False)[:5].plot(kind='pie', autopct='%1.1f%%', figsize=(15, 10), shadow=True, colors=colors)
plt.title("The Percentages of Top 5 EG's regular IDEs", fontsize=15)
plt.show()

In [None]:
note_df = get_df(34, 51, eg_df)

In [None]:
note_df.iloc[0,:].sort_values(ascending=True).plot(kind='barh', figsize=(15, 8), color='darkorange')
plt.title("The Freq of EG's regular Hosted notebooks", fontsize=15)
plt.xlabel("The IDE", fontsize=15)
plt.ylabel("The count", fontsize=15)
plt.show()

In [None]:
pie_plot("Q11", "The Percentages of Top 3 EG's computing platform", df=eg_df, index=3)

In [None]:
hard_df = get_df(52, 58, df=eg_df)

In [None]:
hard_df.iloc[0,:].sort_values(ascending=False)[:5].plot(kind='pie', autopct='%1.1f%%', figsize=(15, 10), shadow=True, colors=colors)
plt.title("The Percentages of Top 5 EG's regular specialized hardware", fontsize=15)
plt.ylabel("")
plt.show()

> __UP Vote <3__

> # PLUS: Modelling

> using sub_df for ML

In [None]:
model_df = df.iloc[:, :7]

In [None]:
model_df.head(1)

> **Label Encoding module**

In [None]:
from sklearn.preprocessing import LabelEncoder 

In [None]:
dummies_columns = []

for column in model_df.columns:
    if df[column].dtype == 'O':
        dummies_columns.append(column)
dummies_columns

In [None]:
# fit the columns label
def label_encoding(columns,df):
    for i in range(len(columns)):
        le = LabelEncoder()
        df[columns[i]] = le.fit_transform(df[columns[i]])

label_encoding(dummies_columns, model_df)

In [None]:
model_df.head(1)

> **Predict *The Country* based on Model_df features**

In [None]:
X = model_df.drop(["code_experience", "Duration"], axis=1).values
y = model_df.code_experience.values

> train_test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=35)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
  import xgboost as xgb

In [None]:
model = xgb.XGBClassifier(n_estimators=100,max_depth=5, objective = "multi:softmax")

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

> **Working is in progress ...**