In [40]:

# Importing Libraries

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sb
import plotly.offline as po
import plotly.graph_objs as go
po.init_notebook_mode(connected=True)
from plotly import tools

import warnings
warnings.filterwarnings('ignore')

# Loading Data
mulchrep = pd.read_csv('/datasets/multipleChoiceResponses.csv', encoding='ISO-8859-1')
mulchrep.head()

Unnamed: 0,GenderSelect,Country,Age,EmploymentStatus,StudentStatus,LearningDataScience,CodeWriter,CareerSwitcher,CurrentJobTitleSelect,TitleFit,...,JobFactorExperienceLevel,JobFactorDepartment,JobFactorTitle,JobFactorCompanyFunding,JobFactorImpact,JobFactorRemote,JobFactorIndustry,JobFactorLeaderReputation,JobFactorDiversity,JobFactorPublishingOpportunity
0,"Non-binary, genderqueer, or gender non-conforming",,,Employed full-time,,,Yes,,DBA/Database Engineer,Fine,...,,,,,,,,,,
1,Female,United States,30.0,"Not employed, but looking for work",,,,,,,...,,,,,,,,Somewhat important,,
2,Male,Canada,28.0,"Not employed, but looking for work",,,,,,,...,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important
3,Male,United States,56.0,"Independent contractor, freelancer, or self-em...",,,Yes,,Operations Research Practitioner,Poorly,...,,,,,,,,,,
4,Male,Taiwan,38.0,Employed full-time,,,Yes,,Computer Scientist,Fine,...,,,,,,,,,,


**The tech world seems still a man's world.**

In [3]:
# Assigning color to pie slice
colors = ['aqua', 'lightgrey', 'lightgreen', '#D0F9B1', 'khaki', 'grey']

# Renaming a gender 
mulchrep['GenderSelect'] = np.where(mulchrep['GenderSelect'] == 'Non-binary, genderqueer, or gender non-conforming', 'Non-binary',
                                    mulchrep['GenderSelect'])

gender = mulchrep['GenderSelect'].value_counts()
label = gender.index
size = gender.values

# Plotting the Pie Chart to display the Gender Distribution
trace = go.Pie(
    labels=label,
    values=size,
    marker=dict(colors=colors)
)

data = [trace]
layout = go.Layout(title = 'Gender Distribution')

fig = go.Figure(data = data, layout=layout)
po.iplot(fig)

**Top 5 countries**
1. USA - 4197 participants
2. India - 2704 participants
3. Russia - 578 participants
4. United Kingdom - 535  participants
5. China - 471 participants

In [4]:

df = pd.DataFrame(mulchrep.Country.value_counts())
df['country'] = df.index
df.columns = ['number', 'country']
df = df.reset_index().drop('index', axis=1)

data = [dict(
    type = 'choropleth',
    locations = df['country'],
    locationmode = 'country names',
    z = df['number'],
    text = df['country'],
    colorscale = [[
        0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],
        [0.5,"rgb(70, 100, 245)"],[0.6,"rgb(90, 120, 245)"],
        [0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
    autocolorscale = False,
    reversescale = True,
    marker = dict(
        line = dict(
            color = 'rgb(180,180,180)',
            width = 0.5
        )
    ),
    colorbar = dict(
        autotick = False,
        tickprefix = '$',
        title = 'Survey Respondents'
    )
)]

layout = dict(
    title = 'The Nationality of Respondents',
    geo = dict(
        showframe = False,
        showcoastlines = True,
        projection = dict(
            type = 'Mercator'
        )
    )
)

fig = dict(data=data, layout=layout)
po.iplot(fig, validate=False)

**Age Distribution in 2017 (left) vs 2018 (right):**

* 18-21: **7.2% - 12.7%**
* 22-24: **14.9% - 21.5%**
* 25-29: **25.9% - 25.8%**
* 30-34: **18.5% - 15.8%**
* 35-39: **12.6% - 9.4%**
* 40-44: **7.7% - 5.7%**
* 44+: **12.9% - 8.4%**

In [9]:
# Subsetting for age greater than 18 but less than 75
mulchrep = mulchrep[(mulchrep['Age'] < 75) & (mulchrep['Age'] > 18)]
# Getting the relative frequency of age 
age = round(mulchrep['Age'].value_counts(normalize=True), 4)

# Plotting a Bar Chart to display the Age Distribution
trace = go.Bar(
    x=age.index,
    y=age.values,
    marker=dict(
        color=age.values,
        colorscale='Reds',
        showscale=True
    )
)

data = [trace]
layout = go.Layout(title='Age Distribution', 
                   yaxis = dict(title = '# of Respondents'))

fig = go.Figure(data=data, layout=layout)
fig['layout']['xaxis'].update(dict(title='Age',
                                   tickfont = dict(size = 12)))
po.iplot(fig)

**We can say that online courses (MOOC) are mainstream training platforms of data science.**

In [13]:
train = mulchrep['FirstTrainingSelect'].value_counts()
label = train.index
size = train.values
colors = ['aqua', 'lightgrey', 'lightgreen', '#D0F9B1', 'khaki', 'grey']

trace = go.Pie(labels=label,
               values=size,
               marker=dict(colors=colors))

data = [trace]
layout=go.Layout(
    title='First Training Platform',
    legend=dict(orientation='h')
)

fig = go.Figure(data=data, layout=layout)
po.iplot(fig)

**Coursera seems the leader of MOOCs thanks to Andrew NG's' [amazing machine learning courses](http://www.coursera.org/instructor/andrewng).**
* **After learning basics of machine learning, people discover the world of Kaggle so although it is the last first learning platform in the above ranking, it is graded as the best learning platform.**
* **Online courses and Stack & Overflow are preferred to textbooks and university courses. The changing face of education in the 21st century!**

In [41]:
course = mulchrep['CoursePlatformSelect'].str.split(',')
course_set = []

for i in course.dropna():
    course_set.extend(i)
courses = round(pd.Series(course_set).value_counts(normalize=True)[:5], 4).\
        sort_values(ascending=False).to_frame()

trace1 = go.Bar(
    x=courses.index,
    y=courses[0],
    name='course',
    marker=dict(
        color=courses[0],
        colorscale='Jet'
    )
)

learning = mulchrep['LearningPlatformSelect'].str.split(',')
learning_set = []
for i in learning.dropna():
    learning_set.extend(i)
learn = round(pd.Series(learning_set).value_counts(normalize=True)[:5], 4).\
        sort_values(ascending=False).to_frame()

trace2 = go.Bar(
    x=learn.index,
    y=learn[0],
    name='platform',
    marker=dict(
        color=learn[0],
        colorscale='Jet'
    )
)

fig = tools.make_subplots(rows=1, cols=2, subplot_titles=('Course Platforms', 
                                                             'Learning Platforms'))
fig.append_trace(trace1, 1,1)
fig.append_trace(trace2, 1,2)
fig['layout'].update(
    height=500, 
    width=820,
    title='Where to start & How to continue in DS',
    showlegend=False
)
po.iplot(fig)

**Basic laptop is enough to follow data science trends so hardware requirements is no excuse not to discover the world.**

In [46]:
hardware = mulchrep['HardwarePersonalProjectsSelect'].str.split(',')
hardware_set = []
for i in hardware.dropna():
    hardware_set.extend(i)
    hware = pd.Series(hardware_set).value_counts()[:6]

label = hware.index
size = hware.values

colors = ['#FEBFB3', 'skyblue', '#96D38C', '#D0F9B1', 'tan', 'lightgrey']

trace = go.Pie(
    labels=label,
    values=size,
    marker=dict(colors=colors)
)

data = [trace]
layout=go.Layout(
    title='Hardware Requirements',
    legend=dict(orientation='h')
)

fig = go.Figure(data=data, layout=layout)
po.iplot(fig)

**The most wondering part of the survey is probably salaries. I think that there is no surprise in the ranking. However, before accepting a job offer,  I would recommend you to check [purchasing power parities](http://data.oecd.org/conversion/purchasing-power-parities-ppp.htm).** 

**Note: Although I calculate median salary that is more robust to outliers I dropped rate-adjusted monthly salaries less than 100 dollars more than 500.000 dollars for the following parts.**

In [59]:
mulchrep['CompensationAmount'] = mulchrep['CompensationAmount'].str.replace(',','')
mulchrep['CompensationAmount'] = mulchrep['CompensationAmount'].str.replace('-','')

salary = mulchrep[['CompensationAmount', 'CompensationCurrency', 'Country', 'JobSatisfaction', 
                    'CurrentJobTitleSelect', 'Age', 'GenderSelect']].dropna()

crates = pd.read_csv('/datasets/conversionRates.csv')
crates.drop('Unnamed: 0', axis=1, inplace=True)
salary = salary.merge(crates, left_on='CompensationCurrency', right_on='originCountry', how='left')
salary['Salary'] = pd.to_numeric(salary['CompensationAmount']) * salary['exchangeRate']

us_salary = salary[(salary['Salary'] > 100) & (salary['Salary'] < 500000) & 
                   (salary['Country'] == 'United States')] 
non_us_salary = salary[(salary['Salary'] > 100) & (salary['Salary'] < 500000) & 
                   (~(salary['Country'] == 'United States'))] 
sal_coun = salary.groupby('Country')['Salary'].median().round(-2).sort_values(ascending=False)[:16].\
    to_frame()

trace = go.Bar()

Unnamed: 0_level_0,Salary
Country,Unnamed: 1_level_1
United States,107000.0
Switzerland,104300.0
Australia,93100.0
Norway,87900.0
Denmark,80400.0
Israel,74900.0
Netherlands,74100.0
Germany,71700.0
Canada,70000.0
Ireland,66700.0
