In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
abs_path = r'C:\Users\Shelton\Desktop\Programming\Data-Science\Pandas\Pandas-Cookbook\pandas-workout-data\data\so_2021_survey_results.csv'
rel_path = os.path.relpath(path=abs_path, start=os.curdir)
rel_path

'..\\pandas-workout-data\\data\\so_2021_survey_results.csv'

In [3]:
pd.set_option('display.max_colwidth',100)
columns = ['LanguageHaveWorkedWith', 'LanguageWantToWorkWith', 'Country', 'CompTotal']
df = pd.read_csv(filepath_or_buffer=rel_path, usecols=columns)
df.iloc[0:12]

Unnamed: 0,Country,CompTotal,LanguageHaveWorkedWith,LanguageWantToWorkWith
0,Slovakia,4800.0,C++;HTML/CSS;JavaScript;Objective-C;PHP;Swift,Swift
1,Netherlands,,JavaScript;Python,
2,Russian Federation,,Assembly;C;Python;R;Rust,Julia;Python;Rust
3,Austria,,JavaScript;TypeScript,JavaScript;TypeScript
4,United Kingdom of Great Britain and Northern Ireland,,Bash/Shell;HTML/CSS;Python;SQL,Bash/Shell;HTML/CSS;Python;SQL
5,United States of America,,C;C#;C++;HTML/CSS;Java;JavaScript;Node.js;PowerShell;Python;SQL;Swift,C#;C++;Go;HTML/CSS;Java;JavaScript;Node.js;Objective-C;Perl;Python;SQL;Swift
6,United States of America,,HTML/CSS;JavaScript,HTML/CSS;JavaScript;PHP
7,Malaysia,,HTML/CSS;JavaScript;PHP;Ruby;SQL;TypeScript,Ruby
8,India,,HTML/CSS;JavaScript,HTML/CSS;JavaScript
9,Sweden,42000.0,C++;Python,Haskell;Python


### What are the different programming languages that developers currently use?

In [4]:
df['LanguageHaveWorkedWith'].str.split(';').explode().unique()

array(['C++', 'HTML/CSS', 'JavaScript', 'Objective-C', 'PHP', 'Swift',
       'Python', 'Assembly', 'C', 'R', 'Rust', 'TypeScript', 'Bash/Shell',
       'SQL', 'C#', 'Java', 'Node.js', 'PowerShell', 'Ruby', 'Perl',
       'Matlab', 'Kotlin', 'Julia', 'Haskell', 'Delphi', 'Go', 'Scala',
       'Dart', nan, 'VBA', 'Groovy', 'Clojure', 'APL', 'LISP', 'F#',
       'Elixir', 'Erlang', 'Crystal', 'COBOL'], dtype=object)

### What are the 10 programming languages most commonly used today?

In [5]:
df['LanguageHaveWorkedWith'].str.split(';').explode().value_counts().head(10)

LanguageHaveWorkedWith
JavaScript    53587
HTML/CSS      46259
Python        39792
SQL           38835
Java          29162
Node.js       27975
TypeScript    24909
C#            22984
Bash/Shell    22385
C++           20057
Name: count, dtype: int64

### What languages are on both top-10 lists?


In [6]:
have_worked_with = (
    df['LanguageHaveWorkedWith']
    .str.split(";")
    .explode()
    .value_counts()
    .head(10)
    .index
)
have_worked_with

Index(['JavaScript', 'HTML/CSS', 'Python', 'SQL', 'Java', 'Node.js',
       'TypeScript', 'C#', 'Bash/Shell', 'C++'],
      dtype='object', name='LanguageHaveWorkedWith')

In [7]:
want_to_work_with = (
    df['LanguageWantToWorkWith']
    .str.split(";")
    .explode()
    .value_counts()
    .head(10)
    .index
)
want_to_work_with

Index(['JavaScript', 'Python', 'HTML/CSS', 'TypeScript', 'SQL', 'Node.js',
       'C#', 'Java', 'Rust', 'Go'],
      dtype='object', name='LanguageWantToWorkWith')

In [8]:
have_worked_with[have_worked_with.isin(want_to_work_with)]

Index(['JavaScript', 'HTML/CSS', 'Python', 'SQL', 'Java', 'Node.js',
       'TypeScript', 'C#'],
      dtype='object', name='LanguageHaveWorkedWith')

In [9]:
have_worked_with.intersection(want_to_work_with)  # intersection method works on index object and not on Series.

Index(['JavaScript', 'HTML/CSS', 'Python', 'SQL', 'Java', 'Node.js',
       'TypeScript', 'C#'],
      dtype='object')

In [10]:
# top_10_current_use = (
#     df['LanguageHaveWorkedWith']
#     .str.split(';')
#     .explode()
#     .value_counts()
#     .head(10)
#     .to_frame()
#     .reset_index())
# top_10_current_use

In [11]:
# top_10_want_to_use = (
#     df['LanguageWantToWorkWith']
#     .str.split(';')
#     .explode()
#     .value_counts()
#     .head(10)
#     .to_frame()
#     .reset_index())
# top_10_want_to_use

In [12]:
# top_10_current_use.loc[top_10_current_use['LanguageHaveWorkedWith'].isin(top_10_want_to_use['LanguageWantToWorkWith']), 'LanguageHaveWorkedWith']

### What languages in the top 10 have people worked with but don’t want to work with in the future?

In [13]:
# top_10_current_use.loc[~top_10_current_use['LanguageHaveWorkedWith'].isin(top_10_want_to_use['LanguageWantToWorkWith']), 'LanguageHaveWorkedWith']

In [14]:
have_worked_with[~have_worked_with.isin(want_to_work_with)]

Index(['Bash/Shell', 'C++'], dtype='object', name='LanguageHaveWorkedWith')

### What is the most popular (current) language used by people in each country?

In [15]:
df

Unnamed: 0,Country,CompTotal,LanguageHaveWorkedWith,LanguageWantToWorkWith
0,Slovakia,4800.0,C++;HTML/CSS;JavaScript;Objective-C;PHP;Swift,Swift
1,Netherlands,,JavaScript;Python,
2,Russian Federation,,Assembly;C;Python;R;Rust,Julia;Python;Rust
3,Austria,,JavaScript;TypeScript,JavaScript;TypeScript
4,United Kingdom of Great Britain and Northern Ireland,,Bash/Shell;HTML/CSS;Python;SQL,Bash/Shell;HTML/CSS;Python;SQL
...,...,...,...,...
83434,United States of America,160500.0,Clojure;Kotlin;SQL,Clojure
83435,Benin,200000.0,,
83436,United States of America,1800.0,Groovy;Java;Python,Java;Python
83437,Canada,90000.0,Bash/Shell;JavaScript;Node.js;Python,Go;Rust


In [16]:
all_languages = (
    df['LanguageHaveWorkedWith']
    .str.split(";")
    .explode()
)
all_languages

0                C++
0           HTML/CSS
0         JavaScript
0        Objective-C
0                PHP
            ...     
83438         Delphi
83438         Elixir
83438       HTML/CSS
83438           Java
83438     JavaScript
Name: LanguageHaveWorkedWith, Length: 443642, dtype: object

In [17]:
df[['Country']] # we use double square brackets around 'Country' to ensure that the result is a data frame rather than a series.

Unnamed: 0,Country
0,Slovakia
1,Netherlands
2,Russian Federation
3,Austria
4,United Kingdom of Great Britain and Northern Ireland
...,...
83434,United States of America
83435,Benin
83436,United States of America
83437,Canada


In [18]:
df[['Country']].join(all_languages) 

Unnamed: 0,Country,LanguageHaveWorkedWith
0,Slovakia,C++
0,Slovakia,HTML/CSS
0,Slovakia,JavaScript
0,Slovakia,Objective-C
0,Slovakia,PHP
...,...,...
83438,Brazil,Delphi
83438,Brazil,Elixir
83438,Brazil,HTML/CSS
83438,Brazil,Java


In [19]:
(
    df[['Country']].join(all_languages)
    .groupby('Country')
    .agg(pd.Series.mode) # we can use the method pd.Series.mode, applying it by passing it to the agg method on our groupby object
) 

Unnamed: 0_level_0,LanguageHaveWorkedWith
Country,Unnamed: 1_level_1
Afghanistan,JavaScript
Albania,JavaScript
Algeria,JavaScript
Andorra,JavaScript
Angola,"[HTML/CSS, JavaScript]"
...,...
"Venezuela, Bolivarian Republic of...",JavaScript
Viet Nam,JavaScript
Yemen,"[C#, HTML/CSS]"
Zambia,HTML/CSS


### What is the mean number of languages used in the last year?

In [20]:
df['LanguageHaveWorkedWith'].str.split(';').str.len().mean()

np.float64(5.373678011583714)

### What is the greatest number of languages people listed as having used in the last year?

In [21]:
df['LanguageHaveWorkedWith'].str.split(';').str.len().max()

38.0

### How many people chose that largest number?

In [22]:
(df['LanguageHaveWorkedWith'].str.split(';').str.len() == 38).sum()

np.int64(32)

In [23]:
df.loc[df['LanguageHaveWorkedWith'].str.split(';').str.len() == 38].shape[0]

32

### How many people in the survey claim salaries of $2 million or more?

In [24]:
(df['CompTotal'] >= 2000000).sum()

np.int64(2369)

In [25]:
df.loc[df['CompTotal'] >= 2_000_000].shape[0]

2369

### Remove rows in which salaries are MORE than $2 million.


In [26]:
df = df.loc[df['CompTotal'] < 2_000_000]
df

Unnamed: 0,Country,CompTotal,LanguageHaveWorkedWith,LanguageWantToWorkWith
0,Slovakia,4800.0,C++;HTML/CSS;JavaScript;Objective-C;PHP;Swift,Swift
9,Sweden,42000.0,C++;Python,Haskell;Python
11,Spain,43000.0,Bash/Shell;HTML/CSS;JavaScript;Node.js;SQL;TypeScript,C++;Clojure;JavaScript;Node.js;Rust;SQL;TypeScript
12,Germany,71500.0,C;C++;Java;Perl;Ruby,Rust
16,Turkey,9000.0,C#;HTML/CSS;Java;JavaScript;Node.js,C#;Java;JavaScript;Node.js
...,...,...,...,...
83434,United States of America,160500.0,Clojure;Kotlin;SQL,Clojure
83435,Benin,200000.0,,
83436,United States of America,1800.0,Groovy;Java;Python,Java;Python
83437,Canada,90000.0,Bash/Shell;JavaScript;Node.js;Python,Go;Rust


### Turn the LanguageHaveWorkedWith column into “dummy” columns in df such that each language is its own column.

In [27]:
df['LanguageHaveWorkedWith'].str.split(';').explode().unique() # Looking for all unique languages, the get_dummies() doesn't take account the nan values

array(['C++', 'HTML/CSS', 'JavaScript', 'Objective-C', 'PHP', 'Swift',
       'Python', 'Bash/Shell', 'Node.js', 'SQL', 'TypeScript', 'C',
       'Java', 'Perl', 'Ruby', 'C#', 'Kotlin', 'Delphi', 'Go', 'Scala',
       'Assembly', 'Matlab', 'Rust', 'PowerShell', 'VBA', 'Dart', 'R',
       'Julia', 'Clojure', 'Haskell', 'F#', 'Groovy', nan, 'LISP',
       'Elixir', 'APL', 'Erlang', 'Crystal', 'COBOL'], dtype=object)

In [28]:
df['LanguageHaveWorkedWith'].str.get_dummies(sep=';') # This method is specifically designed for cases where a single cell in a string column contains multiple categories, usually separated by a delimiter

Unnamed: 0,APL,Assembly,Bash/Shell,C,C#,C++,COBOL,Clojure,Crystal,Dart,...,PowerShell,Python,R,Ruby,Rust,SQL,Scala,Swift,TypeScript,VBA
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
9,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
11,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
12,0,0,0,1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
16,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83434,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
83435,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
83436,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
83437,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [29]:
pd.get_dummies(df['LanguageHaveWorkedWith'].str.split(';').explode()) # using pd.get_dummies() general function

Unnamed: 0,APL,Assembly,Bash/Shell,C,C#,C++,COBOL,Clojure,Crystal,Dart,...,PowerShell,Python,R,Ruby,Rust,SQL,Scala,Swift,TypeScript,VBA
0,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83438,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
83438,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
83438,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
83438,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [30]:
df = pd.concat([df, df['LanguageHaveWorkedWith'].str.get_dummies(sep=';')], axis=1) # The first parameter od the concat() method is a list
df.shape

(44814, 42)

In [31]:
df

Unnamed: 0,Country,CompTotal,LanguageHaveWorkedWith,LanguageWantToWorkWith,APL,Assembly,Bash/Shell,C,C#,C++,...,PowerShell,Python,R,Ruby,Rust,SQL,Scala,Swift,TypeScript,VBA
0,Slovakia,4800.0,C++;HTML/CSS;JavaScript;Objective-C;PHP;Swift,Swift,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
9,Sweden,42000.0,C++;Python,Haskell;Python,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
11,Spain,43000.0,Bash/Shell;HTML/CSS;JavaScript;Node.js;SQL;TypeScript,C++;Clojure;JavaScript;Node.js;Rust;SQL;TypeScript,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
12,Germany,71500.0,C;C++;Java;Perl;Ruby,Rust,0,0,0,1,0,1,...,0,0,0,1,0,0,0,0,0,0
16,Turkey,9000.0,C#;HTML/CSS;Java;JavaScript;Node.js,C#;Java;JavaScript;Node.js,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83434,United States of America,160500.0,Clojure;Kotlin;SQL,Clojure,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
83435,Benin,200000.0,,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
83436,United States of America,1800.0,Groovy;Java;Python,Java;Python,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
83437,Canada,90000.0,Bash/Shell;JavaScript;Node.js;Python,Go;Rust,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0


### Determine what combination is best if you want to maximize your salary and have to choose two languages from Python, JavaScript, and Java.

In [32]:
# Python + JavaScript, not Java
df.loc[((df['Python'] == 1) & ((df['JavaScript']) == 1) & (df['Java'] == 0)), 'CompTotal'].mean()

np.float64(126817.99470235605)

In [33]:
# Python + Java, not JavaScript
df.loc[((df['Python'] == 1) & (df['Java'] == 1) & (df['JavaScript'] == 0)), 'CompTotal'].mean()

np.float64(162737.10379596677)

In [34]:
# JavaScript + Java, not Python
df.loc[((df['JavaScript'] == 1) & (df['Java'] == 1) & (df['Python'] == 0)), 'CompTotal'].mean()


np.float64(140867.65981559738)

## Beyond the exercise

### When developers are stuck (as indicated in the column NEWStuck), what are the three things they’re most likely to do?

In [35]:
columns.append('NEWStuck')
columns.append('Gender')
columns.append('YearsCode')
columns.append('YearsCodePro')
columns

['LanguageHaveWorkedWith',
 'LanguageWantToWorkWith',
 'Country',
 'CompTotal',
 'NEWStuck',
 'Gender',
 'YearsCode',
 'YearsCodePro']

In [36]:
df = pd.read_csv(filepath_or_buffer=rel_path, usecols=columns)
df

Unnamed: 0,Country,YearsCode,YearsCodePro,CompTotal,LanguageHaveWorkedWith,LanguageWantToWorkWith,NEWStuck,Gender
0,Slovakia,,,4800.0,C++;HTML/CSS;JavaScript;Objective-C;PHP;Swift,Swift,Call a coworker or friend;Visit Stack Overflow;Go for a walk or other physical activity;Google it,Man
1,Netherlands,7,,,JavaScript;Python,,Visit Stack Overflow;Google it,Man
2,Russian Federation,,,,Assembly;C;Python;R;Rust,Julia;Python;Rust,Visit Stack Overflow;Google it;Watch help / tutorial videos;Do other work and come back later,Man
3,Austria,,,,JavaScript;TypeScript,JavaScript;TypeScript,Call a coworker or friend;Visit Stack Overflow;Go for a walk or other physical activity;Google it,Man
4,United Kingdom of Great Britain and Northern Ireland,17,10,,Bash/Shell;HTML/CSS;Python;SQL,Bash/Shell;HTML/CSS;Python;SQL,Visit Stack Overflow;Go for a walk or other physical activity;Google it;Watch help / tutorial vi...,Man
...,...,...,...,...,...,...,...,...
83434,United States of America,6,5,160500.0,Clojure;Kotlin;SQL,Clojure,Call a coworker or friend;Google it,Man
83435,Benin,4,2,200000.0,,,Call a coworker or friend;Visit Stack Overflow;Google it;Watch help / tutorial videos;Do other w...,Man
83436,United States of America,10,4,1800.0,Groovy;Java;Python,Java;Python,Call a coworker or friend;Visit Stack Overflow;Google it;Watch help / tutorial videos;Do other w...,Man
83437,Canada,5,3,90000.0,Bash/Shell;JavaScript;Node.js;Python,Go;Rust,Call a coworker or friend;Visit Stack Overflow;Go for a walk or other physical activity;Google i...,Man


In [37]:
df['NEWStuck'].str.split(';').explode().value_counts()

NEWStuck
Google it                                           74491
Visit Stack Overflow                                66410
Do other work and come back later                   39871
Watch help / tutorial videos                        36181
Call a coworker or friend                           32805
Go for a walk or other physical activity            30760
Play games                                          12152
Panic                                                9962
Meditate                                             7785
Visit another developer community (please name):     6577
Other (please specify):                              5812
Name: count, dtype: int64

In [38]:
df['NEWStuck'].str.split(';').explode().value_counts().head(3)


NEWStuck
Google it                            74491
Visit Stack Overflow                 66410
Do other work and come back later    39871
Name: count, dtype: int64

### What proportion of the survey respondents marked their gender as Man? Does that proportion seem similar to your real-life experiences?

In [39]:
df['Gender'].value_counts(normalize=True)

Gender
Man                                                                                   0.909231
Woman                                                                                 0.050069
Prefer not to say                                                                     0.017524
Non-binary, genderqueer, or gender non-conforming                                     0.008385
Or, in your own words:                                                                0.005019
Man;Or, in your own words:                                                            0.003257
Man;Non-binary, genderqueer, or gender non-conforming                                 0.003062
Woman;Non-binary, genderqueer, or gender non-conforming                               0.001786
Man;Woman                                                                             0.000498
Non-binary, genderqueer, or gender non-conforming;Or, in your own words:              0.000255
Man;Woman;Non-binary, genderqueer, or gende

### On ``average``, what proportion of their years coding have been done ``professionally``?

In [40]:
df[['YearsCode', 'YearsCodePro']]

Unnamed: 0,YearsCode,YearsCodePro
0,,
1,7,
2,,
3,,
4,17,10
...,...,...
83434,6,5
83435,4,2
83436,10,4
83437,5,3


In [41]:
df['YearsCode'].unique()

array([nan, '7', '17', '3', '4', '6', '16', '12', '15', '10', '40', '9',
       '26', '14', '39', '20', '8', '19', '5', 'Less than 1 year', '22',
       '2', '1', '34', '21', '13', '25', '24', '30', '31', '18', '38',
       'More than 50 years', '27', '41', '42', '35', '23', '28', '11',
       '37', '44', '43', '36', '33', '45', '29', '50', '46', '32', '47',
       '49', '48'], dtype=object)

In [42]:
df['YearsCodePro'].unique()

array([nan, '10', '4', '5', '6', '2', '30', '9', '18', '12', '21', '1',
       '16', 'Less than 1 year', '15', '3', '35', '7', '8', '17', '14',
       '26', '25', '20', '50', '34', '11', '24', '22', '13', '31', '23',
       '39', '41', '27', '28', '19', '33', 'More than 50 years', '37',
       '29', '32', '43', '40', '38', '45', '42', '46', '36', '44', '47',
       '48', '49'], dtype=object)

In [43]:
df.loc[df['YearsCode'] == 'Less than 1 year', 'YearsCode'] = 0
df.loc[df['YearsCode'] == 'More than 50 years', 'YearsCode'] = 51

In [44]:
df.loc[df['YearsCodePro'] == 'Less than 1 year', 'YearsCodePro'] = 0
df.loc[df['YearsCodePro'] == 'More than 50 years', 'YearsCodePro'] = 51

In [45]:
df[['YearsCode', 'YearsCodePro']].isnull()

Unnamed: 0,YearsCode,YearsCodePro
0,True,True
1,False,True
2,True,True
3,True,True
4,False,False
...,...,...
83434,False,False
83435,False,False
83436,False,False
83437,False,False


In [46]:
df[['YearsCode', 'YearsCodePro']].isnull().sum()

YearsCode        1798
YearsCodePro    22223
dtype: int64

In [47]:
df_not_nan = df[['YearsCode', 'YearsCodePro']].dropna()
df_not_nan

Unnamed: 0,YearsCode,YearsCodePro
4,17,10
8,6,4
9,7,4
10,16,10
11,12,5
...,...,...
83434,6,5
83435,4,2
83436,10,4
83437,5,3


In [48]:
df_not_nan.dtypes

YearsCode       object
YearsCodePro    object
dtype: object

In [49]:
df_not_nan['YearsCode'] = df_not_nan['YearsCode'].astype(np.int16)
df_not_nan['YearsCodePro'] = df_not_nan['YearsCodePro'].astype(np.int16)

In [52]:
df_not_nan['YearsCode'].head(5)

4     17
8      6
9      7
10    16
11    12
Name: YearsCode, dtype: int16

In [53]:
df_not_nan['YearsCodePro'].head(5)

4     10
8      4
9      4
10    10
11     5
Name: YearsCodePro, dtype: int16

In [54]:
# Get rid of rows with 0 yearsCode
df_not_nan = df_not_nan.loc[df_not_nan['YearsCode'] != 0]
df_not_nan

Unnamed: 0,YearsCode,YearsCodePro
4,17,10
8,6,4
9,7,4
10,16,10
11,12,5
...,...,...
83434,6,5
83435,4,2
83436,10,4
83437,5,3


In [56]:
(df_not_nan['YearsCodePro'] / df_not_nan['YearsCode']).mean()

np.float64(0.5923711657118932)