In [1]:
import pandas as pd
import numpy as np

In [2]:
#2016 is the only one where respondent and role is 1-1. Rest are 1-many.
#All non-2016 datasets will have their role column 'exploded' to change it to a 1-1.
sodf16 = pd.read_csv("survey_results_public2016.csv", usecols = ['Respondent', 'occupation'])
sodf16.dropna(inplace=True)
sodf16['Year'] = 2016
sodf16.rename(columns = {"occupation" : "Role"}, inplace = True)
sodf16.head()

Unnamed: 0,Respondent,Role,Year
1,4637,Mobile developer - iOS,2016
3,21378,DevOps,2016
6,31743,Growth hacker,2016
7,51301,Back-end web developer,2016
9,24487,Back-end web developer,2016


In [3]:
#2017 has multiple columns for role, and specific columns for web developer or mobile developer.
#Getting specific role in place of generic role.
sodf17 = pd.read_csv("survey_results_public2017.csv", usecols = ['Respondent','DeveloperType', 'WebDeveloperType', 'MobileDeveloperType', 'NonDeveloperType'])
sodf17.dropna(subset = ['DeveloperType', 'NonDeveloperType'],inplace = True, how = 'all')
display(sodf17)

Unnamed: 0,Respondent,DeveloperType,WebDeveloperType,MobileDeveloperType,NonDeveloperType
2,3,Other,,,
3,4,,,,Data scientist
4,5,Mobile developer; Graphics programming; Deskto...,,,
6,7,,,,Data scientist
7,8,Web developer,Full stack Web developer,,
...,...,...,...,...,...
51386,51387,Web developer; Mobile developer; Developer wit...,,,
51387,51388,Web developer; Developer with a statistics or ...,,,
51389,51390,Web developer; Systems administrator,,,
51390,51391,Web developer; Mobile developer,,,


In [4]:
#Really naive way to join DeveloperType and NonDeveloperType together
sodf17['NewDeveloperType'] = sodf17['DeveloperType'].str.cat(sodf17['NonDeveloperType'], sep = ";", na_rep = '')
sodf17['NewDeveloperTypeStrip'] = sodf17['NewDeveloperType'].str.replace('^;','')
sodf17['NewDeveloperTypeStrip'] = sodf17['NewDeveloperTypeStrip'].str.replace(';$','')
display(sodf17)

Unnamed: 0,Respondent,DeveloperType,WebDeveloperType,MobileDeveloperType,NonDeveloperType,NewDeveloperType,NewDeveloperTypeStrip
2,3,Other,,,,Other;,Other
3,4,,,,Data scientist,;Data scientist,Data scientist
4,5,Mobile developer; Graphics programming; Deskto...,,,,Mobile developer; Graphics programming; Deskto...,Mobile developer; Graphics programming; Deskto...
6,7,,,,Data scientist,;Data scientist,Data scientist
7,8,Web developer,Full stack Web developer,,,Web developer;,Web developer
...,...,...,...,...,...,...,...
51386,51387,Web developer; Mobile developer; Developer wit...,,,,Web developer; Mobile developer; Developer wit...,Web developer; Mobile developer; Developer wit...
51387,51388,Web developer; Developer with a statistics or ...,,,,Web developer; Developer with a statistics or ...,Web developer; Developer with a statistics or ...
51389,51390,Web developer; Systems administrator,,,,Web developer; Systems administrator;,Web developer; Systems administrator
51390,51391,Web developer; Mobile developer,,,,Web developer; Mobile developer;,Web developer; Mobile developer


In [5]:
sodf17['Role'] = sodf17['NewDeveloperTypeStrip'].str.split(';')
sodf17Explode = sodf17.drop(['DeveloperType', 'NonDeveloperType', 'NewDeveloperType', 'NewDeveloperTypeStrip'], axis = 1)
sodf17Explode =sodf17Explode.explode('Role')
sodf17Explode.sample(20)

Unnamed: 0,Respondent,WebDeveloperType,MobileDeveloperType,Role
8987,8988,,,Other
15697,15698,Front-end Web developer,,Web developer
4154,4155,,,Desktop applications developer
2260,2261,,,Mobile developer
49514,49515,,,Web developer
41565,41566,,,Desktop applications developer
27798,27799,,,Developer with a statistics or mathematics bac...
17270,17271,,,Data scientist
10451,10452,,,Web developer
2964,2965,,,Graphics programming


In [6]:
sodf17Role = sodf17Explode.copy()
#Changes 'web developer' to the more specific web developer role if there is one, otherwise kept as 'Web Developer'
sodf17Role['Role'] = sodf17Role['Role'].where(~(sodf17Role['Role'].eq('Web developer') 
                                                    & ~(sodf17Role['WebDeveloperType'].isnull())),
                                                 sodf17Role['WebDeveloperType'])

In [7]:
display(sodf17Role[~(sodf17Role['WebDeveloperType'].isnull())])

Unnamed: 0,Respondent,WebDeveloperType,MobileDeveloperType,Role
7,8,Full stack Web developer,,Full stack Web developer
11,12,Back-end Web developer,,Back-end Web developer
13,14,Full stack Web developer,,Full stack Web developer
16,17,Full stack Web developer,,Full stack Web developer
18,19,Full stack Web developer,,Full stack Web developer
...,...,...,...,...
51372,51373,Full stack Web developer,,Full stack Web developer
51374,51375,Back-end Web developer,,Back-end Web developer
51377,51378,Back-end Web developer,,Back-end Web developer
51382,51383,Front-end Web developer,,Front-end Web developer


In [8]:
sodf17Role[~(sodf17Role['MobileDeveloperType'].isnull())]

Unnamed: 0,Respondent,WebDeveloperType,MobileDeveloperType,Role
57,58,,Mobile developer (Android),Mobile developer
86,87,,Mobile developer (iOS),Mobile developer
108,109,,Mobile developer (Windows Phone),Mobile developer
176,177,,Mobile developer (iOS),Mobile developer
189,190,,Mobile developer (iOS),Mobile developer
...,...,...,...,...
51276,51277,,Mobile developer (iOS); Mobile developer (Andr...,Mobile developer
51330,51331,,Mobile developer (Android),Mobile developer
51343,51344,,Mobile developer (Android),Mobile developer
51349,51350,,Mobile developer (iOS),Mobile developer


In [9]:
#This is completely useless for now as all mobile developer roles get standardized to 'Mobile Developer'.
#Might be useful for analysis later so kept for now
sodf17Role['MobileDeveloperType'] = sodf17Role['MobileDeveloperType'].str.split(';')
sodf17Role[~(sodf17Role['MobileDeveloperType'].isnull())]

Unnamed: 0,Respondent,WebDeveloperType,MobileDeveloperType,Role
57,58,,[Mobile developer (Android)],Mobile developer
86,87,,[Mobile developer (iOS)],Mobile developer
108,109,,[Mobile developer (Windows Phone)],Mobile developer
176,177,,[Mobile developer (iOS)],Mobile developer
189,190,,[Mobile developer (iOS)],Mobile developer
...,...,...,...,...
51276,51277,,"[Mobile developer (iOS), Mobile developer (An...",Mobile developer
51330,51331,,[Mobile developer (Android)],Mobile developer
51343,51344,,[Mobile developer (Android)],Mobile developer
51349,51350,,[Mobile developer (iOS)],Mobile developer


In [10]:
sodf17Role = sodf17Role.explode('MobileDeveloperType')

In [11]:
display (sodf17Role[~sodf17Role['WebDeveloperType'].isnull()])

Unnamed: 0,Respondent,WebDeveloperType,MobileDeveloperType,Role
7,8,Full stack Web developer,,Full stack Web developer
11,12,Back-end Web developer,,Back-end Web developer
13,14,Full stack Web developer,,Full stack Web developer
16,17,Full stack Web developer,,Full stack Web developer
18,19,Full stack Web developer,,Full stack Web developer
...,...,...,...,...
51372,51373,Full stack Web developer,,Full stack Web developer
51374,51375,Back-end Web developer,,Back-end Web developer
51377,51378,Back-end Web developer,,Back-end Web developer
51382,51383,Front-end Web developer,,Front-end Web developer


In [12]:
sodf17Role['Role'] = sodf17Role['Role'].where(~(sodf17Role['Role'].eq('Mobile developer') 
                                                    & ~(sodf17Role['MobileDeveloperType'].isnull())),
                                                 sodf17Role['MobileDeveloperType'])

In [13]:
sodf17Role[~(sodf17Role['MobileDeveloperType'].isnull())]

Unnamed: 0,Respondent,WebDeveloperType,MobileDeveloperType,Role
57,58,,Mobile developer (Android),Mobile developer (Android)
86,87,,Mobile developer (iOS),Mobile developer (iOS)
108,109,,Mobile developer (Windows Phone),Mobile developer (Windows Phone)
176,177,,Mobile developer (iOS),Mobile developer (iOS)
189,190,,Mobile developer (iOS),Mobile developer (iOS)
...,...,...,...,...
51276,51277,,Mobile developer (Android),Mobile developer (Android)
51330,51331,,Mobile developer (Android),Mobile developer (Android)
51343,51344,,Mobile developer (Android),Mobile developer (Android)
51349,51350,,Mobile developer (iOS),Mobile developer (iOS)


In [14]:
sodf17Role.drop(columns = ['WebDeveloperType', 'MobileDeveloperType'], inplace = True)
sodf17Role['Role'] = sodf17Role['Role'].str.strip()
sodf17Role.drop_duplicates(inplace = True)
sodf17Role['Year'] = 2017

In [15]:
display (sodf17Role)

Unnamed: 0,Respondent,Role,Year
2,3,Other,2017
3,4,Data scientist,2017
4,5,Mobile developer,2017
4,5,Graphics programming,2017
4,5,Desktop applications developer,2017
...,...,...,...
51389,51390,Web developer,2017
51389,51390,Systems administrator,2017
51390,51391,Web developer,2017
51390,51391,Mobile developer,2017


In [16]:
sodf18 = pd.read_csv("survey_results_public2018.csv", usecols = ['Respondent','DevType'])

In [17]:
sodf18.dropna(inplace = True)
sodf18['Year'] = 2018
sodf18.sample(20)

Unnamed: 0,Respondent,DevType,Year
31706,45018,Back-end developer;Designer;Full-stack develop...,2018
86807,94846,Back-end developer;Database administrator;Desi...,2018
68718,97558,Back-end developer;Desktop or enterprise appli...,2018
5771,8261,Front-end developer;Mobile developer,2018
81270,12102,Back-end developer;Desktop or enterprise appli...,2018
84518,61148,Back-end developer;Database administrator;Desi...,2018
48275,68480,Database administrator;System administrator,2018
27153,38604,Full-stack developer,2018
46749,66317,Engineering manager;Full-stack developer,2018
69285,98363,Data scientist or machine learning specialist;...,2018


In [18]:
sodf18['Role'] = sodf18['DevType'].str.split(';')
sodf18.drop(columns = ['DevType'], inplace = True)
sodf18.sample(20)

Unnamed: 0,Respondent,Year,Role
66583,94560,2018,[Full-stack developer]
1003,1410,2018,"[Back-end developer, Front-end developer, Full..."
9122,13017,2018,"[Back-end developer, Database administrator, D..."
69031,97995,2018,"[Back-end developer, Desktop or enterprise app..."
74554,33857,2018,"[Back-end developer, Database administrator, F..."
62563,88815,2018,[Front-end developer]
45559,64619,2018,[Full-stack developer]
66099,93836,2018,"[Back-end developer, Full-stack developer]"
45110,64007,2018,"[Back-end developer, Database administrator, F..."
49851,70725,2018,"[Desktop or enterprise applications developer,..."


In [19]:
sodf18Explode = sodf18.explode('Role')
sodf18Explode.sample(20)

Unnamed: 0,Respondent,Year,Role
4546,6530,2018,Back-end developer
78845,82593,2018,Full-stack developer
75315,42363,2018,Front-end developer
47712,67665,2018,Data or business analyst
66329,94168,2018,Front-end developer
62558,88810,2018,Desktop or enterprise applications developer
49449,70147,2018,Front-end developer
23768,33810,2018,Full-stack developer
26208,37258,2018,Back-end developer
4926,7067,2018,System administrator


In [20]:
sodf19 = pd.read_csv("survey_results_public2019.csv", usecols = ['Respondent','DevType'])

In [21]:
sodf19.dropna(inplace = True)
sodf19['Year'] = 2019
sodf19.sample(20)

Unnamed: 0,Respondent,DevType,Year
537,539,Data scientist or machine learning specialist;...,2019
19711,19820,"Developer, back-end;Developer, front-end;Devel...",2019
77866,78356,"Developer, full-stack",2019
42098,42363,"Developer, desktop or enterprise applications",2019
77383,77863,"Developer, mobile",2019
51708,52042,"Developer, front-end;Developer, full-stack",2019
65548,65966,"Developer, back-end",2019
84289,84814,"Developer, full-stack;DevOps specialist;Engine...",2019
78206,78696,"Developer, mobile",2019
59234,59611,"Developer, back-end",2019


In [22]:
sodf19['Role'] = sodf19['DevType'].str.split(';')
sodf19.drop(columns = ['DevType'], inplace = True)
sodf19Explode = sodf19.explode('Role')
sodf19Explode.sample(20)

Unnamed: 0,Respondent,Year,Role
30239,30420,2019,Engineering manager
37756,37989,2019,"Developer, front-end"
10806,10855,2019,"Developer, back-end"
7096,7125,2019,"Developer, back-end"
9536,9577,2019,"Developer, back-end"
27841,28002,2019,"Developer, full-stack"
65233,65650,2019,"Developer, back-end"
10129,10173,2019,Engineering manager
37333,37563,2019,System administrator
54632,54977,2019,"Developer, back-end"


In [23]:
sodf20 = pd.read_csv("survey_results_public2020.csv", usecols = ['Respondent','DevType'])

In [24]:
sodf20.dropna(inplace = True)
sodf20['Year'] = 2020
sodf20.sample(20)

Unnamed: 0,Respondent,DevType,Year
22142,22227,"Developer, back-end;Developer, desktop or ente...",2020
17386,17448,Academic researcher;Data or business analyst;D...,2020
20588,20669,Educator,2020
23478,23568,"Developer, back-end;Developer, front-end",2020
10402,10435,"Developer, back-end;Developer, desktop or ente...",2020
23838,23928,"Database administrator;Developer, back-end;Dev...",2020
47051,48037,"Developer, desktop or enterprise applications;...",2020
34233,34358,"Developer, full-stack",2020
8641,8673,"Designer;Developer, back-end;Developer, deskto...",2020
10736,10770,"Developer, back-end;Developer, full-stack;Educ...",2020


In [25]:
sodf20['Role'] = sodf20['DevType'].str.split(';')
sodf20.drop(columns = ['DevType'], inplace = True)
sodf20Explode = sodf20.explode('Role')
sodf20Explode.sample(20)

Unnamed: 0,Respondent,Year,Role
17041,17101,2020,Academic researcher
7142,7171,2020,Data scientist or machine learning specialist
3280,3294,2020,Educator
37612,37963,2020,"Developer, embedded applications or devices"
44080,44956,2020,"Developer, full-stack"
27726,27830,2020,"Developer, full-stack"
48091,49093,2020,"Developer, back-end"
43573,44410,2020,"Developer, full-stack"
35536,35682,2020,DevOps specialist
52855,53942,2020,"Developer, full-stack"


In [26]:
#Concatenate all survey datasets together
sodfList = [sodf16, sodf17Role, sodf18Explode, sodf19Explode, sodf20Explode]
sodf = pd.concat(sodfList)
display(sodf)

Unnamed: 0,Respondent,Role,Year
1,4637,Mobile developer - iOS,2016
3,21378,DevOps,2016
6,31743,Growth hacker,2016
7,51301,Back-end web developer,2016
9,24487,Back-end web developer,2016
...,...,...,...
64453,64330,Data or business analyst,2020
64453,64330,"Developer, mobile",2020
64453,64330,Senior executive/VP,2020
64453,64330,System administrator,2020


In [27]:
#STANDARDIZING TIME
sodf['Role'].replace('Educator or academic', 'Educator or academic researcher', inplace = True)
sodf['Role'].replace(['Analyst', 'Analyst or consultant'], 'Data or business analyst', inplace = True)
sodf['Role'].replace(['Executive (VP of Eng., CTO, CIO, etc.)', 'C-suite executive', 'C-suite executive (CEO, CTO, etc.)']
                     ,'Senior executive/VP', inplace = True)
sodf['Role'].replace(['Data scientist', 'Machine learning developer', 'Machine learning specialist'],
                    'Data scientist or machine learning specialist', inplace = True)
sodf['Role'].replace('Designer', 'Designer or illustrator', inplace = True)
sodf['Role'].replace(['Enterprise level services developer', 'Desktop developer', 'Desktop applications developer','Desktop or enterprise applications developer']
                     , 'Developer, desktop or enterprise applications', inplace = True)
sodf['Role'].replace(['Back-end web developer', 'Back-end Web developer', 'Back-end developer', 'Developer, back-end']
                     , 'Developer, web (back-end)', inplace = True)
sodf['Role'].replace(['Front-end web developer', 'Front-end Web developer', 'Front-end developer', 'Developer, front-end']
                     , 'Developer, web (front-end)', inplace = True)
sodf['Role'].replace(['Full-stack web developer', 'Full stack Web developer', 'Full-stack developer', 'Developer, full-stack']
                     , 'Developer, web (full-stack)', inplace = True)
sodf['Role'].replace('Web developer', 'Developer, web (back-end, front-end, or full-stack)', inplace = True)
sodf['Role'].replace(['Graphics programmer', 'Graphics programming', 'Game or graphics developer']
                     , 'Developer, game or graphics', inplace = True)
sodf['Role'].replace(['Quality assurance engineer', 'QA or test developer', 'Quality Assurance']
                    , 'Developer, QA or test', inplace = True)
sodf['Role'].replace('DevOps', 'DevOps specialist', inplace = True)
sodf['Role'].replace(['Embedded application developer', 'Embedded applications/devices developer', 'Embedded applications or devices developer']
                    , 'Developer, embedded applications or devices', inplace = True)
sodf['Role'].replace('Marketing or sales manager', 'Marketing or sales professional', inplace = True)
sodf['Role'].replace(['Mobile developer', 'Mobile developer - iOS', 'Mobile developer - Windows Phone', 'Mobile developer - Android',
                     'Mobile developer (Android)', 'Mobile developer (Blackberry)', 'Mobile developer (iOS)', 'Mobile developer (Windows Phone)']
                    , 'Developer, mobile', inplace = True)
sodf['Role'].replace('other', 'Other', inplace = True)
sodf['Role'].replace('System administrator', 'Systems administrator', inplace = True)

In [28]:
#Getting unique roles
sodfRole=sodf['Role'].copy()
sodfRole.drop_duplicates(inplace = True)
display (sodfRole)

1                                       Developer, mobile
3                                       DevOps specialist
6                                           Growth hacker
7                               Developer, web (back-end)
10                            Developer, web (full-stack)
16          Developer, desktop or enterprise applications
17                                                Student
28          Data scientist or machine learning specialist
45                                    Engineering manager
50                                        Product manager
52                                                  Other
53                                Designer or illustrator
61            Developer, embedded applications or devices
64                             Developer, web (front-end)
71                                  Systems administrator
72                                    Senior executive/VP
80                               Data or business analyst
132     Develo

In [29]:
sodfRole.sort_values(inplace = True)
sodfRole.to_csv('StackOverflowRolesv2.csv', index = False)

In [31]:
sodf.drop_duplicates(inplace = True)
sodf.sort_values(by = ['Year', 'Respondent'], inplace = True)
sodf.to_csv('StackOverflowRespondentRolesv2.csv', index = False)