# Requirements
pandas >= 0.20

In [23]:
import os
import pandas as pd 
import numpy as np
import re
from IPython.display import display
pd.set_option('display.max_rows', 500)
pd.options.display.max_columns = None

## Let us write some functions to create a ML pipeline which will do below tasks -
### 1. Read CSV raw file and create a Pandas DataFrame
### 2. Feature Extraction - Handling missing values, retrieving best features from the data, normalizing features and addressing class imbalance of labels. This will enable us to create a ML ready dataset 
### 3. Training a Classifier - Grid search, K-fold cross-validation, Naive Bayes algorithm to predict 5 year future plan

In [6]:
# Read CSV raw file
def read_data(path,csv_filename):
    """
    Takes 2 parameters as input
    
    1.path : path to the folder where CSV file resides
    2.file_name : name of the file
    """
    print("Inside read data\n")
    # set working directory, to easily read the CSV data 
    os.chdir(path)
    print("Working directory set to :: " + str(os.getcwd()))
    input = pd.read_csv(csv_filename, error_bad_lines = False)
    return input

In [7]:
df = read_data(r"C:\Users\prana\Desktop\stack_overflow_data","survey_results_public.csv")
# Below line might give low_memory Warning because Pandas tries to
# infer the datatypes of columns by reading all rows.
df.head(5)

Inside read data

Working directory set to :: C:\Users\prana\Desktop\stack_overflow_data


  if self.run_code(code, result):


Unnamed: 0,Respondent,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,AssessJob1,AssessJob2,AssessJob3,AssessJob4,AssessJob5,AssessJob6,AssessJob7,AssessJob8,AssessJob9,AssessJob10,AssessBenefits1,AssessBenefits2,AssessBenefits3,AssessBenefits4,AssessBenefits5,AssessBenefits6,AssessBenefits7,AssessBenefits8,AssessBenefits9,AssessBenefits10,AssessBenefits11,JobContactPriorities1,JobContactPriorities2,JobContactPriorities3,JobContactPriorities4,JobContactPriorities5,JobEmailPriorities1,JobEmailPriorities2,JobEmailPriorities3,JobEmailPriorities4,JobEmailPriorities5,JobEmailPriorities6,JobEmailPriorities7,UpdateCV,Currency,Salary,SalaryType,ConvertedSalary,CurrencySymbol,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,TimeAfterBootcamp,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageWorkedWith,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AdsPriorities1,AdsPriorities2,AdsPriorities3,AdsPriorities4,AdsPriorities5,AdsPriorities6,AdsPriorities7,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,Gender,SexualOrientation,EducationParents,RaceEthnicity,Age,Dependents,MilitaryUS,SurveyTooLong,SurveyEasy
0,1,Yes,No,Kenya,No,Employed part-time,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Mathematics or statistics,20 to 99 employees,Full-stack developer,3-5 years,3-5 years,Extremely satisfied,Extremely satisfied,Working as a founder or co-founder of my own c...,"I’m not actively looking, but I am open to new...",Less than a year ago,10.0,7.0,8.0,1.0,2.0,5.0,3.0,4.0,9.0,6.0,,,,,,,,,,,,3.0,1.0,4.0,2.0,5.0,5.0,6.0,7.0,2.0,1.0,4.0,3.0,My job status or other personal status changed,,,Monthly,,KES,Slack,One to three months,"Taught yourself a new language, framework, or ...",The official documentation and/or standards fo...,,To build my professional network,Strongly agree,Strongly agree,Neither Agree nor Disagree,JavaScript;Python;HTML;CSS,JavaScript;Python;HTML;CSS,Redis;SQL Server;MySQL;PostgreSQL;Amazon RDS/A...,Redis;SQL Server;MySQL;PostgreSQL;Amazon RDS/A...,AWS;Azure;Linux;Firebase,AWS;Azure;Linux;Firebase,Django;React,Django;React,Komodo;Vim;Visual Studio Code,Linux-based,1.0,Agile;Scrum,Git,Multiple times per day,Yes,No,,Strongly agree,Strongly agree,Strongly agree,Saw an online advertisement and then researche...,1.0,5.0,4.0,7.0,2.0,6.0,3.0,Artificial intelligence surpassing human intel...,Algorithms making important decisions,The developers or the people creating the AI,I'm excited about the possibilities more than ...,No,"Yes, and publicly",Upper management at the company/organization,Yes,10 (Very Likely),Multiple times per day,Yes,I have never participated in Q&A on Stack Over...,"No, I knew that Stack Overflow had a jobs boar...",Yes,,Yes,Extremely interested,Extremely interested,Extremely interested,Extremely interested,Extremely interested,Between 5:00 - 6:00 AM,9 - 12 hours,1 - 2 hours,Never,Standing desk,3 - 4 times per week,Male,Straight or heterosexual,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Black or of African descent,25 - 34 years old,Yes,,The survey was an appropriate length,Very easy
1,3,Yes,Yes,United Kingdom,No,Employed full-time,"Bachelor’s degree (BA, BS, B.Eng., etc.)","A natural science (ex. biology, chemistry, phy...","10,000 or more employees",Database administrator;DevOps specialist;Full-...,30 or more years,18-20 years,Moderately dissatisfied,Neither satisfied nor dissatisfied,Working in a different or more specialized tec...,I am actively looking for a job,More than 4 years ago,1.0,7.0,10.0,8.0,2.0,5.0,4.0,3.0,6.0,9.0,1.0,5.0,3.0,7.0,10.0,4.0,11.0,9.0,6.0,2.0,8.0,3.0,1.0,5.0,2.0,4.0,1.0,3.0,4.0,5.0,2.0,6.0,7.0,I saw an employer’s advertisement,British pounds sterling (£),51000.0,Yearly,70841.0,GBP,Confluence;Office / productivity suite (Micros...,One to three months,"Taught yourself a new language, framework, or ...",The official documentation and/or standards fo...,,,Agree,Agree,Neither Agree nor Disagree,JavaScript;Python;Bash/Shell,Go;Python,Redis;PostgreSQL;Memcached,PostgreSQL,Linux,Linux,Django,React,IPython / Jupyter;Sublime Text;Vim,Linux-based,2.0,,Git;Subversion,A few times per week,Yes,Yes,The website I was visiting asked me to disable it,Somewhat agree,Neither agree nor disagree,Neither agree nor disagree,,3.0,5.0,1.0,4.0,6.0,7.0,2.0,Increasing automation of jobs,Increasing automation of jobs,The developers or the people creating the AI,I'm excited about the possibilities more than ...,Depends on what it is,Depends on what it is,Upper management at the company/organization,Yes,10 (Very Likely),A few times per month or weekly,Yes,A few times per month or weekly,Yes,"No, I have one but it's out of date",7.0,Yes,A little bit interested,A little bit interested,A little bit interested,A little bit interested,A little bit interested,Between 6:01 - 7:00 AM,5 - 8 hours,30 - 59 minutes,Never,Ergonomic keyboard or mouse,Daily or almost every day,Male,Straight or heterosexual,"Bachelor’s degree (BA, BS, B.Eng., etc.)",White or of European descent,35 - 44 years old,Yes,,The survey was an appropriate length,Somewhat easy
2,4,Yes,Yes,United States,No,Employed full-time,Associate degree,"Computer science, computer engineering, or sof...",20 to 99 employees,Engineering manager;Full-stack developer,24-26 years,6-8 years,Moderately satisfied,Moderately satisfied,Working as a founder or co-founder of my own c...,"I’m not actively looking, but I am open to new...",Less than a year ago,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,5,No,No,United States,No,Employed full-time,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",100 to 499 employees,Full-stack developer,18-20 years,12-14 years,Neither satisfied nor dissatisfied,Slightly dissatisfied,Working as a founder or co-founder of my own c...,"I’m not actively looking, but I am open to new...",Less than a year ago,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,A recruiter contacted me,U.S. dollars ($),,,,,,Three to six months,Completed an industry certification program (e...,The official documentation and/or standards fo...,,,Disagree,Disagree,Strongly disagree,C#;JavaScript;SQL;TypeScript;HTML;CSS;Bash/Shell,C#;JavaScript;SQL;TypeScript;HTML;CSS;Bash/Shell,"SQL Server;Microsoft Azure (Tables, CosmosDB, ...","SQL Server;Microsoft Azure (Tables, CosmosDB, ...",Azure,Azure,,Angular;.NET Core;React,Visual Studio;Visual Studio Code,Windows,2.0,Agile;Kanban;Scrum,Git,Multiple times per day,Yes,Yes,The ad-blocking software was causing display i...,Neither agree nor disagree,Somewhat agree,Somewhat agree,Stopped going to a website because of their ad...,,,,,,,,Artificial intelligence surpassing human intel...,Artificial intelligence surpassing human intel...,A governmental or other regulatory body,"I don't care about it, or I haven't thought ab...",No,"Yes, but only within the company",Upper management at the company/organization,Yes,10 (Very Likely),A few times per week,Yes,A few times per month or weekly,Yes,"No, I have one but it's out of date",8.0,Yes,Somewhat interested,Somewhat interested,Somewhat interested,Somewhat interested,Somewhat interested,Between 6:01 - 7:00 AM,9 - 12 hours,Less than 30 minutes,3 - 4 times per week,,I don't typically exercise,Male,Straight or heterosexual,Some college/university study without earning ...,White or of European descent,35 - 44 years old,No,No,The survey was an appropriate length,Somewhat easy
4,7,Yes,No,South Africa,"Yes, part-time",Employed full-time,Some college/university study without earning ...,"Computer science, computer engineering, or sof...","10,000 or more employees",Data or business analyst;Desktop or enterprise...,6-8 years,0-2 years,Slightly satisfied,Moderately satisfied,Working in a different or more specialized tec...,"I’m not actively looking, but I am open to new...",Between 1 and 2 years ago,8.0,5.0,7.0,1.0,2.0,6.0,4.0,3.0,10.0,9.0,1.0,10.0,2.0,4.0,8.0,3.0,11.0,7.0,5.0,9.0,6.0,2.0,1.0,4.0,5.0,3.0,7.0,3.0,6.0,2.0,1.0,4.0,5.0,My job status or other personal status changed,South African rands (R),260000.0,Yearly,21426.0,ZAR,"Office / productivity suite (Microsoft Office,...",Three to six months,Taken a part-time in-person course in programm...,The official documentation and/or standards fo...,,,Strongly agree,Agree,Strongly disagree,C;C++;Java;Matlab;R;SQL;Bash/Shell,Assembly;C;C++;Matlab;SQL;Bash/Shell,SQL Server;PostgreSQL;Oracle;IBM Db2,PostgreSQL;Oracle;IBM Db2,Arduino;Windows Desktop or Server,Arduino;Windows Desktop or Server,,,Notepad++;Visual Studio;Visual Studio Code,Windows,2.0,Evidence-based software engineering;Formal sta...,Zip file back-ups,Weekly or a few times per month,No,,,Somewhat agree,Somewhat agree,Somewhat disagree,Clicked on an online advertisement;Saw an onli...,2.0,3.0,4.0,6.0,1.0,7.0,5.0,Algorithms making important decisions,Algorithms making important decisions,The developers or the people creating the AI,I'm excited about the possibilities more than ...,No,"Yes, but only within the company",Upper management at the company/organization,Yes,10 (Very Likely),Daily or almost daily,Yes,Less than once per month or monthly,"No, I knew that Stack Overflow had a jobs boar...","No, I know what it is but I don't have one",,Yes,Extremely interested,Extremely interested,Extremely interested,Extremely interested,Extremely interested,Before 5:00 AM,Over 12 hours,1 - 2 hours,Never,,3 - 4 times per week,Male,Straight or heterosexual,Some college/university study without earning ...,White or of European descent,18 - 24 years old,Yes,,The survey was an appropriate length,Somewhat easy


In [10]:
# check dataframe stats and info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98855 entries, 0 to 98854
Columns: 129 entries, Respondent to SurveyEasy
dtypes: float64(41), int64(1), object(87)
memory usage: 64.5+ MB


In [9]:
# To check count of records in pandas use df.shape
# df.count counts rows for all non-NA rows
print("Rows in the data :: " + str(df.shape[0]))
print("Columns in the data :: " + str(df.shape[1]))

Rows in the data :: 98855
Columns in the data :: 129


#### Function to display missing values and percentage of missing values from the input data  

In [28]:
def missing_values(df):
    """
    Takes input as dataframe and prints all the columns with missing values.
    Also prints the percentage of missing values from the dataset.
    """
    null_df = pd.DataFrame(df.isna().sum()).reset_index()
    null_df = null_df.rename(columns={'index':'column_name' , 0 : 'missing_values'})
    null_df['percentage_missing'] = 100* null_df.missing_values / df.shape[0]
    null_df = null_df.sort_values('percentage_missing', ascending=False)
    display(null_df) 


In [30]:
# function call to see missing values in dataset
missing_values(df)

Unnamed: 0,column_name,missing_values,percentage_missing
60,TimeAfterBootcamp,92203,93.270952
126,MilitaryUS,83074,84.036215
61,HackathonReasons,73164,74.011431
118,ErgonomicDevices,64797,65.547519
81,AdBlockerReasons,61110,61.817814
107,StackOverflowJobsRecommend,60538,61.239189
45,JobEmailPriorities3,52642,53.251732
43,JobEmailPriorities1,52642,53.251732
44,JobEmailPriorities2,52642,53.251732
49,JobEmailPriorities7,52642,53.251732


In [32]:
l = df.dtypes.to_dict()
print(type(l))
for k,v in l.items():
    print(str(k) + "::" + str(v))

<class 'dict'>
LastNewJob::object
SexualOrientation::object
AIInteresting::object
PlatformDesireNextYear::object
CareerSatisfaction::object
LanguageWorkedWith::object
LanguageDesireNextYear::object
DatabaseWorkedWith::object
HoursComputer::object
Student::object
FormalEducation::object
AdsPriorities4::float64
Hobby::object
AssessBenefits8::float64
JobEmailPriorities5::float64
AssessBenefits6::float64
StackOverflowDevStory::object
EthicsChoice::object
SurveyEasy::object
JobContactPriorities2::float64
AssessJob4::float64
OperatingSystem::object
JobContactPriorities3::float64
EducationParents::object
SurveyTooLong::object
AdsPriorities3::float64
Age::object
Exercise::object
AssessBenefits5::float64
StackOverflowJobs::object
Employment::object
OpenSource::object
AdBlockerDisable::object
HypotheticalTools3::object
HypotheticalTools1::object
StackOverflowParticipate::object
AssessBenefits4::float64
AssessJob9::float64
HypotheticalTools2::object
Methodology::object
AdsPriorities5::float64
AdB

In [18]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [19]:
df.infer_objects().dtypes

AttributeError: 'DataFrame' object has no attribute 'infer_objects'

In [20]:
pd.__version__

'0.19.2'

In [1]:
import pandas as pd
pd.__version__

'0.23.4'