# Analysis of StackOverflow Survey. Part IV 

In this notebook we build a predictiv model for job satisfaction. 

In [110]:
# import neccessary packages and libraries
import os
from collections import defaultdict

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
# to render plots in the notebook
%matplotlib inline

import seaborn as sns
# set a theme for seaborn
sns.set_theme()

from sklearn.linear_model import LinearRegression
from sklearn.impute import KNNImputer
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn import (
    ensemble,
    preprocessing,
    tree,
)
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
)
from sklearn.metrics import (
    r2_score, 
    mean_squared_error,
    auc,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
)

In [105]:
# import local module containing the neccessary functions
import utils_functions as uf

# forces the interpreter to re-import the module
import importlib
importlib.reload(uf);

## State the question
I am addressing the third question in this notebook. What can we tell about the job satisfaction of a data coder? What factors do influence it? Also, predict the job satisfaction for a developer who works with big data. 

This is a classification question, we are predicting a satisfaction level for a data developer, which includes: data scientist or machine learning specialist, data or business analyst and data engineer.

## Performance metrics - to review at the end

The following performance measures will be used in this project:
1. Cross validation via StratifiedKFold with 10 folds.
2. Confusion matrix, in particular precision, recall and F1 score.
3. The ROC curve and the related AUC score.

## Gather the data

Upload the data and keep the subset that contains those developers that work in data science related fields.

In [47]:
# create a path string
mypath = os.getcwd()

# upload the datafiles as pandas dataframes
df1 = pd.read_csv(mypath+'/data/survey20_updated.csv')

# check the uploaded data
df1.shape

(64461, 25)

In [48]:
# the data frame that contains the data developers only
df1 = df1[df1.DevClass == 'data_coder']

# check the size of the data
df1.shape

(8726, 25)

In [49]:
# create a list of columns to be used in this analysis
list_cols = ['MainBranch', 'ConvertedComp', 
       'EdLevel', 'Employment',
       'JobSat', 'EdImpt',
       'Learn', 'Overtime', 'OpSys', 'OrgSize', 
       'UndergradMajor', 'WorkWeekHrs']

In [50]:
# the dataset that contains only the listed columns
df1 = df1[list_cols]
df1.shape

(8726, 12)

In [51]:
# reset the index 
df1.reset_index(drop=True, inplace=True)

In [52]:
# gather information on dtypes and missing values
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8726 entries, 0 to 8725
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   MainBranch      8699 non-null   object 
 1   ConvertedComp   5810 non-null   float64
 2   EdLevel         8581 non-null   object 
 3   Employment      8726 non-null   object 
 4   JobSat          8726 non-null   int64  
 5   EdImpt          8206 non-null   object 
 6   Learn           7979 non-null   object 
 7   Overtime        7424 non-null   object 
 8   OpSys           8120 non-null   object 
 9   OrgSize         7590 non-null   object 
 10  UndergradMajor  8053 non-null   object 
 11  WorkWeekHrs     6995 non-null   float64
dtypes: float64(2), int64(1), object(9)
memory usage: 818.2+ KB


## Data profiling

In [53]:
# run this once to generate a profiling report and save it as html file

#import pandas_profiling
#profile = pandas_profiling.ProfileReport(df, minimal=False)
#profile.to_file(output_file="data_coders_report.html")

## Data preprocessing 

### Remove duplicates

In [54]:
# drop duplicate rows, if any
df1.drop_duplicates(subset=None, keep='first', inplace=True)
df1.shape

(8553, 12)

### Create bins for the WorkWeekHrs column

In [55]:
# create the labels
cut_labels = ['<10', '10-20', '20-30', '30-40', '40-50', '>50']

# define the bins 
m = df1.WorkWeekHrs.max()
cut_bins = [0, 10, 20, 30, 40, 50, m]

# create a new column which contains the new labels
df1['WorkWeek_Bins'] = pd.cut(df1['WorkWeekHrs'], bins=cut_bins, labels=cut_labels)

# check for success
df1['WorkWeek_Bins'].value_counts()

30-40    3842
40-50    1836
>50       610
<10       284
20-30     276
10-20     140
Name: WorkWeek_Bins, dtype: int64

In [56]:
# count the missing values in the new column
df1['WorkWeek_Bins'].isnull().sum()

1565

In [57]:
# change the type of the newly created column
df1['WorkWeek_Bins'] = df1['WorkWeek_Bins'].astype('object')

In [58]:
# drop the WorkWeekHrs column
df1.drop(columns = 'WorkWeekHrs', inplace=True);

### Create bins for the ConvertedComp column

In [59]:
# we could use quantile, however I prefer custom bins here
cut_labels = ['<10K', '10K-30K', '30K-50K', '50K-100K', '100K-200K', '>200K']

# define the bins 
m = df1.ConvertedComp.max()
cut_bins = [0, 10000, 30000, 50000, 100000, 200000, m]

# create a new column which contains the new labels
df1['Comp_Bins'] = pd.cut(df1['ConvertedComp'], bins=cut_bins, labels=cut_labels)

# change the type of the newly created column
df1['Comp_Bins'] = df1['Comp_Bins'].astype('object')

# drop the WorkWeekHrs column
df1.drop(columns = 'ConvertedComp', inplace=True);

## Create features and target

Create a dataframe (X) with the features and a pandas series (y) that contains the labels.

In [64]:
# create a copy of the pre-processed dataframe
df2 = df1.copy()

In [65]:
# create the predictors dataframe
X = df2.drop(columns = 'JobSat')

# create the labels
y = df2['JobSat']

# check for success
X.info(), len(y)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8553 entries, 0 to 8725
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   MainBranch      8526 non-null   object
 1   EdLevel         8410 non-null   object
 2   Employment      8553 non-null   object
 3   EdImpt          8144 non-null   object
 4   Learn           7818 non-null   object
 5   Overtime        7417 non-null   object
 6   OpSys           7954 non-null   object
 7   OrgSize         7580 non-null   object
 8   UndergradMajor  7905 non-null   object
 9   WorkWeek_Bins   6988 non-null   object
 10  Comp_Bins       5783 non-null   object
dtypes: object(11)
memory usage: 801.8+ KB


(None, 8553)

## Create dummies for the dataframe of predictors X

In [75]:
# create dummies for all the columns in dataframe
X_dumm = pd.get_dummies(X)

# check for success
X_dumm.shape

(8553, 69)

## Sample data

We will use $30 \%$ data for testing:

In [79]:
# split the data in train and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_dumm, y, test_size=0.3, random_state=42)

# check for success
X_train.shape, len(y_train), X_test.shape, len(y_test)

((5987, 69), 5987, (2566, 69), 2566)

## Impute missing values

Now that we have test and train data, we can impute missing values on the training set, and use the trained imputer to fill in the test dataset. I will use the KNN imputer from sklearn.

In [85]:
# create an instance of the imputer
imputer = KNNImputer(n_neighbors=5)

# fit the imputer on the dataset
X_train_trans = pd.DataFrame(imputer.fit_transform(X_train), columns = X_train.columns)

# check for success
X_train_trans.isna().any()

MainBranch_I am a developer by profession                                                   False
MainBranch_I am a student who is learning to code                                           False
MainBranch_I am not primarily a developer, but I write code sometimes as part of my work    False
MainBranch_I code primarily as a hobby                                                      False
MainBranch_I used to be a developer by profession, but no longer am                         False
                                                                                            ...  
Comp_Bins_10K-30K                                                                           False
Comp_Bins_30K-50K                                                                           False
Comp_Bins_50K-100K                                                                          False
Comp_Bins_<10K                                                                              False
Comp_Bins_>200K     

## Refactor code


In [106]:
# read the data from the file
df = pd.read_csv(mypath+'/data/survey20_updated.csv')
# preprocess, split and process data
preproc_df = uf.preprocess_data(df)
X_train, y_train, X_test, y_test = uf.process_data(preproc_df, 'JobSat')

## Baseline model

In [111]:
# create an instance of the classifier
knn_clf = KNeighborsClassifier()

# fit the classifier
knn_clf.fit(X_train, y_train)

KNeighborsClassifier()

In [112]:
ypred=knn_clf.predict(X_test) #These are the predicted output values

In [114]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

result = confusion_matrix(y_test, ypred)

print('Confusion Matrix:')
print(result)

result1 = classification_report(y_test, ypred)
print('Classification Report:')
print (result1)

result2 = accuracy_score(y_test,ypred)
print('Accuracy:',result2)

Confusion Matrix:
[[262   3   1   2   6   5]
 [  5  17  26  11  65  58]
 [ 11  17  62  42 130  94]
 [ 16  20  34  32  98  87]
 [ 21  41  94  58 246 209]
 [ 39  35 103  67 285 264]]
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.94      0.83       279
           1       0.13      0.09      0.11       182
           2       0.19      0.17      0.18       356
           3       0.15      0.11      0.13       287
           4       0.30      0.37      0.33       669
           5       0.37      0.33      0.35       793

    accuracy                           0.34      2566
   macro avg       0.31      0.34      0.32      2566
weighted avg       0.32      0.34      0.33      2566

Accuracy: 0.3441153546375682


## Several other algorithms 

In [115]:
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [120]:
from sklearn import model_selection
from sklearn.linear_model import (LogisticRegression)
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import (KNeighborsClassifier)
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import (RandomForestClassifier)
import xgboost as xgb
from sklearn.metrics import classification_report

In [None]:
for model in [ DecisionTreeClassifier, KNeighborsClassifier, GaussianNB, SVC, 
              RandomForestClassifier, xgb.XGBClassifier]:
    cls = model()
    kfold = model_selection.KFold(n_splits=10)
    s = model_selection.cross_val_score(cls, X, y, scoring="roc_auc", cv=kfold)
    print(f"{model.__name__:22}  AUC:" f"{s.mean():.3f} STD: {s.std():.2f}")

In [None]:
print(classification_report(y_test, y_pred, target_names=labels))

In [None]:
cls = xgb.XGBClassifier()
kfold = model_selection.KFold(n_splits=10)
s = model_selection.cross_val_score(cls, X, y, scoring="roc_auc", cv=kfold)
    print(f"{model.__name__:22}  AUC:" f"{s.mean():.3f} STD: {s.std():.2f}")

In [None]:
diabetes = load_diabetes()

X = diabetes.data
y = diabetes.target

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores = []

for train_index, test_index in kfold.split(X):   
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    xgb_model = xgb.XGBRegressor(objective="reg:linear")
    xgb_model.fit(X_train, y_train)
    
    y_pred = xgb_model.predict(X_test)
    
    scores.append(mean_squared_error(y_test, y_pred))
    
display_scores(np.sqrt(scores))