# Analysis of StackOverflow Survey. Part IV

In this notebook we address the third question, and we build a model to predict job satisfaction for data coders.

The steps of the process are: all steps with substeps
1. 
2. 

In [276]:
# general packages and libraries
import os
import sys
from collections import defaultdict
import importlib

In [277]:
# data manipulation packages
import numpy as np
import pandas as pd

In [278]:
# data visualizations packages
import matplotlib.pyplot as plt
# to render plots in the notebook
%matplotlib inline

import seaborn as sns
# set a theme for seaborn
sns.set_theme()

In [279]:
# numerical, statistical and machine learning packages and libraries
import xgboost as xgb
from scipy import stats

from sklearn import (
    ensemble,
    tree,
)
from sklearn.base import (
    BaseEstimator, 
    TransformerMixin,
)
from sklearn.pipeline import (
    make_pipeline,
    FeatureUnion, 
    Pipeline,
)
from sklearn.feature_selection import (
    SelectKBest, 
    chi2, 
    mutual_info_classif,
)
from sklearn.impute import (
    KNNImputer,
    SimpleImputer,
)
from sklearn.preprocessing import (
    OneHotEncoder, 
    OrdinalEncoder, 
    LabelEncoder,
    StandardScaler,
    MultiLabelBinarizer,
)
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    KFold,
    cross_val_score,
)

from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import (RandomForestClassifier)

from sklearn.linear_model import (
    SGDClassifier,
    LogisticRegression,
) 
from sklearn.metrics import (
    classification_report,
    r2_score, 
    mean_squared_error,
    auc,
    confusion_matrix,
    accuracy_score,
    roc_auc_score,
    roc_curve,
)
from sklearn.metrics import log_loss, roc_auc_score, precision_score, recall_score, confusion_matrix


In [381]:
# import local modules 
import utils_functions as uf 
#import utils_classes as uc
import local_maps as lm

# forces the interpreter to re-load the module
importlib.reload(uf);

# create a path string
mypath = os.getcwd()

## Formulate the questions

We separate the respondents of the 2020 StackOverflow Developer Survey into data developers
(data scientist or machine learning specialist, data or business analyst, data engineer) and other developers. In what follows we restrict the dataset to the data developers and address the following questions:  
 - What can we tell about the job satisfaction of a data developer? 
 - What factors do influence the job satisfaction? 
 
We build a predictive model for the job satisfaction for data developers. This is a multi-class classification question, where the satisfaction levels are: very dissatisfied, slightly dissatisfied, neither satisfied nor dissatisfied, slightly satisfied, very satisfied.

## Performance metrics - to review at the end

The following performance measures will be used in this project:
1. Cross validation via StratifiedKFold with 10 folds.
2. Confusion matrix, in particular precision, recall and F1 score.
3. The ROC curve and the related AUC score.

# Gather and prepare the data

Upload the data and keep the subset that contains those developers that work in data science related fields. 


## Load the data

In [382]:
# upload the datafile as pandas dataframe
df = pd.read_csv(mypath+'/data/survey20_updated.csv', index_col=[0])
# check for success
df.shape

(64461, 61)

## Remove unnecessary data

In [383]:
# create a copy of the data
df1 = df.copy()

### Keep the developers that work with data

In [384]:
# use the auxiliary column to retain the data developers only
#df1 = df1[df1['DevClass']== 'data_coder']

df1['DevType'] = df1['DevType'].str.split(';')
# transform each element of a list-like to a row, replicating index values
df1 = df1.explode('DevType')

# check the outcome
df1.shape

(172185, 61)

In [385]:
# retain only those rows that contain data coders
df1 = df1.loc[df1.DevType.str.contains('Data ', na=False)]
df1.shape

(11750, 61)

### Retain the developers that are employed

In [386]:
# check the employment types for data coders
df1.Employment.value_counts()

Employed full-time                                      9236
Independent contractor, freelancer, or self-employed    1481
Not employed, but looking for work                       564
Employed part-time                                       469
Name: Employment, dtype: int64

In [387]:
# retain only the employed data developers
df1 = df1[df1['Employment'] != 'Not employed, but looking for work']

# check for success
df1.Employment.value_counts()

Employed full-time                                      9236
Independent contractor, freelancer, or self-employed    1481
Employed part-time                                       469
Name: Employment, dtype: int64

### Retain only the respondents that code professionally

In [388]:
# check the professional status of the employed developers
df1.MainBranch.value_counts()

I am a developer by profession                                                   8207
I am not primarily a developer, but I write code sometimes as part of my work    2275
I am a student who is learning to code                                            296
I used to be a developer by profession, but no longer am                          203
I code primarily as a hobby                                                       163
Name: MainBranch, dtype: int64

In [389]:
# create a list of main branch choices
main_choices = df1.MainBranch.value_counts().index.to_list()
# retain those rows where MainBranch contains the respondents that work professionally with data
df1 = df1[df1.MainBranch.isin(main_choices[:2])]

# check the outcome
df1.MainBranch.value_counts()

I am a developer by profession                                                   8207
I am not primarily a developer, but I write code sometimes as part of my work    2275
Name: MainBranch, dtype: int64

### Drop the rows with missing values in JobSat column

In [390]:
# drop rows with missing JobSat
df1.dropna(subset=['JobSat'], inplace=True)

### Remove irrelevant columns

In [391]:
# list of columns to be removed
cols_del = [
    # personal, demographics  information
    #'Respondent', 
    'MainBranch', 'Employment', 'Hobbyist', 
    'Country','Ethnicity', 'Age',
    'Gender', 'Sexuality', 'Trans', 
    
    # related to ConvertedComp
    'CompFreq', 'CompTotal', 'CurrencyDesc', 'CurrencySymbol',
    
    # questions regarding future activities
    'DatabaseDesireNextYear', 'MiscTechDesireNextYear',
    'CollabToolsDesireNextYear', 'PlatformDesireNextYear',
    'LanguageDesireNextYear', 'WebframeDesireNextYear',
    
    # questions regarding this survey
    'SurveyEase', 'SurveyLength', 'WelcomeChange',
    
    # question regarding participation is StackOverflow
    'SOSites', 'SOComm', 'SOPartFreq',
    'SOVisitFreq', 'SOAccount',

    # columns related to other columns
    'Age1stCode', 'YearsCodePro', 'DevType', 

    # high cardinality, multiple choices columns, add noise 
    'MiscTechWorkedWith', 'DatabaseWorkedWith', #'CollabToolsWorkedWith',
    'WebframeWorkedWith', 'LanguageWorkedWith',

    # questions not relevant to our goal
    #'JobHunt',
    'JobHuntResearch', 'Stuck',
    'PurchaseResearch', 
    #'PurchaseWhat', 
    'Stuck', 'PurpleLink',
    'OffTopic', 'OtherComms',
    'JobFactors', 
    #'JobSeek',

    # auxiliary columns
    'DevClass']

In [392]:
# drop all the columns in the list
df1.drop(columns=cols_del, inplace=True)

# check the output
df1.shape

(10372, 19)

#### Comments on feature selection
Since we are left with 16 features only, I will not perform additional feature selection. After experimenting with a couple of feature selection options, such as mutual_info_classif and SelectKBest, I observed that the model did not performe substantially better after applying feature selection.

## Preprocess data

### Replace JobSat categories with numerical code and drop missing values

In [393]:
# encoding map for job satisfaction
JobSat_dict =  {'Very dissatisfied': 1, 'Slightly dissatisfied': 2,
               'Neither satisfied nor dissatisfied': 3, 
               'Slightly satisfied': 4, 'Very satisfied': 5}

In [394]:
# use the custom labelling 
df1['JobSat'] = df1['JobSat'].map(JobSat_dict)
# check the outcome
df1['JobSat'].value_counts()

5    3582
4    3079
2    1623
3    1242
1     846
Name: JobSat, dtype: int64

### Update YearsCode column entries and dtype

In [395]:
# replace strings with numerical entries
replace_dict = {'Less than 1 year': '0', 'More than 50 years': '51'}
df1.replace(replace_dict, inplace=True)

# change dtype to numeric
df1['YearsCode'] = pd.to_numeric(df1['YearsCode'])

### Pre-process the multi levels columns

In [396]:
# replace the list of entries with sets, missing values with empy set
df1['PlatformWorkedWith'] = df1['PlatformWorkedWith'].str.split(';').apply(lambda x: {} if 
                                                                           x is np.nan else set(x))

In [397]:
df1['CollabToolsWorkedWith'] = df1['CollabToolsWorkedWith'].str.split(';').apply(lambda x: {} if 
                                                                                 x is np.nan else set(x))

### Save a copy of the data to file

In [398]:
# save a copy of the preprocessed dataframe
df1.to_csv(mypath + '/data/survey20_preprocessedex.csv')

In [399]:
# get the pre-processed data
dfp = pd.read_csv(mypath+'/data/survey20_preprocessedex.csv', index_col=[0])

## Refactor code

Rewrite all of the steps for data pre-processing in a single function.

In [400]:
# create a fresh copy of the dataset
dft=df.copy()

# all data cleaning and preprocessing steps
dft = uf.remove_clean_data(dft)

# check the outcome
dft.shape

KeyError: 'CollabToolsWorkedWith'

### Review data types and data distribution in columns

In [369]:
# the list of numerical columns
num_cols = df1.select_dtypes(include='float64').columns.to_list()
print(num_cols)

['ConvertedComp', 'WorkWeekHrs', 'YearsCode']


In [368]:
# the list of discrete columns with many levels 
multi_cols = ['PlatformWorkedWith', 'CollabToolsWorkedWith']

In [370]:
# the list of discrete columns with several levels
cat_cols = df1.select_dtypes(include='object').columns.to_list()
uni_cols = list(set(cat_cols) - set(multi_cols))
print(uni_cols)

['Overtime', 'OpSys', 'DevOps', 'OrgSize', 'UndergradMajor', 'DevOpsImpt', 'EdImpt', 'OnboardGood', 'EdLevel', 'Learn']


In [None]:
# for each categorical column, print possible row values and their counts
def list_answers(df, cat_cols):
    for col in cat_cols:
        print(col)
        print(' ')
        print(df1[col].value_counts())
        print(' ')
# print counts and values
# list_answers(df1, cat_cols)

## Sample data, create features and target datasets

Create a dataframe X of features and a pandas series y that contains the labels.

In [401]:
# create a copy of the pre-processed dataframe
df2 = df1.copy()

In [402]:
# create the predictors dataframe
X = df2.drop(columns = 'JobSat')

# create the labels
y = df2['JobSat']

# check for success
X.shape, len(y)

((10372, 18), 10372)

### Isolate a test set

In [403]:
# split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# summarize the data
print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)

Train (7260, 18) (7260,)
Test (3112, 18) (3112,)


## Encode the discrete variables

### Encode the columns with many levels

After data cleaning and pre-processing the columns with many levels are:
 - multi_cols = ['PlatformWorkedWith']

In [305]:
# create an instance of the encoder
mlb = MultiLabelBinarizer()

# fit the binarizer and encode the selected column
mlb_model = mlb.fit(X_train['PlatformWorkedWith'])
temp_col =  mlb.transform(X_train['PlatformWorkedWith'])

In [306]:
# put the outcome in pandas dataframe form
temp_df = pd.DataFrame(temp_col, columns=mlb.classes_, index=X_train.index)
# check the outcome
temp_df.head(2)

Unnamed: 0_level_0,AWS,Android,Arduino,Docker,Google Cloud Platform,Heroku,IBM Cloud or Watson,Kubernetes,Linux,MacOS,Microsoft Azure,Raspberry Pi,Slack Apps and Integrations,Windows,WordPress,iOS
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
43724,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
32280,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0


In [307]:
# list the three most popular platforms
platform_keep = list(temp_df.sum().sort_values(ascending=False).head(3).index)

In [308]:
# combine the two dataframes and drop the initial column
X_train = pd.concat([X_train, temp_df[platform_keep]], axis=1).drop(columns = ['PlatformWorkedWith'])

# check the outcome
X_train.shape

(7260, 16)

In [309]:
# apply the same transformations to the test set
temp_col_test =  mlb.transform(X_test['PlatformWorkedWith'])
# put the outcome in pandas dataframe form
temp_df_test = pd.DataFrame(temp_col_test, columns=mlb.classes_, index=X_test.index)
# combine the two dataframes and drop the initial column
X_test = pd.concat([X_test, temp_df_test[platform_keep]], axis=1).drop(columns = ['PlatformWorkedWith'])

# check the outcome
X_test.shape

(3112, 16)

#### Comments:

There are several options to choose from when encoded the columns with high cardinality, that originate from multiple answers questions. If we use MultiLabelBinarizer, a column such PlatformWorkedWith will create 16 new columns, which doubles the number of features in the dataframe. In order to address this column explosion, we droped all the new columns but the 3 that correspond to the most popular choices. 

In [None]:
class ParseMultiColumns(BaseEstimator, TransformerMixin):
    """Custom transformer that that changes a list of strings to a set in a column of a dataframe, and assigns the empty set to missing entries.
    """
    #class constructor method 
    def __init__(self, multi_cols=['PlatformWorkedWith']):
            self.multi_cols = multi_cols
            
    # return self nothing else to do here
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for col in self.multi_cols:
            X[col] = X[col].str.split(';').apply(lambda x: {} if x is np.nan else set(x))
        return X


In [None]:
class MultiColumnsEncoder(BaseEstimator, TransformerMixin):
    """Scikit-learn transformer to convert a feature column of a list in 
    to multiple binary feature columns"""
    def __init__(self, feature_names=None):
            self.feature_names = feature_names

    def fit(self, X, y=None):
        self.encoder_dict_ = {}
        
        for col in self.feature_names:
            mlb = MultiLabelBinarizer()
            mlb.fit(X[col])
            self.encoder_dict_[col] = mlb
        return self

    def transform(self, X):
        for col in self.feature_names:
            col_encoded = pd.DataFrame(
                self.encoder_dict_[col].transform(X[col]),
                columns=self.encoder_dict_[col].classes_,
                index=X.index)
            cols_keep = list(col_encoded.sum().sort_values(ascending=False).head(3).index)

            X = pd.concat([X, col_encoded[cols_keep]], axis=1).drop(columns=[col])

        return X


In [None]:
multi_cols

In [None]:
multi_encoder = MultiColumnsEncoder(feature_names=multi_cols)

In [None]:
df_tmp = multi_encoder.fit_transform(X_train[multi_cols])

In [None]:
df_tmp

In [None]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    """
    The constructor extracts and returns the pandas dataset 
    with only those columns whose names were passed to it 
    as an argument during its initialization. 
    It contains two methods: fit and transform.
    """
    
    # class constructor 
    def __init__(self, feature_names):
        self._feature_names = feature_names 
    
    # return self nothing else to do here    
    def fit(self, X, y = None):
        return self 
    
    # method that describes what we need this transformer to do
    def transform(self, X, y = None):
        return X[ self._feature_names ] 

In [None]:
fsel = FeatureSelector(multi_cols)

In [None]:
fsel.fit_transform(X_train)

In [None]:
X_train.columns

In [None]:
X_train.columns

## Impute the missing values

In [None]:
# take the numerical columns in the train dataset
X_train_num = X_train[num_cols]
# create an instance of the KNN imputer
num_imputer = KNNImputer(n_neighbors=5)
# fit_transform the imputer on the training set
X_train_num_imp = pd.DataFrame(num_imputer.fit_transform(X_train_num), 
                               columns=X_train_num.columns)
# separate the numerical columns in the test set
X_test_num = X_test[num_cols]
# transform the test set with the imputer that was fit on the training set
X_test_num_imp = pd.DataFrame(num_imputer.transform(X_test_num), columns=X_test_num.columns)

In [None]:
# create an instance of the scaler
scaler = StandardScaler()

# scale the numerical variables, fit and transform on the straining set
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_num_imp), 
                                columns=X_train_num_imp.columns)
# use the scaler fit on training set to transform the test set
X_test_scaled = pd.DataFrame(scaler.transform(X_test_num_imp), columns=X_test_num_imp.columns)
     

In [None]:
# separate all the categorical columns in the training set
X_train_cat = X_train[cat_cols]
# create an instance of the imputer
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
# fit and transform the training data
X_train_cat_imp = pd.DataFrame(cat_imputer.fit_transform(X_train_cat), 
                               columns=X_train_cat.columns)
# separate the categorical columns in the test set
X_test_cat = X_test[cat_cols]
# transform the test data with the imputer fit on the training set
X_test_cat_imp=pd.DataFrame(cat_imputer.transform(X_test_cat), columns=X_test_cat.columns)

## Encode the predictors

In [None]:
# encode the low cardinality columns
def ord_encode_predictors(X_train, X_test):
    oe = OrdinalEncoder()
    oe.fit(X_train)
    X_train_enc = pd.DataFrame(oe.transform(X_train))
    X_test_enc = pd.DataFrame(oe.transform(X_test))
    return X_train_enc, X_test_enc

In [None]:
# the low cardinality encoded features
X_train_uni_enc, X_test_uni_enc = ord_encode_predictors(X_train_cat_imp[uni_cols],
                                                        X_test_cat_imp[uni_cols])

In [None]:
# encode the high cardinality columns
def encode_predictors(X_train, X_test):
    enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
    enc.fit(X_train)
    X_train_enc = pd.DataFrame(enc.transform(X_train))
    X_test_enc = pd.DataFrame(enc.transform(X_test))
    return X_train_enc, X_test_enc

In [None]:
# the high cardinality encoded features
X_train_multi_enc, X_test_multi_enc = encode_predictors(X_train_cat_imp[multi_cols], 
                                                        X_test_cat_imp[multi_cols])

In [None]:
# encode the target variable 
def encode_target(y_train, y_test):
	le = LabelEncoder()
	le.fit(y_train)
	y_train_enc = le.transform(y_train)
	y_test_enc = le.transform(y_test)
	return y_train_enc, y_test_enc

In [None]:
# encode the target variable - not necessary
# y_train_enc, y_test_enc = encode_targets(y_train,y_test)

In [None]:
# combine the  X frames 
X_train_cat_enc = pd.concat([X_train_multi_enc, X_train_uni_enc], axis=1)
X_train_prep = pd.concat([X_train_cat_enc, X_train_scaled], axis=1)

X_test_cat_enc = pd.concat([X_test_multi_enc, X_test_uni_enc], axis=1)
X_test_prep = pd.concat([X_test_cat_enc, X_test_scaled], axis=1)


### Create a profiling report

In [None]:
# run this once to generate a profiling report and save it as html file

#import pandas_profiling
#profile = pandas_profiling.ProfileReport(X_train, minimal=False)
#profile.to_file(output_file="data_train_report.html")

In [None]:
uni_cols

## Refactor the code: build processing data pipeline

In [404]:
## refactor code: processing data

# the steps in the categorical pipeline for columns of low cardinality
uni_cat_pipeline = Pipeline( steps = [( 'unicat_selector', FeatureSelector(uni_cols) ),
                                  ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                                  ( 'ordinal_encoder', OrdinalEncoder() ) ] )

# the steps in the categorical pipeline for columns of high cardinality
multi_cat_pipeline = Pipeline( steps = [( 'multicat_selector', FeatureSelector(multi_cols) ),
                                  ( 'multi_encoder', MultiColumnsEncoder(multi_cols) ) ] )

# the steps in the numerical pipeline     
num_pipeline = Pipeline( steps = [ ('num_selector', FeatureSelector(num_cols) ),
                                  ('imputer', KNNImputer(n_neighbors=5) ),
                                  ( 'std_scaler', StandardScaler() ) ] )

# combine the numerical and the categorical pipelines
full_pipeline = FeatureUnion( transformer_list = [ ( 'unicat_pipeline', uni_cat_pipeline ), 
                                                  ( 'multicat_pipeline', multi_cat_pipeline ) ,
                                                 ( 'numerical_pipeline', num_pipeline )] )

# Baseline model

In [405]:
# the full pipeline as a step in another pipeline with an estimator as the final step
full_pipeline_m = Pipeline( steps = [ ( 'full_pipeline', full_pipeline),
                                  ( 'model', KNeighborsClassifier(n_neighbors=5) ) ] )

# call fit on it just like any other pipeline
full_pipeline_m.fit( X_train, y_train )

# predict with it like any other pipeline
y_pred = full_pipeline_m.predict( X_test ) 

In [406]:
# print evaluation metrics and results

result1 = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(result1)

result2 = classification_report(y_test, y_pred)
print('\nClassification Report:')
print (result2)

result3 = accuracy_score(y_test,y_pred)  
print('Accuracy: %.3f' %result3)
         

Confusion Matrix:
[[ 51  35  14  87  71]
 [ 36 128  46 153 123]
 [ 24  79  65 105 110]
 [ 56 160  73 382 250]
 [ 72 169  82 292 449]]

Classification Report:
              precision    recall  f1-score   support

           1       0.21      0.20      0.21       258
           2       0.22      0.26      0.24       486
           3       0.23      0.17      0.20       383
           4       0.37      0.41      0.39       921
           5       0.45      0.42      0.43      1064

    accuracy                           0.35      3112
   macro avg       0.30      0.29      0.29      3112
weighted avg       0.35      0.35      0.34      3112

Accuracy: 0.345


In [None]:
full_pipeline_m.classes_

In [None]:
def get_performance(model, X, ground_y):
    """Calculate some importance metrics for model evaluation: roc_auc_ovr, accuracy, precision_macro, recall_macro,
    confusion matrix"""
    ground_y = np.squeeze(ground_y)

    predict_y = model.predict(X)
    predict_y_proba = model.predict_proba(X)

    roc_auc_score_perf = roc_auc_score(ground_y, predict_y_proba, average='macro', multi_class='ovr')  # ROC-AUC
    #logLoss_perf = log_loss(ground_y, predict_y_proba)

    accuracy_perf = (predict_y == ground_y).sum() / len(predict_y)
    precision_score_perf = precision_score(ground_y, predict_y, average='macro')
    recall_score_perf = recall_score(ground_y, predict_y, average='macro')

    # Confusion matrix:
    # print("Confusion matrix [[TN, FP]\n[FN, TP]]:\n", confusion_matrix(ground_y, predict_y))
    conf_m = confusion_matrix(ground_y, predict_y)

    return roc_auc_score_perf, accuracy_perf, precision_score_perf, recall_score_perf, conf_m

In [None]:
roc_auc_score_perf_train, \
accuracy_perf_train, precision_score_perf_train, recall_score_perf_train, \
conf_m_train = get_performance(full_pipeline_m, X_train, y_train)

train_performance = pd.Series([roc_auc_score_perf_trai, \
                               accuracy_perf_train, 
                               precision_score_perf_train, recall_score_perf_train], 
                              index=['roc-auc_macro', 'accuracy',
                                     'precison_macro', 'recall_macro'])


roc_auc_score_perf_test, \
accuracy_perf_test, precision_score_perf_test, recall_score_perf_test, \
conf_m_test = get_performance(full_pipeline_m, X_test, y_test)

test_performance = pd.Series([roc_auc_score_perf_test, \
                               accuracy_perf_test, 
                               precision_score_perf_test, recall_score_perf_test], index=['roc-auc_macro', 'accuracy', 'precison_macro', 'recall_macro'])



performance_check = pd.DataFrame.from_dict({'train': train_performance, 'test': test_performance})
performance_check

In [425]:
# the full pipeline as a step in another pipeline with an estimator as the final step
full_pipeline_xgb = Pipeline( steps = [ ( 'full_pipeline', full_pipeline),
                                  ( 'model', xgb.XGBClassifier(objective = 'multi:softmax' )) ] )

# call fit on it just like any other pipeline
full_pipeline_xgb.fit( X_train, y_train )

# predict with it like any other pipeline
y_pred = full_pipeline_xgb.predict( X_test ) 

In [426]:
# print evaluation metrics and results

result1 = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(result1)

result2 = classification_report(y_test, y_pred)
print('\nClassification Report:')
print (result2)

result3 = accuracy_score(y_test,y_pred)  
print('Accuracy: %.3f' %result3)

Confusion Matrix:
[[ 16  12   2  90 138]
 [  2  34   4 272 174]
 [  2  12  11 198 160]
 [  2  14   4 442 459]
 [  1   7   5 260 791]]

Classification Report:
              precision    recall  f1-score   support

           1       0.70      0.06      0.11       258
           2       0.43      0.07      0.12       486
           3       0.42      0.03      0.05       383
           4       0.35      0.48      0.40       921
           5       0.46      0.74      0.57      1064

    accuracy                           0.42      3112
   macro avg       0.47      0.28      0.25      3112
weighted avg       0.44      0.42      0.35      3112

Accuracy: 0.416


In [424]:
def get_performance(model, X, ground_y):
    """Calculate some importance metrics for model evaluation: roc_auc_ovr, accuracy, precision_macro, recall_macro,
    confusion matrix"""
    ground_y = np.squeeze(ground_y)

    predict_y = model.predict(X)
    predict_y_proba = model.predict_proba(X)

    roc_auc_score_perf = roc_auc_score(ground_y, predict_y_proba, average='macro', multi_class='ovr')  # ROC-AUC
    #logLoss_perf = log_loss(ground_y, predict_y_proba)

    accuracy_perf = (predict_y == ground_y).sum() / len(predict_y)
    precision_score_perf = precision_score(ground_y, predict_y, average='macro')
    recall_score_perf = recall_score(ground_y, predict_y, average='macro')

    # Confusion matrix:
    # print("Confusion matrix [[TN, FP]\n[FN, TP]]:\n", confusion_matrix(ground_y, predict_y))
    conf_m = confusion_matrix(ground_y, predict_y)

    return roc_auc_score_perf, accuracy_perf, precision_score_perf, recall_score_perf, conf_m

In [431]:
roc_auc_score_perf_train,\
accuracy_perf_train, precision_score_perf_train, recall_score_perf_train, \
conf_m_train = get_performance(full_pipeline_rf, X_train, y_train)

train_performance = pd.Series([roc_auc_score_perf_train, \
                               accuracy_perf_train, 
                               precision_score_perf_train, recall_score_perf_train], index=['roc-auc_macro', 'accuracy', 'precison_macro', 'recall_macro'])

train_performance

roc-auc_macro     1.000000
accuracy          0.999862
precison_macro    0.999907
recall_macro      0.999921
dtype: float64

In [433]:
roc_auc_score_perf_test,\
accuracy_perf_test, precision_score_perf_test, recall_score_perf_test, \
conf_m_test = get_performance(full_pipeline_rf, X_test, y_test)

test_performance = pd.Series([roc_auc_score_perf_test, \
                               accuracy_perf_test, 
                               precision_score_perf_test, recall_score_perf_test], index=['roc-auc_macro', 'accuracy', 'precison_macro', 'recall_macro'])

test_performance

roc-auc_macro     0.817235
accuracy          0.587404
precison_macro    0.717367
recall_macro      0.508818
dtype: float64

In [427]:
# the full pipeline as a step in another pipeline with an estimator as the final step
full_pipeline_rf = Pipeline( steps = [ ( 'full_pipeline', full_pipeline),
                                  ( 'model', RandomForestClassifier(n_estimators=200, max_depth=None) ) ] )

# call fit on it just like any other pipeline
full_pipeline_rf.fit( X_train, y_train )

# predict with it like any other pipeline
y_pred_rf = full_pipeline_rf.predict( X_test ) 

In [418]:
y_p = full_pipeline_m.predict( X_train ) 

In [429]:
# print evaluation metrics and results

result1 = confusion_matrix(y_test, y_pred_rf)
print('Confusion Matrix:')
print(result1)

result2 = classification_report(y_test, y_pred_rf)
print('\nClassification Report:')
print (result2)

result3 = accuracy_score(y_test,y_pred_rf)  
print('Accuracy: %.3f' %result3)

Confusion Matrix:
[[ 99  11   2  56  90]
 [  0 189   6 174 117]
 [  1  20 144 120  98]
 [  3  34   8 572 304]
 [  1  23   5 211 824]]

Classification Report:
              precision    recall  f1-score   support

           1       0.95      0.38      0.55       258
           2       0.68      0.39      0.50       486
           3       0.87      0.38      0.53       383
           4       0.50      0.62      0.56       921
           5       0.58      0.77      0.66      1064

    accuracy                           0.59      3112
   macro avg       0.72      0.51      0.56      3112
weighted avg       0.64      0.59      0.58      3112

Accuracy: 0.587


In [408]:
# print evaluation metrics and results

result1 = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(result1)

result2 = classification_report(y_test, y_pred)
print('\nClassification Report:')
print (result2)

result3 = accuracy_score(y_test,y_pred)  
print('Accuracy: %.3f' %result3)

Confusion Matrix:
[[ 99  11   5  57  86]
 [  1 190   4 176 115]
 [  2  20 143 121  97]
 [  2  37   5 565 312]
 [  1  24   4 213 822]]

Classification Report:
              precision    recall  f1-score   support

           1       0.94      0.38      0.55       258
           2       0.67      0.39      0.49       486
           3       0.89      0.37      0.53       383
           4       0.50      0.61      0.55       921
           5       0.57      0.77      0.66      1064

    accuracy                           0.58      3112
   macro avg       0.72      0.51      0.56      3112
weighted avg       0.64      0.58      0.58      3112

Accuracy: 0.585


In [409]:
from sklearn import model_selection

for model in [DecisionTreeClassifier, KNeighborsClassifier, GaussianNB, SVC, 
              RandomForestClassifier, SGDClassifier]:
    make_pipeline(model())
    classifier = model()
    kfold = model_selection.KFold(n_splits=5)
    classifier.fit(X_train_prep, y_train)
    s = model_selection.cross_val_score(classifier, X_test_prep,y_test, cv=kfold)
    #result2 = classification_report(y_test, y_pred, zero_division=0)
    #s = model_selection.cross_val_score(cls, X, y, cv=kfold)
    print(f"{model.__name__:22}  CV_Mean:" f"{s.mean():.3f} CV_STD: {s.std():.2f}")

NameError: name 'X_train_prep' is not defined

In [None]:
clf_xgb = RandomForestClassifier()
param_dist = {'n_estimators': stats.randint(150, 1000),
              'learning_rate': stats.uniform(0.01, 0.59),
              'subsample': stats.uniform(0.3, 0.6),
              'max_depth': [3, 4, 5, 6, 7, 8, 9],
              'colsample_bytree': stats.uniform(0.5, 0.4),
              'min_child_weight': [1, 2, 3, 4]
             }

numFolds = 5
n = X_train_prep.shape[0]
kfold_5 = KFold(n, True, 5)

clf = RandomizedSearchCV(clf_xgb, 
                         param_distributions = param_dist,
                         cv = kfold_5,  
                         n_iter = 5, # you want 5 here not 25 if I understand you correctly 
                         scoring = 'roc_auc', 
                         error_score = 0, 
                         verbose = 3, 
                         n_jobs = -1)

In [None]:
def hyperparameter_tune(base_model, parameters, n_iter, kfold, X=X_train, y=y_train):
    start_time = time.time()
    
    # Arrange data into folds with approx equal proportion of classes within each fold
    k = StratifiedKFold(n_splits=kfold, shuffle=False)
    
    optimal_model = RandomizedSearchCV(base_model,
                            param_distributions=parameters,
                            n_iter=n_iter,
                            cv=k,
                            n_jobs=-1,
                            random_state=42)
    optimal_model.fit(X, y)
    
    stop_time = time.time()

    scores = cross_val_score(optimal_model, X, y, cv=k, scoring="accuracy")
    
    print("Elapsed Time:", time.strftime("%H:%M:%S", time.gmtime(stop_time - start_time)))
    print("====================")
    print("Cross Val Mean: {:.3f}, Cross Val Stdev: {:.3f}".format(scores.mean(), scores.std()))
    print("Best Score: {:.3f}".format(optimal_model.best_score_))
    print("Best Parameters: {}".format(optimal_model.best_params_))
    
    return optimal_model.best_params_, optimal_model.best_score_

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import randint
import matplotlib.pyplot as plt
import time
import warnings
warnings.simplefilter("ignore", category=PendingDeprecationWarning)
warnings.simplefilter("ignore", category=DeprecationWarning)
warnings.simplefilter("ignore", category=FutureWarning)
warnings.simplefilter("ignore", category=UserWarning)

In [None]:
base_model = RandomForestClassifier(n_jobs=-1,
                                   random_state=42)

lots_of_parameters = {
    "max_depth": [3, 5, 10, None],
    "n_estimators": [100, 200, 300, 400, 500],
    "max_features": randint(1, 3),
    "criterion": ["gini", "entropy"],
    "bootstrap": [True, False],
    "min_samples_leaf": randint(1, 4)
}

parameters = {
    "max_depth": [3, 5, 10, None],
    "n_estimators": [100, 200, 300, 400, 500]
}

best_params, best_score = hyperparameter_tune(base_model, parameters, 10, 5, X_train_prep, y_train)

In [None]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train_prep, y_train)

In [None]:
import shap
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test_prep)

In [None]:
shap.summary_plot(shap_values, X_test_prep, plot_type="bar")

In [None]:
df_f_type = df.dtypes

In [None]:
df_f_type = df_f_type.loc[~df_f_type.index.isin(['JobSat'])]

In [None]:
cols_if_num = df_f_type.apply(lambda x: np.issubdtype(x, np.number))

In [None]:
cols_if_num.sum()

In [None]:
cols_cat = cols_if_num[~cols_if_num].index.tolist()

In [None]:
for col_cat in cols_cat:
        df_sample[col_cat] = df_sample[col_cat].fillna('missing')

In [None]:
cols_num = cols_if_num[cols_if_num].index.tolist()
cols_num

In [None]:
for col_num in cols_num:
        df_sample[col_num] = df_sample[col_num].fillna(df[col_num].mean())

In [None]:
df_sample[cols_num].isnull().sum()

In [None]:
enc = OrdinalEncoder()
df_sample[cols_cat] = enc.fit_transform(df_sample[cols_cat])

In [None]:
enc = OrdinalEncoder()
df_sample.loc[:, 'JobSat'] = enc.fit_transform(df_sample[['JobSat']])

In [None]:
df_sample['JobSat']