## Constants

In [1]:
DF_PATH       = "../data/processed/cleaned_data.pkl"
ROLE_COLS      = ['DevType']
TECH_COLS      = ['LanguageWorkedWith',    'DatabaseWorkedWith',    'WebframeWorkedWith',    'MiscTechWorkedWith']
LOG_PATH = "../models/temp/baseline/"
LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"


## Import tools

In [2]:
# Load packages
import pandas as pd 
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression


## Load the data

In [3]:
df = pd.read_pickle(DF_PATH)

## Functions

In [36]:
def calculate_quality(ground_truth, prediction, metric_function, sort_values=False):
    quality_scores = {}
    for col in prediction.columns:
        role_pred  = prediction[col].copy()
        role_truth = ground_truth[col].copy()
        quality_scores[col] = round(metric_function(role_truth, role_pred) * 100, 2)
        
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    if sort_values:
        quality_scores = quality_scores.sort_values()
    
    return quality_scores


## Balance classes

In [11]:
roles_df = df.iloc[:, :16].copy()
role_sum = roles_df.sum(axis=0)
role_sum


Academic researcher                               581
Data or business analyst                          669
Data scientist or machine learning specialist     799
Database administrator                            296
DevOps specialist                                 677
Developer, QA or test                             493
Developer, back-end                              5503
Developer, desktop or enterprise applications    1671
Developer, embedded applications or devices       795
Developer, front-end                             2890
Developer, full-stack                            5578
Developer, game or graphics                       342
Developer, mobile                                1859
Engineer, data                                    483
Scientist                                         292
System administrator                              440
dtype: int64

## Resample the data

In [12]:
# Resample roles
samples_per_class = 500
resampled_roles = []

for role_col in roles_df.columns:
    sub_df = roles_df.loc[roles_df[role_col] == 1].copy()
    
    if len(sub_df) < samples_per_class:
        # Upsample
        sub_df = sub_df.sample(samples_per_class, replace=True, random_state=0)
    else:
        # Downsample
        sub_df = sub_df.sample(samples_per_class, random_state=0) 
    
    resampled_roles.append(sub_df)

In [21]:
# Construct dfs
roles_df  = pd.concat(resampled_roles)
df = df.loc[roles_df.index]

In [25]:
roles_df.sum(axis=0)

Academic researcher                               797
Data or business analyst                          767
Data scientist or machine learning specialist     871
Database administrator                            588
DevOps specialist                                 705
Developer, QA or test                             590
Developer, back-end                              1770
Developer, desktop or enterprise applications     892
Developer, embedded applications or devices       666
Developer, front-end                              893
Developer, full-stack                            1431
Developer, game or graphics                       560
Developer, mobile                                 800
Engineer, data                                    631
Scientist                                         642
System administrator                              647
dtype: int64

In [27]:
X_train, X_test, Y_train, Y_test = train_test_split(df.iloc[:, 16:], df.iloc[:, :16], random_state=0)

In [28]:
X_train.shape

(10723, 91)

## Logistic regression

In [29]:
clf = make_pipeline(StandardScaler(), 
                    MultiOutputClassifier(LogisticRegression()))

clf.fit(X_train, Y_train)
predictions =  pd.DataFrame(clf.predict(X_train), columns=Y_train.columns)

In [41]:
# Evaluate on training set
train_scores = {score.__name__: calculate_quality(Y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)


In [42]:
train_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,89.42,62.47,23.76,34.43
Data or business analyst,93.13,77.91,43.26,55.63
Data scientist or machine learning specialist,93.02,74.47,49.96,59.8
Database administrator,89.89,65.18,42.18,51.22
DevOps specialist,93.96,76.43,41.17,53.52
"Developer, QA or test",90.77,68.87,7.09,12.85
"Developer, back-end",84.62,68.61,33.86,45.34
"Developer, desktop or enterprise applications",90.19,46.11,8.58,14.47
"Developer, embedded applications or devices",92.91,47.3,15.26,23.08
"Developer, front-end",93.63,69.3,41.44,51.87


In [43]:
# Evaluate on test set
predictions =  pd.DataFrame(clf.predict(X_test), columns=Y_test.columns)
test_scores = {score.__name__: calculate_quality(Y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores,axis=1)
mean_test_scores = test_scores.mean()

In [44]:
mean_test_scores

accuracy_score     91.123750
precision_score    66.711250
recall_score       32.980625
f1_score           42.934375
dtype: float64

In [45]:
test_scores.sort_values("f1_score")


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, QA or test",90.71,55.56,5.95,10.75
"Developer, desktop or enterprise applications",89.51,41.94,7.12,12.18
"Developer, embedded applications or devices",93.06,51.95,15.94,24.39
"Engineer, data",91.13,68.64,22.44,33.82
System administrator,90.85,67.53,27.3,38.88
"Developer, full-stack",86.24,57.19,29.89,39.26
Academic researcher,89.54,66.12,27.94,39.29
"Developer, back-end",85.12,66.23,32.28,43.4
Database administrator,90.32,57.31,37.86,45.6
Scientist,88.84,69.52,39.57,50.43
