In [1]:
DF_PATH       = "../data/processed/2_cleaned_data.pkl"

ROLE_COLS      = ['DevType']
TECH_COLS      = ['LanguageWorkedWith',    'DatabaseWorkedWith',    'WebframeWorkedWith',    'MiscTechWorkedWith']



EXPERIMENT_NAME = "stackoverflow_single_random_forest_model"
LOG_PATH = "../models"
LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

In [2]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle
import random
import plotly 
import os

import mlflow
from mlflow.tracking import MlflowClient

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.feature_selection import VarianceThreshold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

from sklearn.decomposition import PCA, KernelPCA

from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from matplotlib import pyplot as plt

In [3]:
# Read Data 
df = pd.read_pickle(DF_PATH)

### Functions

In [4]:
def calculate_quality(ground_truth, prediction, metric_function, sort_values=False):
    quality_scores = {}
    for col in predictions.columns:
        role_pred  = predictions[col].copy()
        role_truth = ground_truth[col].copy()
        quality_scores[col] = round(metric_function(role_truth, role_pred) * 100, 2)
        
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    if sort_values:
        quality_scores = quality_scores.sort_values()
    
    return quality_scores

### Balance classes


In [5]:
# Check the total samples of roles
jops_df = df["DevType"].copy()
jops_sum = df["DevType"].sum(axis=0)
jops_sum

Academic researcher                               581
Data or business analyst                          669
Data scientist or machine learning specialist     799
Database administrator                            296
DevOps specialist                                 677
Developer, QA or test                             493
Developer, back-end                              5503
Developer, desktop or enterprise applications    1671
Developer, embedded applications or devices       795
Developer, front-end                             2890
Developer, full-stack                            5578
Developer, game or graphics                       342
Developer, mobile                                1859
Engineer, data                                    483
Scientist                                         292
System administrator                              440
dtype: int64

In [6]:
# Resample roles
samples_per_class = 500
resampled_roles = []

for role_col in jops_df.columns:
    sub_df = jops_df.loc[jops_df[role_col] == 1].copy()
    
    if len(sub_df) < samples_per_class:
        # Upsample
        sub_df = sub_df.sample(samples_per_class, replace=True, random_state=0)
    else:
        # Downsample
        sub_df = sub_df.sample(samples_per_class, random_state=0) 
    
    resampled_roles.append(sub_df)

In [7]:
# Construct dfs
jops_df  = pd.concat(resampled_roles)
df = df.loc[jops_df.index]

In [8]:
jops_df.sum(axis=0)

Academic researcher                               797
Data or business analyst                          767
Data scientist or machine learning specialist     871
Database administrator                            588
DevOps specialist                                 705
Developer, QA or test                             590
Developer, back-end                              1770
Developer, desktop or enterprise applications     892
Developer, embedded applications or devices       666
Developer, front-end                              893
Developer, full-stack                            1431
Developer, game or graphics                       560
Developer, mobile                                 800
Engineer, data                                    631
Scientist                                         642
System administrator                              647
dtype: int64

### splitting 

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(df.drop("DevType", axis=1), 
                                                    df["DevType"], 
                                                    random_state=0)

  X_train, X_test, Y_train, Y_test = train_test_split(df.drop("DevType", axis=1),


### Train models

In [10]:
# Initialize client and experiment
client = MlflowClient()
mlflow.set_experiment(EXPERIMENT_NAME)
exp = client.get_experiment_by_name(EXPERIMENT_NAME)

### Vanilla Forest

In [11]:
rf_clf = make_pipeline(StandardScaler(),
                       RandomForestClassifier(random_state=0))

rf_clf.fit(X_train, Y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=0))])

In [12]:
# Evaluate on train set
predictions =  pd.DataFrame(rf_clf.predict(X_train), columns=Y_train.columns)
train_scores = {score.__name__: calculate_quality(Y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)



In [13]:
# Evaluate on test set
predictions =  pd.DataFrame(rf_clf.predict(X_test), columns=Y_test.columns)
test_scores = {score.__name__: calculate_quality(Y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores,axis=1)
mean_test_scores = test_scores.mean()



In [14]:
print(test_scores.mean())
test_scores.sort_values("precision_score")

accuracy_score     94.943750
precision_score    93.345000
recall_score       58.477500
f1_score           71.125625
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, full-stack",87.6,80.11,40.62,53.9
"Developer, front-end",93.6,80.95,49.51,61.45
"Developer, embedded applications or devices",95.2,88.64,47.56,61.9
"Developer, mobile",95.85,89.57,68.87,77.87
Data scientist or machine learning specialist,95.8,90.13,66.5,76.54
"Developer, back-end",86.2,90.87,42.38,57.8
"Developer, desktop or enterprise applications",92.3,91.03,32.57,47.97
Data or business analyst,95.75,94.95,54.02,68.86
Academic researcher,95.15,96.61,55.07,70.15
DevOps specialist,95.5,96.77,50.85,66.67


### Log

In [15]:
# Data details
data_details = {"data_path": DF_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices":     X_test.index.tolist(), 
                "features_names":   X_train.columns.droplevel(0).tolist(),
                "targets_names":    Y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [16]:
# Model
model = {"model_description": "Random Forest: with non linearity",
         "model_details": str(rf_clf),
         "model_object": rf_clf} 

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [17]:
# Preformance details
classes_metrics = {"train_scores": train_scores, 
                   "test_scores":  test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [18]:
# Start a new run and track 
with mlflow.start_run(experiment_id=exp.experiment_id, 
                      run_name=model["model_description"]):
    # Log pickles
    mlflow.log_artifacts(LOG_PATH)  
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score) 

### Random Forest with Non-linearity

In [19]:
rf_clf = make_pipeline(StandardScaler(), 
                       FeatureUnion([('linear_pca', PCA(n_components=40)), 
                                      ('kernel_pca', KernelPCA(n_components=40, 
                                                               kernel='rbf'))]),
                       RandomForestClassifier(random_state=0))

rf_clf.fit(X_train, Y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('featureunion',
                 FeatureUnion(transformer_list=[('linear_pca',
                                                 PCA(n_components=40)),
                                                ('kernel_pca',
                                                 KernelPCA(kernel='rbf',
                                                           n_components=40))])),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=0))])

In [20]:
# Evaluate on train set
predictions =  pd.DataFrame(rf_clf.predict(X_train), columns=Y_train.columns)
train_scores = {score.__name__: calculate_quality(Y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)



In [21]:
# Evaluate on test set
predictions =  pd.DataFrame(rf_clf.predict(X_test), columns=Y_test.columns)
test_scores = {score.__name__: calculate_quality(Y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores,axis=1)
mean_test_scores = test_scores.mean()



In [22]:
print(mean_test_scores)
test_scores.sort_values("f1_score")

accuracy_score     94.837500
precision_score    93.743125
recall_score       57.200000
f1_score           70.085625
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, desktop or enterprise applications",92.25,95.65,30.28,45.99
"Developer, full-stack",87.2,80.98,36.97,50.77
"Developer, embedded applications or devices",94.7,86.25,42.07,56.56
"Developer, back-end",85.85,86.88,43.05,57.57
"Developer, front-end",94.05,89.19,48.06,62.46
DevOps specialist,95.35,96.67,49.15,65.17
Data or business analyst,95.6,95.74,51.72,67.16
Academic researcher,94.95,99.07,51.69,67.94
Data scientist or machine learning specialist,95.3,87.33,63.59,73.6
"Developer, QA or test",96.65,100.0,58.64,73.93


### Log

In [23]:
# Data details
data_details = {"data_path": DF_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices":     X_test.index.tolist(), 
                "features_names":   X_train.columns.droplevel(0).tolist(),
                "targets_names":    Y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [24]:
# Model
model = {"model_description": "Random Forest: with PCAs + RBF kernel",
         "model_details": str(rf_clf),
         "model_object": rf_clf} 

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [25]:
# Preformance details
classes_metrics = {"train_scores": train_scores, 
                   "test_scores":  test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [26]:
# Start a new run and track 
with mlflow.start_run(experiment_id=exp.experiment_id, 
                      run_name=model["model_description"]):
    # Log pickles
    mlflow.log_artifacts(LOG_PATH)  
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score) 
    

### Random Forest with PCAs & Hyper parameter tuning

In [27]:
hpt_rf_clf = make_pipeline(StandardScaler(), 
                           FeatureUnion([('linear_pca', PCA()), 
                                         ('kernel_pca', KernelPCA(kernel='rbf'))]),
                           RandomForestClassifier(random_state=0, ))

In [28]:
# list(hpt_rf_clf.get_params().keys())
tuned_parameters = [{'featureunion__linear_pca__n_components': [5, 10, 20, 40, 60], 
                     'featureunion__kernel_pca__n_components': [5, 10, 20, 40, 60],
                     'randomforestclassifier__n_estimators':   [100, 500, 1000]}]

In [29]:
hpt_rf_clf = GridSearchCV(hpt_rf_clf, 
                          tuned_parameters, 
                          n_jobs=-1)

hpt_rf_clf.fit(X_train, Y_train)

exception calling callback for <Future at 0x2572bda9550 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "C:\Users\pc\anaconda3\lib\site-packages\joblib\externals\loky\_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "C:\Users\pc\anaconda3\lib\site-packages\joblib\parallel.py", line 359, in __call__
    self.parallel.dispatch_next()
  File "C:\Users\pc\anaconda3\lib\site-packages\joblib\parallel.py", line 794, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "C:\Users\pc\anaconda3\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\pc\anaconda3\lib\site-packages\joblib\parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\Users\pc\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 531, in apply_async
    future = self._workers.submit(SafeFunction(func))
  File "C:\Users\p

OSError: [Errno 22] Invalid argument

In [None]:
hpt_rf_clf.best_params_

In [None]:
# Evaluate on test set
predictions =  pd.DataFrame(hpt_rf_clf.predict(X_train), columns=Y_train.columns)
train_scores = {score.__name__: calculate_quality(Y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)

In [None]:
# Evaluate on test set
predictions =  pd.DataFrame(hpt_rf_clf.predict(X_test), columns=Y_test.columns)
test_scores = {score.__name__: calculate_quality(Y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores,axis=1)

In [None]:
print(test_scores.mean())
test_scores.sort_values("f1_score")

### Log

In [None]:
# Data details
data_details = {"data_path": DF_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices":     X_test.index.tolist(), 
                "features_names":   X_train.columns.droplevel(0).tolist(),
                "targets_names":    Y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [None]:
# Model
model = {"model_description": "Random Forest: with PCAs + RBF kernel + Hyperparamter tuning",
         "model_details": str(hpt_rf_clf),
         "model_object": hpt_rf_clf} 

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [None]:
# Preformance details
classes_metrics = {"train_scores": train_scores, 
                   "test_scores":  test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [None]:
# Start a new run and track 
with mlflow.start_run(experiment_id=exp.experiment_id, 
                      run_name=model["model_description"]):
    # Log pickles
    mlflow.log_artifacts(LOG_PATH)  
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score) 