In [28]:
# general packages and libraries
import sys
import importlib

# data manipulation packages
import numpy as np
import pandas as pd

# data visualizations packages
import matplotlib.pyplot as plt
# to render plots in the notebook
%matplotlib inline

import seaborn as sns
# set a theme for seaborn
sns.set_theme()

# numerical, statistical and machine learning packages and libraries

from sklearn.base import (
    BaseEstimator, 
    TransformerMixin,
)
from sklearn.pipeline import (
    make_pipeline,
    Pipeline,
    FeatureUnion,
)
from sklearn.impute import (
    KNNImputer,
    SimpleImputer,
)
from sklearn.preprocessing import (
    OrdinalEncoder, 
    StandardScaler,
    MultiLabelBinarizer,
)
from sklearn.model_selection import (
    train_test_split,
)

from sklearn.ensemble import (
    RandomForestClassifier,
)
    
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score
)

In [29]:
# create a string for the working directory
mypath = '/home/silvia/Documents/udacityND/ml_dsnd/proj1_dsnd/'

# add src folder to sys.path to use the local modules
sys.path.insert(1, mypath + 'src')

In [30]:
# import local modules 
import utils_functions as uf 
import utils_classes as uc
import local_maps as lm      

In [4]:
# re-load the modules as needed
import importlib
importlib.reload(uf);
importlib.reload(uc);
importlib.reload(lm);

In [31]:
# upload the datafile as pandas dataframe
df = pd.read_csv(mypath+'/data/raw/survey20_results_public.csv', index_col=[0])

In [32]:
# create a copy of the dataframe
df1  = df.copy()

In [33]:
# preprocess data: change types, columns, remove features
df_proc = (df1.
                pipe(uf.data_prep).
                pipe(uf.parse_dev_type).
                pipe(uf.remove_clean_data))

In [34]:
# create the predictors dataframe
X = df_proc.drop(columns = 'JobSat')

# create the labels
y = df_proc['JobSat']

# split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [35]:
# create an instance of the classifier, using the optimizing parameters
RF_clf = RandomForestClassifier(max_depth=90, min_samples_split=5, 
                                n_estimators=1400, random_state=42)

In [36]:
# process the train data
X_train_proc = uc.full_pipeline.fit_transform(X_train)

# fit and transform the train data
RF_clf.fit(X_train_proc, y_train)

RandomForestClassifier(max_depth=90, min_samples_split=5, n_estimators=1400,
                       random_state=42)

In [37]:
# process the test data
X_test_proc = uc.full_pipeline.transform(X_test)

In [38]:
# predict labels on test set
y_pred = RF_clf.predict(X_test_proc)

In [39]:
# evaluate performance metrics on the train set
perf_train_RF = pd.Series(uf.get_perf_metrics(RF_clf.fit(X_train_proc,y_train),
                                              X_train_proc, y_train), 
                       index = lm.metrics_list)

# evaluate performance metrics on the test set
perf_test_RF = pd.Series(uf.get_perf_metrics(RF_clf.fit(X_train_proc,y_train),
                                             X_test_proc, y_test), 
                         index = lm.metrics_list)

# combine performance metrics for the baseline model
perf_model_RF = pd.DataFrame.from_dict({'train': perf_train_RF,
                                        'test': perf_test_RF}).round(3)

In [40]:
# print evaluation metrics and results

print('Performance metrics comparison for RandomForestClassifier:\n', perf_model_RF)

result1_RF = confusion_matrix(y_test, y_pred)
print('\nRandomForestClassifier Confusion Matrix for Test Set:')
print(result1_RF)

result2_RF = classification_report(y_test, y_pred)
print('\nRandomForestClassifier Classification Report for Test Set:')
print (result2_RF)

Performance metrics comparison for RandomForestClassifier:
            train   test
accuracy   0.996  0.628
precision  0.998  0.739
recall     0.993  0.541
f1         0.995  0.587

RandomForestClassifier Confusion Matrix for Test Set:
[[ 98  30   1  42  87]
 [  3 228   7 170  78]
 [  2  23 141 141  76]
 [  3  29   4 628 257]
 [  2  13   2 187 860]]

RandomForestClassifier Classification Report for Test Set:
              precision    recall  f1-score   support

           1       0.91      0.38      0.54       258
           2       0.71      0.47      0.56       486
           3       0.91      0.37      0.52       383
           4       0.54      0.68      0.60       921
           5       0.63      0.81      0.71      1064

    accuracy                           0.63      3112
   macro avg       0.74      0.54      0.59      3112
weighted avg       0.67      0.63      0.62      3112

