In [1]:
import gc
import os
import numpy as np 
import pandas as pd 
import utils.paths as plh
import utils.read_utils as hlpread
from sklearn.pipeline import Pipeline
from src.models.feature_eng.TE_KFold import KFoldTargetEncoder
from src.models.feature_eng.FreqEncoding import FrequencyEncoding
from src.models.feature_eng.Combine_feature import CombineFeatures
from src.models.feature_eng.KFoldFreqEncoding import KFoldFrequencyEncoding
from src.models.feature_eng.CountVectorizerEncoding import CountVectorizerEncoding
from src.models.feature_eng.TFIDFVectorizerEncoding import TFIDFVectorizerEncoding

In [2]:
PROJECT_ROOT = plh.get_project_root()

train_data = os.path.join(PROJECT_ROOT, 
                          hlpread.read_yaml_key('data_source.data_folders'),
                          hlpread.read_yaml_key('data_source.prepared.folder'),
                          hlpread.read_yaml_key('data_source.prepared.clean_train'),
                        )
train = hlpread.read_from_parquet(train_data)

print(train.shape)

print(train.columns)
#col_use = [x for x in train.columns if not x in ['ROLE_TITLE', 'MGR_ID']]
#train = train[col_use]
#targetcol = 'ACTION'

#te_col = list(train.columns)
#te_col.remove(targetcol)

(30391, 10)
Index(['ACTION', 'RESOURCE', 'MGR_ID', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2',
       'ROLE_DEPTNAME', 'ROLE_TITLE', 'ROLE_FAMILY_DESC', 'ROLE_FAMILY',
       'ROLE_CODE'],
      dtype='object')


In [3]:
#Order of the pipeline matter as this impact the output
feature_engg = Pipeline( steps = [
                                ('combine_feature', CombineFeatures()),

                                #('frequency_encoding', FrequencyEncoding(min_group_size = 2)),

                                #('tfidf_vectorizer_encoding', TFIDFVectorizerEncoding()),

                                ('count_vectorizer_encoding', CountVectorizerEncoding()),

                                ('KFoldTE', KFoldTargetEncoder()),

                                ]
                              ,verbose =  True
                        )                        


X = feature_engg.fit_transform(train) 

X.shape

[Pipeline] ... (step 1 of 3) Processing combine_feature, total=   0.3s
[Pipeline]  (step 2 of 3) Processing count_vectorizer_encoding, total= 2.4min
[Pipeline] ........... (step 3 of 3) Processing KFoldTE, total=  46.0s


(30391, 233)

In [None]:
#Export the result of the dvc exp to cvs file 
"""
import csv
import subprocess
 
## call date command ##
p = subprocess.Popen("dvc exp show -A --csv", stdout = subprocess.PIPE, shell = True)
(output, err) = p.communicate()
p_status = p.wait()

with open('Example.csv', 'w', encoding='UTF8', newline='') as file:

    writer = csv.writer(file)

    header = []
    data = []
    for i, line in enumerate(output.splitlines()):

        line = line.decode('ASCII') #Output return is  “byte string“. Note: ‘b‘ character before a string is used to specify the string as a “byte string“
        
        if len(line) == 0:
            continue
        
        if len(header) == 0:
            for word in line.split(','):
                header.append(word)

        else:
            row = []    
            for word in line.split(','):
                row.append(word)

            data.append(row)
    
    writer.writerow(header)
    writer.writerow(data)


#f = pd.DataFrame(data)
#f.drop(range(146,198,1), axis = 1, inplace = True)
#f.columns = header
#f.shape
#

"""

# Evaluate Models #

Evaluate the models on the saved test dataset 

## Decision Tree ##

In [None]:
trained_model = hlpread.read_object(
                                    os.path.join(PROJECT_ROOT, 
                                                 hlpread.read_yaml_key('data_source.data_folders'),
                                                 hlpread.read_yaml_key('model.trained_model')
                                                 )
                                    )

In [None]:
Y_predictions_by_class = trained_model.predict_proba(X).astype(float) #Return 2d numpy array which is the probaility for each class label
Y = Y.astype(float)

In [None]:
Y_pred = Y_predictions_by_class.argmax(-1)  
conf_matrix = confusion_matrix(Y, Y_pred)

print(conf_matrix)

In [None]:
f1_score(Y, Y_pred)

## Random Forest ##

In [None]:
trained_model = hlpread.read_object(
                                    os.path.join(PROJECT_ROOT, 
                                                 hlpread.read_yaml_key('data_source.data_folders'),
                                                 hlpread.read_yaml_key('model.trained_model')
                                                 )
)
                                    

In [None]:
Y_predictions_by_class = trained_model.predict_proba(X).astype(float) #Return 2d numpy array which is the probaility for each class label
Y = Y.astype(float)

In [None]:
Y_pred = Y_predictions_by_class.argmax(-1)  
conf_matrix = confusion_matrix(Y, Y_pred)

print(conf_matrix)

In [None]:
f1_score(Y, Y_pred)

## Extra Decision Tree(Ensemble) ## 

In [None]:
trained_model = hlpread.read_object(
                                    os.path.join(PROJECT_ROOT, 
                                                 hlpread.read_yaml_key('data_source.data_folders'),
                                                 hlpread.read_yaml_key('model.trained_model')
                                                 )
)
                                    

In [None]:
Y_predictions_by_class = trained_model.predict_proba(X).astype(float) #Return 2d numpy array which is the probaility for each class label
Y = Y.astype(float)

Y_pred = Y_predictions_by_class.argmax(-1)  
conf_matrix = confusion_matrix(Y, Y_pred)

print(conf_matrix)


In [None]:
f1_score(Y, Y_pred)

## Logistic Regression ##

In [None]:
trained_model = hlpread.read_object(
                                    os.path.join(PROJECT_ROOT, 
                                                 hlpread.read_yaml_key('data_source.data_folders'),
                                                 hlpread.read_yaml_key('model.trained_model')
                                                 )
)
                                    

In [None]:
Y_predictions_by_class = trained_model.predict_proba(X).astype(float) #Return 2d numpy array which is the probaility for each class label
Y = Y.astype(float)

Y_pred = Y_predictions_by_class.argmax(-1)  
conf_matrix = confusion_matrix(Y, Y_pred)

print(conf_matrix)


In [None]:
f1_score(Y, Y_pred)

# Cross Validation for Decision Tree model #

Let's perform cross validation on the final parmater of the decision tree model

In [None]:
training_param =  hlpread.read_yaml_key('trained_model')

In [None]:
#Load clean data
import utils.read_utils as hlpread

clean_train_data = os.path.join(PROJECT_ROOT,
                                hlpread.read_yaml_key('data_source.data_folders'),
                                hlpread.read_yaml_key('data_source.prepared.folder'),
                                hlpread.read_yaml_key('data_source.prepared.clean_train'),
                                )
db_train = hlpread.read_from_parquet(clean_train_data)

In [None]:
# Create pipeline
from src.models.feature_eng.CountVectorizerEncoding import CountVectorizerEncoding
from src.models.feature_eng.Combine_feature import CombineFeatures
from src.models.feature_eng.TE_KFold import KFoldTargetEncoder
from sklearn.pipeline import Pipeline

feature_engg = Pipeline(steps = [
                                    ('combine_feature', CombineFeatures()),
                                    ('count_vectorizer_encoding', CountVectorizerEncoding()),
                                    ('KFoldTE', KFoldTargetEncoder())
                                ]) 

X = feature_engg.fit_transform(db_train) 

In [None]:
#Define cross validation DT
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedShuffleSplit

cv_results = {}
split_params = hlpread.read_yaml_key('train_test_split')
split_s = StratifiedShuffleSplit(n_splits = 4, test_size = split_params['test_size'], random_state = split_params['random_seed'])

fold = 0
for train_index, test_index in split_s.split(X[X.columns[30:]], X['ACTION']):
    
    Y_train, Y_test = X[['ACTION']].iloc[train_index,:], X[['ACTION']].iloc[test_index,:]
    X_train, X_test = X[X.columns[30:]].iloc[train_index,:], X[X.columns[30:]].iloc[test_index,:]

    model = DecisionTreeClassifier(criterion = 'gini', random_state = 42)
    model.set_params(**training_param['params'])

    model.fit(X_train, Y_train ) 

    Y_test_pred = model.predict_proba(X_test).astype(float)
    auc_score = roc_auc_score(Y_test.astype(float), Y_test_pred[:,1])

    cv_results[fold] = auc_score
    fold += 1

In [None]:
cv_results

In [None]:
sum(cv_results.values()) / len(cv_results)

## Train model ##

In [None]:
model.fit(X[X.columns[30:]], X['ACTION'])

In [None]:
Y_predictions_by_class = model.predict_proba(X[X.columns[30:]]).astype(float) #Return 2d numpy array which is the probaility for each class label
Y = X['ACTION'].astype(float)

Y_pred = Y_predictions_by_class.argmax(-1)  
conf_matrix = confusion_matrix(Y, Y_pred)

print(conf_matrix)

In [None]:
auc_score

In [None]:

from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

fpr, tpr, thresholds = roc_curve(Y, Y_predictions_by_class[:,1])
roc_auc_curve_df = pd.DataFrame()    
roc_auc_curve_df['false_positive_rates'] = fpr
roc_auc_curve_df['true_positive_rates'] = tpr
roc_auc_curve_df['thresholds'] = thresholds

fig, ax = plt.subplots(figsize = (7.5, 7.5))
ax.plot(roc_auc_curve_df['false_positive_rates'], roc_auc_curve_df['true_positive_rates'],  color = 'green', label = 'ROC Curve') #, marker = 'o'

ax.tick_params(axis = 'both', labelcolor = 'green')
ax.set_xlabel('False positive rate')
ax.set_ylabel('True positive rate')
label_str = str.format('ROC-AUC: {0}',  round(auc_score, 3))
ax.text(0.5, 0, label_str, fontsize = 6)