In [1]:
import gc
import os
import numpy as np 
import pandas as pd 
import utils.paths as plh
import utils.read_utils as hlpread
from sklearn.pipeline import Pipeline
from src.models.feature_eng.TE_KFold import KFoldTargetEncoder
from src.models.feature_eng.FreqEncoding import FrequencyEncoding
from src.models.feature_eng.Combine_feature import CombineFeatures
from src.models.feature_eng.KFoldFreqEncoding import KFoldFrequencyEncoding
from src.models.feature_eng.CountVectorizerEncoding import CountVectorizerEncoding
from src.models.feature_eng.TFIDFVectorizerEncoding import TFIDFVectorizerEncoding

In [2]:
PROJECT_ROOT = plh.get_project_root()

train_data = os.path.join(PROJECT_ROOT, 
                          hlpread.read_yaml_key('data_source.data_folders'),
                          hlpread.read_yaml_key('data_source.prepared.folder'),
                          hlpread.read_yaml_key('data_source.prepared.clean_train'),
                        )
train = hlpread.read_from_parquet(train_data)

print(train.shape)

print(train.columns)
#col_use = [x for x in train.columns if not x in ['ROLE_TITLE', 'MGR_ID']]
#train = train[col_use]
#targetcol = 'ACTION'

#te_col = list(train.columns)
#te_col.remove(targetcol)

(30391, 10)
Index(['ACTION', 'RESOURCE', 'MGR_ID', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2',
       'ROLE_DEPTNAME', 'ROLE_TITLE', 'ROLE_FAMILY_DESC', 'ROLE_FAMILY',
       'ROLE_CODE'],
      dtype='object')


In [47]:
from sklearn.preprocessing import LabelEncoder

feature_engg = Pipeline( steps = [
                                    ('combine_feature', CombineFeatures()),
                                    #('label_encoder', LabelEncoder()),
                                ]
                         ,verbose =  True
                        )  

X = feature_engg.fit_transform(train) 

X.columns        

[Pipeline] ... (step 1 of 1) Processing combine_feature, total=   0.2s


Index(['ACTION', 'RESOURCE', 'MGR_ID', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2',
       'ROLE_DEPTNAME', 'ROLE_TITLE', 'ROLE_FAMILY_DESC', 'ROLE_FAMILY',
       'ROLE_CODE', 'RESOURCE_ROLE_ROLLUP_1', 'RESOURCE_ROLE_ROLLUP_2',
       'RESOURCE_ROLE_DEPTNAME', 'RESOURCE_ROLE_FAMILY_DESC',
       'RESOURCE_ROLE_FAMILY', 'RESOURCE_ROLE_CODE',
       'ROLE_ROLLUP_1_ROLE_ROLLUP_2', 'ROLE_ROLLUP_1_ROLE_DEPTNAME',
       'ROLE_ROLLUP_1_ROLE_FAMILY_DESC', 'ROLE_ROLLUP_1_ROLE_FAMILY',
       'ROLE_ROLLUP_1_ROLE_CODE', 'ROLE_ROLLUP_2_ROLE_DEPTNAME',
       'ROLE_ROLLUP_2_ROLE_FAMILY_DESC', 'ROLE_ROLLUP_2_ROLE_FAMILY',
       'ROLE_ROLLUP_2_ROLE_CODE', 'ROLE_DEPTNAME_ROLE_FAMILY_DESC',
       'ROLE_DEPTNAME_ROLE_FAMILY', 'ROLE_DEPTNAME_ROLE_CODE',
       'ROLE_FAMILY_DESC_ROLE_FAMILY', 'ROLE_FAMILY_DESC_ROLE_CODE',
       'ROLE_FAMILY_ROLE_CODE'],
      dtype='object')

In [42]:
#Order of the pipeline matter as this impact the output
feature_engg = Pipeline( steps = [
                                ('combine_feature', CombineFeatures()),

                                #('frequency_encoding', FrequencyEncoding(min_group_size = 2)),

                                #('tfidf_vectorizer_encoding', TFIDFVectorizerEncoding()),

                                ('count_vectorizer_encoding', CountVectorizerEncoding()),

                                ('KFoldTE', KFoldTargetEncoder()),

                                ]
                              ,verbose =  True
                        )                        


X = feature_engg.fit_transform(train) 

X.shape

[Pipeline] ... (step 1 of 3) Processing combine_feature, total=   0.2s
[Pipeline]  (step 2 of 3) Processing count_vectorizer_encoding, total= 2.2min
[Pipeline] ........... (step 3 of 3) Processing KFoldTE, total=  42.2s


(30391, 233)

In [36]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

chi2_feature = SelectKBest(chi2, k = 29)
best_feature = chi2_feature.fit_transform(X_2, X.ACTION)


In [37]:
chi2_feature.scores_

array([  179.79681866,   440.72187967,  1500.57693636,   184.74044727,
          73.27398768,  5084.72221475,    58.63225907,   404.12521478,
        1598.98391097,    32.32397894,  1538.48661015,   754.73077548,
         588.22319832,    95.7254337 ,   450.55712416,   797.35428785,
        2218.18303481,   591.18845719,  2027.66683632,  1455.33417203,
        4834.32111978,   185.74463499,  7909.07825373,    54.62173851,
        6012.25826507,   544.87378976, 12109.88162184,  2726.38963297,
         325.85226668])

In [40]:
chi2_feature.scores_.argsort()

array([ 9, 23,  6,  4, 13,  0,  3, 21, 28,  7,  1, 14, 25, 12, 17, 11, 15,
       19,  2, 10,  8, 18, 16, 27, 20,  5, 24, 22, 26], dtype=int64)

In [39]:
chi2_feature.feature_names_in_

array(['RESOURCE_0', 'ROLE_ROLLUP_1_0', 'ROLE_ROLLUP_2_0',
       'ROLE_DEPTNAME_0', 'ROLE_TITLE_0', 'ROLE_FAMILY_DESC_0',
       'ROLE_FAMILY_0', 'ROLE_CODE_0', 'RESOURCE_ROLE_ROLLUP_1_0',
       'RESOURCE_ROLE_ROLLUP_2_0', 'RESOURCE_ROLE_DEPTNAME_0',
       'RESOURCE_ROLE_FAMILY_DESC_0', 'RESOURCE_ROLE_FAMILY_0',
       'RESOURCE_ROLE_CODE_0', 'ROLE_ROLLUP_1_ROLE_ROLLUP_2_0',
       'ROLE_ROLLUP_1_ROLE_DEPTNAME_0',
       'ROLE_ROLLUP_1_ROLE_FAMILY_DESC_0', 'ROLE_ROLLUP_1_ROLE_FAMILY_0',
       'ROLE_ROLLUP_1_ROLE_CODE_0', 'ROLE_ROLLUP_2_ROLE_DEPTNAME_0',
       'ROLE_ROLLUP_2_ROLE_FAMILY_DESC_0', 'ROLE_ROLLUP_2_ROLE_FAMILY_0',
       'ROLE_ROLLUP_2_ROLE_CODE_0', 'ROLE_DEPTNAME_ROLE_FAMILY_DESC_0',
       'ROLE_DEPTNAME_ROLE_FAMILY_0', 'ROLE_DEPTNAME_ROLE_CODE_0',
       'ROLE_FAMILY_DESC_ROLE_FAMILY_0', 'ROLE_FAMILY_DESC_ROLE_CODE_0',
       'ROLE_FAMILY_ROLE_CODE_0'], dtype=object)

In [41]:
chi2_feature.feature_names_in_[chi2_feature.scores_.argsort()]

array(['RESOURCE_ROLE_ROLLUP_2_0', 'ROLE_DEPTNAME_ROLE_FAMILY_DESC_0',
       'ROLE_FAMILY_0', 'ROLE_TITLE_0', 'RESOURCE_ROLE_CODE_0',
       'RESOURCE_0', 'ROLE_DEPTNAME_0', 'ROLE_ROLLUP_2_ROLE_FAMILY_0',
       'ROLE_FAMILY_ROLE_CODE_0', 'ROLE_CODE_0', 'ROLE_ROLLUP_1_0',
       'ROLE_ROLLUP_1_ROLE_ROLLUP_2_0', 'ROLE_DEPTNAME_ROLE_CODE_0',
       'RESOURCE_ROLE_FAMILY_0', 'ROLE_ROLLUP_1_ROLE_FAMILY_0',
       'RESOURCE_ROLE_FAMILY_DESC_0', 'ROLE_ROLLUP_1_ROLE_DEPTNAME_0',
       'ROLE_ROLLUP_2_ROLE_DEPTNAME_0', 'ROLE_ROLLUP_2_0',
       'RESOURCE_ROLE_DEPTNAME_0', 'RESOURCE_ROLE_ROLLUP_1_0',
       'ROLE_ROLLUP_1_ROLE_CODE_0', 'ROLE_ROLLUP_1_ROLE_FAMILY_DESC_0',
       'ROLE_FAMILY_DESC_ROLE_CODE_0', 'ROLE_ROLLUP_2_ROLE_FAMILY_DESC_0',
       'ROLE_FAMILY_DESC_0', 'ROLE_DEPTNAME_ROLE_FAMILY_0',
       'ROLE_ROLLUP_2_ROLE_CODE_0', 'ROLE_FAMILY_DESC_ROLE_FAMILY_0'],
      dtype=object)

In [19]:
best_feature

array([[   24,   711, 10830, ...,  1433,  1956,  1345],
       [   87,    84,  2256, ...,   991,  2015,  1598],
       [  152,  1274,  8552, ...,  1366,  1316,  1315],
       ...,
       [   58,  1223,  2026, ...,  1051,  2504,  1180],
       [   96,  2021,  4739, ...,  1338,  1845,  2511],
       [  138,  1806,  5550, ...,   861,  1540,   506]])

## Train model ##

In [None]:
model.fit(X[X.columns[30:]], X['ACTION'])

In [None]:
Y_predictions_by_class = model.predict_proba(X[X.columns[30:]]).astype(float) #Return 2d numpy array which is the probaility for each class label
Y = X['ACTION'].astype(float)

Y_pred = Y_predictions_by_class.argmax(-1)  
conf_matrix = confusion_matrix(Y, Y_pred)

print(conf_matrix)

In [None]:
auc_score

In [None]:

from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

fpr, tpr, thresholds = roc_curve(Y, Y_predictions_by_class[:,1])
roc_auc_curve_df = pd.DataFrame()    
roc_auc_curve_df['false_positive_rates'] = fpr
roc_auc_curve_df['true_positive_rates'] = tpr
roc_auc_curve_df['thresholds'] = thresholds

fig, ax = plt.subplots(figsize = (7.5, 7.5))
ax.plot(roc_auc_curve_df['false_positive_rates'], roc_auc_curve_df['true_positive_rates'],  color = 'green', label = 'ROC Curve') #, marker = 'o'

ax.tick_params(axis = 'both', labelcolor = 'green')
ax.set_xlabel('False positive rate')
ax.set_ylabel('True positive rate')
label_str = str.format('ROC-AUC: {0}',  round(auc_score, 3))
ax.text(0.5, 0, label_str, fontsize = 6)