In [1]:
import gc
import os
import numpy as np 
import pandas as pd 
import utils.paths as plh
import utils.read_utils as hlpread
from src.models.feature_eng.TE_KFold import KFoldTargetEncoder
from src.models.feature_eng.FreqEncoding import FrequencyEncoding
from src.models.feature_eng.Combine_feature import CombineFeatures
from src.models.feature_eng.KFoldFreqEncoding import KFoldFrequencyEncoding
from src.models.feature_eng.CountVectorizerEncoding import CountVectorizerEncoding
from src.models.feature_eng.TFIDFVectorizerEncoding import TFIDFVectorizerEncoding

# Pipeline #

In [2]:
PROJECT_ROOT = plh.get_project_root()

train_data = os.path.join(PROJECT_ROOT, 
                          hlpread.read_yaml_key('data_source.data_folders'),
                          hlpread.read_yaml_key('data_source.prepared.folder'),
                          hlpread.read_yaml_key('data_source.prepared.clean_train'),
                        )
train = hlpread.read_from_parquet(train_data)

print(train.shape)

print(train.columns)
#col_use = [x for x in train.columns if not x in ['ROLE_TITLE', 'MGR_ID']]
#train = train[col_use]
#targetcol = 'ACTION'

#te_col = list(train.columns)
#te_col.remove(targetcol)

(30391, 10)
Index(['ACTION', 'RESOURCE', 'MGR_ID', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2',
       'ROLE_DEPTNAME', 'ROLE_TITLE', 'ROLE_FAMILY_DESC', 'ROLE_FAMILY',
       'ROLE_CODE'],
      dtype='object')


In [3]:
from sklearn.pipeline import Pipeline

feature_engg = Pipeline( steps = [
                                ('combine_feature', CombineFeatures()),
                                #('tfidf_vectorizer_encoding', TFIDFVectorizerEncoding()),
                                ('KFoldTE', KFoldTargetEncoder(n_fold = 5)),
                                ])

#X = feature_engg.fit_transform(train) 


In [None]:
from sklearn.pipeline import Pipeline

#Order of the pipeline matter as this impact the output
feature_engg = Pipeline( steps = [
                                ('combine_feature', CombineFeatures()),

                                ('KFoldTE', KFoldTargetEncoder()),

                                ('frequency_encoding', FrequencyEncoding(min_group_size = 2)),

                                #('tfidf_vectorizer_encoding', TFIDFVectorizerEncoding()),

                                ('count_vectorizer_encoding', CountVectorizerEncoding()),
                                ]
                              ,verbose =  True
                        ) 

X = feature_engg.fit_transform(train) 

In [None]:
X.shape

In [None]:
X.isna().sum()

In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn import metrics

In [None]:
X.columns[30:]

In [None]:
X[X.columns[30:]].isna().sum()

In [None]:
logreg = LogisticRegressionCV()
logreg.fit(X[X.columns[30:]], train['ACTION'])

In [None]:
y_pred = logreg.predict(X[X.columns[30:]])

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(train['ACTION'], y_pred)
print(confusion_matrix)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(train['ACTION'], y_pred))

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
logit_roc_auc = roc_auc_score(train['ACTION'], y_pred)
fpr, tpr, thresholds = roc_curve(train['ACTION'], y_pred)
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

# Temp #

In [None]:
from sklearn.pipeline import Pipeline

#Order of the pipeline matter as this impact the output
feature_engg = Pipeline( steps = [
                                ('combine_feature', CombineFeatures()),

                                ('KFoldTE', KFoldTargetEncoder()),

                                ('frequency_encoding', FrequencyEncoding(min_group_size = 2))])
X = feature_engg.fit_transform(train)

In [None]:
X.shape

In [None]:
X.reset_index(drop = True, inplace = True)

In [None]:
cnt_vector = hlpread.read_from_parquet( os.path.join(PROJECT_ROOT, 
                hlpread.read_yaml_key('data_source.data_folders'),
                hlpread.read_yaml_key('featurize.count_vector.output.folder'),
                hlpread.read_yaml_key('featurize.count_vector.output.filename'),
))
cnt_vector.reset_index(drop = True, inplace = True)
cnt_vector.shape

In [None]:
cnt_vector.ROLE_FAMILY_ROLE_CODE_RESOURCE_ROLE_ROLLUP_1_svd_cv

In [None]:
X.RESOURCE

In [None]:
Y = pd.merge(X, cnt_vector, left_index = True, right_index = True, how = 'inner')
Y.shape

In [None]:
Y.isna().sum()

In [None]:
Y.ROLE_FAMILY_ROLE_CODE_RESOURCE_ROLE_ROLLUP_1_svd_cv

In [None]:
Y[Y.isna()].ROLE_FAMILY_ROLE_CODE_RESOURCE_ROLE_ROLLUP_1_svd_cv

In [None]:
tfidf_vector  = hlpread.read_from_parquet( os.path.join(PROJECT_ROOT, 
                hlpread.read_yaml_key('data_source.data_folders'),
                hlpread.read_yaml_key('featurize.tfidf.output.folder'),
                hlpread.read_yaml_key('featurize.tfidf.output.filename'),
))

tfidf_vector.shape

In [None]:
X = pd.merge(freq_encoder_X, kte_encoder_X, left_index = True, right_index = True, how = 'inner')
X = pd.merge(tfidf_vector, X, left_index = True, right_index = True, how = 'left')
print(X.shape)

In [None]:
Y = train['ACTION']

X = pd.merge(freq_encoder_X, tfidf_vector, left_index = True, right_index = True, how = 'inner')
#X = pd.merge(kte_encoder_X, X, left_index = True, right_index = True, how = 'inner')
print(X.shape)
X.isna().sum()

In [None]:
from sklearn.linear_model import LogisticRegressionCV
lr_cv = LogisticRegressionCV(Cs=7,                            
                            max_iter = 100,
                            class_weight = 'balance',
                            cv = 5)

lr_cv.fit(X, Y)