In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath('../'))
from service.Classifier.DataLoader.DataLoader import DataLoader
from service.Classifier.Enums.priority import Priority
from service.Classifier.DataLoader.P2DataLoader import P2DataLoader
from service.Classifier.DataLoader.P3DataLoader import P3DataLoader
from service.Classifier.DataLoader.P4DataLoader import P4DataLoader
from service.Classifier.Model.XGBRegressionModel import XGBRegressionModel
from service.Classifier.PreProcessing.RegressionModelPreProcessor import RegressionModelPreProcessor
from service.Classifier.DataSplit.TrainTestSplit import TrainTestSplit
from service.Classifier.Enums.dataTypeEnum import DataType

  from pandas import np


In [2]:
def initializeByPriority(priority):
    # load new updated data via Features class
    if priority == Priority.P2.value:
        return P2DataLoader()
    elif priority == Priority.P3.value:
        return P3DataLoader()
    elif priority == Priority.P4.value:
        return P4DataLoader()
    return None

In [3]:
priority = Priority.P2.value
def extract_data(priority):
    dataLoader = initializeByPriority(priority)
    data_df = dataLoader.loadTrainingRawData()
    # split data into train test 4:1
    dataSplit = TrainTestSplit()
    train_df, test_df = dataSplit.split(data_df)

    # generate features
    train_df, feature_names = dataLoader.transformRawDataToFeatures(train_df, DataType.TRAINDATA.value)
    test_df, feature_names = dataLoader.transformRawDataToFeatures(test_df, DataType.VALIDATION.value)
    # pre-processing the data based on model type
    preprocessor = RegressionModelPreProcessor(feature_names)
    train_X, train_y, test_X, test_y = preprocessor.preprocessing(train_df, test_df)
    
    return (train_X, train_y, test_X, test_y, train_df, test_df)

In [4]:
import xgboost as xgb
import numpy as np
import pandas as pd
from service.Classifier.Enums.LabelEnum import LabelEnum
from sklearn.metrics import mean_squared_error, mean_absolute_error, confusion_matrix, classification_report
from service.Classifier.PreProcessing.Utils.dataValidator import DataValidator

def evaluate_result(train_X, train_y, test_X, test_y, train_df, test_df):

    model = xgb.XGBRegressor(max_depth=5, n_estimators=100, learning_rate=0.05).fit(train_X, train_y)
    predictions = np.rint(model.predict(test_X)).astype(np.int64)
    submission = pd.DataFrame({'ID': test_df['keyID'],
                               LabelEnum.CLOSEDDAY.value: predictions, 'Actual': test_y})
    rmse = np.sqrt(mean_squared_error(predictions, test_df[LabelEnum.CLOSEDDAY.value]))
    print("RMSE: %f" % rmse)
    mae = mean_absolute_error(predictions, test_y)
    print("MAE: %f" % mae)
    DataValidator().valid(submission, priority, test_df)
    print(confusion_matrix(submission["truth"], submission["prediction"]))
    print(classification_report(submission["truth"], submission["prediction"]))
    
    return (test_y, predictions, submission["truth"], submission["prediction"])

In [5]:
priority = Priority.P2.value

train_X, train_y, test_X, test_y, train_df, test_df = extract_data(priority)

p2_regression_true, p2_regression_pred, p2_classification_true, p2_classification_pred = evaluate_result(train_X, train_y, test_X, test_y, train_df, test_df)



RMSE: 10.130254
MAE: 7.944882
[[145 205]
 [ 89 196]]
              precision    recall  f1-score   support

       False       0.62      0.41      0.50       350
        True       0.49      0.69      0.57       285

    accuracy                           0.54       635
   macro avg       0.55      0.55      0.53       635
weighted avg       0.56      0.54      0.53       635



In [6]:
priority = Priority.P3.value

train_X, train_y, test_X, test_y, train_df, test_df = extract_data(priority)

p3_regression_true, p3_regression_pred, p3_classification_true, p3_classification_pred = evaluate_result(train_X, train_y, test_X, test_y, train_df, test_df)



RMSE: 20.449484
MAE: 14.329070
[[513  98]
 [175  74]]
              precision    recall  f1-score   support

       False       0.75      0.84      0.79       611
        True       0.43      0.30      0.35       249

    accuracy                           0.68       860
   macro avg       0.59      0.57      0.57       860
weighted avg       0.65      0.68      0.66       860



In [7]:
priority = Priority.P4.value

train_X, train_y, test_X, test_y, train_df, test_df = extract_data(priority)

p4_regression_true, p4_regression_pred, p4_classification_true, p4_classification_pred = evaluate_result(train_X, train_y, test_X, test_y, train_df, test_df)



RMSE: 25.536339
MAE: 17.656904
[[305  26]
 [130  17]]
              precision    recall  f1-score   support

       False       0.70      0.92      0.80       331
        True       0.40      0.12      0.18       147

    accuracy                           0.67       478
   macro avg       0.55      0.52      0.49       478
weighted avg       0.61      0.67      0.61       478



In [10]:
train_df

Unnamed: 0,keyID,priority,chainLength,longTimeHold,lateTriageDays,InitialActivityCount,InitialActionCount,recentActionCount,recentActivityCount,ResolvedDay,ClosedDay
0,COMS-8231,P4,3,0,0,0.933333,0.000000,0.0,0.800000,65,65
1,CANCEL-6080,P4,1,0,0,0.533333,0.000000,0.0,0.200000,68,68
2,BILLING-16551,P4,0,0,0,0.400000,0.133333,0.0,0.066667,49,49
3,CANCEL-6067,P4,0,0,0,0.533333,0.000000,0.0,0.266667,78,78
4,BILLING-16534,P4,1,0,0,0.533333,0.000000,0.0,0.200000,56,56
...,...,...,...,...,...,...,...,...,...,...,...
1907,VOD-3573,P4,0,0,0,0.200000,0.000000,0.0,0.133333,68,68
1908,CANCEL-6268,P4,2,0,0,0.933333,0.000000,0.0,0.466667,80,80
1909,RES-20850,P4,0,0,0,1.000000,0.133333,0.0,0.133333,86,86
1910,BILLING-17524,P4,1,0,0,1.066667,0.200000,0.0,0.266667,86,86


In [8]:
# regression results
regression_true = pd.concat([p2_regression_true, p3_regression_true])
regression_true = pd.concat([regression_true, p4_regression_true])

regression_pred = np.concatenate((p2_regression_pred, p3_regression_pred), axis=0)
regression_pred = np.concatenate((regression_pred, p4_regression_pred), axis=0)

## classification results
classification_true = pd.concat([p2_classification_true, p3_classification_true])
classification_true = pd.concat([classification_true, p4_classification_true])

classification_pred = pd.concat([p2_classification_pred, p3_classification_pred])
classification_pred = pd.concat([classification_pred, p4_classification_pred])

In [9]:
print("---Dynamic Features (including Activity Sequ) results on all data 75%---")
rmse = np.sqrt(mean_squared_error(regression_pred, regression_true))
print("RMSE: %f" % rmse)
mae = mean_absolute_error(regression_pred, regression_true)
print("MAE: %f" % mae)
print(confusion_matrix(classification_true, classification_pred))
print(classification_report(classification_true, classification_pred))

---Dynamic Features (including Activity Sequ) results on all data 75%---
RMSE: 18.824260
MAE: 13.080588
[[963 329]
 [394 287]]
              precision    recall  f1-score   support

       False       0.71      0.75      0.73      1292
        True       0.47      0.42      0.44       681

    accuracy                           0.63      1973
   macro avg       0.59      0.58      0.58      1973
weighted avg       0.63      0.63      0.63      1973

