In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath('../'))
from service.Classifier.DataLoader.DataLoader import DataLoader
from service.Classifier.Enums.priority import Priority
from service.Classifier.DataLoader.P2DataLoader import P2DataLoader
from service.Classifier.DataLoader.P3DataLoader import P3DataLoader
from service.Classifier.DataLoader.P4DataLoader import P4DataLoader
from service.Classifier.Model.XGBRegressionModel import XGBRegressionModel
from service.Classifier.PreProcessing.RegressionModelPreProcessor import RegressionModelPreProcessor
from service.Classifier.DataSplit.TrainTestSplit import TrainTestSplit
from service.Classifier.Enums.dataTypeEnum import DataType

  from pandas import np


In [2]:
def initializeByPriority(priority):
    # load new updated data via Features class
    if priority == Priority.P2.value:
        return P2DataLoader()
    elif priority == Priority.P3.value:
        return P3DataLoader()
    elif priority == Priority.P4.value:
        return P4DataLoader()
    return None

In [3]:
priority = Priority.P2.value
def extract_data(priority):
    dataLoader = initializeByPriority(priority)
    data_df = dataLoader.loadTrainingRawData()
    # split data into train test 4:1
    dataSplit = TrainTestSplit()
    train_df, test_df = dataSplit.split(data_df)

    # generate features
    train_df, feature_names = dataLoader.transformRawDataToFeatures(train_df, DataType.TRAINDATA.value)
    test_df, feature_names = dataLoader.transformRawDataToFeatures(test_df, DataType.VALIDATION.value)
    # pre-processing the data based on model type
    preprocessor = RegressionModelPreProcessor(feature_names)
    train_X, train_y, test_X, test_y = preprocessor.preprocessing(train_df, test_df)
    
    return (train_X, train_y, test_X, test_y, train_df, test_df)

In [4]:
import xgboost as xgb
import numpy as np
import pandas as pd
from service.Classifier.Enums.LabelEnum import LabelEnum
from sklearn.metrics import mean_squared_error, mean_absolute_error, confusion_matrix, classification_report
from service.Classifier.PreProcessing.Utils.dataValidator import DataValidator

def evaluate_result(train_X, train_y, test_X, test_y, train_df, test_df):

    model = xgb.XGBRegressor(max_depth=5, n_estimators=100, learning_rate=0.05).fit(train_X, train_y)
    predictions = np.rint(model.predict(test_X)).astype(np.int64)
    submission = pd.DataFrame({'ID': test_df['keyID'],
                               LabelEnum.CLOSEDDAY.value: predictions, 'Actual': test_y})
    rmse = np.sqrt(mean_squared_error(predictions, test_df[LabelEnum.CLOSEDDAY.value]))
    print("RMSE: %f" % rmse)
    mae = mean_absolute_error(predictions, test_y)
    print("MAE: %f" % mae)
    DataValidator().valid(submission, priority, test_df)
    print(confusion_matrix(submission["truth"], submission["prediction"]))
    print(classification_report(submission["truth"], submission["prediction"]))
    
    return (test_y, predictions, submission["truth"], submission["prediction"])

In [5]:
priority = Priority.P2.value

train_X, train_y, test_X, test_y, train_df, test_df = extract_data(priority)

p2_regression_true, p2_regression_pred, p2_classification_true, p2_classification_pred = evaluate_result(train_X, train_y, test_X, test_y, train_df, test_df)



The project is not in embedding: NATBEMCR
The project is not in embedding: GGRIP
The project is not in embedding: PROBLEM
The project is not in embedding: PX
The project is not in embedding: CSCNPLAT
RMSE: 9.884145
MAE: 8.098160
[[192 187]
 [ 74 199]]
              precision    recall  f1-score   support

       False       0.72      0.51      0.60       379
        True       0.52      0.73      0.60       273

    accuracy                           0.60       652
   macro avg       0.62      0.62      0.60       652
weighted avg       0.64      0.60      0.60       652



In [6]:
train_df

Unnamed: 0,keyID,priority,projectCoverage,ResolvedDay,ClosedDay
0,ONBAML-12719,P2,0.049914,12,12
1,RES-20723,P2,0.008016,10,15
2,BILLING-16571,P2,0.038925,13,15
3,CANCEL-6059,P2,0.004836,12,14
4,REGSTRN-12121,P2,0.150463,22,26
...,...,...,...,...,...
2637,BILLING-16640,P2,0.038925,27,27
2638,ONBAML-12769,P2,0.224657,10,10
2639,COMS-7986,P2,0.008488,12,12
2640,BILLING-16593,P2,0.035085,13,13


In [7]:
priority = Priority.P3.value

train_X, train_y, test_X, test_y, train_df, test_df = extract_data(priority)

p3_regression_true, p3_regression_pred, p3_classification_true, p3_classification_pred = evaluate_result(train_X, train_y, test_X, test_y, train_df, test_df)



The project is not in embedding: RISKVRTB
The project is not in embedding: MPPMO
The project is not in embedding: NRP
The project is not in embedding: WARROOM
The project is not in embedding: PROBLEM
The project is not in embedding: WEBRES
RMSE: 19.845728
MAE: 13.970362
[[548 108]
 [134 121]]
              precision    recall  f1-score   support

       False       0.80      0.84      0.82       656
        True       0.53      0.47      0.50       255

    accuracy                           0.73       911
   macro avg       0.67      0.65      0.66       911
weighted avg       0.73      0.73      0.73       911



In [8]:
train_df

Unnamed: 0,keyID,priority,projectCoverage,ResolvedDay,ClosedDay
0,TRXAPI-1402,P3,0.015738,41,41
1,VOD-3492,P3,0.004128,45,54
2,BILLING-16549,P3,0.035085,41,42
3,BILLING-16793,P3,0.038925,48,48
4,COMS-7977,P3,0.004164,42,47
...,...,...,...,...,...
3449,BILLING-16639,P3,0.038925,47,47
3450,VOD-3539,P3,0.005637,39,39
3451,VOD-3604,P3,0.056915,69,71
3452,CANCEL-6130,P3,0.004836,43,48


In [9]:
priority = Priority.P4.value

train_X, train_y, test_X, test_y, train_df, test_df = extract_data(priority)

p4_regression_true, p4_regression_pred, p4_classification_true, p4_classification_pred = evaluate_result(train_X, train_y, test_X, test_y, train_df, test_df)



The project is not in embedding: SDOCATOP
RMSE: 25.392762
MAE: 17.355932
[[288  42]
 [ 87  55]]
              precision    recall  f1-score   support

       False       0.77      0.87      0.82       330
        True       0.57      0.39      0.46       142

    accuracy                           0.73       472
   macro avg       0.67      0.63      0.64       472
weighted avg       0.71      0.73      0.71       472



In [10]:
train_df

Unnamed: 0,keyID,priority,projectCoverage,ResolvedDay,ClosedDay
0,CANCEL-6080,P4,0.004131,68,68
1,CANCEL-6072,P4,0.004836,61,61
2,BILLING-16551,P4,0.038925,49,49
3,CANCEL-6067,P4,0.004836,78,78
4,BILLING-16534,P4,0.040816,56,56
...,...,...,...,...,...
1913,VOD-3573,P4,0.056915,68,68
1914,RES-21352,P4,0.008016,50,57
1915,CANCEL-6268,P4,0.004724,80,80
1916,BILLING-17524,P4,0.045840,86,86
