In [3]:
import os
from os import path
import sys
import requests
import pickle
import platform
import csv
import xgboost as xgb
import pathlib
from datetime import datetime
from sklearn.metrics import mean_squared_error, mean_absolute_error, confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import pandas as pd

from service.Classifier.DataLoader.DataLoader import DataLoader
from service.Classifier.Enums.priority import Priority
from service.Classifier.DataLoader.P2DataLoader import P2DataLoader
from service.Classifier.DataLoader.P3DataLoader import P3DataLoader
from service.Classifier.DataLoader.P4DataLoader import P4DataLoader
from service.Classifier.Model.XGBRegressionModel import XGBRegressionModel
from service.Classifier.PreProcessing.RegressionModelPreProcessor import RegressionModelPreProcessor
from service.Classifier.FeatureExtraction.RegressionOOSLAPredictionFeature import RegressionOOSLAPredictionFeature
from service.Classifier.Enums.percentageOfSLA import PercentageOfSLA
from service.Classifier.Constants.Constant import mongodbProdUrl, mongodbStagingUrl
from service.Classifier.Indicators.Indicator import Indicator
from service.Classifier.Enums.IndicatorName import IndicatorName
from service.Classifier.DataSplit.TrainTestSplit import TrainTestSplit
from service.Classifier.Enums.dataTypeEnum import DataType

# from hpsklearn import HyperoptEstimator, xgboost_regression
# from hyperopt import tpe
print('Python version:', platform.python_version())
# print('tensorflow version:', tf.__version__)
sys.path.insert(0, os.path.abspath('../'))

Python version: 3.8.5


  from pandas import np


In [4]:
def initializeByPriority(priority):
    # load new updated data via Features class
    if priority == Priority.P2.value:
        return P2DataLoader()
    elif priority == Priority.P3.value:
        return P3DataLoader()
    elif priority == Priority.P4.value:
        return P4DataLoader()
    return None

## Contextual Data Loader

In [5]:
class BehaviorBuilder:
        
    def buildReporterToActivityDict(self, data_df):
        # collect number of activities of each reporter by priority
        reporterToActivity = {}
        for index, row in data_df.iterrows():
            for history in row["histories"]:
                name = history["author"]["_id"]
                if name not in reporterToActivity:
                    reporterToActivity[name] = []
                reporterToActivity[name].append(history["time"])
            # default NaN for empty bodies
            if type(row["bodies"]) != float: 
                for history in row["bodies"]:
                    name = history["authorID"]
                    if name not in reporterToActivity:
                        reporterToActivity[name] = []
                    reporterToActivity[name].append(history["created"])
        return reporterToActivity

    def buildProjectToBugDict(self, data_df):
        # collect bugs of each Jirakey  by priority
        projectToBug = {}
        for index, row in data_df.iterrows():
            name = row["keyID"].split("-")[0]
            if name not in projectToBug:
                projectToBug[name] = []
            projectToBug[name].append(row["created"])
        return projectToBug
    
    def buildProjectToClosedBugDict(self, data_df):
        # collect closed bugs of each Jirakey by priority
        projectToClosedBug = {}
        for index, row in data_df.iterrows():
            if row["status"] != "Closed":
                continue
            name = row["keyID"].split("-")[0]
            if name not in projectToClosedBug:
                projectToClosedBug[name] = []
            projectToClosedBug[name].append(row["created"])
        return projectToClosedBug
    
    def buildProjectToActivityDict(self, data_df):
        # collect activity of each Jirakey by priority
        projectToActivity = {}
        for index, row in data_df.iterrows():
            name = row["keyID"].split("-")[0]
            if name not in projectToActivity:
                projectToActivity[name] = []
            # default NaN for empty bodies
            if type(row["bodies"]) != float: 
                projectToActivity[name].append({
                    "created": row["created"],
                    "count": len(row["histories"]) + len(row["bodies"])
                })
            else:
                projectToActivity[name].append({
                    "created": row["created"],
                    "count": len(row["histories"])
                })
        return projectToActivity

    def buildHistoryDict(self, data_df, dataLoader):
        data_df = dataLoader.concateWithComments(data_df)
        reporterToActivity = self.buildReporterToActivityDict(data_df)
        projectToBug = self.buildProjectToBugDict(data_df)
        projectToClosedBug = self.buildProjectToClosedBugDict(data_df)
        projectToActivity = self.buildProjectToActivityDict(data_df)
        return reporterToActivity, projectToBug, projectToClosedBug, projectToActivity
    
    def extractCommentActivity(self, data_df):

        commentActivity = {}

        for index, row in data_df.iterrows():
            for history in row["histories"]:
                name = history["author"]["_id"]
                if name not in commentActivity:
                    commentActivity[name] = []
                commentActivity[name].append(history["time"])

        return commentActivity
    
    def extractCreatorTickets(self, data_df):
        ticketsOpenByReporter = {}

        for index, row in data_df.iterrows():
            submitter = row["reporter"]
            status = row["status"]

            if submitter not in ticketsOpenByReporter:
                ticketsOpenByReporter[submitter] = []

            timestamp = row["timestamp"]//1000
            dt_object = datetime.fromtimestamp(timestamp)
            date_time = dt_object.strftime("%Y-%m-%dT%H:%M:%S")

            ticketsOpenByReporter[submitter].append((date_time, status))

        return ticketsOpenByReporter

In [6]:
priorities = [Priority.P2.value, Priority.P3.value, Priority.P4.value]
reporterToActivityDict = {}
projectToBugDict = {}
projectToClosedBugDict = {}
projectToActivityDict = {}
for priority in priorities:
    dataLoader = initializeByPriority(priority)
    data_df = dataLoader.loadAllDefectsByPriority(priority)
    reporterToActivity, projectToBug, projectToClosedBug, projectToActivity = BehaviorBuilder().buildHistoryDict(data_df, dataLoader)
    reporterToActivityDict[priority] = reporterToActivity
    projectToBugDict[priority] = projectToBug
    projectToClosedBugDict[priority] = projectToClosedBug
    projectToActivityDict[priority] = projectToActivity
    CreatorTicketsActivity = BehaviorBuilder().extractCreatorTickets(data_df)
    CreatorCommentsActivity = BehaviorBuilder().extractCommentActivity(data_df)



In [7]:
from datetime import date

def daysDiff(d1, d2):
#     print(d1)
    date1 = d1.split('T')[0].split('-')
#     print(date1)
    date2 = d2.split('T')[0].split('-')
#     print(date2)
    date1_data = date(int(date1[0]), int(date1[1]), int(date1[2]))
    date2_data = date(int(date2[0]), int(date2[1]), int(date2[2]))
    delta = date2_data - date1_data
    return delta.days

## Define contextual indicators

This part is used to define each features. After succesffully build a feature class, adding the class into dataloader's feature.indicators. 

### NCommentT

In [8]:
# Number of comments
class NCommentT(Indicator):

    def getIndicatorName(self):
        return "NCommentT"

    def getIndicatorValue(self, ticket, dataType):
        history, bodies, duration = self.getPreProcessedHistoryCommentsDuration(ticket, dataType)

        return len(bodies)

### NActorsT

In [9]:
# number of active actors 
# number of unique users in comments and histories

class NActor(Indicator):

    def getIndicatorName(self):
        return "NActor"

    def getIndicatorValue(self, ticket, dataType):
        history, bodies, duration = self.getPreProcessedHistoryCommentsDuration(ticket, dataType)
        actors = set()
    
        for idx, row in history.iterrows():
            actor = row['author']['_id']
            actors.add(actor)
            
        for idx, row in bodies.iterrows():
            actors.add(row["authorID"])
        
        return len(actors)

### LabelsT

In [10]:
# number of active actors 
# number of unique users in comments and histories
all_infor = pd.read_pickle("/Users/fjirigesi/Desktop/ExtarctedInfo.pickle")

class LabelsT(Indicator):

    def getIndicatorName(self):
        return "LabelsT"

    def getIndicatorValue(self, ticket, dataType):        
        # todo: Should we read this pickle all the time? can we improve here?
        
        ticket_id = ticket['keyID']
#         print(ticket_id)
        
        labels = all_infor.loc[all_infor['issueKey']== ticket_id, 'labels']
#         print(labels)
        labelsSet = set()
        
        for index, label_list in labels.items():
            for label in label_list:
                labelsSet.add(label)

        
        return len(labelsSet)

### meanCommnetSizeT

In [11]:
# average length of comments
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


class meanCommentSize(Indicator):

    def getIndicatorName(self):
        return "meanCommentSize"

    def getIndicatorValue(self, ticket, dataType):
        history, bodies, duration = self.getPreProcessedHistoryCommentsDuration(ticket, dataType)
        
        stop_words = set(stopwords.words('english'))
        
        sentence_sizes = []
        
        for idx, row in bodies.iterrows():
            content = row['content']
            word_tokens = word_tokenize(content)
            
            filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
            
            filtered_sentence = []
 
            for w in word_tokens:
                if w not in stop_words:
                    filtered_sentence.append(w)
            
            filtered_sentence_size = len(filtered_sentence)
            
            sentence_sizes.append(filtered_sentence_size)
            
        if len(sentence_sizes) > 0:
            return sum(sentence_sizes)/len(sentence_sizes)
        else:
            return 0
            

### ticketCleanedBodyLen

In [12]:
# Length of the combined title and body with markdown parsed and tags removed.
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

class ticketCleanedBodyLen(Indicator):

    def getIndicatorName(self):
        return "ticketCleanedBodyLen"

    def getIndicatorValue(self, ticket, dataType):
        history, bodies, duration = self.getPreProcessedHistoryCommentsDuration(ticket, dataType)
        title = ticket['summary']
        body = ticket['description']
        total = title + body
        total = re.sub(r'^https?:\/\/.*[\r\n]*', '', total, flags=re.MULTILINE) # remove urls
        
        
        stop_words = set(stopwords.words('english'))

        word_tokens = word_tokenize(total)

        filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]

        filtered_sentence = []

        for w in word_tokens:
            if w not in stop_words:
                filtered_sentence.append(w)
        
        return len(filtered_sentence)

## nTicketsByCreatorOpen

In [13]:
# Number of activity of the reporter by T
class nticketsByCreatorOpen(Indicator):

    def getIndicatorName(self):
        return "nticketsByCreatorOpen"

    def getIndicatorValue(self, ticket, dataType):
        return self.getNumberofTicketsByCreatorOpen(ticket["reporter"], ticket["created"])
    
    def getNumberofTicketsByCreatorOpen(self, reporter, created):
        
        if reporter not in CreatorTicketsActivity:
            return 0
        
        NumberCreatedTickets = 0
        for dt in CreatorTicketsActivity[reporter]:
            days = daysDiff(dt[0], created)
            if days < 90 and days > 0:
                NumberCreatedTickets += 1
        return NumberCreatedTickets

## nClosedTicketsByCreatorOpen

In [14]:

# Number of activity of the reporter by T
class nClosedticketsByCreatorOpen(Indicator):

    def getIndicatorName(self):
        return "nClosedticketsByCreatorOpen"

    def getIndicatorValue(self, ticket, dataType):
        return self.getNumberofClosedTicketsByCreatorOpen(ticket["reporter"], ticket["created"])
    
    def getNumberofClosedTicketsByCreatorOpen(self, reporter, created):
        
        if reporter not in CreatorTicketsActivity:
            return 0
        
        NumberClosedTickets = 0
        for dt in CreatorTicketsActivity[reporter]:
            if dt[1] == "Closed":
                days = daysDiff(dt[0], created)
                if days < 90 and days > 0:
                    NumberClosedTickets += 1
        return NumberClosedTickets

### nCommentsByCreator

In [15]:
class nCommentsByCreator(Indicator):

    def getIndicatorName(self):
        return "nCommentsByCreator"

    def getIndicatorValue(self, ticket, dataType):
        return self.getNCommentsByCreator(ticket["reporter"], ticket["created"])
    
    def getNCommentsByCreator(self, reporter, created):
        
        if reporter not in CreatorCommentsActivity:
            return 0
        
        NumberComments = 0
        for dt in CreatorCommentsActivity[reporter]:
            days = daysDiff(dt, created)
            if days < 90 and days > 0:
                NumberComments += 1
        return NumberComments

## Below features are extracted by Bochang

## nCommentByActorsT

In [16]:

from service.Classifier.DateUtils.DateUtils import DateUtil

# Number of activity of the reporter by T
class NCommentByActorsT(Indicator):

    def getIndicatorName(self):
        return "NCommentByActorsT"

    def getIndicatorValue(self, ticket, dataType):
        return self.getNumberOfActivityCountByReporter(ticket["reporter"], ticket["priority"], ticket["created"])
    
    def getNumberOfActivityCountByReporter(self, reporter, priority, created):
        reporterToActivity = reporterToActivityDict[priority]
        if reporter not in reporterToActivity:
            return 0
        activity = 0
        for dt in reporterToActivity[reporter]:
            days = DateUtil().getTimeDifference(dt, created)
            if days < 20 and days > 0:
                activity += 1
        return activity

## nticketsCreatedInProject

In [17]:
# Number of bugs by the project for the past 3 months
class NticketsCreatedInProject(Indicator):

    def getIndicatorName(self):
        return "NticketsCreatedInProject"

    def getIndicatorValue(self, ticket, dataType):
        return self.getNumberOfActivityCountByReporter(ticket["keyID"].split("-")[0], ticket["priority"], ticket["created"])
    
    def getNumberOfActivityCountByReporter(self, jiraKey, priority, created):
        projectToBug = projectToBugDict[priority]
        if jiraKey not in projectToBug:
            return 0
        ticketCount = 0
        for dt in projectToBug[jiraKey]:
            days = DateUtil().getTimeDifference(dt, created)
            if days < 90 and days > 0:
                ticketCount += 1
        return ticketCount

## nTicketsCreatedInProjectT

In [18]:
# Number of bugs by the project for the past 2 weeks
class NticketsCreatedInProjectT(Indicator):

    def getIndicatorName(self):
        return "NticketsCreatedInProjectT"

    def getIndicatorValue(self, ticket, dataType):
        return self.getNumberOfActivityCountByReporter(ticket["keyID"].split("-")[0], ticket["priority"], ticket["created"])
    
    def getNumberOfActivityCountByReporter(self, jiraKey, priority, created):
        projectToBug = projectToBugDict[priority]
        if jiraKey not in projectToBug:
            return 0
        ticketCount = 0
        for dt in projectToBug[jiraKey]:
            days = DateUtil().getTimeDifference(dt, created)
            if days < 14 and days > 0:
                ticketCount += 1
        return ticketCount

## nticketsCreatedInProjectClosed

In [19]:
# Number of closed bugs by the project for the past 3 months
class NticketsCreatedInProjectClosed(Indicator):

    def getIndicatorName(self):
        return "NticketsCreatedInProjectClosed"

    def getIndicatorValue(self, ticket, dataType):
        return self.getNumberOfActivityCountByReporter(ticket["keyID"].split("-")[0], ticket["priority"], ticket["created"])
    
    def getNumberOfActivityCountByReporter(self, jiraKey, priority, created):
        projectToClosedBug = projectToClosedBugDict[priority]
        if jiraKey not in projectToClosedBug:
            return 0
        ticketCount = 0
        for dt in projectToClosedBug[jiraKey]:
            days = DateUtil().getTimeDifference(dt, created)
            if days < 90 and days > 0:
                ticketCount += 1
        return ticketCount

## nTicketsCreatedInProjectClosedT

In [20]:
# Number of closed bugs by the project for the past 2 weeks
class NticketsCreatedInProjectClosedT(Indicator):

    def getIndicatorName(self):
        return "NticketsCreatedInProjectClosedT"

    def getIndicatorValue(self, ticket, dataType):
        return self.getNumberOfActivityCountByReporter(ticket["keyID"].split("-")[0], ticket["priority"], ticket["created"])
    
    def getNumberOfActivityCountByReporter(self, jiraKey, priority, created):
        projectToClosedBug = projectToClosedBugDict[priority]
        if jiraKey not in projectToClosedBug:
            return 0
        ticketCount = 0
        for dt in projectToClosedBug[jiraKey]:
            days = DateUtil().getTimeDifference(dt, created)
            if days < 14 and days > 0:
                ticketCount += 1
        return ticketCount

## nActivityProject

In [21]:
# Number of activity by the project for the past 3 months
class NActivityInProject(Indicator):

    def getIndicatorName(self):
        return "NActivityInProject"

    def getIndicatorValue(self, ticket, dataType):
        return self.getNumberOfActivityCountByReporter(ticket["keyID"].split("-")[0], ticket["priority"], ticket["created"])
    
    def getNumberOfActivityCountByReporter(self, jiraKey, priority, created):
        projectToActivity = projectToActivityDict[priority]
        if jiraKey not in projectToActivity:
            return 0
        ticketCount = 0
        for dt in projectToActivity[jiraKey]:
            days = DateUtil().getTimeDifference(dt["created"], created)
            if days < 90 and days > 0:
                ticketCount += dt["count"]
        return ticketCount

## nActivityProjectT

In [22]:
# Number of activity by the project for the past 2 weeks
class NActivityInProjectT(Indicator):

    def getIndicatorName(self):
        return "NActivityInProjectT"

    def getIndicatorValue(self, ticket, dataType):
        return self.getNumberOfActivityCountByReporter(ticket["keyID"].split("-")[0], ticket["priority"], ticket["created"])
    
    def getNumberOfActivityCountByReporter(self, jiraKey, priority, created):
        projectToActivity = projectToActivityDict[priority]
        if jiraKey not in projectToActivity:
            return 0
        ticketCount = 0
        for dt in projectToActivity[jiraKey]:
            days = DateUtil().getTimeDifference(dt["created"], created)
            if days < 14 and days > 0:
                ticketCount += dt["count"]
        return ticketCount

## Extract features

In [23]:
class ContextualDataLoader(DataLoader):
    
    def __init__(self, priority):
        super().__init__(priority)
        
    def transformToContextualFeatures(self, tickets, dataType):
        feature = RegressionOOSLAPredictionFeature()
#         feature.indicators = [NCommentT(), NActor(), LabelsT(), meanCommentSize(),
#                                ticketCleanedBodyLen(), nticketsByCreatorOpen(),
#                               nClosedticketsByCreatorOpen(), nCommentsByCreator()]
#         feature.indicators = [NCommentByActorsT(), NticketsCreatedInProject(),
#                              NticketsCreatedInProjectT(), NticketsCreatedInProjectClosed(),
#                              NticketsCreatedInProjectClosedT(), NActivityInProject(),
#                              NActivityInProjectT()]
        # All contextual features
        
        feature.indicators = [NCommentT(), NActor(), LabelsT(), meanCommentSize(),
                             ticketCleanedBodyLen(), nticketsByCreatorOpen(),
                             nClosedticketsByCreatorOpen(), nCommentsByCreator(), 
                             NCommentByActorsT(), NticketsCreatedInProject(),
                             NticketsCreatedInProjectT(), NticketsCreatedInProjectClosed(),
                             NticketsCreatedInProjectClosedT(), NActivityInProject(),
                             NActivityInProjectT()]
        
         
        feature.setPercentageOfSLA(self.percentageOfSLA)
        
        df, feature_name = feature.buildFeatureFromData(self.concateWithComments(tickets), dataType)
        # df.to_csv("data-" + self.priority + "(" + str(int(self.percentageOfSLA * 100)) + ").csv")
        return df, feature_name

In [24]:
# def preprocessData(priority_value):


def extract_contextual_feature(priority):
    dataLoader = initializeByPriority(priority)
    data_df = dataLoader.loadTrainingRawData()

    # split data into train test 4:1
    dataSplit = TrainTestSplit()
    train_df_raw, test_df_raw = dataSplit.split(data_df)

    # generate features
    train_df, feature_names = ContextualDataLoader(priority).transformToContextualFeatures(train_df_raw, DataType.TRAINDATA.value)

    test_df, feature_names = ContextualDataLoader(priority).transformToContextualFeatures(test_df_raw, DataType.VALIDATION.value)

    # pre-processing the data based on model type
    preprocessor = RegressionModelPreProcessor(feature_names)
    train_X, train_y, test_X, test_y = preprocessor.preprocessing(train_df, test_df)
    print(train_X.shape)
    print(test_X.shape)
    return (train_X, train_y, test_X, test_y, train_df, test_df)

## XGBoost

In [25]:
import numpy as np
import pandas as pd
from service.Classifier.Enums.LabelEnum import LabelEnum
from sklearn.metrics import mean_squared_error, mean_absolute_error, confusion_matrix, classification_report
from service.Classifier.PreProcessing.Utils.dataValidator import DataValidator
import xgboost as xgb

def evaluate_contextual_result(train_X, train_y, test_X, test_y, train_df, test_df):    

    model = xgb.XGBRegressor(max_depth=5, n_estimators=100, learning_rate=0.05).fit(train_X, train_y)
    predictions = np.rint(model.predict(test_X)).astype(np.int64)
    submission = pd.DataFrame({'ID': test_df['keyID'],
                               LabelEnum.CLOSEDDAY.value: predictions, 'Actual': test_y})
    rmse = np.sqrt(mean_squared_error(predictions, test_df[LabelEnum.CLOSEDDAY.value]))
    print("RMSE: %f" % rmse)
    mae = mean_absolute_error(predictions, test_y)
    regression_true = test_y
    regression_pred = predictions
    print("MAE: %f" % mae)
    DataValidator().valid(submission, priority, test_df)
    classification_true = submission["truth"]
    classification_pred = submission["prediction"]
    print(confusion_matrix(submission["truth"], submission["prediction"]))
    print(classification_report(submission["truth"], submission["prediction"]))
    
    return (regression_true, regression_pred, classification_true, classification_pred)

### Get P1 results

In [27]:
priority = Priority.P2.value
train_X, train_y, test_X, test_y, train_df, test_df = extract_contextual_feature(priority)



(2632, 15)
(662, 15)


#### Extracted features

In [28]:
train_df.head(10)

Unnamed: 0,keyID,priority,NCommentT,NActor,LabelsT,meanCommentSize,ticketCleanedBodyLen,nticketsByCreatorOpen,nClosedticketsByCreatorOpen,nCommentsByCreator,NCommentByActorsT,NticketsCreatedInProject,NticketsCreatedInProjectT,NticketsCreatedInProjectClosed,NticketsCreatedInProjectClosedT,NActivityInProject,NActivityInProjectT,ResolvedDay,ClosedDay
0,ONBAML-12719,P2,3,5,8,19.0,107,102,68,428,18,139,43,136,40,3605,1057,12,12
1,RES-20723,P2,3,7,0,6.0,39,0,0,13,2,1,0,1,0,21,0,10,15
2,BILLING-16571,P2,6,7,0,26.5,217,1,1,0,0,20,10,20,10,508,262,13,15
3,REGSTRN-12121,P2,3,4,1,19.0,37,0,0,0,0,33,1,31,1,782,49,22,26
4,BILLING-16536,P2,8,9,0,28.875,99,13,11,0,0,18,8,18,8,464,232,19,19
5,BILLING-16547,P2,5,6,0,69.8,163,9,9,64,97,16,7,16,7,396,183,14,16
6,ONBAML-12716,P2,10,7,14,45.5,114,1,1,9,389,142,45,139,42,3718,1118,15,15
7,BILLING-16516,P2,5,5,0,70.4,194,0,0,0,0,16,7,16,7,396,183,21,21
8,ONBAML-12670,P2,1,4,14,7.0,82,1,1,9,365,140,42,138,40,3681,1065,14,14
9,BILLING-16557,P2,4,5,0,26.5,92,9,9,64,87,13,4,13,4,304,91,15,17


#### Show results

In [29]:
p2_regression_true, p2_regression_pred, p2_classification_true, p2_classification_pred = evaluate_contextual_result(train_X, train_y, test_X, test_y, train_df, test_df)

RMSE: 9.283956
MAE: 6.790030
[[222 163]
 [102 175]]
              precision    recall  f1-score   support

       False       0.69      0.58      0.63       385
        True       0.52      0.63      0.57       277

    accuracy                           0.60       662
   macro avg       0.60      0.60      0.60       662
weighted avg       0.62      0.60      0.60       662



### Get P3 results

In [25]:
priority = Priority.P3.value
train_X, train_y, test_X, test_y, train_df, test_df = extract_contextual_feature(priority)
print("---Prediction results on P3 data---")
p3_regression_true, p3_regression_pred, p3_classification_true, p3_classification_pred = evaluate_contextual_result(train_X, train_y, test_X, test_y, train_df, test_df)



(2730, 15)
(678, 15)
---Prediction results on P3 data---
RMSE: 20.118821
MAE: 13.864307
[[384  84]
 [108 102]]
              precision    recall  f1-score   support

       False       0.78      0.82      0.80       468
        True       0.55      0.49      0.52       210

    accuracy                           0.72       678
   macro avg       0.66      0.65      0.66       678
weighted avg       0.71      0.72      0.71       678



### get P4 results

In [31]:
priority = Priority.P4.value
train_X, train_y, test_X, test_y, train_df, test_df = extract_contextual_feature(priority)
print("---Prediction results on P4 data---")
p4_regression_true, p4_regression_pred, p4_classification_true, p4_classification_pred = evaluate_contextual_result(train_X, train_y, test_X, test_y, train_df, test_df)



(69, 15)
(19, 15)
---Prediction results on P4 data---
RMSE: 31.697584
MAE: 23.894737
[[7 5]
 [3 4]]
              precision    recall  f1-score   support

       False       0.70      0.58      0.64        12
        True       0.44      0.57      0.50         7

    accuracy                           0.58        19
   macro avg       0.57      0.58      0.57        19
weighted avg       0.61      0.58      0.59        19



### Cancatenate all results

In [27]:
# regression results
regression_true = pd.concat([p2_regression_true, p3_regression_true])
regression_true = pd.concat([regression_true, p4_regression_true])

regression_pred = np.concatenate((p2_regression_pred, p3_regression_pred), axis=0)
regression_pred = np.concatenate((regression_pred, p4_regression_pred), axis=0)

## classification results
classification_true = pd.concat([p2_classification_true, p3_classification_true])
classification_true = pd.concat([classification_true, p4_classification_true])

classification_pred = pd.concat([p2_classification_pred, p3_classification_pred])
classification_pred = pd.concat([classification_pred, p4_classification_pred])

In [28]:
print("---Contextual Feature results on all data---")
rmse = np.sqrt(mean_squared_error(regression_pred, regression_true))
print("RMSE: %f" % rmse)
mae = mean_absolute_error(regression_pred, regression_true)
print("MAE: %f" % mae)
print(confusion_matrix(classification_true, classification_pred))
print(classification_report(classification_true, classification_pred))

---Contextual Feature results on all data---
RMSE: 15.892141
MAE: 10.994845
[[545 226]
 [155 238]]
              precision    recall  f1-score   support

       False       0.78      0.71      0.74       771
        True       0.51      0.61      0.56       393

    accuracy                           0.67      1164
   macro avg       0.65      0.66      0.65      1164
weighted avg       0.69      0.67      0.68      1164



## MLP 

In [29]:
from sklearn.neural_network import MLPRegressor
NNModel = MLPRegressor(random_state=0, max_iter=500, activation ='logistic', hidden_layer_sizes= (14,), solver= 'sgd').fit(train_X, train_y)

predictions = np.rint(NNModel.predict(test_X)).astype(np.int64)
predictions = np.squeeze(predictions)

submission = pd.DataFrame({'ID': test_df['keyID'],
                           LabelEnum.CLOSEDDAY.value: predictions, 'Actual': test_y})
rmse = np.sqrt(mean_squared_error(predictions, test_df[LabelEnum.CLOSEDDAY.value]))
print("RMSE: %f" % rmse)
mae = mean_absolute_error(predictions, test_y)
print("MAE: %f" % mae)
DataValidator().valid(submission, priority, test_df)
print(confusion_matrix(submission["truth"], submission["prediction"]))
print(classification_report(submission["truth"], submission["prediction"]))

RMSE: 39.302248
MAE: 32.666667
[[0 2]
 [0 1]]
              precision    recall  f1-score   support

       False       0.00      0.00      0.00         2
        True       0.33      1.00      0.50         1

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
