In [1]:
#Import Python Libraries
import pandas as pd
import pickle
import time
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt')

#Import Self-written Functions
import os
import sys
src_dir = os.path.join(os.getcwd(), '..', 'src')
sys.path.append(src_dir)

from d00_utils.calculateTimeDifference import calculateTimeDifference #Function to calc time difference
from d01_data.loadCommits import loadCommits #Function to load SVN data
from d02_intermediate.cleanCommitData import cleanCommitData #Function to clean commit data
from d02_intermediate.cleanJiraData import cleanJiraData #Function to clean JIRA data

from d03_processing.createFittedTF_IDF import createFittedTF_IDF #Function to see if a trace is valid
from d03_processing.createCorpusFromDocumentList import createCorpusFromDocumentList #Function to create a corpus
from d03_processing.checkValidityTrace import checkValidityTrace #Function to see if a trace is valid
from d03_processing.calculateTimeDif import calculateTimeDif #Calculate the time difference between 2 dates in seconds
from d03_processing.checkFullnameEqualsEmail import checkFullnameEqualsEmail #Check if fullName is equal to the email
from d03_processing.calculateCosineSimilarity import calculateCosineSimilarity #Calculate the cos similarity
from d03_processing.calculateDocumentStatistics import calculateUniqueWordCount
from d03_processing.calculateDocumentStatistics import calculateTotalWordCount
from d03_processing.calculateDocumentStatistics import calculateOverlapBetweenDocuments

from d04_modelling.summariseClassDistribution import summariseClassDistribution #Visualize the class distribution
from d04_modelling.showModelPerformance import showModelPerformance # Show several performance measures

#Display full value of a column
pd.set_option('display.max_colwidth', None)

#Display all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rande\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rande\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


# 1. Load Raw Data

In [2]:
#Set dataset

datasetDirectory = ""

In [3]:
#Import raw JIRA dataset
rawData_JIRA_mxShop = pd.read_excel('../data/01_raw/JIRA Mendix Engagement export_22_06_2021.xlsx')

#import
rawData_SVN_mxShop = loadCommits('../data/01_raw/MxShop-dump.txt')

# 2. Clean Raw Data
## 2.1 Clean Raw Data - SVN Data
Clean the raw data of the SVN files

In [4]:
from datetime import datetime
import re
import pandas as pd
import string

#nltk for NLP 
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.util import ngrams

#Function to transform natural text into unigram tokens
def preprocessNaturalLanguage(text, porterStemmer, cachedStopWords):
    string_text = str(text)
    #lowercase the string
    lower_case_string = string_text.lower()
    
    #Remove interpunction
    no_interpunction = lower_case_string.translate(str.maketrans('','',string.punctuation))
    
    #Remove numbers
    no_numbers = ''.join([i for i in no_interpunction if not i.isdigit()])
    
    #tokenize string
    tokens = word_tokenize(no_interpunction)
    
    #remove stopwords
    tokens_without_sw = [word for word in tokens if not word in cachedStopWords]
    
    #Stem the tokens
    stemmedToken = list(map(porterStemmer.stem, tokens_without_sw))

    return(stemmedToken)

#Function to transform natural text into n-gram tokens
def preprocessNGrams(text, porterStemmer, cachedStopWords, nGramSize):
    string_text = str(text)
    
    #lowercase the string
    lower_case_string = string_text.lower()
    
    #Remove interpunction
    no_interpunction = lower_case_string.translate(str.maketrans('','',string.punctuation))
    
    #Remove numbers
    no_numbers = ''.join([i for i in no_interpunction if not i.isdigit()])
    
    #tokenize string
    tokens = word_tokenize(no_interpunction)
    
    #Create the ngrams
    ngrams = list(nltk.ngrams(tokens, nGramSize))
    
    #remove all the n-grams containing a stopword
    cleanNGrams = [ngram for ngram in ngrams if not any(stop in ngram for stop in cachedStopWords)]
    
    #Stem the tokens
    stemmedNGrams = []
    for ngram in cleanNGrams:
        stemmed = list(map(porterStemmer.stem, ngram))
        stemmedNGrams.append(stemmed)
    return(stemmedNGrams)

#Function to transform date into a date object
def preprocessCommitDate(date_string):
    date_time_obj = datetime.strptime(date_string, '%Y-%m-%dT%H:%M:%S.%fZ')  
    return(date_time_obj)
    
#Remove the found Issue key from the log
def removeIssueKey(log_message):
    issue_keys = re.findall(r"LRN+.[0-9]+|AFM+.[0-9]+|MA+.[0-9]+|AFI+.[0-9]+|EM+.[0-9]+|OE+.[0-9]+|EM+.[0-9]+", log_message)
    log_message_without_key = log_message
    for issue_key in issue_keys:
        log_message_without_key = log_message_without_key.replace(issue_key, "")
    return(log_message_without_key)

def unitNamesLambdaFunc(unitName, stemmer):
    #Lower case
    unitNameLowered = unitName.lower()
    
    #Remove interpunction
    noInterpunction = unitNameLowered.translate(str.maketrans('','',string.punctuation))
    
    #Remove numbers
    noNumbers = ''.join([i for i in noInterpunction if not i.isdigit()])
    
    stemmendUnitName = stemmer.stem(noInterpunction)
    
    
    return(stemmendUnitName)
    

def preprocessUnitNames(unitName, porterStemmer, cachedStopWords):
    if (isinstance(unitName, str)):
        #Split camelCasing
        unitNameSplitList = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', unitName)).split()
        
        porterStemmer = PorterStemmer() #create an object of class PorterStemmer
        
        #Preprocess each split found.
        unitNameLowered = list(map(lambda unitName: unitNamesLambdaFunc(unitName, porterStemmer), 
                                   unitNameSplitList))
        
        #Check for stopwords
        tokensWithoutSW = [word for word in unitNameLowered if not word in cachedStopWords]

        return(tokensWithoutSW)

def preprocessNGramsUnitNames(unitName, porterStemmer, cachedStopWords, nGramSize):
    if (isinstance(unitName, str)):
        #Split camelCasing
        unitNameSplitList = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', unitName)).split()
        
        cleanedUnitNames = []
        for unitNameSplit in unitNameSplitList:
            #Lower case unit names
            lowerCased = unitNameSplit.lower()

            #Remove interpunction
            removedInterpunction = lowerCased.translate(str.maketrans('','',string.punctuation))
            cleanedUnitNames.append(removedInterpunction)
            
        #Transform to string (needed for tokenizer
        unitNameString = ' '.join(cleanedUnitNames)

        #Tokenzize words
        tokenized = word_tokenize(unitNameString)
        
        #Create the ngrams
        ngrams = list(nltk.ngrams(tokenized, nGramSize))
        
        porterStemmer = PorterStemmer() #create an object of class PorterStemmer
        
        #remove all the n-grams containing a stopword
        cleanNGrams = [ngram for ngram in ngrams if not any(stop in ngram for stop in cachedStopWords)]
    
        #Stem the tokens
        stemmedNGrams = []
        for ngram in cleanNGrams:
            stemmed = list(map(porterStemmer.stem, ngram))
            stemmedNGrams.append(stemmed)
            
        return(stemmedNGrams)

#Method to clean all columns of the provided data
def cleanCommitData(rawCommitData): 
    #create an object of class PorterStemmer
    porterStemmer = PorterStemmer()
    
    #Find all stopwords
    cachedStopWords = stopwords.words("english")
    
    #Remove all revisions without an issue key in the log message
    commit_df = rawCommitData[rawCommitData["related_issue_key"].notna()]

    #Execute cleaning methods on dataset
    cleaned_commit_logs = commit_df['log'].apply(lambda x: removeIssueKey(x))
    processed_commit_logs = cleaned_commit_logs.apply(lambda x: preprocessNaturalLanguage(x, porterStemmer, cachedStopWords))
    processed_commit_logs_2grams = cleaned_commit_logs.apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 2))
    processed_commit_logs_3grams = cleaned_commit_logs.apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 3))
    processed_date_times = commit_df['date'].apply(lambda x: preprocessCommitDate(x))
    processed_unit_names = commit_df['impacted_unit_names'].apply(lambda x: preprocessUnitNames(x, porterStemmer, cachedStopWords))
    processed_unit_names_2grams = commit_df['impacted_unit_names'].apply(lambda x: preprocessNGramsUnitNames(x, porterStemmer, cachedStopWords, 2))
    processed_unit_names_3grams = commit_df['impacted_unit_names'].apply(lambda x: preprocessNGramsUnitNames(x, porterStemmer, cachedStopWords, 3))
    

    #Put all data together into a new dataframe
    commit_data = {'Revision': commit_df["revision"],
               'Email' : commit_df["email"],
               'Commit_date': processed_date_times,
               "Issue_key_commit": commit_df["related_issue_key"],
               'Logs': processed_commit_logs, 
               'Logs_2grams': processed_commit_logs_2grams, 
               'Logs_3grams': processed_commit_logs_3grams, 
               'Unit_names': processed_unit_names,
               'Unit_names_2grams': processed_unit_names_2grams,
               'Unit_names_3grams': processed_unit_names_3grams,
               'Commit_natural_text': processed_commit_logs + processed_unit_names,
               'Commit_natural_text_2grams': processed_commit_logs_2grams + processed_unit_names_2grams,
               'Commit_natural_text_3grams': processed_commit_logs_3grams + processed_unit_names_3grams
               }
               
    commit_processed_df = pd.DataFrame(data=commit_data)

    return(commit_processed_df)

In [5]:
#Start timer
startTime = time.time() 

intermediateData_SVN_mxShop = cleanCommitData(rawData_SVN_mxShop)

#Create a temp XLSX file for all intermediate datasets
intermediateData_SVN_mxShop.to_excel(excel_writer = "../data/02_intermediate/intermediateData_SVN_mxShop.xlsx", index = False)

#Create a pickle file for all intermediate datasets
intermediateData_SVN_mxShop.to_pickle(path= "../data/02_intermediate/intermediateData_SVN_mxShop.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished cleaning after " + timeDifference)

Finished cleaning after 0 minutes and 1.5980095863342285 seconds


In [6]:
import re

import string
#nltk for NLP 
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag  import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from datetime import datetime
import numpy as np
import pandas as pd
import time
nltk.download('averaged_perceptron_tagger')


#Function to clean the comments
def clean_comments(comment):
    try:
        commentDates = re.findall(r"[0-9]{2} [A-Z][a-z]{2} [0-9]{4} [0-9]{2}:[0-9]{2};[a-zA-Z0-9_]{24};", comment)
        accountIds = re.findall(r"\[~accountid:[a-zA-Z0-9]{24}\]", comment)
               
        
        cleanedComment = comment.replace("nan",'')
        for commentDate in commentDates:
            cleanedComment = cleanedComment.replace(commentDate,'')
        
        for accountId in accountIds: 
            cleanedComment = cleanedComment.replace(accountId,'')
        
        return(cleanedComment)
    except:
        return("")

def preprocess(text, porterStemmer, cachedStopwords):
    string_text = str(text)
    #lowercase the string
    lower_case_string = string_text.lower()
    
    #Remove interpunction
    no_interpunction = lower_case_string.translate(str.maketrans('','',string.punctuation))
    
    #Remove numbers
    no_numbers = ''.join([i for i in no_interpunction if not i.isdigit()])
    
    #tokenize string
    tokens = word_tokenize(no_numbers)
    
    #remove stopwords
    tokens_without_sw = [word for word in tokens if not word in cachedStopwords]
    
    #Stem the tokens
    stemmedToken = list(map(porterStemmer.stem, tokens_without_sw))

    return(stemmedToken)

def preprocessNGrams(text, porterStemmer, cachedStopWords, nGramSize):
    string_text = str(text)
    
    #lowercase the string
    lower_case_string = string_text.lower()
    
    #Remove interpunction
    no_interpunction = lower_case_string.translate(str.maketrans('','',string.punctuation))
    
    #Remove numbers
    no_numbers = ''.join([i for i in no_interpunction if not i.isdigit()])
    
    #tokenize string
    tokens = word_tokenize(no_numbers)
    
    #Create the ngrams
    ngrams = list(nltk.ngrams(tokens, nGramSize))
    
    #remove all the n-grams containing a stopword
    cleanNGrams = [ngram for ngram in ngrams if not any(stop in ngram for stop in cachedStopWords)]
    
    #Stem the tokens
    stemmedNGrams = []
    for ngram in cleanNGrams:
        stemmed = list(map(porterStemmer.stem, ngram))
        stemmedNGrams.append(stemmed)
    return(stemmedNGrams)

#Function to transform date into a date object
def preprocess_jira_date(date_string):
    if(isinstance(date_string, str)):
        try:
            date_time_obj = datetime.strptime(date_string, '%d %b %Y %H:%M')
        except:
            date_time_obj = datetime.strptime(date_string, '%Y-%m-%d %H:%M:%S:%f')
        return(date_time_obj)
    elif(isinstance(date_string, datetime)): 
        return(date_string)
    else:
        return(np.nan)
    
    
def findVerbs(tokenList):
    posTags = pos_tag(tokenList)
    verbAbrList = ['VBP', 'VBG', 'VBN', 'VBP', 'VBZ', 'RB', 'RBR', 'RBS']
    verbList = []
    for posTag in posTags:
        if posTag[1] in verbAbrList:
            verbList.append(posTag[0])
    return(verbList)

#Preprocess all the features and transform to the format needed for further processing.
def preprocessJiraData(cleanDataFrame, preprocessComments, porterStemmer, cachedStopWords, startTime):
    if (preprocessComments == True):
        nOfSteps = '4'
    else:
        nOfSteps = '3'

    #preprocess Summaries
    jira_summaries = cleanDataFrame['Summary'].apply(lambda x: preprocess(x, porterStemmer, cachedStopWords))
    jira_summaries_2grams = cleanDataFrame['Summary'].apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 2))
    jira_summaries_3grams = cleanDataFrame['Summary'].apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 3))
    
    endTimeCleaningSummaries = time.time() - startTime
    print("1/" + nOfSteps + ") Finished Cleaning Summaries after " + str(endTimeCleaningSummaries) + " sec")

    #preprocess Descriptions
    jira_descriptions = cleanDataFrame['Description'].apply(lambda x: preprocess(x, porterStemmer, cachedStopWords))
    jira_descriptions_2grams = cleanDataFrame['Description'].apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 2))
    jira_descriptions_3grams = cleanDataFrame['Description'].apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 2))
    
    endTimeCleaningDescriptions = time.time() - startTime
    print("2/" + nOfSteps + ") Finished Cleaning Description after " + str(endTimeCleaningDescriptions) + " sec")

    #preprocess Dates
    jira_creation = cleanDataFrame['Created'].apply(lambda x: preprocess_jira_date(x))
    jira_updated = cleanDataFrame['Updated'].apply(lambda x: preprocess_jira_date(x))
    jira_resolved = cleanDataFrame['Resolved'].apply(lambda x: preprocess_jira_date(x))
    endTimeCleaningDates = time.time() - startTime
    print("3/" + nOfSteps + ") Finished Cleaning Dates after " + str(endTimeCleaningDates) + " sec")

    #Comments take too long for a test run.
    if (preprocessComments == True):
        jira_comments = cleanDataFrame['Comments'].apply(lambda x: preprocess(x, porterStemmer, cachedStopWords))
        jira_comments_2grams = cleanDataFrame['Comments'].apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 2))
        jira_comments_3grams = cleanDataFrame['Comments'].apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 2))
        endTimeCleaningComments = time.time() - startTime
        print("4/" + nOfSteps + ") Finished Cleaning Comments after " + str(endTimeCleaningComments) + " sec")

         #create JIRA corpus by merging Summary and Description
        jira_data = {'Issue_key_jira': cleanDataFrame['Issue key'], 
             'Assignee': cleanDataFrame['Assignee'],
             'Jira_created_date': jira_creation, 
             'Jira_updated_date': jira_updated, 
             'Jira_resolved_date': jira_resolved, 
             'Summary': jira_summaries, 
             'Summary_2grams': jira_summaries_2grams,
             'Summary_3grams': jira_summaries_3grams, 
             'Description': jira_descriptions,
             'Description_2grams': jira_descriptions_2grams,
             'Description_3grams': jira_descriptions_3grams,
             'Comments': jira_comments,
             'Comments_2grams': jira_comments_2grams,
             'Comments_3grams': jira_comments_3grams,
             'Jira_natural_text': jira_summaries +  jira_descriptions + jira_comments,
             'Jira_natural_text_2grams': jira_summaries_2grams +  jira_descriptions_2grams + jira_comments_2grams,
             'Jira_natural_text_3grams': jira_summaries_3grams +  jira_descriptions_3grams + jira_comments_3grams}
    else:
         #create JIRA corpus by merging Summary and Description
        jira_data = {'Issue_key_jira': cleanDataFrame['Issue key'], 
             'Assignee': cleanDataFrame['Assignee'],
             'Jira_created_date': jira_creation, 
             'Jira_updated_date': jira_updated, 
             'Jira_resolved_date': jira_resolved, 
             'Summary': jira_summaries,
             'Summary_2grams': jira_summaries_2grams,
             'Summary_3grams': jira_summaries_3grams,
             'Description': jira_descriptions,
             'Description_2grams': jira_descriptions_2grams,
             'Description_3grams': jira_descriptions_3grams,
             'Jira_natural_text': jira_summaries +  jira_descriptions,
             'Jira_natural_text_2grams': jira_summaries_2grams +  jira_descriptions_2grams,
             'Jira_natural_text_3grams': jira_summaries_3grams +  jira_descriptions_3grams}

    jira_processed_df = pd.DataFrame(data=jira_data)
    
    #Find verbs
    jira_processed_df['verbs'] = jira_processed_df['Jira_natural_text'].apply(lambda x: findVerbs(x))
    
    return(jira_processed_df)

#Input dataframe and num of_comments, and bool to determine if comments need to be cleaned
def cleanJiraData(dataFrame, cleanComments, commentAmount):
    startTime = time.time()

    #create an object of class PorterStemmer
    porterStemmer = PorterStemmer()
    
    #Find all stopwords
    cachedStopWords = stopwords.words("english")

    if (cleanComments == True):
        #Subset only all comments 
        loc_first_comment = dataFrame.columns.get_loc('Comment') # Variable storing the col location of the 1st comment
    
        dataFrame["Comments"] = dataFrame.iloc[:,loc_first_comment:loc_first_comment+commentAmount].apply(
            lambda x: " ".join(x.astype(str)), axis=1)
    
        #First remove the date and comment string from the comments
        dataFrame["Comments"] = dataFrame["Comments"].apply(lambda x: clean_comments(x))

        #Subset JIRA ID, Summary, Description, comments
        jira_issues_subset = dataFrame[["Issue key", "Assignee", "Summary", "Description", "Comments", "Created", "Resolved", "Updated"]]
        cleanedAndProcessedJiraData = preprocessJiraData(jira_issues_subset, preprocessComments = True, porterStemmer = porterStemmer, cachedStopWords = cachedStopWords, startTime = startTime)
        return(cleanedAndProcessedJiraData)
    else: 
        jira_issues_subset = dataFrame[["Issue key", "Assignee", "Summary", "Description", "Created", "Resolved", "Updated"]]
        cleanedAndProcessedJiraData = preprocessJiraData(jira_issues_subset, preprocessComments = False, porterStemmer = porterStemmer, cachedStopWords = cachedStopWords, startTime = startTime)
        return(cleanedAndProcessedJiraData)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rande\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [7]:
#Rename key to Issue key
rawData_JIRA_mxShop = rawData_JIRA_mxShop.rename({'Key': 'Issue key'}, axis=1)

#Clean Data sets
intermediateData_JIRA_mxShop = cleanJiraData(dataFrame = rawData_JIRA_mxShop, cleanComments = False, commentAmount = 39)

#Create a temp XLSX file for all intermediate datasets
intermediateData_JIRA_mxShop.to_excel(excel_writer = "../data/02_intermediate/intermediateData_JIRA_mxShop.xlsx", index = False)

#Create a pickle file for all intermediate datasets
intermediateData_JIRA_mxShop.to_pickle(path= "../data/02_intermediate/intermediateData_JIRA_mxShop.pkl")

1/3) Finished Cleaning Summaries after 0.39092397689819336 sec
2/3) Finished Cleaning Description after 3.141378164291382 sec
3/3) Finished Cleaning Dates after 3.146327495574951 sec


## 2.4 Clean Raw Data - Create JIRA Corpora
Create the corpora for JIRA UNIGRAM

In [8]:
def createCorpusFromDocumentList(token_column):
    token_list = token_column.tolist()
    corpus_list = []
    
    for document in token_list:
        #Only join to the string when a list. When it is not a list, then it is np.NaN, thus no changes
        if(isinstance(document, list)):
            #Transform list to a string for SKLEARN to accept the input.
            token_string = ' '.join(document)
        
            #Add string to the corpus list
            corpus_list.append(token_string)
    return(corpus_list)

In [9]:
#Create JIRA corpus for mxShop dataset
intermediateData_JIRA_mxShopCorpusSummary = createCorpusFromDocumentList(intermediateData_JIRA_mxShop.Summary)
intermediateData_JIRA_mxShopCorpusDescription = createCorpusFromDocumentList(intermediateData_JIRA_mxShop.Description)

#Merge all JIRA Corpora into 1 corpus
intermediateData_JIRA_mxShopCorpus = [i+" "+j for i,j in zip(intermediateData_JIRA_mxShopCorpusSummary,
                                                                             intermediateData_JIRA_mxShopCorpusDescription
                                                                            )]

#Save intermediate pickles
with open('../data/02_intermediate/intermediateData_JIRA_mxShopCorpus.pkl', 'wb') as f:
    pickle.dump(intermediateData_JIRA_mxShopCorpus, f)

Bigram corpora

In [10]:
def createCorpusNGrams(tokenColumn):
    tokenList = tokenColumn.tolist()
    corpusList = []
    
    #Transform to strings
    for document in tokenList:
        if(isinstance(document, list)):
            for ngram in document:
                ngramString = ' '.join(ngram)
                corpusList.append(ngramString)         
    return(corpusList)

In [11]:
#Create JIRA corpus for mxShop dataset
intermediateData_JIRA_mxShopCorpusSummary_2grams = createCorpusNGrams(intermediateData_JIRA_mxShop.Summary_2grams)
intermediateData_JIRA_mxShopCorpusDescription_2grams = createCorpusNGrams(intermediateData_JIRA_mxShop.Description_2grams)

#Merge all JIRA Corpora into 1 corpus
intermediateData_JIRA_mxShopCorpus_2gram = [i+" "+j for i,j in zip(intermediateData_JIRA_mxShopCorpusSummary_2grams,
                                                                             intermediateData_JIRA_mxShopCorpusDescription_2grams
                                                                             )]


#Save intermediate pickles
with open('../data/02_intermediate/intermediateData_JIRA_mxShopCorpus_2gram.pkl', 'wb') as f:
    pickle.dump(intermediateData_JIRA_mxShopCorpus_2gram, f)

## 2.4 Clean Raw Data - Create SVN Corpora
Create the corpora for SVN

In [12]:
intermediateData_SVN_mxShop = pd.read_pickle("../data/02_intermediate/intermediateData_SVN_mxShop.pkl")

In [13]:
#Create corpus for log messages
intermediateData_SVNLogs_mxShopCorpus = createCorpusFromDocumentList(intermediateData_SVN_mxShop.Logs)

#Create corpus for unit names
intermediateData_SVNUnitNames_mxShopCorpus = createCorpusFromDocumentList(intermediateData_SVN_mxShop.Unit_names)

#Create corpus for entire commit (log message + model)
intermediateData_SVN_mxShopCorpus = createCorpusFromDocumentList(intermediateData_SVN_mxShop.Logs + intermediateData_SVN_mxShop.Unit_names)
intermediateData_SVN_mxShopCorpusAll = createCorpusFromDocumentList(intermediateData_SVN_mxShop.Logs + intermediateData_SVN_mxShop.Unit_names)
#Save intermediate pickles
with open('../data/02_intermediate/intermediateData_SVNLogs_mxShopCorpus.pkl', 'wb') as f:
    pickle.dump(intermediateData_SVNLogs_mxShopCorpus, f)

with open('../data/02_intermediate/intermediateData_SVNUnitNames_mxShopCorpus.pkl', 'wb') as f:
    pickle.dump(intermediateData_SVNUnitNames_mxShopCorpus, f)

with open('../data/02_intermediate/intermediateData_SVN_mxShopCorpus.pkl', 'wb') as f:
    pickle.dump(intermediateData_SVN_mxShopCorpus, f)
    
with open('../data/02_intermediate/intermediateData_SVN_mxShopCorpusAll.pkl', 'wb') as f:
    pickle.dump(intermediateData_SVN_mxShopCorpusAll, f)

bigram corpora

In [14]:
intermediateData_SVNLogs_mxShopCorpus_2gram = createCorpusNGrams(intermediateData_SVN_mxShop.Logs_2grams)
intermediateData_SVNUnitNames_mxShopCorpus_2gram = createCorpusNGrams(intermediateData_SVN_mxShop.Unit_names_2grams)
with open('../data/02_intermediate/intermediateData_SVNLogs_mxShopCorpus_2gram.pkl', 'wb') as f:
    pickle.dump(intermediateData_SVNLogs_mxShopCorpus_2gram, f)
    
    
with open('../data/02_intermediate/intermediateData_SVNUnitNames_mxShopCorpus_2gram.pkl', 'wb') as f:
    pickle.dump(intermediateData_SVNUnitNames_mxShopCorpus_2gram, f)

# 3. Preprocess Data

In [15]:
#Run this code block when you've restarted the kernel, and want to use previously gained results.
intermediateData_JIRA_mxShop = pd.read_pickle("../data/02_intermediate/intermediateData_JIRA_mxShop.pkl")

intermediateData_SVN_mxShop = pd.read_pickle("../data/02_intermediate/intermediateData_SVN_mxShop.pkl")

intermediateData_JIRA_mxShopCorpus = pd.read_pickle(r'../data/02_intermediate/intermediateData_JIRA_mxShopCorpus.pkl')
intermediateData_JIRA_mxShopCorpus = pd.read_pickle(r'../data/02_intermediate/intermediateData_JIRA_mxShopCorpus.pkl')
#intermediateData_SVN_mxShopCorpusAll = pd.read_pickle(r'../data/02_intermediate/intermediateData_SVN_mxShopCorpusAll.pkl')
#intermediateData_SVN_mxShopCorpusModel = pd.read_pickle(r'../data/02_intermediate/intermediateData_SVN_mxShopCorpusModel.pkl')
intermediateData_SVN_mxShopCorpus = pd.read_pickle(r'../data/02_intermediate/intermediateData_SVN_mxShopCorpus.pkl')

############# Bigrams


############# Trigrams

## 3.0 Preprocess Data - Create cartesian product JIRA x Commits

In [16]:
#Create cartesian products JIRA x Commits
processedData_mxShopCartesian = intermediateData_JIRA_mxShop.merge(intermediateData_SVN_mxShop, how='cross')

processedData_mxShopCartesian = processedData_mxShopCartesian.drop(processedData_mxShopCartesian[processedData_mxShopCartesian.Jira_created_date > processedData_mxShopCartesian.Commit_date].index)

#Create a pickle file for all intermediate datasets
processedData_mxShopCartesian.to_pickle(path= "../data/03_processed/processedData_mxShopCartesian.pkl")


## 3.1 Preprocess Data - Create Labels

In [17]:
#Create new dataFrames for the time features
processedData_mxShopLabels = pd.DataFrame() 


#Create a column, which indicates which traces are valid.
processedData_mxShopLabels["is_valid"] = processedData_mxShopCartesian.apply(lambda x: checkValidityTrace(x.Issue_key_jira, x.Issue_key_commit), axis=1)
print("Finished creating labels for mxShop")

#Save intermediate results
processedData_mxShopLabels.to_pickle(path= "../data/03_processed/processedData_mxShopLabels.pkl")

processedData_mxShopLabels.info()

Finished creating labels for mxShop
<class 'pandas.core.frame.DataFrame'>
Int64Index: 33627 entries, 1432 to 129969
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   is_valid  33627 non-null  bool 
dtypes: bool(1)
memory usage: 295.5 KB


In [18]:
#processedData_mxShopLabels[processedData_mxShopLabels.is_valid == True].count()
processedData_mxShopLabels[processedData_mxShopLabels.is_valid == True].count()

is_valid    86
dtype: int64

## 3.2 Preprocess Data - Create Time-Related Features

In [19]:
#Create new dataFrames for the time features
processedData_mxShopFeaturesTime = pd.DataFrame() 


#Calculate the time features for data Processing Dataset
processedData_mxShopFeaturesTime['Creation_commit_date_dif'] = processedData_mxShopCartesian.apply(lambda x: calculateTimeDif(x.Jira_created_date, x.Commit_date), axis=1)
processedData_mxShopFeaturesTime['Updated_commit_date_dif'] = processedData_mxShopCartesian.apply(lambda x: calculateTimeDif(x.Jira_updated_date, x.Commit_date), axis=1)
processedData_mxShopFeaturesTime['Resolved_commit_date_dif'] = processedData_mxShopCartesian.apply(lambda x: calculateTimeDif(x.Jira_resolved_date, x.Commit_date), axis=1)
print("Finished data Processing")

#Create a pickle file for all intermediate datasets
processedData_mxShopFeaturesTime.to_pickle(path= "../data/03_processed/processedData_mxShopFeaturesTime.pkl")

Finished data Processing


## 3.3 Preprocess Data - Create Stakeholder-Related Features

In [20]:
#Create new dataFrames for the Stakeholder features
processedData_mxShopFeaturesStakeholder = pd.DataFrame() 

processedData_mxShopFeaturesStakeholder['Assignee_is_commiter'] = processedData_mxShopCartesian.apply(lambda x: checkFullnameEqualsEmail(x.Assignee, x.Email), axis=1)
print("Finished mxShop")

#Create a pickle file for all intermediate datasets
processedData_mxShopFeaturesStakeholder.to_pickle(path= "../data/03_processed/processedData_mxShopFeaturesStakeholder.pkl")


Finished mxShop


## 3.4 Preprocess Data - Create Cosine Similarity Features
### 3.4.1 mxShop - Cosine Similarity UniGrams

In [21]:
from scipy import spatial
import pandas as pd

def calc_vector_representation(document, cv, fittedTF_IDF):        
    #Transform document type to a string
    documentString = document
    
    #Calculate the Term Frequency of the document
    inputDocs = [documentString] 

    # count matrix 
    count_vector = cv.transform(inputDocs) 
 
    #tf-idf scores 
    tf_idf_vector = fittedTF_IDF.transform(count_vector)

    feature_names = cv.get_feature_names() 
 
    #get tfidf vector for first document 
    document_vector=tf_idf_vector[0] 
 
    #print the scores 
    
    # place tf-idf values in a pandas data frame 
    df = pd.DataFrame(document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
    df.sort_values(by=["tfidf"],ascending=False)

    return(document_vector.T.todense())

def calculateCosineSimilarity(document1, document2, cv, fittedTF_IDF):

    #If both doc1 and doc2 are lists
    if (isinstance(document1, list) & isinstance(document2, list)):
        #Transform document to string type
        document1String = ' '.join(document1)
        document2String = ' '.join(document2)

    #Only document1 is a list
    elif(isinstance(document1, list)):
        #Transform document to string type
        document1String = ' '.join(document1)
        document2String = ''

    #Only document2 is a list
    elif(isinstance(document2, list)):
        #Transform document to string type
        document1String = ''
        document2String = ' '.join(document2)
        
    else:
        document1String = ''
        document2String = ''

    vector1 = calc_vector_representation(document1String, cv, fittedTF_IDF)
    vector2 = calc_vector_representation(document2String, cv, fittedTF_IDF)
    
    #The cosine similarity. Produces NaN if no terms are found in the corpus.
    result = 1 - spatial.distance.cosine(vector1, vector2)
    
    return(result)

def calculateCosineSimilarityNGrams(document1, document2, cv, fittedTF_IDF):

    #If both doc1 and doc2 are lists
    if (isinstance(document1, list) & isinstance(document2, list)):
        #Transform document to string type
        document1String = ' '.join(document1)
        document2String = ' '.join(document2)

    #Only document1 is a list
    elif(isinstance(document1, list)):
        #Transform document to string type
        document1String = ' '.join(document1)
        document2String = ''

    #Only document2 is a list
    elif(isinstance(document2, list)):
        #Transform document to string type
        document1String = ''
        document2String = ' '.join(document2)
        
    else:
        document1String = ''
        document2String = ''

    vector1 = calc_vector_representation(document1String, cv, fittedTF_IDF)
    vector2 = calc_vector_representation(document2String, cv, fittedTF_IDF)
    
    #The cosine similarity. Produces NaN if no terms are found in the corpus.
    result = 1 - spatial.distance.cosine(vector1, vector2)
    
    return(result)


def calculateCosineSimilarityWithPOSPruning(document1, document2, cv, fittedTF_IDF, verbList):

    #If both doc1 and doc2 are lists
    if (isinstance(document1, list) & isinstance(document2, list)):
        #Transform document to string type
        document1String = ' '.join(document1)
        document2String = ' '.join(document2)

    #Only document1 is a list
    elif(isinstance(document1, list)):
        #Transform document to string type
        document1String = ' '.join(document1)
        document2String = ''

    #Only document2 is a list
    elif(isinstance(document2, list)):
        #Transform document to string type
        document1String = ''
        document2String = ' '.join(document2)
        
    else:
        document1String = ''
        document2String = ''

    vector1 = calc_vector_representation(document1String, cv, fittedTF_IDF)
    vector2 = calc_vector_representation(document2String, cv, fittedTF_IDF)
    
    #The cosine similarity. Produces NaN if no terms are found in the corpus.
    result = 1 - spatial.distance.cosine(vector1, vector2)
    
    verbCounter = 0
    if(isinstance(document2, list)):
        for token in document2:
            if token in verbList:
                verbCounter = verbCounter + 1
    
    if verbCounter > 0:
        result = result * (1 + (0.1 * verbCounter))
    else:
        result = 0
    
    return(result)

In [22]:
#Instantiate the count vectorizer and tfidf for the corpus
from sklearn.feature_extraction.text import CountVectorizer 

######################################################
#                       mxShop              #
######################################################

################# Unigrams ###############
#instantiate CountVectorizer() for SVN
processedData_SVN_mxShopCountVectorizer = CountVectorizer()
processedData_SVN_mxShopCountTF_IDF = createFittedTF_IDF(processedData_SVN_mxShopCountVectorizer, intermediateData_SVN_mxShopCorpus)

processedData_SVNLogs_mxShopCountVectorizer = CountVectorizer()
processedData_SVNLogs_mxShopCountTF_IDF = createFittedTF_IDF(processedData_SVNLogs_mxShopCountVectorizer, intermediateData_SVNLogs_mxShopCorpus)

processedData_SVNUnitNames_mxShopCountVectorizer = CountVectorizer()
processedData_SVNUnitNames_mxShopCountTF_IDF = createFittedTF_IDF(processedData_SVNUnitNames_mxShopCountVectorizer, intermediateData_SVNUnitNames_mxShopCorpus)

#instantiate CountVectorizer() for JIRA - unigram
processedData_JIRA_mxShopCountVectorizer = CountVectorizer()
processedData_JIRA_mxShopCountTF_IDF = createFittedTF_IDF(processedData_JIRA_mxShopCountVectorizer, intermediateData_JIRA_mxShopCorpus)

processedData_JIRASummaries_mxShopCountVectorizer = CountVectorizer()
processedData_JIRASummaries_mxShopCountTF_IDF = createFittedTF_IDF(processedData_JIRASummaries_mxShopCountVectorizer, intermediateData_JIRA_mxShopCorpusSummary)

processedData_JIRADescriptions_mxShopCountVectorizer = CountVectorizer()
processedData_JIRADescriptions_mxShopCountTF_IDF = createFittedTF_IDF(processedData_JIRADescriptions_mxShopCountVectorizer, intermediateData_JIRA_mxShopCorpusDescription)

#processedData_JIRAComments_mxShopCountVectorizer = CountVectorizer()
#processedData_JIRAComments_mxShopCountTF_IDF = createFittedTF_IDF(processedData_JIRAComments_mxShopCountVectorizer, intermediateData_JIRA_mxShopCorpusComments)


################# Bigrams ###############
#instantiate CountVectorizer() for SVN - bigrams
processedData_SVNLogs_mxShopCountVectorizer_2gram = CountVectorizer(ngram_range=(2, 2))
processedData_SVNLogs_mxShopCountTF_IDF_2gram = createFittedTF_IDF(processedData_SVNLogs_mxShopCountVectorizer_2gram, intermediateData_SVNLogs_mxShopCorpus_2gram)

processedData_SVNUnitNames_mxShopCountVectorizer_2gram = CountVectorizer()
processedData_SVNUnitNames_mxShopCountTF_IDF_2gram = createFittedTF_IDF(processedData_SVNUnitNames_mxShopCountVectorizer_2gram, intermediateData_SVNUnitNames_mxShopCorpus_2gram)


#instantiate CountVectorizer() for JIRA - biigram
processedData_JIRA_mxShopCountVectorizer_2gram = CountVectorizer(ngram_range=(2, 2))
processedData_JIRA_mxShopCountTF_IDF_2gram = createFittedTF_IDF(processedData_JIRA_mxShopCountVectorizer_2gram, intermediateData_JIRA_mxShopCorpus_2gram)

processedData_JIRASummaries_mxShopCountVectorizer_2gram = CountVectorizer(ngram_range=(2, 2))
processedData_JIRASummaries_mxShopCountTF_IDF_2gram = createFittedTF_IDF(processedData_JIRASummaries_mxShopCountVectorizer_2gram, intermediateData_JIRA_mxShopCorpusSummary_2grams)

processedData_JIRADescriptions_mxShopCountVectorizer_2gram = CountVectorizer(ngram_range=(2, 2))
processedData_JIRADescriptions_mxShopCountTF_IDF_2gram = createFittedTF_IDF(processedData_JIRADescriptions_mxShopCountVectorizer_2gram, intermediateData_JIRA_mxShopCorpusDescription_2grams)

#processedData_JIRAComments_mxShopCountVectorizer_2gram = CountVectorizer(ngram_range=(2, 2))
#processedData_JIRAComments_mxShopCountTF_IDF_2gram = createFittedTF_IDF(processedData_JIRAComments_mxShopCountVectorizer_2gram, intermediateData_JIRA_mxShopCorpusComments_2grams)




#### 3.4.1 [VSM unigram] Similarity between JIRA issue and Commit Log - Jira As Query

In [23]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmLogsJiraAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmLogsJiraAsQuery["vsm_logs_jira_as_query"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Logs, processedData_JIRA_mxShopCountVectorizer, processedData_JIRA_mxShopCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmLogsJiraAsQuery.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmLogsJiraAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 3 minutes and 40.32604956626892 seconds


#### 3.4.2 [VSM unigram] Similarity between JIRA issue and Commit Log - Log As Query

In [24]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmLogsLogAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmLogsLogAsQuery["vsm_logs_log_as_query"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Logs, processedData_SVNLogs_mxShopCountVectorizer, processedData_SVNLogs_mxShopCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmLogsLogAsQuery.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmLogsLogAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

  dist = 1.0 - uv / np.sqrt(uu * vv)


Finished creating 'VSM Logs Jira as query' after 1 minutes and 27.145812034606934 seconds


#### 3.4.3 [VSM unigram] Similarity between JIRA issue and Unit Names - JIRA As Query

In [25]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmUnitNamesJiraAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmUnitNamesJiraAsQuery["vsm_unit_names_jira_as_query"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Unit_names, processedData_JIRA_mxShopCountVectorizer, processedData_JIRA_mxShopCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmUnitNamesJiraAsQuery.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmUnitNamesJiraAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 3 minutes and 43.94310140609741 seconds


#### 3.4.1 [VSM unigram] Similarity between JIRA Summary and Commit Log - Jira As Query

In [26]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmSummaryLogsSummaryAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmSummaryLogsSummaryAsQuery["vsm_summary_logs_summary_as_query"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Summary, x.Logs, processedData_JIRASummaries_mxShopCountVectorizer, processedData_JIRASummaries_mxShopCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmSummaryLogsSummaryAsQuery.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmSummaryLogsSummaryAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 2 minutes and 37.656718492507935 seconds


#### 3.4.1 [VSM unigram] Similarity between JIRA Summary and Commit Log - Log As Query

In [27]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmSummaryLogsLogsAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmSummaryLogsLogsAsQuery["vsm_summary_logs_logs_as_query"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Summary, x.Logs, processedData_SVNLogs_mxShopCountVectorizer, processedData_SVNLogs_mxShopCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmSummaryLogsLogsAsQuery.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmSummaryLogsLogsAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 1 minutes and 25.61216139793396 seconds


#### 3.4.1 [VSM unigram] Similarity between JIRA Summary and UnitNames - Summary As Query

In [28]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmSummaryUnitNamesSummaryAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmSummaryUnitNamesSummaryAsQuery["vsm_summary_unitNames_summary_as_query"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Summary, x.Unit_names, processedData_JIRASummaries_mxShopCountVectorizer, processedData_JIRASummaries_mxShopCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmSummaryUnitNamesSummaryAsQuery.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmSummaryUnitNamesSummaryAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 2 minutes and 55.66054844856262 seconds


#### 3.4.1 [VSM unigram] Similarity between JIRA Summary and UnitNames - UnitNames As Query

In [29]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmSummaryUnitNamesUnitNamesAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmSummaryUnitNamesUnitNamesAsQuery["vsm_summary_unitNames_unitNames_as_query"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Summary, x.Unit_names, processedData_SVNUnitNames_mxShopCountVectorizer, processedData_SVNUnitNames_mxShopCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmSummaryUnitNamesSummaryAsQuery.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmSummaryUnitNamesUnitNamesAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 2 minutes and 32.42275404930115 seconds


#### 3.4.3 [VSM unigram - verb pruning] Similarity between JIRA issue and Unit Names - JIRA As Query

In [30]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmVerbPruningUnitNamesJiraAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmVerbPruningUnitNamesJiraAsQuery["vsm_verb_pruning_unit_names_jira_as_query"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarityWithPOSPruning(x.Jira_natural_text, x.Unit_names, processedData_JIRA_mxShopCountVectorizer, processedData_JIRA_mxShopCountTF_IDF, x.verbs), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmVerbPruningUnitNamesJiraAsQuery.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmVerbPruningUnitNamesJiraAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query and verb pruning' after " + timeDifference)

Finished creating 'VSM Logs Jira as query and verb pruning' after 3 minutes and 42.609020471572876 seconds


#### 3.4.4 [VSM unigram] Similarity between JIRA issue and Unit Names  - Unit Names As Query

In [31]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmUnitNamesUnitNamesAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmUnitNamesUnitNamesAsQuery["vsm_unit_names_log_as_query"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Unit_names, processedData_SVNUnitNames_mxShopCountVectorizer, processedData_SVNUnitNames_mxShopCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmUnitNamesUnitNamesAsQuery.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmUnitNamesUnitNamesAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 2 minutes and 30.3442542552948 seconds


#### 3.4.5 [VSM unigram] Similarity between JIRA description and commit log - Description as query

In [32]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmDescriptionDescriptionAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmDescriptionDescriptionAsQuery["vsm_description_description_as_query"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Description, x.Logs, processedData_JIRADescriptions_mxShopCountVectorizer, processedData_JIRADescriptions_mxShopCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmDescriptionDescriptionAsQuery.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmDescriptionDescriptionAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 3 minutes and 38.9091432094574 seconds


#### 3.4.5 [VSM unigram Silarity between JIRA description and commit log - Log as descrintion

In [33]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmDescriptionLogsAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmDescriptionLogsAsQuery["vsm_description_log_as_query"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Description, x.Unit_names, processedData_SVNUnitNames_mxShopCountVectorizer, processedData_SVNUnitNames_mxShopCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmDescriptionLogsAsQuery.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmDescriptionLogsAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 2 minutes and 31.22728157043457 seconds


#### 3.4.5 [VSM unigram Silarity between JIRA Comment and unitnames - Comment as query

#### 3.4.5 [VSM unigram Silarity between JIRA Comment and unitnames - Comment as query

#### 3.4.5 [VSM unigram Silarity between JIRA Comment and commit log - Comment as description

#### 3.4.5 [VSM unigram Silarity between JIRA description and commit log - Log as description

#### [VSM bigram] Similarity between JIRA comments and Commit Logs - Logs as query

#### 3.4.5 [VSM bigram] Silarity between JIRA Comment and commit log - Comment as query

#### [VSM Unigram] Similarity between Unit Names and Description - Unit Names as query

In [34]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmUnitNamesDescriptionUnitNamesAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmUnitNamesDescriptionUnitNamesAsQuery["vsm_unitnames_description_unitnames_as_query"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Description, x.Unit_names, processedData_SVNUnitNames_mxShopCountVectorizer, processedData_SVNUnitNames_mxShopCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmUnitNamesDescriptionUnitNamesAsQuery.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmUnitNamesDescriptionUnitNamesAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 2 minutes and 32.14757180213928 seconds


#### [VSM Unigram] Similarity between Unit Names and Description - Description as query

In [35]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmUnitNamesDescriptionDescriptionAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmUnitNamesDescriptionDescriptionAsQuery["vsm_unitnames_description_description_as_query"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Description, x.Unit_names, processedData_JIRADescriptions_mxShopCountVectorizer, processedData_JIRADescriptions_mxShopCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmUnitNamesDescriptionDescriptionAsQuery.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmUnitNamesDescriptionDescriptionAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 3 minutes and 39.299832344055176 seconds


#### [VSM Unigram] Similarity between Unit Names and Comments - Unit Names as query

#### [VSM Unigram] Similarity between Unit Names and Comments - Comments as query

In [36]:
#### [VSM Unigram] Similarity between SVN (entirely) and JIRA (entirely)- JIRA as query

In [37]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmSvnJiraJiraAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmSvnJiraJiraAsQuery["vsm_svn_jira_jira_as_query"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Commit_natural_text, processedData_JIRA_mxShopCountVectorizer, processedData_JIRA_mxShopCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmSvnJiraJiraAsQuery.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmSvnJiraJiraAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 3 minutes and 42.32733869552612 seconds


In [38]:
#### [VSM Unigram] Similarity between SVN (entirely) and JIRA (entirely) - SVN as query

In [39]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmSvnJiraSvnAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmSvnJiraSvnAsQuery["vsm_svn_jira_svn_as_query"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Commit_natural_text, processedData_SVN_mxShopCountVectorizer, processedData_SVN_mxShopCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmSvnJiraSvnAsQuery.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmSvnJiraSvnAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 2 minutes and 37.55129528045654 seconds


In [40]:
#### [VSM Unigram] Similarity between SVN (entirely) and Summary - SVN as query

In [41]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmSvnSummarySvnAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmSvnSummarySvnAsQuery["vsm_svn_summary_svn_as_query"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Commit_natural_text, x.Summary, processedData_SVN_mxShopCountVectorizer, processedData_SVN_mxShopCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmSvnSummarySvnAsQuery.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmSvnSummarySvnAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 2 minutes and 33.45942139625549 seconds


In [42]:
#### [VSM Unigram] Similarity between SVN (entirely) and Summary - Summary as query

In [43]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmSvnSummarySummaryAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmSvnSummarySummaryAsQuery["vsm_svn_summary_summary_as_query"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Commit_natural_text, x.Summary, processedData_JIRASummaries_mxShopCountVectorizer, processedData_JIRASummaries_mxShopCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmSvnSummarySummaryAsQuery.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmSvnSummarySummaryAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 2 minutes and 38.702887296676636 seconds


In [44]:
#### [VSM Unigram] Similarity between SVN (entirely) and Description - SVN as query

In [45]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmSvnDescriptionSvnAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmSvnDescriptionSvnAsQuery["vsm_svn_description_svn_as_query"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Commit_natural_text, x.Description, processedData_SVN_mxShopCountVectorizer, processedData_SVN_mxShopCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmSvnDescriptionSvnAsQuery.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmSvnDescriptionSvnAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 2 minutes and 37.61386013031006 seconds


In [46]:
#### [VSM Unigram] Similarity between SVN (entirely) and Description - Description as query

In [47]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmSvnDescriptionDescriptionAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmSvnDescriptionDescriptionAsQuery["vsm_svn_description_description_as_query"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Commit_natural_text, x.Description, processedData_JIRADescriptions_mxShopCountVectorizer, processedData_JIRADescriptions_mxShopCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmSvnDescriptionDescriptionAsQuery.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmSvnDescriptionDescriptionAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 3 minutes and 38.98972415924072 seconds


In [48]:
#### [VSM Unigram] Similarity between SVN (entirely) and Comments - SVN as query

In [49]:
#### [VSM Unigram] Similarity between SVN (entirely) and Comments - Comments as query

#### 3.4.3 [VSM unigram - verb pruning] Similarity between JIRA issue and Unit Names and verb pruning - Unit Names As Query

In [50]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmVerbPruningUnitNamesUnitNamesAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmVerbPruningUnitNamesUnitNamesAsQuery["vsm_verb_pruning_unit_names_log_as_query"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarityWithPOSPruning(x.Jira_natural_text, x.Unit_names, processedData_SVNUnitNames_mxShopCountVectorizer, processedData_SVNUnitNames_mxShopCountTF_IDF, x.verbs), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmVerbPruningUnitNamesUnitNamesAsQuery.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmVerbPruningUnitNamesUnitNamesAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 2 minutes and 33.43541121482849 seconds


#### 3.4.5 [VSM bigram] Similarity between JIRA issue and Commit Log - Jira As Query

In [51]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmLogsJiraAsQuery_2gram = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmLogsJiraAsQuery_2gram["vsm_logs_jira_as_query_2gram"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Logs, processedData_JIRA_mxShopCountVectorizer_2gram, processedData_JIRA_mxShopCountTF_IDF_2gram), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmLogsJiraAsQuery_2gram.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmLogsJiraAsQuery_2gram.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 3 minutes and 57.775928020477295 seconds


#### 3.4.6 [VSM bigram] Similarity between JIRA issue and Commit Log - Logs As Query

In [52]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmLogsLogAsQuery_2gram = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmLogsLogAsQuery_2gram["vsm_logs_log_as_query_2gram"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Logs, processedData_SVNLogs_mxShopCountVectorizer_2gram, processedData_SVNLogs_mxShopCountTF_IDF_2gram), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmLogsLogAsQuery_2gram.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmLogsLogAsQuery_2gram.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 1 minutes and 28.329726219177246 seconds


#### 3.4.6 [VSM bigram] Similarity between JIRA issue and Unit Names - Jira As Query

In [53]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmUnitNamesJiraAsQuery_2gram = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmUnitNamesJiraAsQuery_2gram["vsm_unit_names_jira_as_query_2gram"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Unit_names, processedData_JIRA_mxShopCountVectorizer_2gram, processedData_JIRA_mxShopCountTF_IDF_2gram), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmUnitNamesJiraAsQuery_2gram.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmUnitNamesJiraAsQuery_2gram.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 3 minutes and 6.739144325256348 seconds


#### 3.4.6 [VSM bigram] Similarity between JIRA issue and Unit Names - UnitNames As Query

In [54]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmUnitNamesUnitNamesAsQuery_2gram = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmUnitNamesUnitNamesAsQuery_2gram["vsm_unit_names_log_as_query_2gram"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Unit_names, processedData_SVNUnitNames_mxShopCountVectorizer_2gram, processedData_SVNUnitNames_mxShopCountTF_IDF_2gram), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmUnitNamesUnitNamesAsQuery_2gram.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmUnitNamesUnitNamesAsQuery_2gram.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 2 minutes and 33.26380109786987 seconds


#### [VSM bigram] Similarity between Logs and Description - Logs as Query

In [55]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmDescriptionLogsAsQuery_2gram = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmDescriptionLogsAsQuery_2gram["vsm_description_log_as_query_2gram"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Description, x.Unit_names, processedData_SVNUnitNames_mxShopCountVectorizer_2gram, processedData_SVNUnitNames_mxShopCountTF_IDF_2gram), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmDescriptionLogsAsQuery_2gram.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmDescriptionLogsAsQuery_2gram.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 1 minutes and 29.75190806388855 seconds


#### [VSM bigram] Similarity between Logs and Description - Description as Query

In [56]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmDescriptionDescriptionAsQuery_2gram = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmDescriptionDescriptionAsQuery_2gram["vsm_description_description_as_query_2gram"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarity(x.Description, x.Logs, processedData_JIRADescriptions_mxShopCountVectorizer_2gram, processedData_JIRADescriptions_mxShopCountTF_IDF_2gram), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmDescriptionDescriptionAsQuery_2gram.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmDescriptionDescriptionAsQuery_2gram.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Bigrams' after " + timeDifference)

Finished creating 'VSM Bigrams' after 5 minutes and 32.54574370384216 seconds


#### [VSM bigram] Similarity between Logs and Summary - Logs as Query

#### [VSM bigram] Similarity between Logs and Summary - Summary as Query

In [57]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_mxShop_features_VsmSummaryLogsSummaryAsQuery_2gram = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_mxShop_features_VsmSummaryLogsSummaryAsQuery_2gram["vsm_summary_logs_summary_as_query_2gram"] = processedData_mxShopCartesian.apply(lambda x: calculateCosineSimilarityNGrams(x.Summary, x.Logs, processedData_JIRASummaries_mxShopCountVectorizer_2gram, processedData_JIRASummaries_mxShopCountTF_IDF_2gram), 
                                                            axis=1)

#Save results in pickle
processedData_mxShop_features_VsmSummaryLogsSummaryAsQuery_2gram.to_pickle(path= "../data/03_processed/processedData_mxShop_features_VsmSummaryLogsSummaryAsQuery_2gram.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 2 minutes and 45.65132427215576 seconds


## 3.6 Document Statistics

### mxShop

In [58]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRA_mxShopFeaturesUniqueWordCount = pd.DataFrame() 
processedData_SVN_mxShopFeaturesUniqueWordCount = pd.DataFrame() 
processedData_JIRA_mxShopFeaturesTotalWordCount = pd.DataFrame() 
processedData_SVN_mxShopFeaturesTotalWordCount = pd.DataFrame()

processedData_JIRA_mxShopFeaturesOverlapPercentage = pd.DataFrame()
processedData_SVN_mxShopFeaturesOverlapPercentage = pd.DataFrame()
processedData_UNION_mxShopFeaturesOverlapPercentage = pd.DataFrame()

#Calculate unique terms JIRA for each trace
processedData_JIRA_mxShopFeaturesUniqueWordCount["unique_term_count_jira"] = processedData_mxShopCartesian.apply(lambda x: calculateUniqueWordCount(x.Jira_natural_text), 
                                                            axis=1)
#Calculate unique terms JIRA for each trace
processedData_SVN_mxShopFeaturesUniqueWordCount["unique_term_count_svn"] = processedData_mxShopCartesian.apply(lambda x: calculateUniqueWordCount(x.Commit_natural_text), 
                                                            axis=1)

#Calculate total terms JIRA for each trace
processedData_JIRA_mxShopFeaturesTotalWordCount["total_term_count_jira"] = processedData_mxShopCartesian.apply(lambda x: calculateTotalWordCount(x.Jira_natural_text), 
                                                            axis=1)
#Calculate total terms JIRA for each trace
processedData_SVN_mxShopFeaturesTotalWordCount["total_term_count_svn"] = processedData_mxShopCartesian.apply(lambda x: calculateTotalWordCount(x.Commit_natural_text), 
                                                            axis=1)

processedData_JIRA_mxShopFeaturesOverlapPercentage["overlap_percentage_compared_to_jira"] = processedData_mxShopCartesian.apply(lambda x: calculateOverlapBetweenDocuments(x.Jira_natural_text, x.Commit_natural_text, 'list1'),
                                                            axis=1)
processedData_SVN_mxShopFeaturesOverlapPercentage["overlap_percentage_compared_to_svn"] = processedData_mxShopCartesian.apply(lambda x: calculateOverlapBetweenDocuments(x.Jira_natural_text, x.Commit_natural_text, 'list2'),
                                                            axis=1)
processedData_UNION_mxShopFeaturesOverlapPercentage["overlap_percentage_compared_to_union"] = processedData_mxShopCartesian.apply(lambda x: calculateOverlapBetweenDocuments(x.Jira_natural_text, x.Commit_natural_text, 'union'),
                                                            axis=1)





#Save results in pickle
processedData_JIRA_mxShopFeaturesUniqueWordCount.to_pickle(path= "../data/03_processed/processedData_JIRA_mxShopFeaturesUniqueWordCount.pkl")
processedData_SVN_mxShopFeaturesUniqueWordCount.to_pickle(path= "../data/03_processed/processedData_SVN_mxShopFeaturesUniqueWordCount.pkl")
processedData_JIRA_mxShopFeaturesTotalWordCount.to_pickle(path= "../data/03_processed/processedData_JIRA_mxShopFeaturesTotalWordCount.pkl")
processedData_SVN_mxShopFeaturesTotalWordCount.to_pickle(path= "../data/03_processed/processedData_SVN_mxShopFeaturesTotalWordCount.pkl")

processedData_JIRA_mxShopFeaturesOverlapPercentage.to_pickle(path= "../data/03_processed/processedData_JIRA_mxShopFeaturesOverlapPercentage.pkl")
processedData_SVN_mxShopFeaturesOverlapPercentage.to_pickle(path= "../data/03_processed/processedData_SVN_mxShopFeaturesOverlapPercentage.pkl")
processedData_UNION_mxShopFeaturesOverlapPercentage.to_pickle(path= "../data/03_processed/processedData_UNION_mxShopFeaturesOverlapPercentage.pkl")



endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating document statistics in " + timeDifference)

Finished creating document statistics in 0 minutes and 6.26347541809082 seconds


## 3.7 Query Quality

In [59]:
#Instantiate the count vectorizer and tfidf for the corpus
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import TfidfVectorizer 
from statistics import mean, median, mode, stdev, variance
from math import log, sqrt
import itertools

#Function calculating the IDFs of all query terms. Returns a list containing all IDFs
def calcIDFList(document, cv, tfidf_transformer):
    idfScoreList=[]
    if isinstance(document, list):
        termCount = len(document)
        for term in document:
            try:
                indexOfWord = cv.get_feature_names().index(term)
                idfScore = tfidf_transformer.idf_[indexOfWord]
                idfScoreList.append(idfScore)
            except:
                idfScoreList.append(0)
    else:
        termCount = 0
    return(idfScoreList)


def calcAvgIDF(IDFList):
    termCount = len(IDFList)
    if(termCount != 0):
        avgIdf = sum(IDFList) / termCount
    else:
        avgIdf = 0
    return(avgIdf)

def calcMaxIDF(IDFList): 
    termCount = len(IDFList)
    if(termCount != 0):
        maxIdf = np.amax(IDFList)
    else: 
        maxIdf = 0
    return(maxIdf)

def calcDevIDF(IDFList):
    termCount = len(IDFList)
    if(termCount > 1):
        stdevIdf = stdev(IDFList)
    else: 
        stdevIdf = 0
    return(stdevIdf)

#Function calculating the ICTF of all query terms. Returns a list containing all IDFs
def calcICTFList(document, cv, documentCount):
    ICTFList = []
        #For all terms in query, find how often they occur in the Corpus
    if isinstance(document, list):
        for term in document:
            try:
            #Find out how often the term occurs in the corpus
                termFrequency = (cv.vocabulary_[term])
                
                #Compute the log
                ictF = log(documentCount/termFrequency)
            except:
                ictF = 0
            
            ICTFList.append(ictF)
    return(ICTFList)

def calcAvgICTF(ICTFList, documentCount):
    avgICTF = sum(ICTFList) / documentCount
    return(avgICTF)


def calcMaxICTF(ICTFList): 
    termCount = len(ICTFList)
    if(termCount != 0):
        maxICTF = np.amax(ICTFList)
    else: 
        maxICTF = 0
    return(maxICTF)

def calcDevICTF(ICTFList):
    termCount = len(ICTFList)
    if(termCount > 1):
        stdevICTF = stdev(ICTFList)
    else: 
        stdevICTF = 0
    return(stdevICTF)


def calcEntropyList(query, cv, documentCount, docCollection):
    #entropy(t) = ∑ (d∈Dt)  ( tf(t,d) / tf(t, D) ) * log |D|(tf(t,d) / tf(t, D) )
        
    entropyValueList = []
    #for each term in the query, calculate the entropy of the query
    if isinstance(query, list):
        for queryTerm in query:
            #For each d ∈ D
            
            partialEntropyList = []
            
            for d in docCollection:
                #Check if queryTerm occurs in D (i.e/ d∈Dt)
                if (isinstance(d, list)):
                    if queryTerm in d:
                        try:
                            #Calculate the frequency of the term occurs in the document (i.e tf(t,d))
                            queryTermFrequencyInDocument = d.count(queryTerm)
                            
                            #calculate the frequency the term occurs in the query corpus (i.e tf(t,D))
                            queryTermFrequencyInCorpus = (cv.vocabulary_[queryTerm])
                             
                            # This part of the calculation tf(t,d) / tf(t, D)  * log |D|(tf(t,d) / tf(t, D))
                            partialEntropy1stHalf = queryTermFrequencyInDocument / queryTermFrequencyInCorpus
                            partialEntropy2ndHalf = log((queryTermFrequencyInDocument / queryTermFrequencyInCorpus), documentCount)
                            partialEntropy = partialEntropy1stHalf
                            partialEntropyList.append(partialEntropy)
                        except:
                            partialEntropyList.append(0) #If term not found entropy is 0
            #this part of the calculation ∑ (d∈Dt)
            entropyValueOfQueryTerm = sum(partialEntropyList)
            entropyValueList.append(entropyValueOfQueryTerm)
    
    return(entropyValueList)


def calcAvgEntropy(entropyValueList):
    termCount = len(entropyValueList)
    if(termCount != 0):
        #Calculate the average of all the entropies
        avgEntropy = sum(entropyValueList) / len(entropyValueList)
    else:
        avgEntropy = 0
    return(avgEntropy)

    
def calcMedEntropy(entropyValueList):
    termCount = len(entropyValueList)
    if(termCount != 0):
        #Calculate the average of all the entropies
        medEntropy = median(entropyValueList)
    else:
        medEntropy = 0
    return(medEntropy)
    
def calcMaxEntropy(entropyValueList):
    termCount = len(entropyValueList)
    if(termCount != 0):
        maxEntropy = np.amax(entropyValueList)
    else: 
        maxEntropy = 0
    return(maxEntropy)
    
def calcDevEntropy(entropyValueList):
    termCount = len(entropyValueList)
    if(termCount > 1):
        #Calculate the average of all the entropies
        devEntropy = stdev(entropyValueList)
    else:
        devEntropy = 0
    return(devEntropy)

#The percentage of documents in the collection containing at least one of the query terms
def calcQueryScope(query, docCollection): 
    counter = 0
    if isinstance(query, list):
        for document in docCollection:
            #check if query occurs in term. 
            if(isinstance(document, list)):
                for queryTerm in query:
                    if queryTerm in document:
                        counter = counter + 1
                        break
    queryScope = counter / len(docCollection)
    return(queryScope)

#The Kullback-Leiber divergence of the query language model from the collection language model
def calcSCS(query, cv, docCount):
    divergenceList = []
    if isinstance(query, list):
        for queryTerm in query:
            try:
                #frequency of term in query - tf(q, Q)/|Q|
                pqQ = query.count(queryTerm) / len(query)
                
                #frequency of term in documentlist - tf(q, D)/|D|
                pqD = cv.vocabulary_[queryTerm]
                
                divergence = pqQ * log(pqQ / pqD)
                divergenceList.append(divergence)
            except:
                continue
    SCS = sum(divergenceList)
    return(SCS)

#The average of the collection-query similarity (SCQ) over all query terms
def calcSCQList(query, docCollection, cv, fittedTF_IDF, documentCount):
    SCQList = []
    if isinstance(query, list):
        documentString = ' '.join(query)
        
        #Calculate the Term Frequency of the document
        inputDocs = [documentString] 
        
        # count matrix 
        count_vector = cv.transform(inputDocs) 
 
        #tf-idf scores 
        tf_idf_vector = fittedTF_IDF.transform(count_vector)
        
        feature_names = cv.get_feature_names() 
        # place tf-idf values in a pandas data frame 
        df = pd.DataFrame(tf_idf_vector.T.todense(), 
                          index=feature_names, columns=["tfidf"])
    
        
        #Find the tfidf of the term
        for queryTerm in query:    
            try:
                tfidf = df["tfidf"][queryTerm]
                SCQ = (1 + log(tfidf))
                SCQList.append(SCQ)
            except:
                continue
        
    avgSCQ = sum(SCQList) / documentCount
    return(SCQList)

#The average of the collection-query similarity (SCQ) over all query terms
def calcAvgSCQ(SCQList, documentCount):
    avgSCQ = sum(SCQList) / documentCount
    return(avgSCQ)
    
#The average of the collection-query similarity (SCQ) over all query terms
def calcMaxSCQ(SCQList):
    termCount = len(SCQList)
    if(termCount != 0):
        maxSCQ = np.amax(SCQList)
    else:
        maxSCQ = np.NaN
    return(maxSCQ)

#The average of the collection-query similarity (SCQ) over all query terms
def calcSumSCQ(SCQList):
    sumSCQ = sum(SCQList)
    return(sumSCQ)

def createTermPairs(cv):
    terms = list(cv.vocabulary_.keys())
    #Create all possible pair combinations from the terms in the query 
    pairCombinationList = list(itertools.combinations(terms, 2))
    return(pairCombinationList)

#Method to find out how often a term occurs in a document
def findTermFrequencies(cv, docCollection):
    terms = list(cv.vocabulary_.keys())
    termFrequencies = {}
    for term in terms:
        termCounter = 0
        for document in docCollection:
            if isinstance(document, list):
                if term in document: 
                    termCounter = termCounter + 1
        termFrequencies[term] = termCounter
    return(termFrequencies)

#Method to find out how often both terms occur in a document. 
def findTermPairFrequencies(termPairs, docCollection):
    termPairFrequencies = {}
    for termPair in termPairs:
        termPairCount = 0
        for document in docCollection:
            if (isinstance(document, list)):
                if all(i in document for i in termPair):
                    termPairCount = termPairCount + 1
        termPairFrequencies[termPair] = termPairCount
    return(termPairFrequencies)   

def calcPMIList(query, termFrequencies, termPairFrequencies, docCollection):
    if isinstance(query, list):
    #Find the frequencies of the individual terms and the pairs
        pairCombinationList = list(itertools.combinations(query, 2))
        termOccurances = []
        for pair in pairCombinationList:
            try:
                q1Freq = termFrequencies[pair[0]]
            except:
                q1Freq = 0
            try:
                q2Freq = termFrequencies[pair[1]]
            except:
                q2Freq = 0
            try:
                q1q2Freq = termPairFrequencies[pair]
            except:
                q1q2Freq = 0
                    
            termOccurances.append({'q1Freq': q1Freq, 
                                   'q2Freq': q2Freq, 
                                   'q1q2Freq': q1q2Freq})
    
        docCount = len(docCollection)
        pmiList = []
        for term in termOccurances:
            pq1 = term['q1Freq'] / docCount
            pq2 = term['q2Freq'] / docCount
            pq1q2 = term['q1q2Freq'] / docCount

            try:
                pmi = log(pq1q2 /(pq1 * pq2))
            except:
                pmi = np.nan
            pmiList.append(pmi)
        return(pmiList)
    else:
        return(np.nan)

def calcAvgPMI(pmiList):
    if(isinstance(pmiList, list)):
        pairCount = len(pmiList)
        if(pairCount != 0):
            #Calculate the average of all the entropies
            avgPMI= np.nansum(pmiList) / pairCount
        else:
            avgPMI = 0
        return(avgPMI)
    return(np.nan)

def calcMaxPMI(pmiList): 
    if(isinstance(pmiList, list)):
        pairCount = len(pmiList)
        if(pairCount != 0):
            maxPMI = np.nanmax(pmiList)
        else: 
            maxPMI = np.nan
        return(maxPMI)
    return(np.nan)

In [60]:
#Read datasets from disk
processedData_mxShopCartesian = pd.read_pickle(r"../data/03_processed/processedData_mxShopCartesian.pkl")

#instantiate CountVectorizer() for SVN
processedData_SVN_mxShopCountVectorizer = CountVectorizer()
processedData_SVN_mxShopTF_IDF = createFittedTF_IDF(processedData_SVN_mxShopCountVectorizer, intermediateData_SVN_mxShopCorpusAll)

#instantiate CountVectorizer() for JIRA
processedData_JIRA_mxShopCountVectorizer = CountVectorizer()
processedData_JIRA_mxShopTF_IDF = createFittedTF_IDF(processedData_JIRA_mxShopCountVectorizer, intermediateData_JIRA_mxShopCorpus)

#Determine document counts
intermediateData_JIRA_mxShop_documentCount = len(intermediateData_JIRA_mxShop.index)
intermediateData_SVN_mxShop_documentCount = len(intermediateData_SVN_mxShop.index)



#### IDF Scores (SVN as Query)

In [61]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVN_mxShopFeaturesIDF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVN_mxShopFeaturesIDF["SvnAsQuery_IDF"] = processedData_mxShopCartesian.apply(lambda x: calcIDFList(x.Commit_natural_text, 
                                                                                                                processedData_SVN_mxShopCountVectorizer, 
                                                                                                                processedData_SVN_mxShopTF_IDF),axis=1)

processedData_SVN_mxShopFeaturesIDF["SvnAsQuery_avgIDF"] = processedData_SVN_mxShopFeaturesIDF.apply(lambda x: calcAvgIDF(x.SvnAsQuery_IDF), axis=1)
processedData_SVN_mxShopFeaturesIDF["SvnAsQuery_maxIDF"] = processedData_SVN_mxShopFeaturesIDF.apply(lambda x: calcMaxIDF(x.SvnAsQuery_IDF), axis=1)
processedData_SVN_mxShopFeaturesIDF["SvnAsQuery_devIDF"] = processedData_SVN_mxShopFeaturesIDF.apply(lambda x: calcDevIDF(x.SvnAsQuery_IDF), axis=1)

#Save results in pickle
processedData_SVN_mxShopFeaturesIDF.to_pickle(path= "../data/03_processed/processedData_SVN_mxShopFeaturesIDF.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 11 minutes and 33.5739266872406 seconds


#### IDF Scores (SVNLogs as Query)

In [62]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNLogs_mxShopFeaturesIDF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNLogs_mxShopFeaturesIDF["SvnLogsAsQuery_IDF"] = processedData_mxShopCartesian.apply(lambda x: calcIDFList(x.Logs, 
                                                                                                                processedData_SVNLogs_mxShopCountVectorizer, 
                                                                                                                processedData_SVNLogs_mxShopCountTF_IDF),axis=1)

processedData_SVNLogs_mxShopFeaturesIDF["SvnLogsAsQuery_avgIDF"] = processedData_SVNLogs_mxShopFeaturesIDF.apply(lambda x: calcAvgIDF(x.SvnLogsAsQuery_IDF), axis=1)
processedData_SVNLogs_mxShopFeaturesIDF["SvnLogsAsQuery_maxIDF"] = processedData_SVNLogs_mxShopFeaturesIDF.apply(lambda x: calcMaxIDF(x.SvnLogsAsQuery_IDF), axis=1)
processedData_SVNLogs_mxShopFeaturesIDF["SvnLogsAsQuery_devIDF"] = processedData_SVNLogs_mxShopFeaturesIDF.apply(lambda x: calcDevIDF(x.SvnLogsAsQuery_IDF), axis=1)

#Save results in pickle
processedData_SVNLogs_mxShopFeaturesIDF.to_pickle(path= "../data/03_processed/processedData_SVNLogs_mxShopFeaturesIDF.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 1 minutes and 34.83880829811096 seconds


#### IDF Scores (SVNUnitNames as Query

In [63]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNUnitNames_mxShopFeaturesIDF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNUnitNames_mxShopFeaturesIDF["SvnUnitNamesAsQuery_IDF"] = processedData_mxShopCartesian.apply(lambda x: calcIDFList(x.Logs, 
                                                                                                                processedData_SVNLogs_mxShopCountVectorizer, 
                                                                                                                processedData_SVNLogs_mxShopCountTF_IDF),axis=1)

processedData_SVNUnitNames_mxShopFeaturesIDF["SvnUnitNamesAsQuery_avgIDF"] = processedData_SVNUnitNames_mxShopFeaturesIDF.apply(lambda x: calcAvgIDF(x.SvnUnitNamesAsQuery_IDF), axis=1)
processedData_SVNUnitNames_mxShopFeaturesIDF["SvnUnitNamesAsQuery_maxIDF"] = processedData_SVNUnitNames_mxShopFeaturesIDF.apply(lambda x: calcMaxIDF(x.SvnUnitNamesAsQuery_IDF), axis=1)
processedData_SVNUnitNames_mxShopFeaturesIDF["SvnUnitNamesAsQuery_devIDF"] = processedData_SVNUnitNames_mxShopFeaturesIDF.apply(lambda x: calcDevIDF(x.SvnUnitNamesAsQuery_IDF), axis=1)

#Save results in pickle
processedData_SVNUnitNames_mxShopFeaturesIDF.to_pickle(path= "../data/03_processed/processedData_SVNUnitNames_mxShopFeaturesIDF.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 1 minutes and 34.84552049636841 seconds


##### IDF Scores (JIRA as Query)

In [64]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRA_mxShopFeaturesIDF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRA_mxShopFeaturesIDF["JiraAsQuery_IDF"] = processedData_mxShopCartesian.apply(lambda x: calcIDFList(x.Jira_natural_text, 
                                                                                                                processedData_JIRA_mxShopCountVectorizer, 
                                                                                                                processedData_JIRA_mxShopTF_IDF),axis=1)

processedData_JIRA_mxShopFeaturesIDF["JiraAsQuery_avgIDF"] = processedData_JIRA_mxShopFeaturesIDF.apply(lambda x: calcAvgIDF(x.JiraAsQuery_IDF), axis=1)
processedData_JIRA_mxShopFeaturesIDF["JiraAsQuery_maxIDF"] = processedData_JIRA_mxShopFeaturesIDF.apply(lambda x: calcMaxIDF(x.JiraAsQuery_IDF), axis=1)
processedData_JIRA_mxShopFeaturesIDF["JiraAsQuery_devIDF"] = processedData_JIRA_mxShopFeaturesIDF.apply(lambda x: calcDevIDF(x.JiraAsQuery_IDF), axis=1)

#Save results in pickle
processedData_JIRA_mxShopFeaturesIDF.to_pickle(path= "../data/03_processed/processedData_JIRA_mxShopFeaturesIDF.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 22 minutes and 23.688847303390503 seconds


##### IDF Scores (JIRA Summaries as Query)

In [65]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRASummaries_mxShopFeaturesIDF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRASummaries_mxShopFeaturesIDF["JiraSummariesAsQuery_IDF"] = processedData_mxShopCartesian.apply(lambda x: calcIDFList(x.Summary, 
                                                                                                                processedData_JIRASummaries_mxShopCountVectorizer, 
                                                                                                                processedData_JIRASummaries_mxShopCountTF_IDF),axis=1)

processedData_JIRASummaries_mxShopFeaturesIDF["JiraSummariesAsQuery_avgIDF"] = processedData_JIRASummaries_mxShopFeaturesIDF.apply(lambda x: calcAvgIDF(x.JiraSummariesAsQuery_IDF), axis=1)
processedData_JIRASummaries_mxShopFeaturesIDF["JiraSummariesAsQuery_maxIDF"] = processedData_JIRASummaries_mxShopFeaturesIDF.apply(lambda x: calcMaxIDF(x.JiraSummariesAsQuery_IDF), axis=1)
processedData_JIRASummaries_mxShopFeaturesIDF["JiraSummariesAsQuery_devIDF"] = processedData_JIRASummaries_mxShopFeaturesIDF.apply(lambda x: calcDevIDF(x.JiraSummariesAsQuery_IDF), axis=1)

#Save results in pickle
processedData_JIRASummaries_mxShopFeaturesIDF.to_pickle(path= "../data/03_processed/processedData_JIRASummaries_mxShopFeaturesIDF.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 1 minutes and 52.58431792259216 seconds


##### IDF Scores (JIRA Descriptions as Query)

In [66]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRADescriptions_mxShopFeaturesIDF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRADescriptions_mxShopFeaturesIDF["JiraDescriptionsAsQuery_IDF"] = processedData_mxShopCartesian.apply(lambda x: calcIDFList(x.Description, 
                                                                                                                processedData_JIRADescriptions_mxShopCountVectorizer, 
                                                                                                                processedData_JIRADescriptions_mxShopCountTF_IDF),axis=1)

processedData_JIRADescriptions_mxShopFeaturesIDF["JiraDescriptionsAsQuery_avgIDF"] = processedData_JIRADescriptions_mxShopFeaturesIDF.apply(lambda x: calcAvgIDF(x.JiraDescriptionsAsQuery_IDF), axis=1)
processedData_JIRADescriptions_mxShopFeaturesIDF["JiraDescriptionsAsQuery_maxIDF"] = processedData_JIRADescriptions_mxShopFeaturesIDF.apply(lambda x: calcMaxIDF(x.JiraDescriptionsAsQuery_IDF), axis=1)
processedData_JIRADescriptions_mxShopFeaturesIDF["JiraDescriptionsAsQuery_devIDF"] = processedData_JIRADescriptions_mxShopFeaturesIDF.apply(lambda x: calcDevIDF(x.JiraDescriptionsAsQuery_IDF), axis=1)

#Save results in pickle
processedData_JIRADescriptions_mxShopFeaturesIDF.to_pickle(path= "../data/03_processed/processedData_JIRADescriptions_mxShopFeaturesIDF.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 19 minutes and 49.29271125793457 seconds


##### IDF Scores (JIRA Comments as Query)

#### ICTF Scores (SVN as query)

In [67]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVN_mxShopFeaturesICTF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVN_mxShopFeaturesICTF["SvnAsQuery_ICTF"] = processedData_mxShopCartesian.apply(lambda x: calcIDFList(x.Commit_natural_text, 
                                                                                                                processedData_SVN_mxShopCountVectorizer, 
                                                                                                                intermediateData_SVN_mxShop_documentCount),axis=1)

processedData_SVN_mxShopFeaturesICTF["SvnAsQuery_avgICTF"] = processedData_SVN_mxShopFeaturesICTF.apply(lambda x: calcAvgICTF(x.SvnAsQuery_ICTF, intermediateData_SVN_mxShop_documentCount), axis=1)
processedData_SVN_mxShopFeaturesICTF["SvnAsQuery_maxICTF"] = processedData_SVN_mxShopFeaturesICTF.apply(lambda x: calcMaxICTF(x.SvnAsQuery_ICTF), axis=1)
processedData_SVN_mxShopFeaturesICTF["SvnAsQuery_devICTF"] = processedData_SVN_mxShopFeaturesICTF.apply(lambda x: calcDevICTF(x.SvnAsQuery_ICTF), axis=1)

#Save results in pickle
processedData_SVN_mxShopFeaturesICTF.to_pickle(path= "../data/03_processed/processedData_SVN_mxShopFeaturesICTF.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 5 minutes and 29.143274068832397 seconds


#### ICTF Scores (SVNLogs as query)

In [68]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNLogs_mxShopFeaturesICTF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNLogs_mxShopFeaturesICTF["SvnLogsAsQuery_ICTF"] = processedData_mxShopCartesian.apply(lambda x: calcICTFList(x.Logs, 
                                                                                                                processedData_SVNLogs_mxShopCountVectorizer, 
                                                                                                                intermediateData_SVN_mxShop_documentCount),axis=1)
##
processedData_SVNLogs_mxShopFeaturesICTF["SvnLogsAsQuery_avgICTF"] = processedData_SVNLogs_mxShopFeaturesICTF.apply(lambda x: calcAvgICTF(x.SvnLogsAsQuery_ICTF, intermediateData_SVN_mxShop_documentCount), axis=1)
processedData_SVNLogs_mxShopFeaturesICTF["SvnLogsAsQuery_maxICTF"] = processedData_SVNLogs_mxShopFeaturesICTF.apply(lambda x: calcMaxICTF(x.SvnLogsAsQuery_ICTF), axis=1)
processedData_SVNLogs_mxShopFeaturesICTF["SvnLogsAsQuery_devICTF"] = processedData_SVNLogs_mxShopFeaturesICTF.apply(lambda x: calcDevICTF(x.SvnLogsAsQuery_ICTF), axis=1)

#Save results in pickle
processedData_SVNLogs_mxShopFeaturesICTF.to_pickle(path= "../data/03_processed/processedData_SVNLogs_mxShopFeaturesICTF.pkl")



endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 0 minutes and 5.362651348114014 seconds


#### ICTF Scores (SVNUnitNames as query)

In [69]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNUnitNames_mxShopFeaturesICTF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNUnitNames_mxShopFeaturesICTF["SvnUnitNamesAsQuery_ICTF"] = processedData_mxShopCartesian.apply(lambda x: calcICTFList(x.Unit_names, 
                                                                                                                processedData_SVNUnitNames_mxShopCountVectorizer, 
                                                                                                                intermediateData_SVN_mxShop_documentCount),axis=1)
##
processedData_SVNUnitNames_mxShopFeaturesICTF["SvnUnitNamesAsQuery_avgICTF"] = processedData_SVNUnitNames_mxShopFeaturesICTF.apply(lambda x: calcAvgICTF(x.SvnUnitNamesAsQuery_ICTF, intermediateData_SVN_mxShop_documentCount), axis=1)
processedData_SVNUnitNames_mxShopFeaturesICTF["SvnUnitNamesAsQuery_maxICTF"] = processedData_SVNUnitNames_mxShopFeaturesICTF.apply(lambda x: calcMaxICTF(x.SvnUnitNamesAsQuery_ICTF), axis=1)
processedData_SVNUnitNames_mxShopFeaturesICTF["SvnUnitNamesAsQuery_devICTF"] = processedData_SVNUnitNames_mxShopFeaturesICTF.apply(lambda x: calcDevICTF(x.SvnUnitNamesAsQuery_ICTF), axis=1)

#Save results in pickle
processedData_SVNUnitNames_mxShopFeaturesICTF.to_pickle(path= "../data/03_processed/processedData_SVNUnitNames_mxShopFeaturesICTF.pkl")



endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 0 minutes and 10.362074613571167 seconds


#### ICTF Scores (JIRA as query)

In [70]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRA_mxShopFeaturesICTF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRA_mxShopFeaturesICTF["JiraAsQuery_ICTF"] = processedData_mxShopCartesian.apply(lambda x: calcICTFList(x.Jira_natural_text, 
                                                                                                                processedData_JIRA_mxShopCountVectorizer, 
                                                                                                                intermediateData_JIRA_mxShop_documentCount),axis=1)
##
processedData_JIRA_mxShopFeaturesICTF["JiraAsQuery_avgICTF"] = processedData_JIRA_mxShopFeaturesICTF.apply(lambda x: calcAvgICTF(x.JiraAsQuery_ICTF, intermediateData_JIRA_mxShop_documentCount), axis=1)
processedData_JIRA_mxShopFeaturesICTF["JiraAsQuery_maxICTF"] = processedData_JIRA_mxShopFeaturesICTF.apply(lambda x: calcMaxICTF(x.JiraAsQuery_ICTF), axis=1)
processedData_JIRA_mxShopFeaturesICTF["JiraAsQuery_devICTF"] = processedData_JIRA_mxShopFeaturesICTF.apply(lambda x: calcDevICTF(x.JiraAsQuery_ICTF), axis=1)

#Save results in pickle
processedData_JIRA_mxShopFeaturesICTF.to_pickle(path= "../data/03_processed/processedData_JIRA_mxShopFeaturesICTF.pkl")



endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 0 minutes and 10.448076725006104 seconds


#### ICTF Scores (JIRA Summaries as query)

In [71]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRASummaries_mxShopFeaturesICTF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRASummaries_mxShopFeaturesICTF["JiraSummariesAsQuery_ICTF"] = processedData_mxShopCartesian.apply(lambda x: calcICTFList(x.Summary, 
                                                                                                                processedData_JIRASummaries_mxShopCountVectorizer, 
                                                                                                                intermediateData_JIRA_mxShop_documentCount),axis=1)
##
processedData_JIRASummaries_mxShopFeaturesICTF["JiraSummariesAsQuery_avgICTF"] = processedData_JIRASummaries_mxShopFeaturesICTF.apply(lambda x: calcAvgICTF(x.JiraSummariesAsQuery_ICTF, intermediateData_JIRA_mxShop_documentCount), axis=1)
processedData_JIRASummaries_mxShopFeaturesICTF["JiraSummariesAsQuery_maxICTF"] = processedData_JIRASummaries_mxShopFeaturesICTF.apply(lambda x: calcMaxICTF(x.JiraSummariesAsQuery_ICTF), axis=1)
processedData_JIRASummaries_mxShopFeaturesICTF["JiraSummariesAsQuery_devICTF"] = processedData_JIRASummaries_mxShopFeaturesICTF.apply(lambda x: calcDevICTF(x.JiraSummariesAsQuery_ICTF), axis=1)

#Save results in pickle
processedData_JIRASummaries_mxShopFeaturesICTF.to_pickle(path= "../data/03_processed/processedData_JIRASummaries_mxShopFeaturesICTF.pkl")



endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 0 minutes and 6.820654630661011 seconds


#### ICTF Scores (JIRA Descriptions as query)

In [72]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRADescriptions_mxShopFeaturesICTF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRADescriptions_mxShopFeaturesICTF["JiraDescriptionsAsQuery_ICTF"] = processedData_mxShopCartesian.apply(lambda x: calcICTFList(x.Description, 
                                                                                                                processedData_JIRADescriptions_mxShopCountVectorizer, 
                                                                                                                intermediateData_JIRA_mxShop_documentCount),axis=1)
##
processedData_JIRADescriptions_mxShopFeaturesICTF["JiraDescriptionsAsQuery_avgICTF"] = processedData_JIRADescriptions_mxShopFeaturesICTF.apply(lambda x: calcAvgICTF(x.JiraDescriptionsAsQuery_ICTF, intermediateData_JIRA_mxShop_documentCount), axis=1)
processedData_JIRADescriptions_mxShopFeaturesICTF["JiraDescriptionsAsQuery_maxICTF"] = processedData_JIRADescriptions_mxShopFeaturesICTF.apply(lambda x: calcMaxICTF(x.JiraDescriptionsAsQuery_ICTF), axis=1)
processedData_JIRADescriptions_mxShopFeaturesICTF["JiraDescriptionsAsQuery_devICTF"] = processedData_JIRADescriptions_mxShopFeaturesICTF.apply(lambda x: calcDevICTF(x.JiraDescriptionsAsQuery_ICTF), axis=1)

#Save results in pickle
processedData_JIRADescriptions_mxShopFeaturesICTF.to_pickle(path= "../data/03_processed/processedData_JIRADescriptions_mxShopFeaturesICTF.pkl")



endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 0 minutes and 10.199738025665283 seconds


#### ICTF Scores (JIRA Comments as query)

#### Entropy (SVN as query)

In [73]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVN_mxShopFeaturesEntropy = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVN_mxShopFeaturesEntropy["SvnAsQuery_Entropy"] = processedData_mxShopCartesian.apply(lambda x: calcEntropyList(x.Commit_natural_text, 
                                                                                                                processedData_SVN_mxShopCountVectorizer, 
                                                                                                                intermediateData_SVN_mxShop_documentCount,
                                                                                                                intermediateData_SVN_mxShop.Commit_natural_text),axis=1)

processedData_SVN_mxShopFeaturesEntropy["SvnAsQuery_avgEntropy"] = processedData_SVN_mxShopFeaturesEntropy.apply(lambda x: calcAvgEntropy(x.SvnAsQuery_Entropy), axis=1)
processedData_SVN_mxShopFeaturesEntropy["SvnAsQuery_medEntropy"] = processedData_SVN_mxShopFeaturesEntropy.apply(lambda x: calcMedEntropy(x.SvnAsQuery_Entropy), axis=1)
processedData_SVN_mxShopFeaturesEntropy["SvnAsQuery_maxEntropy"] = processedData_SVN_mxShopFeaturesEntropy.apply(lambda x: calcMaxEntropy(x.SvnAsQuery_Entropy), axis=1)
processedData_SVN_mxShopFeaturesEntropy["SvnAsQuery_devEntropy"] = processedData_SVN_mxShopFeaturesEntropy.apply(lambda x: calcDevEntropy(x.SvnAsQuery_Entropy), axis=1)

#Save results in pickle
processedData_SVN_mxShopFeaturesEntropy.to_pickle(path= "../data/03_processed/processedData_SVN_mxShopFeaturesEntropy.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 9 minutes and 27.337192058563232 seconds


#### Entropy (SVNLogs as query)

In [74]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNLogs_mxShopFeaturesEntropy = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNLogs_mxShopFeaturesEntropy["SvnLogsAsQuery_Entropy"] = processedData_mxShopCartesian.apply(lambda x: calcEntropyList(x.Logs, 
                                                                                                                processedData_SVNLogs_mxShopCountVectorizer, 
                                                                                                                intermediateData_SVN_mxShop_documentCount,
                                                                                                                intermediateData_SVN_mxShop.Logs),axis=1)
##
processedData_SVNLogs_mxShopFeaturesEntropy["SvnLogsAsQuery_avgEntropy"] = processedData_SVNLogs_mxShopFeaturesEntropy.apply(lambda x: calcAvgEntropy(x.SvnLogsAsQuery_Entropy), axis=1)
processedData_SVNLogs_mxShopFeaturesEntropy["SvnLogsAsQuery_medEntropy"] = processedData_SVNLogs_mxShopFeaturesEntropy.apply(lambda x: calcMedEntropy(x.SvnLogsAsQuery_Entropy), axis=1)
processedData_SVNLogs_mxShopFeaturesEntropy["SvnLogsAsQuery_maxEntropy"] = processedData_SVNLogs_mxShopFeaturesEntropy.apply(lambda x: calcMaxEntropy(x.SvnLogsAsQuery_Entropy), axis=1)
processedData_SVNLogs_mxShopFeaturesEntropy["SvnLogsAsQuery_devEntropy"] = processedData_SVNLogs_mxShopFeaturesEntropy.apply(lambda x: calcDevEntropy(x.SvnLogsAsQuery_Entropy), axis=1)


#Save results in pickle
processedData_SVNLogs_mxShopFeaturesEntropy.to_pickle(path= "../data/03_processed/processedData_SVNLogs_mxShopFeaturesEntropy.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 0 minutes and 14.758890867233276 seconds


#### Entropy (SVNUnitNames as query)

In [75]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNUnitNames_mxShopFeaturesEntropy = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNUnitNames_mxShopFeaturesEntropy["SvnUnitNamesAsQuery_Entropy"] = processedData_mxShopCartesian.apply(lambda x: calcEntropyList(x.Unit_names, 
                                                                                                                processedData_SVNUnitNames_mxShopCountVectorizer, 
                                                                                                                intermediateData_SVN_mxShop_documentCount,
                                                                                                                intermediateData_SVN_mxShop.Unit_names),axis=1)
##
processedData_SVNUnitNames_mxShopFeaturesEntropy["SvnUnitNamesAsQuery_avgEntropy"] = processedData_SVNUnitNames_mxShopFeaturesEntropy.apply(lambda x: calcAvgEntropy(x.SvnUnitNamesAsQuery_Entropy), axis=1)
processedData_SVNUnitNames_mxShopFeaturesEntropy["SvnUnitNamesAsQuery_medEntropy"] = processedData_SVNUnitNames_mxShopFeaturesEntropy.apply(lambda x: calcMedEntropy(x.SvnUnitNamesAsQuery_Entropy), axis=1)
processedData_SVNUnitNames_mxShopFeaturesEntropy["SvnUnitNamesAsQuery_maxEntropy"] = processedData_SVNUnitNames_mxShopFeaturesEntropy.apply(lambda x: calcMaxEntropy(x.SvnUnitNamesAsQuery_Entropy), axis=1)
processedData_SVNUnitNames_mxShopFeaturesEntropy["SvnUnitNamesAsQuery_devEntropy"] = processedData_SVNUnitNames_mxShopFeaturesEntropy.apply(lambda x: calcDevEntropy(x.SvnUnitNamesAsQuery_Entropy), axis=1)


#Save results in pickle
processedData_SVNUnitNames_mxShopFeaturesEntropy.to_pickle(path= "../data/03_processed/processedData_SVNUnitNames_mxShopFeaturesEntropy.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 8 minutes and 25.446935653686523 seconds


#### Entropy (JIRA as query)

In [76]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRA_mxShopFeaturesEntropy = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRA_mxShopFeaturesEntropy["JiraAsQuery_Entropy"] = processedData_mxShopCartesian.apply(lambda x: calcEntropyList(x.Jira_natural_text, 
                                                                                                                processedData_JIRA_mxShopCountVectorizer, 
                                                                                                                intermediateData_JIRA_mxShop_documentCount,
                                                                                                                intermediateData_JIRA_mxShop.Jira_natural_text),axis=1)
##
processedData_JIRA_mxShopFeaturesEntropy["JiraAsQuery_avgEntropy"] = processedData_JIRA_mxShopFeaturesEntropy.apply(lambda x: calcAvgEntropy(x.JiraAsQuery_Entropy), axis=1)
processedData_JIRA_mxShopFeaturesEntropy["JiraAsQuery_medEntropy"] = processedData_JIRA_mxShopFeaturesEntropy.apply(lambda x: calcMedEntropy(x.JiraAsQuery_Entropy), axis=1)
processedData_JIRA_mxShopFeaturesEntropy["JiraAsQuery_maxEntropy"] = processedData_JIRA_mxShopFeaturesEntropy.apply(lambda x: calcMaxEntropy(x.JiraAsQuery_Entropy), axis=1)
processedData_JIRA_mxShopFeaturesEntropy["JiraAsQuery_devEntropy"] = processedData_JIRA_mxShopFeaturesEntropy.apply(lambda x: calcDevEntropy(x.JiraAsQuery_Entropy), axis=1)


#Save results in pickle
processedData_JIRA_mxShopFeaturesEntropy.to_pickle(path= "../data/03_processed/processedData_JIRA_mxShopFeaturesEntropy.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 13 minutes and 37.50166034698486 seconds


#### Entropy (JIRA Summaries as query)

In [77]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRASummaries_mxShopFeaturesEntropy = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRASummaries_mxShopFeaturesEntropy["JiraSummariesAsQuery_Entropy"] = processedData_mxShopCartesian.apply(lambda x: calcEntropyList(x.Summary, 
                                                                                                                processedData_JIRASummaries_mxShopCountVectorizer, 
                                                                                                                intermediateData_JIRA_mxShop_documentCount,
                                                                                                                intermediateData_JIRA_mxShop.Summary),axis=1)
##
processedData_JIRASummaries_mxShopFeaturesEntropy["JiraSummariesAsQuery_avgEntropy"] = processedData_JIRASummaries_mxShopFeaturesEntropy.apply(lambda x: calcAvgEntropy(x.JiraSummariesAsQuery_Entropy), axis=1)
processedData_JIRASummaries_mxShopFeaturesEntropy["JiraSummariesAsQuery_medEntropy"] = processedData_JIRASummaries_mxShopFeaturesEntropy.apply(lambda x: calcMedEntropy(x.JiraSummariesAsQuery_Entropy), axis=1)
processedData_JIRASummaries_mxShopFeaturesEntropy["JiraSummariesAsQuery_maxEntropy"] = processedData_JIRASummaries_mxShopFeaturesEntropy.apply(lambda x: calcMaxEntropy(x.JiraSummariesAsQuery_Entropy), axis=1)
processedData_JIRASummaries_mxShopFeaturesEntropy["JiraSummariesAsQuery_devEntropy"] = processedData_JIRASummaries_mxShopFeaturesEntropy.apply(lambda x: calcDevEntropy(x.JiraSummariesAsQuery_Entropy), axis=1)


#Save results in pickle
processedData_JIRASummaries_mxShopFeaturesEntropy.to_pickle(path= "../data/03_processed/processedData_JIRASummaries_mxShopFeaturesEntropy.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 0 minutes and 28.030139207839966 seconds


#### Entropy (JIRA Descriptions as query)

In [78]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRADescriptions_mxShopFeaturesEntropy = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRADescriptions_mxShopFeaturesEntropy["JiraDescriptionsAsQuery_Entropy"] = processedData_mxShopCartesian.apply(lambda x: calcEntropyList(x.Description, 
                                                                                                                processedData_JIRADescriptions_mxShopCountVectorizer, 
                                                                                                                intermediateData_JIRA_mxShop_documentCount,
                                                                                                                intermediateData_JIRA_mxShop.Description),axis=1)
##
processedData_JIRADescriptions_mxShopFeaturesEntropy["JiraDescriptionsAsQuery_avgEntropy"] = processedData_JIRADescriptions_mxShopFeaturesEntropy.apply(lambda x: calcAvgEntropy(x.JiraDescriptionsAsQuery_Entropy), axis=1)
processedData_JIRADescriptions_mxShopFeaturesEntropy["JiraDescriptionsAsQuery_medEntropy"] = processedData_JIRADescriptions_mxShopFeaturesEntropy.apply(lambda x: calcMedEntropy(x.JiraDescriptionsAsQuery_Entropy), axis=1)
processedData_JIRADescriptions_mxShopFeaturesEntropy["JiraDescriptionsAsQuery_maxEntropy"] = processedData_JIRADescriptions_mxShopFeaturesEntropy.apply(lambda x: calcMaxEntropy(x.JiraDescriptionsAsQuery_Entropy), axis=1)
processedData_JIRADescriptions_mxShopFeaturesEntropy["JiraDescriptionsAsQuery_devEntropy"] = processedData_JIRADescriptions_mxShopFeaturesEntropy.apply(lambda x: calcDevEntropy(x.JiraDescriptionsAsQuery_Entropy), axis=1)


#Save results in pickle
processedData_JIRADescriptions_mxShopFeaturesEntropy.to_pickle(path= "../data/03_processed/processedData_JIRADescriptions_mxShopFeaturesEntropy.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 10 minutes and 53.47031378746033 seconds


#### Entropy (JIRA Comments as query)

##### Query Scope (SVN as query)

In [79]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVN_mxShopFeaturesQueryScope = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVN_mxShopFeaturesQueryScope["SvnAsQuery_QueryScope"] = processedData_mxShopCartesian.apply(lambda x: calcQueryScope(x.Commit_natural_text, 
                                                                                                                intermediateData_SVN_mxShop.Commit_natural_text),axis=1)

#Save results in pickle
processedData_SVN_mxShopFeaturesQueryScope.to_pickle(path= "../data/03_processed/processedData_SVN_mxShopFeaturesQueryScope.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 17.28474998474121 seconds


##### Query Scope (SVNLogs as query)

In [80]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNLogs_mxShopFeaturesQueryScope = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNLogs_mxShopFeaturesQueryScope["SvnLogsAsQuery_QueryScope"] = processedData_mxShopCartesian.apply(lambda x: calcQueryScope(x.Logs, 
                                                                                                                intermediateData_SVN_mxShop.Logs),axis=1)

#Save results in pickle
processedData_SVNLogs_mxShopFeaturesQueryScope.to_pickle(path= "../data/03_processed/processedData_SVNLogs_mxShopFeaturesQueryScope.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 4.86288595199585 seconds


##### Query Scope (SVNUnitNames as query)

In [81]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNUnitNames_mxShopFeaturesQueryScope = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNUnitNames_mxShopFeaturesQueryScope["SvnUnitNamesAsQuery_QueryScope"] = processedData_mxShopCartesian.apply(lambda x: calcQueryScope(x.Unit_names, 
                                                                                                                intermediateData_SVN_mxShop.Unit_names),axis=1)

#Save results in pickle
processedData_SVNUnitNames_mxShopFeaturesQueryScope.to_pickle(path= "../data/03_processed/processedData_SVNUnitNames_mxShopFeaturesQueryScope.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 10.458050012588501 seconds


##### Query Scope (JIRA as query)

In [82]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRA_mxShopFeaturesQueryScope = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRA_mxShopFeaturesQueryScope["JiraAsQuery_QueryScope"] = processedData_mxShopCartesian.apply(lambda x: calcQueryScope(x.Jira_natural_text, 
                                                                                                                intermediateData_JIRA_mxShop.Jira_natural_text),axis=1)

#Save results in pickle
processedData_JIRA_mxShopFeaturesQueryScope.to_pickle(path= "../data/03_processed/processedData_JIRA_mxShopFeaturesQueryScope.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 3 minutes and 40.0184326171875 seconds


##### Query Scope (JIRA Summaries as query)

In [83]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRASummaries_mxShopFeaturesQueryScope = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRASummaries_mxShopFeaturesQueryScope["JiraSummariesAsQuery_QueryScope"] = processedData_mxShopCartesian.apply(lambda x: calcQueryScope(x.Summary, 
                                                                                                                intermediateData_JIRA_mxShop.Summary),axis=1)

#Save results in pickle
processedData_JIRASummaries_mxShopFeaturesQueryScope.to_pickle(path= "../data/03_processed/processedData_JIRASummaries_mxShopFeaturesQueryScope.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 13.364269018173218 seconds


##### Query Scope (JIRA Descriptions as query)

In [84]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRADescriptions_mxShopFeaturesQueryScope = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRADescriptions_mxShopFeaturesQueryScope["JiraDescriptionsAsQuery_QueryScope"] = processedData_mxShopCartesian.apply(lambda x: calcQueryScope(x.Description, 
                                                                                                                intermediateData_JIRA_mxShop.Description),axis=1)

#Save results in pickle
processedData_JIRADescriptions_mxShopFeaturesQueryScope.to_pickle(path= "../data/03_processed/processedData_JIRADescriptions_mxShopFeaturesQueryScope.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 2 minutes and 3.145510673522949 seconds


##### Query Scope (JIRA Comments as query)

#### Kullback-Leiber divergence (SVN as query)

In [85]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVN_mxShopFeaturesSCS = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVN_mxShopFeaturesSCS["SvnAsQuery_SCS"] = processedData_mxShopCartesian.apply(lambda x: calcSCS(x.Commit_natural_text, 
                                                                                                                processedData_SVN_mxShopCountVectorizer, 
                                                                                                                intermediateData_SVN_mxShop_documentCount),axis=1)

#Save results in pickle
processedData_SVN_mxShopFeaturesSCS.to_pickle(path= "../data/03_processed/processedData_SVN_mxShopFeaturesSCS.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 26.224824905395508 seconds


#### Kullback-Leiber divergence (SVNLogs as query)

In [86]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNLogs_mxShopFeaturesSCS = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNLogs_mxShopFeaturesSCS["SvnLogsAsQuery_SCS"] = processedData_mxShopCartesian.apply(lambda x: calcSCS(x.Logs, 
                                                                                                                processedData_SVNLogs_mxShopCountVectorizer, 
                                                                                                                intermediateData_SVN_mxShop_documentCount),axis=1)

#Save results in pickle
processedData_SVNLogs_mxShopFeaturesSCS.to_pickle(path= "../data/03_processed/processedData_SVNLogs_mxShopFeaturesSCS.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 1.0075480937957764 seconds


#### Kullback-Leiber divergence (SVNUnitNames as query)

In [87]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNUnitNames_mxShopFeaturesSCS = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNUnitNames_mxShopFeaturesSCS["SvnUnitNamesAsQuery_SCS"] = processedData_mxShopCartesian.apply(lambda x: calcSCS(x.Unit_names, 
                                                                                                                processedData_SVNUnitNames_mxShopCountVectorizer, 
                                                                                                                intermediateData_SVN_mxShop_documentCount),axis=1)

#Save results in pickle
processedData_SVNUnitNames_mxShopFeaturesSCS.to_pickle(path= "../data/03_processed/processedData_SVNUnitNames_mxShopFeaturesSCS.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 26.23382830619812 seconds


#### Kullback-Leiber divergence (JIRA as query)

In [88]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRA_mxShopFeaturesSCS = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRA_mxShopFeaturesSCS["JiraAsQuery_SCS"] = processedData_mxShopCartesian.apply(lambda x: calcSCS(x.Jira_natural_text, 
                                                                                                                processedData_JIRA_mxShopCountVectorizer, 
                                                                                                                intermediateData_JIRA_mxShop_documentCount),axis=1)

#Save results in pickle
processedData_JIRA_mxShopFeaturesSCS.to_pickle(path= "../data/03_processed/processedData_JIRA_mxShopFeaturesSCS.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 2.8304619789123535 seconds


#### Kullback-Leiber divergence (JIRA Summaries as query)

In [89]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRASummaries_mxShopFeaturesSCS = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRASummaries_mxShopFeaturesSCS["JiraSummariesAsQuery_SCS"] = processedData_mxShopCartesian.apply(lambda x: calcSCS(x.Summary, 
                                                                                                                processedData_JIRASummaries_mxShopCountVectorizer, 
                                                                                                                intermediateData_JIRA_mxShop_documentCount),axis=1)

#Save results in pickle
processedData_JIRASummaries_mxShopFeaturesSCS.to_pickle(path= "../data/03_processed/processedData_JIRASummaries_mxShopFeaturesSCS.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 0.7719323635101318 seconds


In [90]:
##### Kullback-Leiber divergence (JIRA Description as query)

In [91]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRADescriptions_mxShopFeaturesSCS = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRADescriptions_mxShopFeaturesSCS["JiraDescriptionsAsQuery_SCS"] = processedData_mxShopCartesian.apply(lambda x: calcSCS(x.Description, 
                                                                                                                processedData_JIRADescriptions_mxShopCountVectorizer, 
                                                                                                                intermediateData_JIRA_mxShop_documentCount),axis=1)

#Save results in pickle
processedData_JIRADescriptions_mxShopFeaturesSCS.to_pickle(path= "../data/03_processed/processedData_JIRADescriptions_mxShopFeaturesSCS.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 2.652320623397827 seconds


In [92]:
##### Kullback-Leiber divergence (JIRA Comments as query)

#### SCQ (SVN as Query)

In [93]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVN_mxShopFeaturesSCQ = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVN_mxShopFeaturesSCQ["SvnAsQuery_SCQ"] = processedData_mxShopCartesian.apply(lambda x: calcSCQList(x.Commit_natural_text, intermediateData_SVN_mxShop.Commit_natural_text,
                                                                                                                                         processedData_SVN_mxShopCountVectorizer,
                                                                                                                                         processedData_SVN_mxShopCountTF_IDF,
                                                                                                                                         intermediateData_SVN_mxShop_documentCount),axis=1)

processedData_SVN_mxShopFeaturesSCQ["SvnAsQuery_avgSCQ"] = processedData_SVN_mxShopFeaturesSCQ.apply(lambda x: calcAvgSCQ(x.SvnAsQuery_SCQ, intermediateData_SVN_mxShop_documentCount), axis=1)
processedData_SVN_mxShopFeaturesSCQ["SvnAsQuery_maxSCQ"] = processedData_SVN_mxShopFeaturesSCQ.apply(lambda x: calcMaxSCQ(x.SvnAsQuery_SCQ), axis=1)
processedData_SVN_mxShopFeaturesSCQ["SvnAsQuery_sumSCQ"] = processedData_SVN_mxShopFeaturesSCQ.apply(lambda x: calcSumSCQ(x.SvnAsQuery_SCQ), axis=1)


#Save results in pickle
processedData_SVN_mxShopFeaturesSCQ.to_pickle(path= "../data/03_processed/processedData_SVN_mxShopFeaturesSCQ.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 1 minutes and 39.3187518119812 seconds


#### SCQ (SVNLogs as Query)

In [94]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNLogs_mxShopFeaturesSCQ = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNLogs_mxShopFeaturesSCQ["SvnLogsAsQuery_SCQ"] = processedData_mxShopCartesian.apply(lambda x: calcSCQList(x.Logs, intermediateData_SVN_mxShop.Logs,
                                                                                                                                         processedData_SVNLogs_mxShopCountVectorizer,
                                                                                                                                         processedData_SVNLogs_mxShopCountTF_IDF,
                                                                                                                                         intermediateData_SVN_mxShop_documentCount),axis=1)

processedData_SVNLogs_mxShopFeaturesSCQ["SvnLogsAsQuery_avgSCQ"] = processedData_SVNLogs_mxShopFeaturesSCQ.apply(lambda x: calcAvgSCQ(x.SvnLogsAsQuery_SCQ, intermediateData_SVN_mxShop_documentCount), axis=1)
processedData_SVNLogs_mxShopFeaturesSCQ["SvnLogsAsQuery_maxSCQ"] = processedData_SVNLogs_mxShopFeaturesSCQ.apply(lambda x: calcMaxSCQ(x.SvnLogsAsQuery_SCQ), axis=1)
processedData_SVNLogs_mxShopFeaturesSCQ["SvnLogsAsQuery_sumSCQ"] = processedData_SVNLogs_mxShopFeaturesSCQ.apply(lambda x: calcSumSCQ(x.SvnLogsAsQuery_SCQ), axis=1)


#Save results in pickle
processedData_SVNLogs_mxShopFeaturesSCQ.to_pickle(path= "../data/03_processed/processedData_SVNLogs_mxShopFeaturesSCQ.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 1 minutes and 35.09811496734619 seconds


#### SCQ (SVNUnitNames as Query)

In [95]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNUnitNames_mxShopFeaturesSCQ = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNUnitNames_mxShopFeaturesSCQ["SvnUnitNamesAsQuery_SCQ"] = processedData_mxShopCartesian.apply(lambda x: calcSCQList(x.Unit_names, intermediateData_SVN_mxShop.Unit_names,
                                                                                                                                         processedData_SVNUnitNames_mxShopCountVectorizer,
                                                                                                                                         processedData_SVNUnitNames_mxShopCountTF_IDF,
                                                                                                                                         intermediateData_SVN_mxShop_documentCount),axis=1)

processedData_SVNUnitNames_mxShopFeaturesSCQ["SvnUnitNamesAsQuery_avgSCQ"] = processedData_SVNUnitNames_mxShopFeaturesSCQ.apply(lambda x: calcAvgSCQ(x.SvnUnitNamesAsQuery_SCQ, intermediateData_SVN_mxShop_documentCount), axis=1)
processedData_SVNUnitNames_mxShopFeaturesSCQ["SvnUnitNamesAsQuery_maxSCQ"] = processedData_SVNUnitNames_mxShopFeaturesSCQ.apply(lambda x: calcMaxSCQ(x.SvnUnitNamesAsQuery_SCQ), axis=1)
processedData_SVNUnitNames_mxShopFeaturesSCQ["SvnUnitNamesAsQuery_sumSCQ"] = processedData_SVNUnitNames_mxShopFeaturesSCQ.apply(lambda x: calcSumSCQ(x.SvnUnitNamesAsQuery_SCQ), axis=1)


#Save results in pickle
processedData_SVNUnitNames_mxShopFeaturesSCQ.to_pickle(path= "../data/03_processed/processedData_SVNUnitNames_mxShopFeaturesSCQ.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 1 minutes and 33.80674600601196 seconds


#### SCQ (JIRA as Query)

In [96]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRA_mxShopFeaturesSCQ = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRA_mxShopFeaturesSCQ["JiraAsQuery_SCQ"] = processedData_mxShopCartesian.apply(lambda x: calcSCQList(x.Jira_natural_text, intermediateData_JIRA_mxShop.Jira_natural_text,
                                                                                                                                         processedData_JIRA_mxShopCountVectorizer,
                                                                                                                                         processedData_JIRA_mxShopTF_IDF,
                                                                                                                                         intermediateData_JIRA_mxShop_documentCount),axis=1)

processedData_JIRA_mxShopFeaturesSCQ["JiraAsQuery_avgSCQ"] = processedData_JIRA_mxShopFeaturesSCQ.apply(lambda x: calcAvgSCQ(x.JiraAsQuery_SCQ, intermediateData_JIRA_mxShop_documentCount), axis=1)
processedData_JIRA_mxShopFeaturesSCQ["JiraAsQuery_maxSCQ"] = processedData_JIRA_mxShopFeaturesSCQ.apply(lambda x: calcMaxSCQ(x.JiraAsQuery_SCQ), axis=1)
processedData_JIRA_mxShopFeaturesSCQ["JiraAsQuery_sumSCQ"] = processedData_JIRA_mxShopFeaturesSCQ.apply(lambda x: calcSumSCQ(x.JiraAsQuery_SCQ), axis=1)


#Save results in pickle
processedData_JIRA_mxShopFeaturesSCQ.to_pickle(path= "../data/03_processed/processedData_JIRA_mxShopFeaturesSCQ.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 1 minutes and 21.878973484039307 seconds


#### SCQ (JIRA Summaries as Query)

In [97]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRASummaries_mxShopFeaturesSCQ = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRASummaries_mxShopFeaturesSCQ["JiraSummariesAsQuery_SCQ"] = processedData_mxShopCartesian.apply(lambda x: calcSCQList(x.Summary, intermediateData_JIRA_mxShop.Summary,
                                                                                                                                         processedData_JIRASummaries_mxShopCountVectorizer,
                                                                                                                                         processedData_JIRASummaries_mxShopCountTF_IDF,
                                                                                                                                         intermediateData_JIRA_mxShop_documentCount),axis=1)

processedData_JIRASummaries_mxShopFeaturesSCQ["JiraSummariesAsQuery_avgSCQ"] = processedData_JIRASummaries_mxShopFeaturesSCQ.apply(lambda x: calcAvgSCQ(x.JiraSummariesAsQuery_SCQ, intermediateData_JIRA_mxShop_documentCount), axis=1)
processedData_JIRASummaries_mxShopFeaturesSCQ["JiraSummariesAsQuery_maxSCQ"] = processedData_JIRASummaries_mxShopFeaturesSCQ.apply(lambda x: calcMaxSCQ(x.JiraSummariesAsQuery_SCQ), axis=1)
processedData_JIRASummaries_mxShopFeaturesSCQ["JiraSummariesAsQuery_sumSCQ"] = processedData_JIRASummaries_mxShopFeaturesSCQ.apply(lambda x: calcSumSCQ(x.JiraSummariesAsQuery_SCQ), axis=1)


#Save results in pickle
processedData_JIRASummaries_mxShopFeaturesSCQ.to_pickle(path= "../data/03_processed/processedData_JIRASummaries_mxShopFeaturesSCQ.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 1 minutes and 41.34452772140503 seconds


#### SCQ (JIRA Descriptions as Query)

In [98]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRADescriptions_mxShopFeaturesSCQ = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRADescriptions_mxShopFeaturesSCQ["JiraDescriptionsAsQuery_SCQ"] = processedData_mxShopCartesian.apply(lambda x: calcSCQList(x.Description, intermediateData_JIRA_mxShop.Description,
                                                                                                                                         processedData_JIRADescriptions_mxShopCountVectorizer,
                                                                                                                                         processedData_JIRADescriptions_mxShopCountTF_IDF,
                                                                                                                                         intermediateData_JIRA_mxShop_documentCount),axis=1)

processedData_JIRADescriptions_mxShopFeaturesSCQ["JiraDescriptionsAsQuery_avgSCQ"] = processedData_JIRADescriptions_mxShopFeaturesSCQ.apply(lambda x: calcAvgSCQ(x.JiraDescriptionsAsQuery_SCQ, intermediateData_JIRA_mxShop_documentCount), axis=1)
processedData_JIRADescriptions_mxShopFeaturesSCQ["JiraDescriptionsAsQuery_maxSCQ"] = processedData_JIRADescriptions_mxShopFeaturesSCQ.apply(lambda x: calcMaxSCQ(x.JiraDescriptionsAsQuery_SCQ), axis=1)
processedData_JIRADescriptions_mxShopFeaturesSCQ["JiraDescriptionsAsQuery_sumSCQ"] = processedData_JIRADescriptions_mxShopFeaturesSCQ.apply(lambda x: calcSumSCQ(x.JiraDescriptionsAsQuery_SCQ), axis=1)


#Save results in pickle
processedData_JIRADescriptions_mxShopFeaturesSCQ.to_pickle(path= "../data/03_processed/processedData_JIRADescriptions_mxShopFeaturesSCQ.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 1 minutes and 17.451756477355957 seconds


#### SCQ (JIRA Comments as Query)

#### PMI (SVN as query)

In [99]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(processedData_SVN_mxShopCountVectorizer)
termFrequencies = findTermFrequencies(processedData_SVN_mxShopCountVectorizer, intermediateData_SVN_mxShop.Commit_natural_text)
termPairFrequencies = findTermPairFrequencies(termPairs, intermediateData_SVN_mxShop.Commit_natural_text)

#Create new dataFrame
processedData_SVN_mxShopFeaturesPMI = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVN_mxShopFeaturesPMI["SvnAsQuery_PMI"] = processedData_mxShopCartesian.apply(lambda x: calcPMIList(x.Commit_natural_text, 
                                                                                                                                  termFrequencies, 
                                                                                                                                  termPairFrequencies, 
                                                                                                                                  intermediateData_SVN_mxShop.Commit_natural_text),axis=1)

processedData_SVN_mxShopFeaturesPMI["SvnAsQuery_avgPMI"] = processedData_SVN_mxShopFeaturesPMI.apply(lambda x: calcAvgPMI(x.SvnAsQuery_PMI), axis=1)
processedData_SVN_mxShopFeaturesPMI["SvnAsQuery_maxPMI"] = processedData_SVN_mxShopFeaturesPMI.apply(lambda x: calcMaxPMI(x.SvnAsQuery_PMI), axis=1)



processedData_SVN_mxShopFeaturesPMI.drop('SvnAsQuery_PMI', axis = 1, inplace=True)

#Save results in pickle
processedData_SVN_mxShopFeaturesPMI.to_pickle(path= "../data/03_processed/processedData_SVN_mxShopFeaturesPMI.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 25 minutes and 56.55142426490784 seconds


#### PMI (SVNLogs as query)

In [100]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(processedData_SVNLogs_mxShopCountVectorizer)
termFrequencies = findTermFrequencies(processedData_SVNLogs_mxShopCountVectorizer, intermediateData_SVN_mxShop.Logs)
termPairFrequencies = findTermPairFrequencies(termPairs, intermediateData_SVN_mxShop.Logs)

#Create new dataFrame
processedData_SVNLogs_mxShopFeaturesPMI = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNLogs_mxShopFeaturesPMI["SvnLogsAsQuery_PMI"] = processedData_mxShopCartesian.apply(lambda x: calcPMIList(x.Logs, 
                                                                                                                                  termFrequencies, 
                                                                                                                                  termPairFrequencies, 
                                                                                                                                  intermediateData_SVN_mxShop.Logs),axis=1)

processedData_SVNLogs_mxShopFeaturesPMI["SvnLogsAsQuery_avgPMI"] = processedData_SVNLogs_mxShopFeaturesPMI.apply(lambda x: calcAvgPMI(x.SvnLogsAsQuery_PMI), axis=1)
processedData_SVNLogs_mxShopFeaturesPMI["SvnLogsAsQuery_maxPMI"] = processedData_SVNLogs_mxShopFeaturesPMI.apply(lambda x: calcMaxPMI(x.SvnLogsAsQuery_PMI), axis=1)



processedData_SVNLogs_mxShopFeaturesPMI.drop('SvnLogsAsQuery_PMI', axis = 1, inplace=True)

#Save results in pickle
processedData_SVNLogs_mxShopFeaturesPMI.to_pickle(path= "../data/03_processed/processedData_SVNLogs_mxShopFeaturesPMI.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



  maxPMI = np.nanmax(pmiList)


Finished creating query quality features in 0 minutes and 17.834798574447632 seconds


#### PMI (SVNUnitNames as query)

In [101]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(processedData_SVNUnitNames_mxShopCountVectorizer)
termFrequencies = findTermFrequencies(processedData_SVNUnitNames_mxShopCountVectorizer, intermediateData_SVN_mxShop.Unit_names)
termPairFrequencies = findTermPairFrequencies(termPairs, intermediateData_SVN_mxShop.Unit_names)

#Create new dataFrame
processedData_SVNUnitNames_mxShopFeaturesPMI = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNUnitNames_mxShopFeaturesPMI["SvnUnitNamesAsQuery_PMI"] = processedData_mxShopCartesian.apply(lambda x: calcPMIList(x.Unit_names, 
                                                                                                                                  termFrequencies, 
                                                                                                                                  termPairFrequencies, 
                                                                                                                                  intermediateData_SVN_mxShop.Unit_names),axis=1)

processedData_SVNUnitNames_mxShopFeaturesPMI["SvnUnitNamesAsQuery_avgPMI"] = processedData_SVNUnitNames_mxShopFeaturesPMI.apply(lambda x: calcAvgPMI(x.SvnUnitNamesAsQuery_PMI), axis=1)
processedData_SVNUnitNames_mxShopFeaturesPMI["SvnUnitNamesAsQuery_maxPMI"] = processedData_SVNUnitNames_mxShopFeaturesPMI.apply(lambda x: calcMaxPMI(x.SvnUnitNamesAsQuery_PMI), axis=1)



processedData_SVNUnitNames_mxShopFeaturesPMI.drop('SvnUnitNamesAsQuery_PMI', axis = 1, inplace=True)

#Save results in pickle
processedData_SVNUnitNames_mxShopFeaturesPMI.to_pickle(path= "../data/03_processed/processedData_SVNUnitNames_mxShopFeaturesPMI.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



  maxPMI = np.nanmax(pmiList)


Finished creating query quality features in 24 minutes and 43.04617142677307 seconds


#### PMI (JIRA as query)

In [102]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(processedData_JIRA_mxShopCountVectorizer)
termFrequencies = findTermFrequencies(processedData_JIRA_mxShopCountVectorizer, intermediateData_JIRA_mxShop.Jira_natural_text)
termPairFrequencies = findTermPairFrequencies(termPairs, intermediateData_JIRA_mxShop.Jira_natural_text)

#Create new dataFrame
processedData_JIRA_mxShopFeaturesPMI = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRA_mxShopFeaturesPMI["JiraAsQuery_PMI"] = processedData_mxShopCartesian.apply(lambda x: calcPMIList(x.Jira_natural_text, 
                                                                                                                                  termFrequencies, 
                                                                                                                                  termPairFrequencies, 
                                                                                                                                  intermediateData_JIRA_mxShop.Jira_natural_text),axis=1)

processedData_JIRA_mxShopFeaturesPMI["JiraAsQuery_avgPMI"] = processedData_JIRA_mxShopFeaturesPMI.apply(lambda x: calcAvgPMI(x.JiraAsQuery_PMI), axis=1)
processedData_JIRA_mxShopFeaturesPMI["JiraAsQuery_maxPMI"] = processedData_JIRA_mxShopFeaturesPMI.apply(lambda x: calcMaxPMI(x.JiraAsQuery_PMI), axis=1)



processedData_JIRA_mxShopFeaturesPMI.drop('JiraAsQuery_PMI', axis = 1, inplace=True)

#Save results in pickle
processedData_JIRA_mxShopFeaturesPMI.to_pickle(path= "../data/03_processed/processedData_JIRA_mxShopFeaturesPMI.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 61 minutes and 47.90031933784485 seconds


#### PMI (JIRA Summaries as query)

In [103]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(processedData_JIRASummaries_mxShopCountVectorizer)
termFrequencies = findTermFrequencies(processedData_JIRASummaries_mxShopCountVectorizer, intermediateData_JIRA_mxShop.Summary)
termPairFrequencies = findTermPairFrequencies(termPairs, intermediateData_JIRA_mxShop.Summary)

#Create new dataFrame
processedData_JIRASummaries_mxShopFeaturesPMI = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRASummaries_mxShopFeaturesPMI["JiraSummariesAsQuery_PMI"] = processedData_mxShopCartesian.apply(lambda x: calcPMIList(x.Summary, 
                                                                                                                                  termFrequencies, 
                                                                                                                                  termPairFrequencies, 
                                                                                                                                  intermediateData_JIRA_mxShop.Summary),axis=1)

processedData_JIRASummaries_mxShopFeaturesPMI["JiraSummariesAsQuery_avgPMI"] = processedData_JIRASummaries_mxShopFeaturesPMI.apply(lambda x: calcAvgPMI(x.JiraSummariesAsQuery_PMI), axis=1)
processedData_JIRASummaries_mxShopFeaturesPMI["JiraSummariesAsQuery_maxPMI"] = processedData_JIRASummaries_mxShopFeaturesPMI.apply(lambda x: calcMaxPMI(x.JiraSummariesAsQuery_PMI), axis=1)



processedData_JIRASummaries_mxShopFeaturesPMI.drop('JiraSummariesAsQuery_PMI', axis = 1, inplace=True)

#Save results in pickle
processedData_JIRASummaries_mxShopFeaturesPMI.to_pickle(path= "../data/03_processed/processedData_JIRASummaries_mxShopFeaturesPMI.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



  maxPMI = np.nanmax(pmiList)


Finished creating query quality features in 2 minutes and 12.151050567626953 seconds


#### PMI (JIRA Descriptions as query)

In [104]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(processedData_JIRADescriptions_mxShopCountVectorizer)
termFrequencies = findTermFrequencies(processedData_JIRADescriptions_mxShopCountVectorizer, intermediateData_JIRA_mxShop.Description)
termPairFrequencies = findTermPairFrequencies(termPairs, intermediateData_JIRA_mxShop.Description)

#Create new dataFrame
processedData_JIRADescriptions_mxShopFeaturesPMI = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRADescriptions_mxShopFeaturesPMI["JiraDescriptionsAsQuery_PMI"] = processedData_mxShopCartesian.apply(lambda x: calcPMIList(x.Description, 
                                                                                                                                  termFrequencies, 
                                                                                                                                  termPairFrequencies, 
                                                                                                                                  intermediateData_JIRA_mxShop.Description),axis=1)

processedData_JIRADescriptions_mxShopFeaturesPMI["JiraDescriptionsAsQuery_avgPMI"] = processedData_JIRADescriptions_mxShopFeaturesPMI.apply(lambda x: calcAvgPMI(x.JiraDescriptionsAsQuery_PMI), axis=1)
processedData_JIRADescriptions_mxShopFeaturesPMI["JiraDescriptionsAsQuery_maxPMI"] = processedData_JIRADescriptions_mxShopFeaturesPMI.apply(lambda x: calcMaxPMI(x.JiraDescriptionsAsQuery_PMI), axis=1)



processedData_JIRADescriptions_mxShopFeaturesPMI.drop('JiraDescriptionsAsQuery_PMI', axis = 1, inplace=True)

#Save results in pickle
processedData_JIRADescriptions_mxShopFeaturesPMI.to_pickle(path= "../data/03_processed/processedData_JIRADescriptions_mxShopFeaturesPMI.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



  maxPMI = np.nanmax(pmiList)


Finished creating query quality features in 52 minutes and 42.46821045875549 seconds


#### PMI (JIRA Comments as query)

In [2]:
from sklearn import preprocessing
import numpy as np
import pandas as pd

def normalizeData(dataFrame):
    scaler = preprocessing.MinMaxScaler()
    names = dataFrame.columns
    d = scaler.fit_transform(dataFrame)
    scaledDataFrame = pd.DataFrame(d, columns=names)
    return(scaledDataFrame)

# Normalize all data

In [3]:
from sklearn import preprocessing
import numpy as np

################################## Loading #################################
#Load Process-Related Features
processedData_mxShopFeaturesTime = pd.read_pickle(r'../data/03_processed/processedData_mxShopFeaturesTime.pkl')
processedData_mxShopFeaturesStakeholder = pd.read_pickle(r'../data/03_processed/processedData_mxShopFeaturesStakeholder.pkl')

#Load IR-Related Features - unigram
processedData_mxShop_features_VsmLogsJiraAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmLogsJiraAsQuery.pkl')
processedData_mxShop_features_VsmLogsLogAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmLogsLogAsQuery.pkl')
processedData_mxShop_features_VsmUnitNamesJiraAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmUnitNamesJiraAsQuery.pkl')
processedData_mxShop_features_VsmUnitNamesUnitNamesAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmUnitNamesUnitNamesAsQuery.pkl')

#processedData_mxShop_features_VsmUnitNamesCommentsCommentsAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmUnitNamesCommentsCommentsAsQuery.pkl')
#processedData_mxShop_features_VsmUnitNamesCommentsUnitNamesAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmUnitNamesCommentsUnitNamesAsQuery.pkl')
processedData_mxShop_features_VsmUnitNamesDescriptionDescriptionAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmUnitNamesDescriptionDescriptionAsQuery.pkl')
processedData_mxShop_features_VsmUnitNamesDescriptionUnitNamesAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmUnitNamesDescriptionUnitNamesAsQuery.pkl')

#processedData_mxShop_features_VsmVerbPruningUnitNamesJiraAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmVerbPruningUnitNamesJiraAsQuery.pkl')
#processedData_mxShop_features_VsmVerbPruningUnitNamesUnitNamesAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmVerbPruningUnitNamesUnitNamesAsQuery.pkl')
processedData_mxShop_features_VsmSummaryLogsSummaryAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmSummaryLogsSummaryAsQuery.pkl')
processedData_mxShop_features_VsmSummaryLogsLogsAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmSummaryLogsLogsAsQuery.pkl')
processedData_mxShop_features_VsmSummaryUnitNamesSummaryAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmSummaryUnitNamesSummaryAsQuery.pkl')
processedData_mxShop_features_VsmSummaryUnitNamesUnitNamesAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmSummaryUnitNamesUnitNamesAsQuery.pkl')
processedData_mxShop_features_VsmDescriptionDescriptionAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmDescriptionDescriptionAsQuery.pkl')
processedData_mxShop_features_VsmDescriptionLogsAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmDescriptionLogsAsQuery.pkl')
#processedData_mxShop_features_VsmCommentsCommentsAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmCommentsCommentsAsQuery.pkl')
#processedData_mxShop_features_VsmCommentsLogsAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmCommentsLogsAsQuery.pkl')

processedData_mxShop_features_VsmSvnJiraJiraAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmSvnJiraJiraAsQuery.pkl')
processedData_mxShop_features_VsmSvnJiraSvnAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmSvnJiraSvnAsQuery.pkl')
processedData_mxShop_features_VsmSvnSummarySvnAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmSvnSummarySvnAsQuery.pkl')
processedData_mxShop_features_VsmSvnSummarySummaryAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmSvnSummarySummaryAsQuery.pkl')
processedData_mxShop_features_VsmSvnDescriptionSvnAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmSvnDescriptionSvnAsQuery.pkl')
processedData_mxShop_features_VsmSvnDescriptionDescriptionAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmSvnDescriptionDescriptionAsQuery.pkl')
#processedData_mxShop_features_VsmSvnCommentsSvnAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmSvnCommentsSvnAsQuery.pkl')
#processedData_mxShop_features_VsmSvnCommentsCommentsAsQuery = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmSvnCommentsCommentsAsQuery.pkl')


#Load IR-Related Features - bigram
#processedData_mxShop_features_VsmLogsJiraAsQuery_2gram = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmLogsJiraAsQuery_2gram.pkl')
#processedData_mxShop_features_VsmLogsLogAsQuery_2gram = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmLogsLogAsQuery_2gram.pkl')
#processedData_mxShop_features_VsmUnitNamesJiraAsQuery_2gram = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmUnitNamesJiraAsQuery_2gram.pkl')
#processedData_mxShop_features_VsmUnitNamesUnitNamesAsQuery_2gram = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmUnitNamesUnitNamesAsQuery_2gram.pkl')
#processedData_mxShop_features_VsmCommentsLogsAsQuery_2gram = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmCommentsLogsAsQuery_2gram.pkl')
#processedData_mxShop_features_VsmCommentsCommentsAsQuery_2gram = pd.read_pickle(r'../data/03_processed/processedData_mxShop_features_VsmCommentsCommentsAsQuery_2gram.pkl')


#Load Document Statistics Features
processedData_JIRA_mxShopFeaturesUniqueWordCount = pd.read_pickle(r"../data/03_processed/processedData_JIRA_mxShopFeaturesUniqueWordCount.pkl")
processedData_SVN_mxShopFeaturesUniqueWordCount = pd.read_pickle(r"../data/03_processed/processedData_SVN_mxShopFeaturesUniqueWordCount.pkl")
processedData_JIRA_mxShopFeaturesTotalWordCount = pd.read_pickle(r"../data/03_processed/processedData_JIRA_mxShopFeaturesTotalWordCount.pkl")
processedData_SVN_mxShopFeaturesTotalWordCount = pd.read_pickle(r"../data/03_processed/processedData_SVN_mxShopFeaturesTotalWordCount.pkl")
processedData_JIRA_mxShopFeaturesOverlapPercentage = pd.read_pickle(r"../data/03_processed/processedData_JIRA_mxShopFeaturesOverlapPercentage.pkl")
processedData_SVN_mxShopFeaturesOverlapPercentage = pd.read_pickle(r"../data/03_processed/processedData_SVN_mxShopFeaturesOverlapPercentage.pkl")
processedData_UNION_mxShopFeaturesOverlapPercentage = pd.read_pickle(r"../data/03_processed/processedData_UNION_mxShopFeaturesOverlapPercentage.pkl")

#Load Query Quality Features
#processedData_mxShopFeaturesQueryQuality = pd.read_pickle(r'../data/03_processed/processedData_mxShopFeaturesQueryQuality.pkl')
processedData_SVN_mxShopFeaturesIDF = pd.read_pickle(r'../data/03_processed/processedData_SVN_mxShopFeaturesIDF.pkl')
processedData_SVNLogs_mxShopFeaturesIDF = pd.read_pickle(r'../data/03_processed/processedData_SVNLogs_mxShopFeaturesIDF.pkl')
processedData_SVNUnitNames_mxShopFeaturesIDF = pd.read_pickle(r'../data/03_processed/processedData_SVNUnitNames_mxShopFeaturesIDF.pkl')
processedData_JIRA_mxShopFeaturesIDF = pd.read_pickle(r'../data/03_processed/processedData_JIRA_mxShopFeaturesIDF.pkl')
processedData_JIRASummaries_mxShopFeaturesIDF = pd.read_pickle(r'../data/03_processed/processedData_JIRASummaries_mxShopFeaturesIDF.pkl')
processedData_JIRADescriptions_mxShopFeaturesIDF = pd.read_pickle(r'../data/03_processed/processedData_JIRADescriptions_mxShopFeaturesIDF.pkl')
#processedData_JIRAComments_mxShopFeaturesIDF = pd.read_pickle(r'../data/03_processed/processedData_JIRAComments_mxShopFeaturesIDF.pkl')


processedData_SVN_mxShopFeaturesICTF = pd.read_pickle(r'../data/03_processed/processedData_SVN_mxShopFeaturesICTF.pkl')
processedData_SVNLogs_mxShopFeaturesICTF = pd.read_pickle(r'../data/03_processed/processedData_SVNLogs_mxShopFeaturesICTF.pkl')
processedData_SVNUnitNames_mxShopFeaturesICTF = pd.read_pickle(r'../data/03_processed/processedData_SVNUnitNames_mxShopFeaturesICTF.pkl')
processedData_JIRA_mxShopFeaturesICTF = pd.read_pickle(r'../data/03_processed/processedData_JIRA_mxShopFeaturesICTF.pkl')
processedData_JIRASummaries_mxShopFeaturesICTF = pd.read_pickle(r'../data/03_processed/processedData_JIRASummaries_mxShopFeaturesICTF.pkl')
processedData_JIRADescriptions_mxShopFeaturesICTF = pd.read_pickle(r'../data/03_processed/processedData_JIRADescriptions_mxShopFeaturesICTF.pkl')
#processedData_JIRAComments_mxShopFeaturesICTF = pd.read_pickle(r'../data/03_processed/processedData_JIRAComments_mxShopFeaturesICTF.pkl')


processedData_SVN_mxShopFeaturesEntropy = pd.read_pickle(r'../data/03_processed/processedData_SVN_mxShopFeaturesEntropy.pkl')
processedData_SVNLogs_mxShopFeaturesEntropy = pd.read_pickle(r'../data/03_processed/processedData_SVNLogs_mxShopFeaturesEntropy.pkl')
processedData_SVNUnitNames_mxShopFeaturesEntropy = pd.read_pickle(r'../data/03_processed/processedData_SVNUnitNames_mxShopFeaturesEntropy.pkl')
processedData_JIRA_mxShopFeaturesEntropy = pd.read_pickle(r'../data/03_processed/processedData_JIRA_mxShopFeaturesEntropy.pkl')
processedData_JIRASummaries_mxShopFeaturesEntropy = pd.read_pickle(r'../data/03_processed/processedData_JIRASummaries_mxShopFeaturesEntropy.pkl')
processedData_JIRADescriptions_mxShopFeaturesEntropy = pd.read_pickle(r'../data/03_processed/processedData_JIRADescriptions_mxShopFeaturesEntropy.pkl')
#processedData_JIRAComments_mxShopFeaturesEntropy = pd.read_pickle(r'../data/03_processed/processedData_JIRAComments_mxShopFeaturesEntropy.pkl')


processedData_SVN_mxShopFeaturesQueryScope = pd.read_pickle(r'../data/03_processed/processedData_SVN_mxShopFeaturesQueryScope.pkl')
processedData_SVNLogs_mxShopFeaturesQueryScope = pd.read_pickle(r'../data/03_processed/processedData_SVNLogs_mxShopFeaturesQueryScope.pkl')
processedData_SVNUnitNames_mxShopFeaturesQueryScope = pd.read_pickle(r'../data/03_processed/processedData_SVNUnitNames_mxShopFeaturesQueryScope.pkl')
processedData_JIRA_mxShopFeaturesQueryScope = pd.read_pickle(r'../data/03_processed/processedData_JIRA_mxShopFeaturesQueryScope.pkl')
processedData_JIRASummaries_mxShopFeaturesQueryScope = pd.read_pickle(r'../data/03_processed/processedData_JIRASummaries_mxShopFeaturesQueryScope.pkl')
processedData_JIRADescriptions_mxShopFeaturesQueryScope = pd.read_pickle(r'../data/03_processed/processedData_JIRADescriptions_mxShopFeaturesQueryScope.pkl')
#processedData_JIRAComments_mxShopFeaturesQueryScope = pd.read_pickle(r'../data/03_processed/processedData_JIRAComments_mxShopFeaturesQueryScope.pkl')


processedData_SVN_mxShopFeaturesSCS = pd.read_pickle(r'../data/03_processed/processedData_SVN_mxShopFeaturesSCS.pkl')
processedData_SVNLogs_mxShopFeaturesSCS = pd.read_pickle(r'../data/03_processed/processedData_SVNLogs_mxShopFeaturesSCS.pkl')
processedData_SVNUnitNames_mxShopFeaturesSCS = pd.read_pickle(r'../data/03_processed/processedData_SVNUnitNames_mxShopFeaturesSCS.pkl')
processedData_JIRA_mxShopFeaturesSCS = pd.read_pickle(r'../data/03_processed/processedData_JIRA_mxShopFeaturesSCS.pkl')
processedData_JIRASummaries_mxShopFeaturesSCS = pd.read_pickle(r'../data/03_processed/processedData_JIRASummaries_mxShopFeaturesSCS.pkl')
processedData_JIRADescriptions_mxShopFeaturesSCS = pd.read_pickle(r'../data/03_processed/processedData_JIRADescriptions_mxShopFeaturesSCS.pkl')
#processedData_JIRAComments_mxShopFeaturesSCS = pd.read_pickle(r'../data/03_processed/processedData_JIRAComments_mxShopFeaturesSCS.pkl')


processedData_SVN_mxShopFeaturesSCQ = pd.read_pickle(r'../data/03_processed/processedData_SVN_mxShopFeaturesSCQ.pkl')
processedData_SVNLogs_mxShopFeaturesSCQ = pd.read_pickle(r'../data/03_processed/processedData_SVNLogs_mxShopFeaturesSCQ.pkl')
processedData_SVNUnitNames_mxShopFeaturesSCQ = pd.read_pickle(r'../data/03_processed/processedData_SVNUnitNames_mxShopFeaturesSCQ.pkl')
processedData_JIRA_mxShopFeaturesSCQ = pd.read_pickle(r'../data/03_processed/processedData_JIRA_mxShopFeaturesSCQ.pkl')
processedData_JIRASummaries_mxShopFeaturesSCQ = pd.read_pickle(r'../data/03_processed/processedData_JIRASummaries_mxShopFeaturesSCQ.pkl')
processedData_JIRADescriptions_mxShopFeaturesSCQ = pd.read_pickle(r'../data/03_processed/processedData_JIRADescriptions_mxShopFeaturesSCQ.pkl')
#processedData_JIRAComments_mxShopFeaturesSCQ = pd.read_pickle(r'../data/03_processed/processedData_JIRAComments_mxShopFeaturesSCQ.pkl')


#processedData_SVN_mxShopFeaturesPMI = pd.read_pickle(r'../data/03_processed/processedData_SVN_mxShopFeaturesPMI.pkl')
processedData_SVNLogs_mxShopFeaturesPMI = pd.read_pickle(r'../data/03_processed/processedData_SVNLogs_mxShopFeaturesPMI.pkl')
#processedData_SVNUnitNames_mxShopFeaturesPMI = pd.read_pickle(r'../data/03_processed/processedData_SVNUnitNames_mxShopFeaturesPMI.pkl')
#processedData_JIRA_mxShopFeaturesPMI = pd.read_pickle(r'../data/03_processed/processedData_JIRA_mxShopFeaturesPMI.pkl')
processedData_JIRASummaries_mxShopFeaturesPMI = pd.read_pickle(r'../data/03_processed/processedData_JIRASummaries_mxShopFeaturesPMI.pkl')
#processedData_JIRADescriptions_mxShopFeaturesPMI = pd.read_pickle(r'../data/03_processed/processedData_JIRADescriptions_mxShopFeaturesPMI.pkl')
#processedData_JIRAComments_mxShopFeaturesPMI = pd.read_pickle(r'../data/03_processed/processedData_JIRAComments_mxShopFeaturesPMI.pkl')


################################## Drop query array for normalization ###############################################


processedData_SVN_mxShopFeaturesIDF.drop('SvnAsQuery_IDF', axis = 1, inplace=True)
processedData_SVNLogs_mxShopFeaturesIDF.drop('SvnLogsAsQuery_IDF', axis = 1, inplace=True)
processedData_SVNUnitNames_mxShopFeaturesIDF.drop('SvnUnitNamesAsQuery_IDF', axis = 1, inplace=True)
processedData_JIRA_mxShopFeaturesIDF.drop('JiraAsQuery_IDF', axis = 1, inplace=True)
processedData_JIRASummaries_mxShopFeaturesIDF.drop('JiraSummariesAsQuery_IDF', axis = 1, inplace=True)
processedData_JIRADescriptions_mxShopFeaturesIDF.drop('JiraDescriptionsAsQuery_IDF', axis = 1, inplace=True)
#processedData_JIRAComments_mxShopFeaturesIDF.drop('JiraCommentsAsQuery_IDF', axis = 1, inplace=True)

processedData_SVN_mxShopFeaturesICTF.drop('SvnAsQuery_ICTF', axis = 1, inplace=True)
processedData_SVNLogs_mxShopFeaturesICTF.drop('SvnLogsAsQuery_ICTF', axis = 1, inplace=True)
processedData_SVNUnitNames_mxShopFeaturesICTF.drop('SvnUnitNamesAsQuery_ICTF', axis = 1, inplace=True)
processedData_JIRA_mxShopFeaturesICTF.drop('JiraAsQuery_ICTF', axis = 1, inplace=True)
processedData_JIRASummaries_mxShopFeaturesICTF.drop('JiraSummariesAsQuery_ICTF', axis = 1, inplace=True)
processedData_JIRADescriptions_mxShopFeaturesICTF.drop('JiraDescriptionsAsQuery_ICTF', axis = 1, inplace=True)
#processedData_JIRAComments_mxShopFeaturesICTF.drop('JiraCommentsAsQuery_ICTF', axis = 1, inplace=True)

processedData_SVN_mxShopFeaturesEntropy.drop('SvnAsQuery_Entropy', axis = 1, inplace=True)
processedData_SVNLogs_mxShopFeaturesEntropy.drop('SvnLogsAsQuery_Entropy', axis = 1, inplace=True)
processedData_SVNUnitNames_mxShopFeaturesEntropy.drop('SvnUnitNamesAsQuery_Entropy', axis = 1, inplace=True)
processedData_JIRA_mxShopFeaturesEntropy.drop('JiraAsQuery_Entropy', axis = 1, inplace=True)
processedData_JIRASummaries_mxShopFeaturesEntropy.drop('JiraSummariesAsQuery_Entropy', axis = 1, inplace=True)
processedData_JIRADescriptions_mxShopFeaturesEntropy.drop('JiraDescriptionsAsQuery_Entropy', axis = 1, inplace=True)
#processedData_JIRAComments_mxShopFeaturesEntropy.drop('JiraCommentsAsQuery_Entropy', axis = 1, inplace=True)

processedData_SVN_mxShopFeaturesSCQ.drop('SvnAsQuery_SCQ', axis = 1, inplace=True)
processedData_SVNLogs_mxShopFeaturesSCQ.drop('SvnLogsAsQuery_SCQ', axis = 1, inplace=True)
processedData_SVNUnitNames_mxShopFeaturesSCQ.drop('SvnUnitNamesAsQuery_SCQ', axis = 1, inplace=True)
processedData_JIRA_mxShopFeaturesSCQ.drop('JiraAsQuery_SCQ', axis = 1, inplace=True)
processedData_JIRASummaries_mxShopFeaturesSCQ.drop('JiraSummariesAsQuery_SCQ', axis = 1, inplace=True)
processedData_JIRADescriptions_mxShopFeaturesSCQ.drop('JiraDescriptionsAsQuery_SCQ', axis = 1, inplace=True)
#processedData_JIRAComments_mxShopFeaturesSCQ.drop('JiraCommentsAsQuery_SCQ', axis = 1, inplace=True)

################################## Normalizing ################################################

processedData_mxShopFeaturesTime_normalized = normalizeData(processedData_mxShopFeaturesTime)
processedData_mxShopFeaturesStakeholder_normalized = normalizeData(processedData_mxShopFeaturesStakeholder)

#Load IR-Related Features - unigram
processedData_mxShop_features_VsmLogsJiraAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmLogsJiraAsQuery)
processedData_mxShop_features_VsmLogsLogAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmLogsLogAsQuery)
processedData_mxShop_features_VsmUnitNamesJiraAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmUnitNamesJiraAsQuery)
processedData_mxShop_features_VsmUnitNamesUnitNamesAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmUnitNamesUnitNamesAsQuery)
#processedData_mxShop_features_VsmUnitNamesCommentsCommentsAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmUnitNamesCommentsCommentsAsQuery)
#processedData_mxShop_features_VsmUnitNamesCommentsUnitNamesAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmUnitNamesCommentsUnitNamesAsQuery)
processedData_mxShop_features_VsmUnitNamesDescriptionDescriptionAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmUnitNamesDescriptionDescriptionAsQuery)
processedData_mxShop_features_VsmUnitNamesDescriptionUnitNamesAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmUnitNamesDescriptionUnitNamesAsQuery)

#processedData_mxShop_features_VsmVerbPruningUnitNamesJiraAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmVerbPruningUnitNamesJiraAsQuery)
#processedData_mxShop_features_VsmVerbPruningUnitNamesUnitNamesAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmVerbPruningUnitNamesUnitNamesAsQuery)
processedData_mxShop_features_VsmSummaryLogsSummaryAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmSummaryLogsSummaryAsQuery)
processedData_mxShop_features_VsmSummaryLogsLogsAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmSummaryLogsLogsAsQuery)
processedData_mxShop_features_VsmSummaryUnitNamesSummaryAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmSummaryUnitNamesSummaryAsQuery)
processedData_mxShop_features_VsmSummaryUnitNamesUnitNamesAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmSummaryUnitNamesUnitNamesAsQuery)
processedData_mxShop_features_VsmDescriptionDescriptionAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmDescriptionDescriptionAsQuery)
processedData_mxShop_features_VsmDescriptionLogsAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmDescriptionLogsAsQuery)
#processedData_mxShop_features_VsmCommentsCommentsAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmCommentsCommentsAsQuery)
#processedData_mxShop_features_VsmCommentsLogsAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmCommentsLogsAsQuery)

processedData_mxShop_features_VsmSvnJiraJiraAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmSvnJiraJiraAsQuery)
processedData_mxShop_features_VsmSvnJiraSvnAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmSvnJiraSvnAsQuery)
processedData_mxShop_features_VsmSvnSummarySvnAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmSvnSummarySvnAsQuery)
processedData_mxShop_features_VsmSvnSummarySummaryAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmSvnSummarySummaryAsQuery)
processedData_mxShop_features_VsmSvnDescriptionSvnAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmSvnDescriptionSvnAsQuery)
processedData_mxShop_features_VsmSvnDescriptionDescriptionAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmSvnDescriptionDescriptionAsQuery)
#processedData_mxShop_features_VsmSvnCommentsSvnAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmSvnCommentsSvnAsQuery)
#processedData_mxShop_features_VsmSvnCommentsCommentsAsQuery_normalized = normalizeData(processedData_mxShop_features_VsmSvnCommentsCommentsAsQuery)



#Load IR-Related Features - bigram
#processedData_mxShop_features_VsmLogsJiraAsQuery_2gram_normalized = normalizeData(processedData_mxShop_features_VsmLogsJiraAsQuery_2gram)
#processedData_mxShop_features_VsmLogsLogAsQuery_2gram_normalized = normalizeData(processedData_mxShop_features_VsmLogsLogAsQuery_2gram)
#processedData_mxShop_features_VsmUnitNamesJiraAsQuery_2gram_normalized = normalizeData(processedData_mxShop_features_VsmUnitNamesJiraAsQuery_2gram)
#processedData_mxShop_features_VsmUnitNamesUnitNamesAsQuery_2gram_normalized = normalizeData(processedData_mxShop_features_VsmUnitNamesUnitNamesAsQuery_2gram)
#processedData_mxShop_features_VsmCommentsLogsAsQuery_2gram_normalized = normalizeData(processedData_mxShop_features_VsmCommentsLogsAsQuery_2gram)
#processedData_mxShop_features_VsmCommentsCommentsAsQuery_2gram_normalized = normalizeData(processedData_mxShop_features_VsmCommentsCommentsAsQuery_2gram)


#Load Document Statistics Features
processedData_JIRA_mxShopFeaturesUniqueWordCount_normalized = normalizeData(processedData_JIRA_mxShopFeaturesUniqueWordCount)
processedData_SVN_mxShopFeaturesUniqueWordCount_normalized = normalizeData(processedData_SVN_mxShopFeaturesUniqueWordCount)
processedData_JIRA_mxShopFeaturesTotalWordCount_normalized = normalizeData(processedData_JIRA_mxShopFeaturesTotalWordCount)
processedData_SVN_mxShopFeaturesTotalWordCount_normalized = normalizeData(processedData_SVN_mxShopFeaturesTotalWordCount)
processedData_JIRA_mxShopFeaturesOverlapPercentage_normalized = normalizeData(processedData_JIRA_mxShopFeaturesOverlapPercentage)
processedData_SVN_mxShopFeaturesOverlapPercentage_normalized = normalizeData(processedData_SVN_mxShopFeaturesOverlapPercentage)
processedData_UNION_mxShopFeaturesOverlapPercentage_normalized = normalizeData(processedData_UNION_mxShopFeaturesOverlapPercentage)

#Load Query Quality Features
processedData_SVN_mxShopFeaturesIDF_normalized = normalizeData(processedData_SVN_mxShopFeaturesIDF)
processedData_SVNLogs_mxShopFeaturesIDF_normalized = normalizeData(processedData_SVNLogs_mxShopFeaturesIDF)
processedData_SVNUnitNames_mxShopFeaturesIDF_normalized = normalizeData(processedData_SVNUnitNames_mxShopFeaturesIDF)
processedData_JIRA_mxShopFeaturesIDF_normalized = normalizeData(processedData_JIRA_mxShopFeaturesIDF)
processedData_JIRASummaries_mxShopFeaturesIDF_normalized = normalizeData(processedData_JIRASummaries_mxShopFeaturesIDF)
processedData_JIRADescriptions_mxShopFeaturesIDF_normalized = normalizeData(processedData_JIRADescriptions_mxShopFeaturesIDF)
#processedData_JIRAComments_mxShopFeaturesIDF_normalized = normalizeData(processedData_JIRAComments_mxShopFeaturesIDF)

processedData_SVN_mxShopFeaturesICTF_normalized = normalizeData(processedData_SVN_mxShopFeaturesICTF)
processedData_SVNLogs_mxShopFeaturesICTF_normalized = normalizeData(processedData_SVNLogs_mxShopFeaturesICTF)
processedData_SVNUnitNames_mxShopFeaturesICTF_normalized = normalizeData(processedData_SVNUnitNames_mxShopFeaturesICTF)
processedData_JIRA_mxShopFeaturesICTF_normalized = normalizeData(processedData_JIRA_mxShopFeaturesICTF)
processedData_JIRASummaries_mxShopFeaturesICTF_normalized = normalizeData(processedData_JIRASummaries_mxShopFeaturesICTF)
processedData_JIRADescriptions_mxShopFeaturesICTF_normalized = normalizeData(processedData_JIRADescriptions_mxShopFeaturesICTF)
#processedData_JIRAComments_mxShopFeaturesICTF_normalized = normalizeData(processedData_JIRAComments_mxShopFeaturesICTF)

processedData_SVN_mxShopFeaturesEntropy_normalized = normalizeData(processedData_SVN_mxShopFeaturesEntropy)
processedData_SVNLogs_mxShopFeaturesEntropy_normalized = normalizeData(processedData_SVNLogs_mxShopFeaturesEntropy)
processedData_SVNUnitNames_mxShopFeaturesEntropy_normalized = normalizeData(processedData_SVNUnitNames_mxShopFeaturesEntropy)
processedData_JIRA_mxShopFeaturesEntropy_normalized = normalizeData(processedData_JIRA_mxShopFeaturesEntropy)
processedData_JIRASummaries_mxShopFeaturesEntropy_normalized = normalizeData(processedData_JIRASummaries_mxShopFeaturesEntropy)
processedData_JIRADescriptions_mxShopFeaturesEntropy_normalized = normalizeData(processedData_JIRADescriptions_mxShopFeaturesEntropy)
#processedData_JIRAComments_mxShopFeaturesEntropy_normalized = normalizeData(processedData_JIRAComments_mxShopFeaturesEntropy)

processedData_SVN_mxShopFeaturesQueryScope_normalized = normalizeData(processedData_SVN_mxShopFeaturesQueryScope)
processedData_SVNLogs_mxShopFeaturesQueryScope_normalized = normalizeData(processedData_SVNLogs_mxShopFeaturesQueryScope)
processedData_SVNUnitNames_mxShopFeaturesQueryScope_normalized = normalizeData(processedData_SVNUnitNames_mxShopFeaturesQueryScope)
processedData_JIRA_mxShopFeaturesQueryScope_normalized = normalizeData(processedData_JIRA_mxShopFeaturesQueryScope)
processedData_JIRASummaries_mxShopFeaturesQueryScope_normalized = normalizeData(processedData_JIRASummaries_mxShopFeaturesQueryScope)
processedData_JIRADescriptions_mxShopFeaturesQueryScope_normalized = normalizeData(processedData_JIRADescriptions_mxShopFeaturesQueryScope)
#processedData_JIRAComments_mxShopFeaturesQueryScope_normalized = normalizeData(processedData_JIRAComments_mxShopFeaturesQueryScope)

processedData_SVN_mxShopFeaturesSCS_normalized = normalizeData(processedData_SVN_mxShopFeaturesSCS)
processedData_SVNLogs_mxShopFeaturesSCS_normalized = normalizeData(processedData_SVNLogs_mxShopFeaturesSCS)
processedData_SVNUnitNames_mxShopFeaturesSCS_normalized = normalizeData(processedData_SVNUnitNames_mxShopFeaturesSCS)
processedData_JIRA_mxShopFeaturesSCS_normalized = normalizeData(processedData_JIRA_mxShopFeaturesSCS)
processedData_JIRASummaries_mxShopFeaturesSCS_normalized = normalizeData(processedData_JIRASummaries_mxShopFeaturesSCS)
processedData_JIRADescriptions_mxShopFeaturesSCS_normalized = normalizeData(processedData_JIRADescriptions_mxShopFeaturesSCS)
#processedData_JIRAComments_mxShopFeaturesSCS_normalized = normalizeData(processedData_JIRAComments_mxShopFeaturesSCS)

processedData_SVN_mxShopFeaturesSCQ_normalized = normalizeData(processedData_SVN_mxShopFeaturesSCQ)
processedData_SVNLogs_mxShopFeaturesSCQ_normalized = normalizeData(processedData_SVNLogs_mxShopFeaturesSCQ)
processedData_SVNUnitNames_mxShopFeaturesSCQ_normalized = normalizeData(processedData_SVNUnitNames_mxShopFeaturesSCQ)
processedData_JIRA_mxShopFeaturesSCQ_normalized = normalizeData(processedData_JIRA_mxShopFeaturesSCQ)
processedData_JIRASummaries_mxShopFeaturesSCQ_normalized = normalizeData(processedData_JIRASummaries_mxShopFeaturesSCQ)
processedData_JIRADescriptions_mxShopFeaturesSCQ_normalized = normalizeData(processedData_JIRADescriptions_mxShopFeaturesSCQ)
#processedData_JIRAComments_mxShopFeaturesSCQ_normalized = normalizeData(processedData_JIRAComments_mxShopFeaturesSCQ)

#processedData_SVN_mxShopFeaturesPMI_normalized = normalizeData(processedData_SVN_mxShopFeaturesPMI)
processedData_SVNLogs_mxShopFeaturesPMI_normalized = normalizeData(processedData_SVNLogs_mxShopFeaturesPMI)
#processedData_SVNUnitNames_mxShopFeaturesPMI_normalized = normalizeData(processedData_SVNUnitNames_mxShopFeaturesPMI)
#processedData_JIRA_mxShopFeaturesPMI_normalized = normalizeData(processedData_JIRA_mxShopFeaturesPMI)
processedData_JIRASummaries_mxShopFeaturesPMI_normalized = normalizeData(processedData_JIRASummaries_mxShopFeaturesPMI)
#processedData_JIRADescriptions_mxShopFeaturesPMI_normalized = normalizeData(processedData_JIRADescriptions_mxShopFeaturesPMI)
#processedData_JIRAComments_mxShopFeaturesPMI_normalized = normalizeData(processedData_JIRAComments_mxShopFeaturesPMI)


## 3.8 Preprocess Data - Load and transform feature families needed for training

In [4]:

#Merge features into 1 dataframe
processedData_mxShopFeatures_normalized = pd.concat([processedData_mxShopFeaturesTime_normalized,
                                                  processedData_mxShopFeaturesStakeholder_normalized,
                                                  #IR-based
                                                  processedData_mxShop_features_VsmLogsJiraAsQuery_normalized,
                                                  processedData_mxShop_features_VsmLogsLogAsQuery_normalized,
                                                  processedData_mxShop_features_VsmUnitNamesJiraAsQuery_normalized,
                                                  processedData_mxShop_features_VsmUnitNamesUnitNamesAsQuery_normalized,
                                                #  processedData_mxShop_features_VsmUnitNamesCommentsCommentsAsQuery_normalized,
                                                #  processedData_mxShop_features_VsmUnitNamesCommentsUnitNamesAsQuery_normalized,
                                                  processedData_mxShop_features_VsmUnitNamesDescriptionDescriptionAsQuery_normalized,
                                                  processedData_mxShop_features_VsmUnitNamesDescriptionUnitNamesAsQuery_normalized,
                                                  processedData_mxShop_features_VsmSummaryLogsSummaryAsQuery_normalized,
                                                  processedData_mxShop_features_VsmSummaryLogsLogsAsQuery_normalized,
                                                  processedData_mxShop_features_VsmSummaryUnitNamesSummaryAsQuery_normalized,
                                                  processedData_mxShop_features_VsmSummaryUnitNamesUnitNamesAsQuery_normalized,
                                                  processedData_mxShop_features_VsmDescriptionDescriptionAsQuery_normalized,
                                                  processedData_mxShop_features_VsmDescriptionLogsAsQuery_normalized,
                                                 # processedData_mxShop_features_VsmCommentsCommentsAsQuery_normalized,
                                                #  processedData_mxShop_features_VsmCommentsLogsAsQuery_normalized,
                                                 # processedData_mxShop_features_VsmLogsJiraAsQuery_2gram_normalized,
                                                 # processedData_mxShop_features_VsmLogsLogAsQuery_2gram_normalized,
                                                 # processedData_mxShop_features_VsmUnitNamesJiraAsQuery_2gram_normalized,
                                                 # processedData_mxShop_features_VsmUnitNamesUnitNamesAsQuery_2gram_normalized,
                                                  #processedData_mxShop_features_VsmVerbPruningUnitNamesJiraAsQuery_normalized,
                                                 # processedData_mxShop_features_VsmVerbPruningUnitNamesUnitNamesAsQuery_normalized,
                                                  processedData_mxShop_features_VsmSvnJiraJiraAsQuery_normalized,
                                                  processedData_mxShop_features_VsmSvnJiraSvnAsQuery_normalized,
                                                  processedData_mxShop_features_VsmSvnSummarySvnAsQuery_normalized,
                                                  processedData_mxShop_features_VsmSvnSummarySummaryAsQuery_normalized,
                                                  processedData_mxShop_features_VsmSvnDescriptionSvnAsQuery_normalized,
                                                  processedData_mxShop_features_VsmSvnDescriptionDescriptionAsQuery_normalized,
                                                #  processedData_mxShop_features_VsmSvnCommentsSvnAsQuery_normalized,
                                                #  processedData_mxShop_features_VsmSvnCommentsCommentsAsQuery_normalized,

                                                  
                                                  #Document Statistics
                                                  processedData_JIRA_mxShopFeaturesUniqueWordCount_normalized,
                                                  processedData_SVN_mxShopFeaturesUniqueWordCount_normalized,
                                                  processedData_JIRA_mxShopFeaturesTotalWordCount_normalized,
                                                  processedData_SVN_mxShopFeaturesTotalWordCount_normalized,
                                                  processedData_JIRA_mxShopFeaturesOverlapPercentage_normalized,
                                                  processedData_SVN_mxShopFeaturesOverlapPercentage_normalized,
                                                  processedData_UNION_mxShopFeaturesOverlapPercentage_normalized,
                                                 #Query Quality
                                                  processedData_SVN_mxShopFeaturesIDF_normalized['SvnAsQuery_avgIDF'],
                                                  processedData_SVN_mxShopFeaturesIDF_normalized['SvnAsQuery_maxIDF'],
                                                  processedData_SVN_mxShopFeaturesIDF_normalized['SvnAsQuery_devIDF'],
                                                  processedData_SVNLogs_mxShopFeaturesIDF_normalized['SvnLogsAsQuery_avgIDF'],
                                                  processedData_SVNLogs_mxShopFeaturesIDF_normalized['SvnLogsAsQuery_maxIDF'],
                                                  processedData_SVNLogs_mxShopFeaturesIDF_normalized['SvnLogsAsQuery_devIDF'],
                                                  processedData_SVNUnitNames_mxShopFeaturesIDF_normalized['SvnUnitNamesAsQuery_avgIDF'],
                                                  processedData_SVNUnitNames_mxShopFeaturesIDF_normalized['SvnUnitNamesAsQuery_maxIDF'],
                                                  processedData_SVNUnitNames_mxShopFeaturesIDF_normalized['SvnUnitNamesAsQuery_devIDF'],
                                                  processedData_JIRA_mxShopFeaturesIDF_normalized['JiraAsQuery_avgIDF'],
                                                  processedData_JIRA_mxShopFeaturesIDF_normalized['JiraAsQuery_maxIDF'],
                                                  processedData_JIRA_mxShopFeaturesIDF_normalized['JiraAsQuery_devIDF'],  
                                                  processedData_JIRASummaries_mxShopFeaturesIDF_normalized['JiraSummariesAsQuery_avgIDF'],
                                                  processedData_JIRASummaries_mxShopFeaturesIDF_normalized['JiraSummariesAsQuery_maxIDF'],
                                                  processedData_JIRASummaries_mxShopFeaturesIDF_normalized['JiraSummariesAsQuery_devIDF'],  
                                                  processedData_JIRADescriptions_mxShopFeaturesIDF_normalized['JiraDescriptionsAsQuery_avgIDF'],
                                                  processedData_JIRADescriptions_mxShopFeaturesIDF_normalized['JiraDescriptionsAsQuery_maxIDF'],
                                                  processedData_JIRADescriptions_mxShopFeaturesIDF_normalized['JiraDescriptionsAsQuery_devIDF'],  
                                                #  processedData_JIRAComments_mxShopFeaturesIDF_normalized['JiraCommentsAsQuery_avgIDF'],
                                                #  processedData_JIRAComments_mxShopFeaturesIDF_normalized['JiraCommentsAsQuery_maxIDF'],
                                                #  processedData_JIRAComments_mxShopFeaturesIDF_normalized['JiraCommentsAsQuery_devIDF'],  
                                                  
                                                  processedData_SVN_mxShopFeaturesICTF_normalized["SvnAsQuery_avgICTF"],
                                                  processedData_SVN_mxShopFeaturesICTF_normalized["SvnAsQuery_maxICTF"],
                                                  processedData_SVN_mxShopFeaturesICTF_normalized["SvnAsQuery_devICTF"],
                                                  processedData_SVNLogs_mxShopFeaturesICTF_normalized["SvnLogsAsQuery_avgICTF"],
                                                  processedData_SVNLogs_mxShopFeaturesICTF_normalized["SvnLogsAsQuery_maxICTF"],
                                                  processedData_SVNLogs_mxShopFeaturesICTF_normalized["SvnLogsAsQuery_devICTF"],
                                                  processedData_SVNUnitNames_mxShopFeaturesICTF_normalized["SvnUnitNamesAsQuery_avgICTF"],
                                                  processedData_SVNUnitNames_mxShopFeaturesICTF_normalized["SvnUnitNamesAsQuery_maxICTF"],
                                                  processedData_SVNUnitNames_mxShopFeaturesICTF_normalized["SvnUnitNamesAsQuery_devICTF"],
                                                  processedData_JIRA_mxShopFeaturesICTF_normalized["JiraAsQuery_avgICTF"],
                                                  processedData_JIRA_mxShopFeaturesICTF_normalized["JiraAsQuery_maxICTF"],
                                                  processedData_JIRA_mxShopFeaturesICTF_normalized["JiraAsQuery_devICTF"],
                                                  processedData_JIRASummaries_mxShopFeaturesICTF_normalized["JiraSummariesAsQuery_avgICTF"],
                                                  processedData_JIRASummaries_mxShopFeaturesICTF_normalized["JiraSummariesAsQuery_maxICTF"],
                                                  processedData_JIRASummaries_mxShopFeaturesICTF_normalized["JiraSummariesAsQuery_devICTF"],
                                                  processedData_JIRADescriptions_mxShopFeaturesICTF_normalized["JiraDescriptionsAsQuery_avgICTF"],
                                                  processedData_JIRADescriptions_mxShopFeaturesICTF_normalized["JiraDescriptionsAsQuery_maxICTF"],
                                                  processedData_JIRADescriptions_mxShopFeaturesICTF_normalized["JiraDescriptionsAsQuery_devICTF"],
                                              #    processedData_JIRAComments_mxShopFeaturesICTF_normalized["JiraCommentsAsQuery_avgICTF"],
                                              #    processedData_JIRAComments_mxShopFeaturesICTF_normalized["JiraCommentsAsQuery_maxICTF"],
                                              #    processedData_JIRAComments_mxShopFeaturesICTF_normalized["JiraCommentsAsQuery_devICTF"],
                                                  
                                                  processedData_SVN_mxShopFeaturesEntropy_normalized["SvnAsQuery_avgEntropy"],
                                                  processedData_SVN_mxShopFeaturesEntropy_normalized["SvnAsQuery_medEntropy"],
                                                  processedData_SVN_mxShopFeaturesEntropy_normalized["SvnAsQuery_maxEntropy"],
                                                  processedData_SVN_mxShopFeaturesEntropy_normalized["SvnAsQuery_devEntropy"],
                                                  processedData_SVNLogs_mxShopFeaturesEntropy_normalized["SvnLogsAsQuery_avgEntropy"],
                                                  processedData_SVNLogs_mxShopFeaturesEntropy_normalized["SvnLogsAsQuery_medEntropy"],
                                                  processedData_SVNLogs_mxShopFeaturesEntropy_normalized["SvnLogsAsQuery_maxEntropy"],
                                                  processedData_SVNLogs_mxShopFeaturesEntropy_normalized["SvnLogsAsQuery_devEntropy"],
                                                  processedData_SVNUnitNames_mxShopFeaturesEntropy_normalized["SvnUnitNamesAsQuery_avgEntropy"],
                                                  processedData_SVNUnitNames_mxShopFeaturesEntropy_normalized["SvnUnitNamesAsQuery_medEntropy"],
                                                  processedData_SVNUnitNames_mxShopFeaturesEntropy_normalized["SvnUnitNamesAsQuery_maxEntropy"],
                                                  processedData_SVNUnitNames_mxShopFeaturesEntropy_normalized["SvnUnitNamesAsQuery_devEntropy"],
                                                  processedData_JIRA_mxShopFeaturesEntropy_normalized["JiraAsQuery_avgEntropy"],
                                                  processedData_JIRA_mxShopFeaturesEntropy_normalized["JiraAsQuery_medEntropy"],
                                                  processedData_JIRA_mxShopFeaturesEntropy_normalized["JiraAsQuery_maxEntropy"],
                                                  processedData_JIRA_mxShopFeaturesEntropy_normalized["JiraAsQuery_devEntropy"],
                                                  processedData_JIRASummaries_mxShopFeaturesEntropy_normalized["JiraSummariesAsQuery_avgEntropy"],
                                                  processedData_JIRASummaries_mxShopFeaturesEntropy_normalized["JiraSummariesAsQuery_medEntropy"],
                                                  processedData_JIRASummaries_mxShopFeaturesEntropy_normalized["JiraSummariesAsQuery_maxEntropy"],
                                                  processedData_JIRASummaries_mxShopFeaturesEntropy_normalized["JiraSummariesAsQuery_devEntropy"],
                                                  processedData_JIRADescriptions_mxShopFeaturesEntropy_normalized["JiraDescriptionsAsQuery_avgEntropy"],
                                                  processedData_JIRADescriptions_mxShopFeaturesEntropy_normalized["JiraDescriptionsAsQuery_medEntropy"],
                                                  processedData_JIRADescriptions_mxShopFeaturesEntropy_normalized["JiraDescriptionsAsQuery_maxEntropy"],
                                                  processedData_JIRADescriptions_mxShopFeaturesEntropy_normalized["JiraDescriptionsAsQuery_devEntropy"],
                                               #   processedData_JIRAComments_mxShopFeaturesEntropy_normalized["JiraCommentsAsQuery_avgEntropy"],
                                               #   processedData_JIRAComments_mxShopFeaturesEntropy_normalized["JiraCommentsAsQuery_medEntropy"],
                                               #   processedData_JIRAComments_mxShopFeaturesEntropy_normalized["JiraCommentsAsQuery_maxEntropy"],
                                               #   processedData_JIRAComments_mxShopFeaturesEntropy_normalized["JiraCommentsAsQuery_devEntropy"],
                                                  
                                                  processedData_SVN_mxShopFeaturesQueryScope_normalized,
                                                  processedData_SVNLogs_mxShopFeaturesQueryScope_normalized,
                                                  processedData_SVNUnitNames_mxShopFeaturesQueryScope_normalized,
                                                  processedData_JIRA_mxShopFeaturesQueryScope_normalized,
                                                  processedData_JIRASummaries_mxShopFeaturesQueryScope_normalized,
                                                  processedData_JIRADescriptions_mxShopFeaturesQueryScope_normalized,
                                                #  processedData_JIRAComments_mxShopFeaturesQueryScope_normalized,
                                                  
                                                  processedData_SVN_mxShopFeaturesSCS_normalized,
                                                  processedData_SVNLogs_mxShopFeaturesSCS_normalized,
                                                  processedData_SVNUnitNames_mxShopFeaturesSCS_normalized,
                                                  processedData_JIRA_mxShopFeaturesSCS_normalized,
                                                  processedData_JIRASummaries_mxShopFeaturesSCS_normalized,
                                                  processedData_JIRADescriptions_mxShopFeaturesSCS_normalized,
                                                #  processedData_JIRAComments_mxShopFeaturesSCS_normalized,
                                                  
                                                  processedData_SVN_mxShopFeaturesSCQ_normalized["SvnAsQuery_avgSCQ"],
                                                  processedData_SVN_mxShopFeaturesSCQ_normalized["SvnAsQuery_maxSCQ"],
                                                  processedData_SVN_mxShopFeaturesSCQ_normalized["SvnAsQuery_sumSCQ"],
                                                  processedData_SVNLogs_mxShopFeaturesSCQ_normalized["SvnLogsAsQuery_avgSCQ"],
                                                  processedData_SVNLogs_mxShopFeaturesSCQ_normalized["SvnLogsAsQuery_maxSCQ"],
                                                  processedData_SVNLogs_mxShopFeaturesSCQ_normalized["SvnLogsAsQuery_sumSCQ"],
                                                  processedData_SVNUnitNames_mxShopFeaturesSCQ_normalized["SvnUnitNamesAsQuery_avgSCQ"],
                                                  processedData_SVNUnitNames_mxShopFeaturesSCQ_normalized["SvnUnitNamesAsQuery_maxSCQ"],
                                                  processedData_SVNUnitNames_mxShopFeaturesSCQ_normalized["SvnUnitNamesAsQuery_sumSCQ"],
                                                  processedData_JIRA_mxShopFeaturesSCQ_normalized["JiraAsQuery_avgSCQ"],
                                                  processedData_JIRA_mxShopFeaturesSCQ_normalized["JiraAsQuery_maxSCQ"],
                                                  processedData_JIRA_mxShopFeaturesSCQ_normalized["JiraAsQuery_sumSCQ"],
                                                  processedData_JIRASummaries_mxShopFeaturesSCQ_normalized["JiraSummariesAsQuery_avgSCQ"],
                                                  processedData_JIRASummaries_mxShopFeaturesSCQ_normalized["JiraSummariesAsQuery_maxSCQ"],
                                                  processedData_JIRASummaries_mxShopFeaturesSCQ_normalized["JiraSummariesAsQuery_sumSCQ"],
                                                  processedData_JIRADescriptions_mxShopFeaturesSCQ_normalized["JiraDescriptionsAsQuery_avgSCQ"],
                                                  processedData_JIRADescriptions_mxShopFeaturesSCQ_normalized["JiraDescriptionsAsQuery_maxSCQ"],
                                                  processedData_JIRADescriptions_mxShopFeaturesSCQ_normalized["JiraDescriptionsAsQuery_sumSCQ"],
                                                 # processedData_JIRAComments_mxShopFeaturesSCQ_normalized["JiraCommentsAsQuery_avgSCQ"],
                                                #  processedData_JIRAComments_mxShopFeaturesSCQ_normalized["JiraCommentsAsQuery_maxSCQ"],
                                                #  processedData_JIRAComments_mxShopFeaturesSCQ_normalized["JiraCommentsAsQuery_sumSCQ"],
                                                  
                                                 # processedData_SVN_mxShopFeaturesPMI_normalized["SvnAsQuery_avgPMI"],
                                                 # processedData_SVN_mxShopFeaturesPMI_normalized["SvnAsQuery_maxPMI"],
                                                  processedData_SVNLogs_mxShopFeaturesPMI_normalized["SvnLogsAsQuery_avgPMI"],
                                                  processedData_SVNLogs_mxShopFeaturesPMI_normalized["SvnLogsAsQuery_maxPMI"],
                                                 # processedData_SVNUnitNames_mxShopFeaturesPMI_normalized["SvnUnitNamesAsQuery_avgPMI"],
                                                 # processedData_SVNUnitNames_mxShopFeaturesPMI_normalized["SvnUnitNamesAsQuery_maxPMI"],
                                                 # processedData_JIRA_mxShopFeaturesPMI_normalized["JiraAsQuery_avgPMI"],
                                                 # processedData_JIRA_mxShopFeaturesPMI_normalized["JiraAsQuery_maxPMI"],
                                                  processedData_JIRASummaries_mxShopFeaturesPMI_normalized["JiraSummariesAsQuery_avgPMI"],
                                                  processedData_JIRASummaries_mxShopFeaturesPMI_normalized["JiraSummariesAsQuery_maxPMI"],
                                                 # processedData_JIRADescriptions_mxShopFeaturesPMI_normalized["JiraDescriptionsAsQuery_avgPMI"],
                                                 # processedData_JIRADescriptions_mxShopFeaturesPMI_normalized["JiraDescriptionsAsQuery_maxPMI"],
                                                #  processedData_JIRAComments_mxShopFeaturesPMI_normalized["JiraCommentsAsQuery_avgPMI"],
                                               #   processedData_JIRAComments_mxShopFeaturesPMI_normalized["JiraCommentssAsQuery_maxPMI"],                                                  
                                                 ], axis=1)
#Set the NaN to 0
processedData_mxShopFeatures_normalized = processedData_mxShopFeatures_normalized.fillna(0)

#Saving feature names for later use
processedData_mxShopFeatureNames_normalized = list(processedData_mxShopFeatures_normalized.columns)

#Transform pandas data frame into numpy arrays
processedData_mxShopFeatures_normalized = np.array(processedData_mxShopFeatures_normalized)

#Load labels
processedData_mxShopLabels_normalized = pd.read_pickle(r'../data/03_processed/processedData_mxShopLabels.pkl')
processedData_mxShopLabels_normalized = np.array(processedData_mxShopLabels_normalized["is_valid"])


In [5]:

#Merge features into 1 dataframe
processedData_mxShopFeatures = pd.concat([processedData_mxShopFeaturesTime,
                                                  processedData_mxShopFeaturesStakeholder,
                                                  #IR-based
                                                  processedData_mxShop_features_VsmLogsJiraAsQuery,
                                                  processedData_mxShop_features_VsmLogsLogAsQuery,
                                                  processedData_mxShop_features_VsmUnitNamesJiraAsQuery,
                                                  processedData_mxShop_features_VsmUnitNamesUnitNamesAsQuery,
                                                 # processedData_mxShop_features_VsmUnitNamesCommentsCommentsAsQuery,
                                                 # processedData_mxShop_features_VsmUnitNamesCommentsUnitNamesAsQuery,
                                                  processedData_mxShop_features_VsmUnitNamesDescriptionDescriptionAsQuery,
                                                  processedData_mxShop_features_VsmUnitNamesDescriptionUnitNamesAsQuery,
                                                  processedData_mxShop_features_VsmSummaryLogsSummaryAsQuery,
                                                  processedData_mxShop_features_VsmSummaryLogsLogsAsQuery,
                                                  processedData_mxShop_features_VsmSummaryUnitNamesSummaryAsQuery,
                                                  processedData_mxShop_features_VsmSummaryUnitNamesUnitNamesAsQuery,
                                                  processedData_mxShop_features_VsmDescriptionDescriptionAsQuery,
                                                  processedData_mxShop_features_VsmDescriptionLogsAsQuery,
                                                 # processedData_mxShop_features_VsmLogsJiraAsQuery_2gram,
                                                 # processedData_mxShop_features_VsmLogsLogAsQuery_2gram,
                                                 # processedData_mxShop_features_VsmUnitNamesJiraAsQuery_2gram,
                                                 # processedData_mxShop_features_VsmUnitNamesUnitNamesAsQuery_2gram,
                                                 # processedData_mxShop_features_VsmVerbPruningUnitNamesJiraAsQuery,
                                                 # processedData_mxShop_features_VsmVerbPruningUnitNamesUnitNamesAsQuery,
                                                  processedData_mxShop_features_VsmSvnJiraJiraAsQuery,
                                                  processedData_mxShop_features_VsmSvnJiraSvnAsQuery,
                                                  processedData_mxShop_features_VsmSvnSummarySvnAsQuery,
                                                  processedData_mxShop_features_VsmSvnSummarySummaryAsQuery,
                                                  processedData_mxShop_features_VsmSvnDescriptionSvnAsQuery,
                                                  processedData_mxShop_features_VsmSvnDescriptionDescriptionAsQuery,
                                                #  processedData_mxShop_features_VsmSvnCommentsSvnAsQuery,
                                                #  processedData_mxShop_features_VsmSvnCommentsCommentsAsQuery,

                                                  
                                                  #Document Statistics
                                                  processedData_JIRA_mxShopFeaturesUniqueWordCount,
                                                  processedData_SVN_mxShopFeaturesUniqueWordCount,
                                                  processedData_JIRA_mxShopFeaturesTotalWordCount,
                                                  processedData_SVN_mxShopFeaturesTotalWordCount,
                                                  processedData_JIRA_mxShopFeaturesOverlapPercentage,
                                                  processedData_SVN_mxShopFeaturesOverlapPercentage,
                                                  processedData_UNION_mxShopFeaturesOverlapPercentage,
                                                 #Query Quality
                                                  processedData_SVN_mxShopFeaturesIDF['SvnAsQuery_avgIDF'],
                                                  processedData_SVN_mxShopFeaturesIDF['SvnAsQuery_maxIDF'],
                                                  processedData_SVN_mxShopFeaturesIDF['SvnAsQuery_devIDF'],
                                                  processedData_SVNLogs_mxShopFeaturesIDF['SvnLogsAsQuery_avgIDF'],
                                                  processedData_SVNLogs_mxShopFeaturesIDF['SvnLogsAsQuery_maxIDF'],
                                                  processedData_SVNLogs_mxShopFeaturesIDF['SvnLogsAsQuery_devIDF'],
                                                  processedData_SVNUnitNames_mxShopFeaturesIDF['SvnUnitNamesAsQuery_avgIDF'],
                                                  processedData_SVNUnitNames_mxShopFeaturesIDF['SvnUnitNamesAsQuery_maxIDF'],
                                                  processedData_SVNUnitNames_mxShopFeaturesIDF['SvnUnitNamesAsQuery_devIDF'],
                                                  processedData_JIRA_mxShopFeaturesIDF['JiraAsQuery_avgIDF'],
                                                  processedData_JIRA_mxShopFeaturesIDF['JiraAsQuery_maxIDF'],
                                                  processedData_JIRA_mxShopFeaturesIDF['JiraAsQuery_devIDF'], 
                                                  processedData_JIRASummaries_mxShopFeaturesIDF['JiraSummariesAsQuery_avgIDF'],
                                                  processedData_JIRASummaries_mxShopFeaturesIDF['JiraSummariesAsQuery_maxIDF'],
                                                  processedData_JIRASummaries_mxShopFeaturesIDF['JiraSummariesAsQuery_devIDF'], 
                                                  processedData_JIRADescriptions_mxShopFeaturesIDF['JiraDescriptionsAsQuery_avgIDF'],
                                                  processedData_JIRADescriptions_mxShopFeaturesIDF['JiraDescriptionsAsQuery_maxIDF'],
                                                  processedData_JIRADescriptions_mxShopFeaturesIDF['JiraDescriptionsAsQuery_devIDF'], 
                                                #  processedData_JIRAComments_mxShopFeaturesIDF['JiraCommentsAsQuery_avgIDF'],
                                               #   processedData_JIRAComments_mxShopFeaturesIDF['JiraCommentsAsQuery_maxIDF'],
                                               #   processedData_JIRAComments_mxShopFeaturesIDF['JiraCommentsAsQuery_devIDF'], 
                                                  
                                                  processedData_SVN_mxShopFeaturesICTF["SvnAsQuery_avgICTF"],
                                                  processedData_SVN_mxShopFeaturesICTF["SvnAsQuery_maxICTF"],
                                                  processedData_SVN_mxShopFeaturesICTF["SvnAsQuery_devICTF"],
                                                  processedData_SVNLogs_mxShopFeaturesICTF["SvnLogsAsQuery_avgICTF"],
                                                  processedData_SVNLogs_mxShopFeaturesICTF["SvnLogsAsQuery_maxICTF"],
                                                  processedData_SVNLogs_mxShopFeaturesICTF["SvnLogsAsQuery_devICTF"],
                                                  processedData_SVNUnitNames_mxShopFeaturesICTF["SvnUnitNamesAsQuery_avgICTF"],
                                                  processedData_SVNUnitNames_mxShopFeaturesICTF["SvnUnitNamesAsQuery_maxICTF"],
                                                  processedData_SVNUnitNames_mxShopFeaturesICTF["SvnUnitNamesAsQuery_devICTF"],
                                                  processedData_JIRA_mxShopFeaturesICTF["JiraAsQuery_avgICTF"],
                                                  processedData_JIRA_mxShopFeaturesICTF["JiraAsQuery_maxICTF"],
                                                  processedData_JIRA_mxShopFeaturesICTF["JiraAsQuery_devICTF"],
                                                  processedData_JIRASummaries_mxShopFeaturesICTF["JiraSummariesAsQuery_avgICTF"],
                                                  processedData_JIRASummaries_mxShopFeaturesICTF["JiraSummariesAsQuery_maxICTF"],
                                                  processedData_JIRASummaries_mxShopFeaturesICTF["JiraSummariesAsQuery_devICTF"],
                                                  processedData_JIRADescriptions_mxShopFeaturesICTF["JiraDescriptionsAsQuery_avgICTF"],
                                                  processedData_JIRADescriptions_mxShopFeaturesICTF["JiraDescriptionsAsQuery_maxICTF"],
                                                  processedData_JIRADescriptions_mxShopFeaturesICTF["JiraDescriptionsAsQuery_devICTF"],
                                                 # processedData_JIRAComments_mxShopFeaturesICTF["JiraCommentsAsQuery_avgICTF"],
                                                #  processedData_JIRAComments_mxShopFeaturesICTF["JiraCommentsAsQuery_maxICTF"],
                                               #   processedData_JIRAComments_mxShopFeaturesICTF["JiraCommentsAsQuery_devICTF"],
                                                  
                                                  processedData_SVN_mxShopFeaturesEntropy["SvnAsQuery_avgEntropy"],
                                                  processedData_SVN_mxShopFeaturesEntropy["SvnAsQuery_medEntropy"],
                                                  processedData_SVN_mxShopFeaturesEntropy["SvnAsQuery_maxEntropy"],
                                                  processedData_SVN_mxShopFeaturesEntropy["SvnAsQuery_devEntropy"],
                                                  processedData_SVNLogs_mxShopFeaturesEntropy["SvnLogsAsQuery_avgEntropy"],
                                                  processedData_SVNLogs_mxShopFeaturesEntropy["SvnLogsAsQuery_medEntropy"],
                                                  processedData_SVNLogs_mxShopFeaturesEntropy["SvnLogsAsQuery_maxEntropy"],
                                                  processedData_SVNLogs_mxShopFeaturesEntropy["SvnLogsAsQuery_devEntropy"],
                                                  processedData_SVNUnitNames_mxShopFeaturesEntropy["SvnUnitNamesAsQuery_avgEntropy"],
                                                  processedData_SVNUnitNames_mxShopFeaturesEntropy["SvnUnitNamesAsQuery_medEntropy"],
                                                  processedData_SVNUnitNames_mxShopFeaturesEntropy["SvnUnitNamesAsQuery_maxEntropy"],
                                                  processedData_SVNUnitNames_mxShopFeaturesEntropy["SvnUnitNamesAsQuery_devEntropy"],
                                                  processedData_JIRA_mxShopFeaturesEntropy["JiraAsQuery_avgEntropy"],
                                                  processedData_JIRA_mxShopFeaturesEntropy["JiraAsQuery_medEntropy"],
                                                  processedData_JIRA_mxShopFeaturesEntropy["JiraAsQuery_maxEntropy"],
                                                  processedData_JIRA_mxShopFeaturesEntropy["JiraAsQuery_devEntropy"],
                                                  processedData_JIRASummaries_mxShopFeaturesEntropy["JiraSummariesAsQuery_avgEntropy"],
                                                  processedData_JIRASummaries_mxShopFeaturesEntropy["JiraSummariesAsQuery_medEntropy"],
                                                  processedData_JIRASummaries_mxShopFeaturesEntropy["JiraSummariesAsQuery_maxEntropy"],
                                                  processedData_JIRASummaries_mxShopFeaturesEntropy["JiraSummariesAsQuery_devEntropy"],
                                                  processedData_JIRADescriptions_mxShopFeaturesEntropy["JiraDescriptionsAsQuery_avgEntropy"],
                                                  processedData_JIRADescriptions_mxShopFeaturesEntropy["JiraDescriptionsAsQuery_medEntropy"],
                                                  processedData_JIRADescriptions_mxShopFeaturesEntropy["JiraDescriptionsAsQuery_maxEntropy"],
                                                  processedData_JIRADescriptions_mxShopFeaturesEntropy["JiraDescriptionsAsQuery_devEntropy"],
                                                #  processedData_JIRAComments_mxShopFeaturesEntropy["JiraCommentsAsQuery_avgEntropy"],
                                                 # processedData_JIRAComments_mxShopFeaturesEntropy["JiraCommentsAsQuery_medEntropy"],
                                                 # processedData_JIRAComments_mxShopFeaturesEntropy["JiraCommentsAsQuery_maxEntropy"],
                                                 # processedData_JIRAComments_mxShopFeaturesEntropy["JiraCommentsAsQuery_devEntropy"],
                                                  
                                                  processedData_SVN_mxShopFeaturesQueryScope,
                                                  processedData_SVNLogs_mxShopFeaturesQueryScope,
                                                  processedData_SVNUnitNames_mxShopFeaturesQueryScope,
                                                  processedData_JIRA_mxShopFeaturesQueryScope,
                                                  processedData_JIRASummaries_mxShopFeaturesQueryScope,
                                                  processedData_JIRADescriptions_mxShopFeaturesQueryScope,
                                                #  processedData_JIRAComments_mxShopFeaturesQueryScope,
                                                  
                                                  processedData_SVN_mxShopFeaturesSCS,
                                                  processedData_SVNLogs_mxShopFeaturesSCS,
                                                  processedData_SVNUnitNames_mxShopFeaturesSCS,
                                                  processedData_JIRA_mxShopFeaturesSCS,
                                                  processedData_JIRASummaries_mxShopFeaturesSCS,
                                                  processedData_JIRADescriptions_mxShopFeaturesSCS,
                                                 # processedData_JIRAComments_mxShopFeaturesSCS,
                                                  
                                                  processedData_SVN_mxShopFeaturesSCQ["SvnAsQuery_avgSCQ"],
                                                  processedData_SVN_mxShopFeaturesSCQ["SvnAsQuery_maxSCQ"],
                                                  processedData_SVN_mxShopFeaturesSCQ["SvnAsQuery_sumSCQ"],
                                                  processedData_SVNLogs_mxShopFeaturesSCQ["SvnLogsAsQuery_avgSCQ"],
                                                  processedData_SVNLogs_mxShopFeaturesSCQ["SvnLogsAsQuery_maxSCQ"],
                                                  processedData_SVNLogs_mxShopFeaturesSCQ["SvnLogsAsQuery_sumSCQ"],
                                                  processedData_SVNUnitNames_mxShopFeaturesSCQ["SvnUnitNamesAsQuery_avgSCQ"],
                                                  processedData_SVNUnitNames_mxShopFeaturesSCQ["SvnUnitNamesAsQuery_maxSCQ"],
                                                  processedData_SVNUnitNames_mxShopFeaturesSCQ["SvnUnitNamesAsQuery_sumSCQ"],
                                                  processedData_JIRA_mxShopFeaturesSCQ["JiraAsQuery_avgSCQ"],
                                                  processedData_JIRA_mxShopFeaturesSCQ["JiraAsQuery_maxSCQ"],
                                                  processedData_JIRA_mxShopFeaturesSCQ["JiraAsQuery_sumSCQ"],
                                                  processedData_JIRASummaries_mxShopFeaturesSCQ["JiraSummariesAsQuery_avgSCQ"],
                                                  processedData_JIRASummaries_mxShopFeaturesSCQ["JiraSummariesAsQuery_maxSCQ"],
                                                  processedData_JIRASummaries_mxShopFeaturesSCQ["JiraSummariesAsQuery_sumSCQ"],
                                                  processedData_JIRADescriptions_mxShopFeaturesSCQ["JiraDescriptionsAsQuery_avgSCQ"],
                                                  processedData_JIRADescriptions_mxShopFeaturesSCQ["JiraDescriptionsAsQuery_maxSCQ"],
                                                  processedData_JIRADescriptions_mxShopFeaturesSCQ["JiraDescriptionsAsQuery_sumSCQ"],
                                                 # processedData_JIRAComments_mxShopFeaturesSCQ["JiraCommentsAsQuery_avgSCQ"],
                                                 # processedData_JIRAComments_mxShopFeaturesSCQ["JiraCommentsAsQuery_maxSCQ"],
                                                 # processedData_JIRAComments_mxShopFeaturesSCQ["JiraCommentsAsQuery_sumSCQ"],
                                                  
                                                  #processedData_SVN_mxShopFeaturesPMI["SvnAsQuery_avgPMI"],
                                                  #processedData_SVN_mxShopFeaturesPMI["SvnAsQuery_maxPMI"],
                                                  processedData_SVNLogs_mxShopFeaturesPMI["SvnLogsAsQuery_avgPMI"],
                                                  processedData_SVNLogs_mxShopFeaturesPMI["SvnLogsAsQuery_maxPMI"],
                                                 # processedData_SVNUnitNames_mxShopFeaturesPMI["SvnUnitNamesAsQuery_avgPMI"],
                                                 # processedData_SVNUnitNames_mxShopFeaturesPMI["SvnUnitNamesAsQuery_maxPMI"],
                                                 # processedData_JIRA_mxShopFeaturesPMI["JiraAsQuery_avgPMI"],
                                                 # processedData_JIRA_mxShopFeaturesPMI["JiraAsQuery_maxPMI"],
                                                  processedData_JIRASummaries_mxShopFeaturesPMI["JiraSummariesAsQuery_avgPMI"],
                                                  processedData_JIRASummaries_mxShopFeaturesPMI["JiraSummariesAsQuery_maxPMI"],
                                                #  processedData_JIRADescriptions_mxShopFeaturesPMI["JiraDescriptionsAsQuery_avgPMI"],
                                                #  processedData_JIRADescriptions_mxShopFeaturesPMI["JiraDescriptionsAsQuery_maxPMI"],
                                                  #processedData_JIRAComments_mxShopFeaturesPMI["JiraCommentsAsQuery_avgPMI"],
                                                  #processedData_JIRAComments_mxShopFeaturesPMI["JiraCommentssAsQuery_maxPMI"],
                                                 ], axis=1)
#Set the NaN to 0
processedData_mxShopFeatures = processedData_mxShopFeatures.fillna(0)

#Saving feature names for later use
processedData_mxShopFeatureNames = list(processedData_mxShopFeatures.columns)

#Transform pandas data frame into numpy arrays
processedData_mxShopFeatures = np.array(processedData_mxShopFeatures)

#Load labels
processedData_mxShopLabels = pd.read_pickle(r'../data/03_processed/processedData_mxShopLabels.pkl')
processedData_mxShopLabels = np.array(processedData_mxShopLabels["is_valid"])


# 4. Modeling - Normalization
First select which data set to train:


In [6]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve



def showModelPerformance(trainedModel, testFeatures, testLabels):
    # Use the forest's predict method on the test data
    predictionLabels = trainedModel.predict(testFeatures)
    
    accuracyValue = accuracy_score(testLabels.astype(bool), predictionLabels)
    precisionValue = precision_score(testLabels.astype(bool), predictionLabels, average='binary')
    f1Value = f1_score(testLabels.astype(bool), predictionLabels)
    f2Value = fbeta_score(testLabels.astype(bool), predictionLabels, beta=2.0)
    f05Value = fbeta_score(testLabels.astype(bool), predictionLabels, beta=0.5)
    recallValue = recall_score(testLabels.astype(bool), predictionLabels)
    averagePrecisionValue = average_precision_score(testLabels.astype(bool), predictionLabels)
          
    performanceData = {'Accuracy':  [accuracyValue],
                       'Precision': [precisionValue],
                       'Recall': [recallValue],
                       'F1': [f1Value],
                       'F2': [f2Value],
                       'F0.5': [f05Value],
                       'Average Precision': [averagePrecisionValue]
                      }
    performanceDf = pd.DataFrame(performanceData)
    return(performanceDf)

In [7]:
features_normalized = processedData_mxShopFeatures_normalized
labels_normalized = processedData_mxShopLabels_normalized

## 4.1 Rebalancing Strategy - None

### 4.1.1 Random Forests

In [111]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

none_randomforest_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    pipeline = Pipeline(steps = [#['smote', SMOTE(sampling_strategy = 0.5, n_jobs=2)],
                              #['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    none_randomforest_normalized_performance_df = pd.concat([none_randomforest_normalized_performance_df, new_performance_df])
    
none_randomforest_normalized_performance_df.to_csv("../data/05_model_output/none_randomforest_normalized_performance_df.csv")



### 4.1.2 XGBoost

In [112]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


none_xgboost_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    GXBoostPipeline = Pipeline(steps = [#['smote', SMOTE()],
                                    #['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    none_xgboost_normalized_performance_df = pd.concat([none_xgboost_normalized_performance_df, new_performance_df])
    

none_xgboost_normalized_performance_df.to_csv("../data/05_model_output/none_xgboost_normalized_performance_df.csv")




Elapsed time to compute best fit: 42.650 seconds
Cross-validation score: 0.8341419037698655
Test score: 0.7692307692307693
Best Hyperparameters: {}
0.012228564
0.019170862
0.026953243
0.009952978
0.008801372
0.009699399
0.0011389386
0.0033750439
0.008673984
0.0
0.010198673
0.004564193
0.0
0.0
0.0075401184
0.0
0.03717987
0.0
0.008521208
0.0
0.0
0.028367655
0.008758058
0.0002223952
0.0
0.0
0.014965369
0.011723049
0.005344078
0.0004998682
0.005712581
0.0
0.0023494223
0.0004588325
0.008093835
0.0
0.0
0.0
0.008597786
0.02668507
0.0005602176
0.006347892
0.006112002
0.013015379
0.043249637
0.022014841
0.0045908173
0.0
0.0
0.0
0.0042753513
0.0066141896
0.00401093
0.004970064
0.0
0.0
0.005062527
0.0029188928
0.0025466953
0.028163878
0.09012046
0.01524282
0.0
0.00025091766
0.00066783297
0.003341364
0.011569835
0.0
0.003471371
0.009439608
0.0026332904
0.0
0.011789735
0.024387343
0.010277577
0.0
0.012582472
0.017902827
0.011028543
0.0
0.021434665
0.024097007
0.02349047
0.015579472
0.015984721
0.04



Elapsed time to compute best fit: 45.387 seconds
Cross-validation score: 0.6612160263042377
Test score: 0.7971014492753623
Best Hyperparameters: {}
0.010550992
0.0173457
0.02191594
0.004862453
0.016443752
0.012961326
0.007224841
0.011604335
0.0027283991
0.0
0.0023683377
0.008392187
0.0
0.0
0.015398836
0.0
0.015069008
0.005809098
0.007952512
0.0
0.009508084
0.023192804
0.027669823
0.0
0.0
0.0
0.010608269
0.012945175
0.0060088974
0.0025891683
0.0025414177
0.0087549
0.012710208
0.0029325094
0.0054501165
0.0
0.0
0.0
0.01907633
0.0303574
0.00089866424
0.0039161933
0.001426032
0.006237233
0.018492829
0.020592999
0.0034507266
0.0
0.0
0.0
0.0031988586
0.005475687
0.004523924
0.0057225255
0.0
0.0
0.0
0.0045942077
0.0040495563
0.050616927
0.07503493
0.013206641
0.0018101815
0.0013848457
0.0016782044
0.007608237
0.0017118109
0.0
0.014196197
0.0067736125
0.0035676332
0.049769666
0.008421653
0.016839167
0.0066485303
0.0
0.006237633
0.013944476
0.009118189
6.735275e-05
0.013905747
0.020069517
0.0068



Elapsed time to compute best fit: 43.257 seconds
Cross-validation score: 0.8325026050562985
Test score: 0.7547169811320755
Best Hyperparameters: {}
0.009003392
0.016323853
0.024962414
0.011971093
0.015089925
0.0074241445
0.017063564
0.010385249
0.011783623
0.008028326
0.015591399
0.0071042366
0.0
0.0
0.016379677
0.0
0.020037804
0.003239592
0.014929795
0.009378338
0.0064311214
0.040816657
0.0
0.0
0.0
0.0
0.0012841895
0.0
0.002648632
0.006355933
0.0032127358
0.006869059
0.019532163
0.0034083473
0.009486755
0.0
0.0
0.0
0.009301976
0.010609784
0.0019660196
0.0135606835
0.0016884361
0.009281832
0.014489251
0.024719514
0.00452188
0.0
0.0
0.0
0.005434637
0.009982665
0.005863155
0.004996587
0.0
0.0
0.0025524346
0.0039953575
0.011105046
0.027961679
0.07089165
0.016341092
0.0038222698
0.0
0.005022746
0.009196888
0.002209331
0.0
0.00717081
0.0064737955
0.009934136
0.057269473
0.004330831
0.0
0.004278482
0.0
0.0
0.0043033357
0.0065777204
0.0014887408
0.000599727
0.022587683
0.011626467
0.0
0.00835



Elapsed time to compute best fit: 45.188 seconds
Cross-validation score: 0.8158310449887919
Test score: 0.7317073170731707
Best Hyperparameters: {}
0.011125234
0.014020649
0.021344291
0.028206564
0.010332156
0.011456816
0.0006217239
0.0007767027
0.00029315075
0.002472238
0.03324229
0.0074401814
0.0
0.0
0.02674804
0.0
0.017620798
0.0008208221
0.0038854687
0.009448831
0.006486795
0.0
0.0023135038
0.0
0.0
0.0
0.019179007
0.00091538305
0.0052986997
0.019312927
0.0066827037
0.008246149
0.013883646
0.004502901
0.0040976787
0.0
0.0
0.0
0.016130958
0.029689172
0.020051928
0.007833417
0.0017124253
0.0031412335
0.025344223
0.020580277
0.0029055516
0.0
0.0
0.0
0.0032357113
0.0008191733
0.0053392826
0.003549958
0.0
0.028123558
0.0033660969
0.0
0.0048371004
0.028351527
0.06496283
0.0010045345
0.0014837548
0.0
0.0
0.01670592
0.0
0.0
0.0030248102
0.007733674
0.0002318904
0.0
0.01059629
0.031201057
0.0
0.0
0.014804638
0.017345892
0.009885111
0.0
0.028211419
0.014907254
0.008714273
0.012845662
0.008704



Elapsed time to compute best fit: 43.030 seconds
Cross-validation score: 0.7300437083045779
Test score: 0.8695652173913044
Best Hyperparameters: {}
0.01211472
0.020381404
0.028940106
0.0056606177
0.021113252
0.009666185
0.0059688324
0.0
0.009934168
0.015234734
0.008811954
0.009269865
0.0
0.0
0.012179034
0.0
0.014711999
0.0
0.011347707
0.0046165762
0.0
0.022449214
0.005391961
0.0
0.009136768
0.0
0.00814556
0.001679176
0.005820093
0.01911259
0.008730111
0.013854803
0.017293664
0.0036870616
0.0067263218
0.0
0.0
0.0
0.0037581238
0.01698125
0.005030531
0.015014482
0.0027007503
0.006972624
0.020497067
0.009625497
0.0011635287
0.0
0.0
0.0
0.010485126
0.0028258504
0.006792065
0.006968472
0.0
0.00033873526
0.0
0.002791823
0.009080712
0.044162326
0.0753794
0.005452938
0.0018711446
0.0
0.0
0.011766604
0.0
0.0
0.0030229478
0.011023517
0.0027372679
0.0
0.007822015
0.0
0.019110825
0.0
0.018225536
0.009215665
0.01130613
0.0039001927
0.0
0.032177947
0.024507789
0.004950484
0.0020159003
0.041644372
0.0



Elapsed time to compute best fit: 39.075 seconds
Cross-validation score: 0.7746226212676348
Test score: 0.7017543859649122
Best Hyperparameters: {}
0.008658035
0.018874666
0.022550307
0.0053999093
0.007876027
0.008297283
0.0049379035
0.0
0.0
0.020243235
0.0069214604
0.0059805787
0.012938502
0.0
0.027508482
0.0
0.0077244593
0.0
0.0054288823
0.007848156
0.012457435
0.008030687
0.0063105184
0.0
0.0
0.0
0.009578541
0.004047825
0.0043755593
0.0
0.010638586
0.011036479
0.004679921
0.001522556
0.005950683
0.0
0.0
0.0
0.0029458217
0.04804856
0.00899598
0.0054859845
0.03058131
0.0025272055
0.006541555
0.013585571
0.002032358
0.0
0.0
0.0
0.004295366
0.0048517366
0.0032115413
0.0062693125
0.0
0.0065036034
0.0045397053
0.013854858
0.0060548
0.028422177
0.060668845
0.0113762235
0.0041955416
0.0
0.001384743
0.005461734
0.0018103047
0.0
0.013425813
0.0055713393
0.019162683
0.0
0.009088417
0.034285404
0.012421733
0.0
0.01723191
0.010811589
0.0058481484
0.0006623114
0.003345061
0.012396528
0.011024162




Elapsed time to compute best fit: 41.816 seconds
Cross-validation score: 0.8040103915943584
Test score: 0.5737704918032787
Best Hyperparameters: {}
0.01478439
0.01868653
0.029357512
0.008204864
0.013506077
0.010734365
0.012415424
0.0
0.016003076
0.0
0.02318282
0.0066181235
0.0
0.0
0.012591824
0.0
0.01839143
0.0
0.0
0.010774457
0.0
0.028666995
0.00249246
0.0
0.0
0.0
0.0
0.006266033
0.00065287505
0.005111158
0.032010708
0.013673333
0.0030348473
0.0
0.005645075
0.0
0.0
0.0
0.007758796
0.033075005
0.0056986837
0.0042673727
0.0046503735
0.008238447
0.023696678
0.0009812128
0.0010039848
0.0
0.0
0.0
0.00377359
0.0056610648
0.0062753526
0.009016009
0.0061939587
0.0
0.009697615
0.005401705
0.008751186
0.029093126
0.080260746
0.01935077
0.0059155393
0.0
0.005017015
0.018384093
0.0
0.0
0.0
0.007557544
0.0014234277
0.016585479
0.0077911997
0.0
0.0
0.0
0.0
0.011156594
0.008172295
0.0
0.010127972
0.02601666
0.0067719147
0.027590321
0.0074146492
0.08043464
0.0021787644
0.0
0.0018290557
0.0
0.00988232



Elapsed time to compute best fit: 42.047 seconds
Cross-validation score: 0.7322131688891858
Test score: 0.7017543859649122
Best Hyperparameters: {}
0.012295647
0.017284079
0.020103116
0.003314327
0.015173593
0.0060303397
0.005296063
0.0
0.0
0.0
0.0048977765
0.008510091
0.021906054
0.0
0.013350504
0.0
0.006140752
0.0050420146
0.0063554524
0.040676005
0.0070619616
0.04844212
0.0015201742
0.01193672
0.031302847
0.0
0.009207697
0.014782674
0.012813505
0.007133014
0.0049680034
0.0125612635
0.007899394
0.0031191206
0.007606575
0.0
0.0
0.0
0.0029081905
0.031740688
0.0032231186
0.02756071
0.019297747
0.00713392
0.022624781
0.0142835695
0.0038181802
0.0
0.0
0.0
0.0033282344
0.0025985984
0.0044923113
0.0
0.0
0.02161863
0.0076349624
0.0
0.009699884
0.024016073
0.09189797
0.0062682386
0.0057602692
0.0003296506
0.001553378
0.005920414
0.0069744545
0.0
0.0051399968
0.004411718
0.009554505
0.01062572
0.008209712
0.0
0.0
0.0
0.0
0.0048375484
0.007606753
0.00085542514
0.0062654824
0.020075213
0.0091394



Elapsed time to compute best fit: 42.666 seconds
Cross-validation score: 0.7777552225584999
Test score: 0.8461538461538461
Best Hyperparameters: {}
0.010352809
0.01514303
0.031843614
0.0068449783
0.03106093
0.015779687
0.012176016
0.003864885
0.004865995
0.0
0.03599157
0.0061678113
0.0
0.0
0.016144749
0.0
0.009102401
0.0017549427
0.0035374246
0.010135564
0.0040404233
0.033175632
0.032010473
0.006697269
0.0
0.0
0.010673931
0.0
0.0016206389
0.0025234108
0.011670655
0.0063330424
0.013017224
0.0038407503
0.0051505845
0.0
0.0
0.0
0.0029368203
0.031308025
0.023632368
0.027164597
0.0102034025
0.013328801
0.018205652
0.007311018
0.0051753377
0.0
0.0
0.0
0.0031890257
0.0007878121
0.004141328
0.009998143
0.0
0.0
0.0137623865
0.0018643958
0.0010638549
0.029625399
0.06580569
0.00878427
0.003960582
0.0025548239
0.0
0.006879912
0.0061763753
0.0
0.002631038
0.007613806
0.022369757
0.008992025
0.0069477474
0.0
0.0051034526
0.0
0.03134311
0.0045742085
0.0139627885
0.0
0.0048550703
0.027391195
0.0089805



Elapsed time to compute best fit: 47.092 seconds
Cross-validation score: 0.7324036797499557
Test score: 0.8695652173913044
Best Hyperparameters: {}
0.010493156
0.01643008
0.0262185
0.013597927
0.012505095
0.005320818
0.0070627136
0.0011268076
0.008754879
0.0
0.014640837
0.0062982477
0.0
0.0
0.013498715
0.0
0.07123045
0.005720919
0.0
0.011442257
0.0048508374
0.020327352
0.006021073
0.0055451198
0.025023691
0.0003001139
0.009697591
0.0017745695
0.00113559
0.007788213
0.0002719309
0.006430195
0.0100052515
0.019256271
0.0037769487
0.0
0.0
0.0
0.008792576
0.02876882
0.0011243348
0.01328817
0.006070504
0.0075423038
0.018354971
0.006896194
0.00230652
0.0
0.0
0.0
0.0033214833
0.005109654
0.0050105653
0.0
0.0
0.0033826379
0.007262656
0.00021050128
0.0067146644
0.037421316
0.05018918
0.012130837
0.0030963733
0.03838454
0.00040441568
0.013808992
0.00058049907
0.0
0.03897402
0.005193815
0.008948113
0.02390014
0.0089642545
0.0
0.0
0.0
0.016090019
0.003914511
0.0058863303
0.00661938
0.005997262
0.01



Elapsed time to compute best fit: 43.207 seconds
Cross-validation score: 0.6628741027869987
Test score: 0.8904109589041096
Best Hyperparameters: {}
0.00951845
0.022327114
0.030138854
0.008453278
0.013251146
0.007850234
0.0
0.003785334
0.0
0.0
0.010387564
0.005045023
0.0
0.0
0.018672725
0.0
0.005443323
0.007097171
0.0
0.0076004695
0.0033145521
0.013404222
0.041243713
0.008693137
0.007844322
0.0
0.0
0.0
0.0069509004
0.002142909
0.0014847108
0.01520574
0.024528364
0.00083804264
0.009792321
0.0
0.0
0.0
0.0044960734
0.03251848
0.012065133
0.012198351
0.0
0.0058570304
0.013564979
0.026586974
0.0034465496
0.0
0.0
0.0
0.004395222
0.0019275832
0.006086932
0.0111108245
0.0
0.011095805
0.019503932
0.014476624
0.00781818
0.027131626
0.09141183
0.011426582
0.010988382
0.0020313172
0.0057068556
0.012594849
0.002848728
0.0
0.00053141586
0.0070068613
0.0019328373
0.0
0.0075451587
0.0
0.0068566375
0.0
0.025279587
0.007616758
0.008085877
0.0
0.0027213925
0.020151712
0.010271828
0.017715085
0.0015545319




Elapsed time to compute best fit: 41.820 seconds
Cross-validation score: 0.7225603807651493
Test score: 0.7377049180327868
Best Hyperparameters: {}
0.010526815
0.017015513
0.019705158
0.0
0.007802283
0.006611669
0.006338298
0.005565669
0.006678721
0.011450321
0.006061277
0.010400152
0.000959272
0.0
0.010168946
0.0
0.026656348
0.0014218271
0.0
0.0007600006
0.0
0.01607628
0.010574974
0.020040868
0.002651426
0.0006135309
0.024603978
0.027537514
0.0074701346
0.0044549084
0.0036926402
0.009437351
0.013405123
0.0047180406
0.0041139834
0.0
0.0
0.0
0.016706575
0.028168479
0.0004937653
0.011150004
0.0017511366
0.005360199
0.021382648
0.0022690028
0.0031399867
0.0
0.0
0.0
0.006242591
0.017083304
0.0036201486
0.005210423
0.0
0.0
0.0019953959
0.007926776
0.011433073
0.031193621
0.06587001
0.012662407
0.0021447262
0.0044105817
0.006134775
0.018164974
0.0020673082
0.0
0.0043431385
0.0076190536
0.00058356446
0.00363826
0.005863908
0.0
0.0016838483
0.0
0.023328358
0.0157647
0.008696952
0.005488141
0.0



Elapsed time to compute best fit: 43.553 seconds
Cross-validation score: 0.7640104666194713
Test score: 0.6122448979591837
Best Hyperparameters: {}
0.00855232
0.014194466
0.028739242
0.0005345571
0.01894187
0.0053196587
0.0
0.0
0.0066252793
0.020435456
0.0065246415
0.0039344644
0.0
0.0
0.030813836
0.0
0.013339701
0.0032883259
0.0038670774
0.0036989988
0.0
0.017667074
0.0037226274
0.009275878
0.016409703
0.0
0.007282186
0.0
0.009808915
0.008447783
0.017050179
0.0095872795
0.0084166555
0.0062340703
0.0026377325
0.0
0.0
0.0
0.0115352
0.02196796
0.0011125724
0.018623572
0.003935426
0.014860428
0.0012278163
0.014191483
0.0025873224
0.0
0.0
0.0
0.006453018
0.0
0.003685999
0.0
0.0
0.0059073945
0.01707485
0.004220167
0.00966819
0.03623565
0.11946038
0.005138703
0.006607486
0.038580004
0.0044893166
0.001869883
0.0
0.0021535573
0.004278402
0.011369959
0.0053940546
0.014451214
0.0074725836
0.00021274373
0.0047148885
0.0
0.0
0.0
0.0123447385
0.0
0.0
0.016212828
0.0037283266
0.0023965607
0.01102631



Elapsed time to compute best fit: 40.626 seconds
Cross-validation score: 0.7950156700359696
Test score: 0.5660377358490566
Best Hyperparameters: {}
0.011405657
0.014063366
0.023746984
0.0
0.011542176
0.004026883
0.015857972
0.0037674122
0.0
0.0
0.0071047544
0.008557441
0.0
0.0
0.016151013
0.0
0.08232335
0.0033881913
0.011295832
0.0
0.0012937438
0.019820748
0.008741267
0.0059407502
0.0
0.0
0.0
0.0027784428
0.027814608
0.0052388036
0.0
0.014714971
0.011037122
0.0027589067
0.0035327876
0.0
0.0
0.0
0.016305713
0.0031015016
0.0065376125
0.041012913
0.0069761416
0.021838494
0.015289097
0.003166955
0.0064001945
0.0
0.0
0.0
0.00444138
0.005200808
0.005712921
0.0
0.0
0.0
0.0017344016
0.0
0.010245814
0.020946356
0.0824195
0.008062438
0.0
0.0
0.0030269164
0.012837352
0.0010044551
0.0
0.0
0.0082576135
0.0
0.0
0.018266438
0.0019030939
0.010058427
0.0
0.018649422
0.009122668
0.010501803
0.0038198712
0.016943239
0.019951764
0.00782381
0.019931871
0.0023706052
0.11169518
0.0023608562
0.0
0.011040601
0



Elapsed time to compute best fit: 41.105 seconds
Cross-validation score: 0.8116773030370137
Test score: 0.7971014492753623
Best Hyperparameters: {}
0.011667388
0.01535947
0.028241863
0.0074495305
0.013140726
0.006434247
0.008713411
0.0017307419
0.0027609048
0.0
0.01922621
0.007531581
0.0
0.0
0.010493385
0.0
0.017268334
0.0022023094
0.007157785
0.0
0.0
0.023218049
0.0022134269
0.014118781
0.010435532
0.0
0.0
0.0055991076
0.0077226893
0.0045445203
0.0033471445
0.0
0.0027117934
0.0067736492
0.0011606396
0.0
0.0
0.0
0.046729103
0.019279847
0.011298131
0.020757223
0.00011655003
0.006585732
0.013585857
0.022687446
0.005329133
0.0
0.0
0.0
0.0033527517
0.0006277848
0.005024328
0.0056089764
0.0
0.0
0.011726756
0.0012067604
0.0022250107
0.0189547
0.077033445
0.012438064
0.00020025049
0.033059105
0.0
0.01043499
0.0073436713
0.0
0.0073627234
0.0074374154
0.017755982
0.010166383
0.009087982
0.0
0.013194274
0.0
0.016086195
0.005846717
0.009570783
0.0008787535
0.0
0.023279544
0.016838446
0.0012215547



Elapsed time to compute best fit: 43.404 seconds
Cross-validation score: 0.7200654911033592
Test score: 0.7142857142857142
Best Hyperparameters: {}
0.008231029
0.013994952
0.029149478
0.017309818
0.022205494
0.0041412283
0.0
0.0
0.010747785
0.0
0.015371211
0.007902787
0.0
0.0
0.0059217177
0.0
0.027227473
0.0
0.003046976
0.0
0.0053060423
0.0272896
0.0009879341
0.0005932183
0.017575417
0.0
0.0
0.0036145365
0.0046985694
0.00058735395
0.0
0.011725652
0.011021469
0.0068467516
0.0019229448
0.0
0.0
0.0
0.015512116
0.013309163
0.0006990512
0.008216397
0.013545046
0.00965277
0.011136163
0.02894397
0.0069016237
0.0
0.0
0.0
0.005358437
0.003469083
0.0057936534
0.0
0.0
0.007495089
0.008540378
0.0021918488
0.0060166074
0.057656407
0.08020988
0.009428467
0.018320223
0.0012032302
0.0018410595
0.014878135
0.0038609093
0.0
0.013432366
0.010658835
0.0073628644
0.0009701897
0.013407317
0.0
0.0
0.0
0.0
0.017112788
0.004753174
0.0
0.018950528
0.028457442
0.009192324
0.0030791902
0.0117311105
0.031043021
0.



Elapsed time to compute best fit: 40.369 seconds
Cross-validation score: 0.7616028148913658
Test score: 0.7246376811594203
Best Hyperparameters: {}
0.010188787
0.019308975
0.02074275
0.005383826
0.011049906
0.0079421075
0.0057716104
0.009187258
0.0
0.0036538003
0.0052281874
0.008378265
0.0
0.0
0.022128511
0.0
0.023944039
0.0056043887
0.0
0.0
0.0
0.017789308
0.003316974
0.0
0.07260446
0.0
0.023351131
0.017844325
0.01077062
0.0043616076
0.0
0.013945079
0.010766294
0.0027819357
0.009047371
0.0
0.0
0.0
0.0020742915
0.010887655
0.0009944862
0.0078032715
0.008506378
0.0042598397
0.021147653
0.01886369
0.012081091
0.0
0.0
0.0
0.0037900785
0.0022078012
0.0038973298
0.008132299
0.0
0.0026004245
0.009197089
0.0028900139
0.007845872
0.032997426
0.07588517
0.0095575405
0.011085402
0.0
0.0027714288
0.015111132
0.019808384
0.0
0.009452142
0.008010106
0.010266198
0.0
0.008493598
0.0
0.0
0.0
0.008860958
0.0005953388
0.007346977
0.0012461462
0.0005424795
0.021263056
0.013909333
0.0
0.0057444973
0.05998



Elapsed time to compute best fit: 46.254 seconds
Cross-validation score: 0.7178396747674078
Test score: 0.8163265306122449
Best Hyperparameters: {}
0.010282041
0.013645323
0.027785389
0.006916955
0.008852898
0.0050829337
0.009790757
0.002360281
0.0022646494
0.0
0.018598381
0.004617891
0.004496858
0.0
0.0062821405
0.0
0.03611595
0.0032953315
0.0006661577
0.0004598297
0.0076642367
0.022575857
0.0041774414
0.0
0.05945244
0.0
0.050213728
0.006631964
0.0077623324
0.005690782
0.0
0.0076585542
0.013495715
0.0034554417
0.0039604073
0.0
0.0
0.0
0.00575048
0.013835553
0.009999033
0.0035445346
0.018413614
0.003997594
0.010107297
0.008003791
0.0033293029
0.0
0.0
0.0
0.004108774
0.0061571645
0.0032646712
0.0
0.0
0.0
0.0016205937
0.005701529
0.008235395
0.032655925
0.060001265
0.0056816097
0.0029990776
0.0
0.0010977583
0.00951222
0.0050010583
0.0
0.002872228
0.008024441
0.024437921
0.0
0.008604139
0.03398419
0.0074760253
0.0
0.01832272
0.017403154
0.01050955
0.0
0.0070215985
0.013643368
0.010033792




Elapsed time to compute best fit: 40.043 seconds
Cross-validation score: 0.7394045129595435
Test score: 0.9016393442622951
Best Hyperparameters: {}
0.010890281
0.0138263805
0.031682342
0.009159568
0.0099332975
0.0055988086
0.009017366
0.0011845347
0.002800417
0.0
0.0067572426
0.008843881
0.0014116523
0.0
0.026725277
0.0
0.0020092372
0.009481329
0.0
0.0037850807
0.009589272
0.021432482
0.005970914
0.00092937605
0.0
0.0
0.015763048
0.0039000139
0.004323757
0.0068079513
0.0
0.0007856752
0.0036746222
0.00057676533
0.0024227998
0.0
0.0
0.0
0.0021403704
0.03813694
0.0032245528
0.0143087795
0.02088472
0.011132885
0.033647723
0.010548675
0.017486159
0.0
0.0
0.0
0.0043266
0.0012406628
0.005349192
0.007385051
0.0
0.0
0.00017548437
0.0
0.012450811
0.03260148
0.07265551
0.009409074
0.0017043807
0.04140486
0.005047284
0.0101485485
0.0024529092
0.0
0.00806446
0.010754073
0.0034052934
0.034860216
0.02008487
0.0
0.0014433543
0.0
0.010543939
0.006894876
0.0052808137
0.0
0.0027130407
0.014507931
0.01375



Elapsed time to compute best fit: 41.814 seconds
Cross-validation score: 0.7165735165402989
Test score: 0.821917808219178
Best Hyperparameters: {}
0.011525464
0.016505359
0.029539801
0.010914857
0.0134520605
0.006581482
0.0
0.0
0.0022368724
0.0
0.04736298
0.010250757
0.0
0.0
0.0046098433
0.0
0.019234274
0.0031700002
0.0
0.0
0.0053899963
0.024412228
0.0381236
0.0073613115
0.0
0.0016665083
0.042787585
0.0
0.0034283076
0.0047733583
0.004365758
0.0042943745
0.006360003
0.007301825
0.009581855
0.0
0.0
0.0
0.0044936663
0.022038335
0.011778563
0.0055341497
0.002108967
0.006995315
0.014851738
0.014363623
0.002080394
0.0
0.0
0.0
0.0042799977
0.007486862
0.004876633
0.009831582
0.00440937
0.011147792
0.006753389
0.0
0.0095828585
0.024382014
0.08242265
0.013860434
0.010314092
0.0
0.012873463
0.0047459407
0.006250647
0.0
0.008079801
0.011396819
0.0024373573
0.0
0.0073082508
0.0
0.011710518
0.0
0.023879664
0.007888774
0.0101528
0.0
0.0006827653
0.019252058
0.00930165
0.028805338
0.010233661
0.02589



Elapsed time to compute best fit: 45.676 seconds
Cross-validation score: 0.7174612352917513
Test score: 0.684931506849315
Best Hyperparameters: {}
0.00972527
0.01724705
0.027116364
0.009509548
0.033597402
0.010903743
0.0
0.0
0.004181042
0.0018914804
0.01768783
0.0057351333
0.0
0.0
0.024921842
0.0
0.02573236
0.0
0.0
0.010206796
0.0048376173
0.01634466
0.0021912686
0.0
0.0
0.0
0.013152027
0.0132146645
0.00744459
0.005851276
0.007977873
0.00753957
0.012271887
0.018810103
0.014320602
0.0
0.0
0.0
0.007372533
0.03927541
0.020012662
0.00429841
0.0005411241
0.008201815
0.00287985
0.028287102
0.0047161104
0.0
0.0
0.0
0.0060057966
0.0
0.005330769
0.005426533
0.0
0.0
0.0010623919
0.0052896743
0.0046653845
0.025985846
0.06278131
0.010224945
0.0021654156
0.0
0.0006749647
0.009852226
0.008944731
0.0
0.014058684
0.0079189055
0.0125801135
0.0
0.0062389914
0.014838468
0.0064568045
0.0
0.019596634
0.010632863
0.011317075
0.0
0.008113672
0.018980077
0.011792161
0.005696738
0.0068129343
0.058704592
0.0125



Elapsed time to compute best fit: 42.155 seconds
Cross-validation score: 0.737544088308095
Test score: 0.8490566037735848
Best Hyperparameters: {}
0.008668821
0.02087013
0.021564208
0.002922771
0.017330607
0.007243484
0.0073613278
0.012245089
0.0
0.0
0.008730785
0.009682357
0.0
0.0
0.01640282
0.0
0.015248054
0.0065579466
0.001102582
0.0023403405
0.0049631973
0.02442596
0.0
0.0
0.011087919
0.0
0.074304655
0.0003718432
0.0029323308
0.006771672
0.0
0.005610142
0.006313995
0.024847407
0.005344283
0.0
0.0
0.0
0.006429371
0.011163792
0.0055301846
0.04334684
0.002408832
0.008933199
0.00466743
0.014683328
0.007961077
0.0
0.0
0.0
0.004950038
0.006018103
0.0041947584
0.005391531
0.0
0.0
0.0026926622
0.0
0.008112574
0.032254834
0.08518776
0.0075004073
0.006649779
0.0
0.0006661088
0.003928166
0.005018614
0.008196445
0.011959688
0.008471976
0.0
0.0019911053
0.0067380182
0.0
0.011728461
0.0
0.017837254
0.013263703
0.00822121
0.0
0.006210012
0.02525795
0.015503418
0.004996366
0.010354973
0.0011019232



Elapsed time to compute best fit: 39.466 seconds
Cross-validation score: 0.7498156721084307
Test score: 0.7377049180327868
Best Hyperparameters: {}
0.013716867
0.01914639
0.027174376
0.008160815
0.012768126
0.0066038463
0.0
0.0
0.006556865
0.004347281
0.010355622
0.0065778936
0.0
0.0
0.018911198
0.0
0.016598703
0.0
0.0
0.0008844395
0.0
0.014184306
0.023855614
0.006979996
0.007452211
0.0
0.025010863
0.00031416919
0.0
0.00595235
0.005957407
0.0079528745
0.006942796
0.003939406
0.006044642
0.0
0.0
0.0
0.014346926
0.016806303
0.0
0.0072915633
0.0011234729
0.0067787506
0.01857387
0.026448108
0.0023631863
0.0
0.0
0.0
0.0047268597
0.0013322727
0.005210915
0.0
0.011157118
0.010072606
0.013528702
0.0026677737
0.00819518
0.036933906
0.094382554
0.012723631
0.0034547448
0.035992358
0.00078773894
0.0069957734
0.0
0.0
0.0042353934
0.011185387
0.016928213
0.0
0.009624821
0.0
0.013181687
0.0
0.0
0.009081892
0.014366397
0.0069106864
0.011940045
0.017562967
0.017798511
0.0
0.011762829
0.035203923
0.000



Elapsed time to compute best fit: 44.444 seconds
Cross-validation score: 0.7397484939486858
Test score: 0.8461538461538461
Best Hyperparameters: {}
0.0138464775
0.01735678
0.019312726
0.0054448796
0.010738143
0.0058412473
0.013762767
0.0012855078
0.0052357665
0.012158364
0.016381828
0.010768266
0.002221802
0.0
0.014022471
0.0
0.042927664
0.002434122
0.005746772
0.0
0.0064102598
0.014766381
0.005430142
0.0
0.011425574
0.0
0.024125231
0.0070492113
0.015672324
0.006370467
0.012429546
0.0075981864
0.010471273
0.0013563212
0.0043546706
0.0
0.0
0.0
0.0017645111
0.009742621
0.0002720808
0.0030648536
0.0010968706
0.0037804395
0.008604673
0.008005961
0.0048286
0.0
0.0
0.0
0.006586982
0.0040650633
0.0057196505
0.0067463582
0.0
0.002098144
0.004119302
0.0017176128
0.007921887
0.036170978
0.07122196
0.017298853
0.0016356213
0.0
0.0023491639
0.008978797
0.014306854
0.0
0.0
0.008739441
0.0061311265
0.0
0.0030379149
0.01026303
0.0057383534
0.0
0.0
0.010820275
0.007957981
0.0
0.012690229
0.018969283
0



Elapsed time to compute best fit: 41.998 seconds
Cross-validation score: 0.6873399229123744
Test score: 0.7792207792207791
Best Hyperparameters: {}
0.010152374
0.014418126
0.022942903
0.0024533414
0.006077762
0.006394593
0.0025547545
0.0062252153
0.0070330394
0.00684351
0.033243608
0.010376754
0.0
0.0
0.013218881
0.0
0.04612215
0.00036934577
0.005017195
0.0022651867
0.009232837
0.040610768
0.0056543713
0.0
0.0002970271
0.0
0.0
0.0038542424
0.0073525785
0.005699562
0.0035736247
0.018140193
0.010590976
0.00072639994
0.007922509
0.0
0.0
0.0
0.0029152452
0.017875407
0.014996083
0.0042909114
0.0046361648
0.008217602
0.010890139
0.0444866
0.0030684432
0.0
0.0
0.0
0.004408951
0.0032539032
0.0065076128
0.0033940512
0.0
0.016991131
0.015637938
0.0037690087
0.008873723
0.02808262
0.06866316
0.008078716
0.0
0.0003664474
0.0033660803
0.0006017065
0.0030469897
0.009813603
0.009436972
0.007092061
0.013996103
0.0026529361
0.003792302
0.027612753
0.009547023
0.0
0.022942694
0.0073154336
0.0082842335
0

### 4.1.3 LightGBM

In [113]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

none_lightgbm_performance_normalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    LightGBMPipeline = Pipeline(steps = [#['smote', SMOTE(sampling_strategy = 0.5, n_jobs=2)],
                                    #['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    none_lightgbm_performance_normalized_df = pd.concat([none_lightgbm_performance_normalized_df, new_performance_df])
    

none_lightgbm_performance_normalized_df.to_csv("../data/05_model_output/none_lightgbm_performance_normalized_df.csv")




Elapsed time to compute best fit: 9.393 seconds
Cross-validation score: 0.4274034802852089
Test score: 0.4597701149425287
Best Hyperparameters: {}
19358.357289907974
1066.2318802728212
2184.771223795349
40.01562132022809
426.92896697705146
741.0727856098783
8.01636028289795
3.8094589305110276
5.099667012691498
2.5833496497943997
214.36890794348437
380.84321096003987
1.2827881556004286
0.0
9983.625742211472
0.0
164.51931641448755
3.8278982397168875
200.4046754837036
552.0135145179229
3.794600009918213
120.66367342043668
120.61710654280614
1.4109914042055607
0.0
0.001443049986846745
32155.086503187194
97.46106833347585
5619.926797780325
0.4152690292103216
0.6134156305342913
15.658136096753879
2.2163449563086033
44.542726763058454
13.873340241378173
0.0
0.0
0.0
1.7353170873248018
2.930881917476654
20.867895041243173
12.083834967343137
20.129794757580385
35.60310989143909
4.534453357919119
6.627409934997559
11.605404952948447
0.0
0.0
0.0
170.47663804411422
3.160489805508405
90.486291836918



Elapsed time to compute best fit: 8.902 seconds
Cross-validation score: 0.44168629125460235
Test score: 0.14563106796116507
Best Hyperparameters: {}
1624675.333161028
443880.22678431496
9611090.491554156
163.6456836797297
15570579.767932462
115964.31279408559
534967.27306946
422470.16086165607
6071.674644794315
1866.6841568350792
65437.670234784484
312.08788610994816
140886.13452571072
0.0
403860.4970818646
0.0
104372.3896687217
211577.38133520633
1433196.0757862031
72837.51339511946
1271069.0012381747
5151.054026000202
13444.6218470335
826475.7958046794
3090.4388434253633
0.47677698731422424
928493.5338031594
186137.8140154481
44476623.451229095
5665.598494514823
391.90432051569223
5301477.909718357
77977.00731274486
599.2633097171783
32597.62021144852
0.0
0.0
0.0
388978.0307604894
1091.657251894474
29335702.77143255
33189.28283891082
339114.35349524766
692892.8622858897
10021406.481711693
729124.2462360859
4968.986891634762
0.0
0.0
0.0
364695.7872771807
173614.91091805696
744915.8422



Elapsed time to compute best fit: 8.918 seconds
Cross-validation score: 0.3713469332383494
Test score: 0.6790123456790124
Best Hyperparameters: {}
846.1380249728536
564.392340430175
2256.4752879076404
9.749629248399287
286.9750401172787
30.731885193730704
178.1461882563308
11.860803141491488
2688.741007342702
476.6775438282639
156.85181436908897
44.26317691989243
2.245469693094492
0.0
21.8620600446593
0.0
140.51786149293184
130.26027902262285
5.238400959875435
0.47736615198664367
83.38187533710152
3977.5384906493127
156.2503947801888
0.007512860000133514
4.144392183050513
2.7633399963378906
41.38427887193393
25.101091539254412
31.721047923900187
11.132158570922911
0.23510629683732986
29.4998172580963
85.51444586622529
19.31604258576408
10.310268574161455
0.0
0.0
0.0
20.538711223751307
20.96621331642382
378.2404938992113
424.30812303349376
7.4466082556173205
10.120338844135404
7.115143787115812
38.466278571635485
1006.1891080574133
0.0
0.0
0.0
166.10994857246988
4.117580220103264
166.19



Elapsed time to compute best fit: 8.543 seconds
Cross-validation score: 0.3439410710520768
Test score: 0.6790123456790124
Best Hyperparameters: {}
572.7212364249281
365.60214860219276
2049.7217963217186
44.94326278756489
116.44616323942319
174.79346244514454
12.33129469351843
0.052339598536491394
0.5017719864845276
1.0189741291105747
175.16164994883002
58.08203977253288
34.78605270385742
0.0
5221.04226002068
0.0
20.357485836255364
0.6904198117554188
56.52241675090045
153.0259866497945
130.17468550172634
67.66056502365973
1.8878579323645681
0.1486469954252243
630.9520263671875
1.1205799579620361
26.849157568183728
17.80919063091278
91.44250765675679
6.0672093607718125
1.2588532192166895
80.50266885827295
76.53573717125983
3.9892877250604215
111.14531147369416
0.0
0.0
0.0
14.328068338538287
4.087452799547464
3889.8978839686606
7.687266547000036
65.21454518497922
32.553193401312456
4.239805219229311
31.517480541020632
1.785987614246551
0.0
0.0
0.0
65.89288897439837
1519.4444653121955
52.6



Elapsed time to compute best fit: 8.499 seconds
Cross-validation score: 0.3756819508106273
Test score: 0.42168674698795183
Best Hyperparameters: {}
755.3154228683802
1171.5589019214065
3262.598945241247
6.705395885393955
154.3651340662036
138.54547351918882
3.574272698024288
24.80320409394335
0.726603867020458
58.31697403267026
6.200636005494744
17.510283426381648
0.7378199640661478
0.0
7.556820560595952
0.0
262.355666576419
41.4297627042979
10.918670177459717
17.496859277947806
3.188592902617529
616.3160370341502
3.4542074613273144
0.9110519858077168
18.062412244733423
0.0
4.8648428820306435
14.301953974878415
88.02491233637556
21.30353321658913
958.2453613316175
20.365468418458477
35.35650494339325
3.191602465463802
12.278374251374872
0.0
0.0
0.0
61.059026741888374
7.92475725967779
113.78380701498827
9.027688281494193
0.7885969890048727
22.405536454636604
3.8329618785646744
19.390323529718444
9.966981063538697
0.0
0.0
0.0
140.90981780274888
22.066709247301333
51.32534156940294
2.2608



Elapsed time to compute best fit: 8.577 seconds
Cross-validation score: 0.48523677112386787
Test score: 0.5194805194805195
Best Hyperparameters: {}
1451.1460869324312
474.08988046192525
4005.6761889901954
23.633033884640536
255.0916033817436
7.900474810565356
5.047500017221092
26.907060947894934
151.19670602784026
405.7045157866087
0.9858902166924963
125.9893016214387
0.27971210330724716
0.0
3.055078644887544
0.0
3.584781335456589
0.5846588831627741
16.067778396731228
54.79072507924866
13.368068280215084
919.5115721902112
0.6331770253394603
0.0
2.4084397897822782
0.0
28.23743147497139
0.1073454499160107
13.5150825381279
0.9627705038292333
3.065482974052429
6.070265631729853
32.11966222252037
0.18384731630794704
9.097337442959542
0.0
0.0
0.0
11.725394961395068
0.9571680582594126
27.049642659829352
0.3872163529449608
0.30120994412573054
1.987796978734132
76.29878928525945
0.1185825327411294
9.198721152946401
0.0
0.0
0.0
37.6371734109307
10.193233714366215
64.39914735247473
2.941488442549



Elapsed time to compute best fit: 10.614 seconds
Cross-validation score: 0.5065561939553874
Test score: 0.6707317073170732
Best Hyperparameters: {}
453.75089447264327
3273.9776202343637
383.1418073629029
18.4560173896607
17.77427224908024
30.89120625378564
2.1929604592733085
0.26956230495125055
4.019649982452393
0.0369562990963459
1.0974634978920221
27.291631730622612
4.965538619086146
0.0
3526.463613433414
0.0
1020.469688810641
252.08312825486064
2.921050483593717
117.36399841308594
18.82330067222938
115.42128622718155
2257.501432900317
1.9392000436782837
5.129896845668554
0.0
6.013675359310582
2.367846821434796
65.53227516124025
4.166163232177496
5.714223675429821
14.075169774820097
243.77299807127565
0.33316961815580726
20.80512679228559
0.0
0.0
0.0
95.4989230944775
3.3060333942994475
190.32987975445576
1.8503624626901
62.33843687851913
47.870081779081374
160.23107288428582
197.5247651655227
32.352908950764686
0.0
0.0
0.0
33.323927672579885
0.5093458835035563
49.48392750264611
0.050



Elapsed time to compute best fit: 9.125 seconds
Cross-validation score: 0.3058374302800299
Test score: 0.9036144578313253
Best Hyperparameters: {}
684.2665081939194
336.9944161361782
1502.456330084009
21.392213438637555
38.489125105319545
43.23614929849282
7.955463164718822
1.1292599439620972
0.04086937056854367
0.039349519880488515
38.49693765491247
5.4048382714390755
25.703976311022416
0.0
26.522017368115485
0.0
368.92783681536093
0.5897441280540079
0.04588843043893576
14.187179954256862
26.681380953639746
738.4117554561235
114.23065155977383
2.033623605268076
1.2528975987806916
0.0
9.268417287268676
4.574380177538842
1.5334235316840932
49.43039265507832
6.926701440475881
4.577456890139729
7.812843198189512
2.8018867168575525
15.023761907592416
0.0
0.0
0.0
8.004560563713312
1.74886354804039
3.330017787637189
62.888441065559164
1.2504014279693365
22.591512547340244
15.06548113306053
0.7813190221786499
14.365186981391162
0.0
0.0
0.0
52.619224570924416
5.8614235408604145
72.477342620142



Elapsed time to compute best fit: 8.758 seconds
Cross-validation score: 0.4524954952523193
Test score: 0.5813953488372092
Best Hyperparameters: {}
924.2420122237354
4625.258023472154
571.706187255244
13.88278710704617
4325.167364738445
373.2132919210999
51.89241814240813
0.0
1.7605080008506775
0.0
171.9610707880929
1049.1260751225054
26.433951377868652
0.0
3382.27506125113
0.0
3.0071131485456135
26.007340368349105
2.6974534080945887
15.968827903270721
14.505802345462143
209.00845339894295
0.34532878268510103
0.20879259705543518
0.004004220012575388
0.0
116.826928444847
17.792076801299118
289.21597976237535
58.38204234292789
1.8759175785817206
29.034294940531254
912.6619125240759
7.66302859427924
157.23467475170037
0.0
0.0
0.0
12.996798814507201
14.191017281613313
32.984334692667176
10.869381909724325
2.1297760256566107
50.42056097229943
131.32041147863492
5.700129570788704
12.123118739342317
0.0
0.0
0.0
931.4420001937542
0.8662289269268513
78.01712770567974
307.9168680049479
0.0
6.3052



Elapsed time to compute best fit: 8.780 seconds
Cross-validation score: 0.4444764956903128
Test score: 0.29166666666666663
Best Hyperparameters: {}
2900178.4269496948
17767494.44973468
1461971.5517889932
1610.5780941098928
4250.284656427801
332102.4421657771
2034.7503758519888
4029.8662990778685
590327.2554594129
5693.14735981822
164742.05922937393
500.22701854258776
602.392041772604
0.0
45291.783168785274
0.0
115102.875307329
58534.162990517914
4279.983585298061
318.96275770664215
359290.84495574236
1039443.7407529354
200104.4159618616
37115.685474038124
752233.2707974613
12715.90485431254
113916.68089237809
1480045.0249959826
11510.221686169505
105048.44281828403
1541.1792960762978
281374.5935536325
642190.15523839
27905.4843814373
399450.66315199435
0.0
0.0
0.0
2274.68102286011
6332.539360880852
78606.43558035791
929057.1981867105
71586.41663369536
192950.06047199667
81611.24946567416
7341.678629986942
285511.19293889403
0.0
0.0
0.0
5154193.158563919
531097.8832993358
306971.8838627



Elapsed time to compute best fit: 8.670 seconds
Cross-validation score: 0.4895807654465149
Test score: 0.7142857142857143
Best Hyperparameters: {}
3235.6562954735186
8125.520552871865
1818.5847306548385
19.84989339549793
41.72784303750086
27.483687849715352
145.40120101813227
1.100159777328372
8.150197790004313
19.320598040736513
40.47051831171848
81.55164908338338
0.01766970008611679
0.0
12.448929007630795
0.0
316.3659714530222
12.976998791098595
17.760339736938477
50.28778957843315
3.892024554195814
151.3748836054001
47.591350212926045
8.96347339823842
62.75661134254187
0.02356809936463833
1.2600956400856376
128.5601910952828
8.02798531355802
48.13592708110809
9.859880454139784
381.58602902293205
23.210887884139083
0.8801901841070503
17.360647469293326
0.0
0.0
0.0
69.49163759639487
9.208242940250784
22.727754453982925
85.80584883433767
12.167092656716704
6.758194142370485
3.502652413677424
2.4861131595098414
50.276572031667456
0.0
0.0
0.0
160.4058175771867
43.52314708183985
1302.4388



Elapsed time to compute best fit: 8.993 seconds
Cross-validation score: 0.3857331020294771
Test score: 0.6111111111111112
Best Hyperparameters: {}
31040.505790536925
11785.155476269283
245.55976164340973
32.89726905571297
3041.8236269650515
929.1060735270148
36.71136356052011
1030.816976159811
1572.2914000749588
3.4601455722004175
8.063764145364985
17.909323198720813
222.75634113699198
0.0
834.1693525710143
0.0
72363.36099457182
340.31780739175156
61.903753846883774
9.440853697480634
1566.5328149910783
4404.892629531678
2279.4695579484105
222.2530059814453
0.47460717847570777
55692.83158111572
103.32223140820861
4.476118119433522
1717.945420175558
21.66838059667498
12.655381111428142
11.872321950271726
31.055328879505396
20.541186735092197
27.873351158574224
0.0
0.0
0.0
283.56065892885636
15.722672488540411
38.82503578206524
37197.457317100256
3.390371387824416
49.66578319930704
26.652732582762837
4.90942195430398
19.032889974303544
0.0
0.0
0.0
575.5561179760844
3.1072097588330507
45.1



Elapsed time to compute best fit: 8.273 seconds
Cross-validation score: 0.35374671964745497
Test score: 0.5194805194805195
Best Hyperparameters: {}
895.9999798284948
528.766225308136
5522.526934866895
37.26043741183821
11.930454634480297
36.45041148294695
3.2516867062076926
1.2581947724102065
0.024584469851106405
206.516478813719
47.928672729525715
22.168308568710927
4.053864414803684
0.0
0.07595139695331454
0.0
72.87407064437866
45.674732636748786
180.37403145992357
20.71125030517578
3.0718627483583987
1102.8101916055894
100.25636896223295
0.0
17.000253590755165
0.0
1.1439750310964882
0.24175995914265513
20.8539181293163
10.138975008856505
0.23782403208315372
10.020754350000276
209.89076098260387
21.31341452759807
20.064161959029068
0.0
0.0
0.0
3.5673701244668337
18.490720494208198
2.150783502622499
2.7271306932499826
6.151269021676853
93.51533746218777
255.46722477218282
0.041621800512075424
2.7025336503818096
0.0
0.0
0.0
44.30243811871736
130.78602319431957
20.707515335259153
20.467



Elapsed time to compute best fit: 8.873 seconds
Cross-validation score: 0.39354515837948034
Test score: 0.3703703703703704
Best Hyperparameters: {}
14105982.458549805
337519.1254254244
4847856.956440773
35.48788568004966
212368.26122635603
624308.0284835435
1770.8233267441392
219559.3334083557
3400.81151586771
61.494590640068054
1045.3108433783054
2258.255370646715
753.7194003462791
0.0
7594.42300209403
0.0
1625553.0404793322
1226846.9382504523
338.18321265280247
775783.230849266
80300.56682503223
77992.28895898163
14168.21606875956
5995.686225384474
171272.27315482497
331.1992273032665
5046781.871184319
297871.66399490833
49052.16701622307
1210642.4792374596
5042.922462403774
687467.0139428079
6988.0624009519815
645660.5587330759
96481.0243203938
0.0
0.0
0.0
11618233.290851008
57258.690423965454
9092196.020943962
283815.49410274625
144606.89886799455
94906.54754783958
53263.50995284319
932.7019958496094
862017.6178025156
0.0
0.0
0.0
2534354.334701862
5022914.474261783
11192.1975601352



Elapsed time to compute best fit: 8.575 seconds
Cross-validation score: 0.4139327915100015
Test score: 0.220125786163522
Best Hyperparameters: {}
185603267.02109087
651551.2253352441
462140.3558697775
177176.35724949092
8322.60516441986
2843509.030653432
6294.955882456154
6316.734221410006
819630.0956128389
36.233584344387054
1962577.2828539908
149548.96730009466
18420.005647301674
0.0
11267.904762011021
0.0
3662.30286443606
61673.73219373822
472.8465398028493
201.52183342725039
7977.233492668718
127.76993750408292
24587.187049165368
1733.1390898525715
21924.563330173492
38.27399826049805
988051.4763478786
8961.808311998844
4206.054371267557
249432.2999989502
201.77440440654755
64556.378609761596
966938.4505990297
11828.85967001319
2194182.614275111
0.0
0.0
0.0
181800.7852461487
29.862447518855333
6894.608890846372
5063024.889171436
25723.971176862717
29049.835700575262
9260.076185878366
649.3854040205479
624548.7712826729
0.0
0.0
0.0
852515.2242609486
6002681.363031551
46724470.362515



Elapsed time to compute best fit: 9.149 seconds
Cross-validation score: 0.39187811800877576
Test score: 0.5487804878048781
Best Hyperparameters: {}
2134.453344375128
9486.062721958093
392.92964290175587
10.817170931492
158.75035654986277
59.087984281766694
63.828439072705805
44.595921482658014
682.9640650749207
33.47442571120337
144.9535878810566
135.9884923766367
62.101491656620055
0.0
184.7201963365078
0.0
319.11852513439953
79.7595592327416
292.45789925806525
124.42759504308924
165.7043453590013
8461.11877691932
4.266419264022261
0.00965133961290121
365.92733228206635
0.0016194699564948678
355.3366325062234
2917.527576504275
1.4291372944135219
8.429414874874055
0.0021378500387072563
10.436158266849816
28.20212321780855
55.232631638878956
26.505370556144044
0.0
0.0
0.0
7.51124687329866
13.784032776020467
1344.6676663709804
18.208740429894533
0.026624280028045177
5.511084296973422
100.43757660314441
1.646129951812327
258.46425273432396
0.0
0.0
0.0
64.83520762861008
1.7914788217749447




Elapsed time to compute best fit: 8.988 seconds
Cross-validation score: 0.4813325730386353
Test score: 0.49999999999999994
Best Hyperparameters: {}
147.01836062538496
3899.033251684713
639.16144945407
6.225263393716887
5.2143679773435
45.82161884353263
3.440382554894313
0.17791107948869467
34.49221949186176
4.342022001743317
12.195237343898043
49.64574346737936
0.19461939856410027
0.0
1831.2852150778053
0.0
12.360877687402535
0.6449542531045154
1.040670461487025
0.026677200570702553
0.3002954199910164
3.7677807079744525
3120.0404670910066
0.06683240085840225
0.0747976831626147
0.0
29.067180208512582
5.3541277593467385
21.844322552671656
1.8443546798080206
9.710740069276653
2.837667643732857
55.01971672020393
6.330629211575975
24.624064726172946
0.0
0.0
0.0
12.49790983920684
15.841155737638474
11.823258816148154
4.601631111232564
10.06020637328038
26.124796127784066
9.745027757599019
3.356135935432614
8.821380025008693
0.0
0.0
0.0
48.20599532854976
4.16903391899541
77.43101591178493
2.4



Elapsed time to compute best fit: 8.862 seconds
Cross-validation score: 0.4475965862884744
Test score: 0.6470588235294119
Best Hyperparameters: {}
1027.9070783619586
3872.069871279502
6075.888558750041
26.14097791397944
993.9665954267839
111.37025085417554
18.312059081159532
0.058099535293877125
226.70352288731374
0.21669116197153926
80.24261890084017
25.29976020939648
1.22380529390648
0.0
5.1015221499837935
0.0
1079.853389975382
33.800564562901855
321.0701664456283
14.682510375976562
5.548839092254639
921.8135953751625
36.73339259807835
1.8074244138551876
0.006638769991695881
0.0090002998476848
21.425247369450517
144.43070115242153
6.3387064163107425
108.2290805359203
0.1744117183261551
58.95640118204756
182.0766547130188
4.0846100294729695
69.69866009462203
0.0
0.0
0.0
86.17235773976427
21.764344409108162
3356.163372035371
0.7891578925773501
4.625841837259941
9.718102671322413
224.88606747006997
46.85763427661732
12.579436446618274
0.0
0.0
0.0
2538.5190121975775
1.4914121681358665
58



Elapsed time to compute best fit: 9.130 seconds
Cross-validation score: 0.4066561233743896
Test score: 0.7142857142857143
Best Hyperparameters: {}
497.8233084775309
125.13177638337947
2163.7239046691684
6.777554863132536
58.09686915553175
58.39941246248782
8.153759391279891
7.202554464805871
0.0
343.2014076411724
2.0601869830861688
22.216406332328916
9.359263839200139
0.0
137.41560539545026
0.0
235.46407641540281
49.208593174116686
79.74551323265769
0.003557560034096241
197.6783811391797
854.5224811807275
163.02317934599705
3.543405693024397
151.23670620052144
0.0014590299688279629
7.755926320096478
9.295071442378685
459.41280197142623
5.019051385228522
6.51782895589713
3.7298400928266346
95.32634841099207
2.3140727448044345
32.88509578362573
0.0
0.0
0.0
52.88092863583006
6.260505647631362
4.682803524658084
0.6274601102340966
3.283461461775005
8.893001911696047
35.739245629054494
7.348714475752786
8.690501095727086
0.0
0.0
0.0
15.786201651324518
12.316374071873724
71.51898400799837
0.6



Elapsed time to compute best fit: 10.067 seconds
Cross-validation score: 0.47924082473418333
Test score: 0.6707317073170732
Best Hyperparameters: {}
469.4535660077818
4222.807704459177
576.3025247352198
9.372171313036233
14.862963788211346
28.101907127420418
0.22284296434372663
10.052466616034508
43.23640060424805
0.21733838645741343
383.6568797889631
1.3864119849167764
12.346569425426424
0.0
488.4398972140625
0.0
12.304819200187922
0.016703189467079937
13.739535958040506
0.07932113984134048
0.9107085019350052
58.129743778379634
2804.527044169605
0.0
0.3006550073623657
0.2285660058259964
10.372266709338874
1.828159002121538
3.1175306601217017
4.718780644237995
13.403055309318006
97.62089100398589
285.3461492541246
10.007376952562481
15.862998275319114
0.0
0.0
0.0
5.152005904354155
3.6764461013954133
21.972910373006016
1.3019863161025569
73.16824383870699
27.366619946900755
12.68507551215589
0.0008463769918307662
9.756152757909149
0.0
0.0
0.0
19.35807737626601
1.281122470740229
69.94869



Elapsed time to compute best fit: 9.132 seconds
Cross-validation score: 0.392435181328547
Test score: 0.46666666666666656
Best Hyperparameters: {}
753.2531728976454
1326.4690980219998
2457.9497245778693
35.584883007686585
76.353748014546
63.09868562326301
3.021861999470275
0.45099003589712083
29.62953335610655
17.69547222170513
165.4267426802544
54.53271780931391
3.7187430411577225
0.0
6036.435647641425
0.0
38.392551671131514
64.18063758313656
23.3761911066249
1947.8439170001075
119.86550481675658
301.1636536461865
0.3521686904132366
0.042581929825246334
4.6716291308403015
0.0
2.939469392877072
132.2227270903968
27.35009159008041
0.23283361364156008
0.19870400428771973
29.074343874363695
565.3408607900849
1.19190426095156
9.16527042912848
0.0
0.0
0.0
109.06974583669155
0.6540256175212562
65.38873482769122
457.93301308713853
1.5427758671576157
6.770582900580492
153.61694890634573
0.049070101231336594
74.64962183129683
0.0
0.0
0.0
171.5502770021867
3.2429161196505447
469.89086374239395
0



Elapsed time to compute best fit: 8.683 seconds
Cross-validation score: 0.42111975533126245
Test score: 0.5128205128205128
Best Hyperparameters: {}
276.0349899454896
3196.5937418274116
747.3212952771573
75.58464227349032
158.47900454181945
114.98932474153116
168.2444329727441
0.9438859969377518
10.722267284989357
23.69444441248197
399.83391722256783
282.67780989117455
33.31344613083638
0.0
1348.1247972629499
0.0
92.77492614183575
11.810742821689928
29.896173669258133
2.383596721221693
44.034966297098435
109.33243425190449
2351.2836418121005
0.015725539764389396
0.016080200672149658
0.0
3.594968358054757
8.064570527058095
9.2006033831276
0.8695529517135583
10.622540522832423
26.36588060320355
19.18320109578781
35.13111356156878
35.05680914002005
0.0
0.0
0.0
98.84646232961677
0.3927512955851853
5.7061686692759395
22.904047266580164
4.1708123796852306
23.66704964870587
21.19003203464672
0.24395511578768492
4.571806931344327
0.0
0.0
0.0
69.36012980510714
7.7218184705707245
108.163554202183



Elapsed time to compute best fit: 8.574 seconds
Cross-validation score: 0.45852333725706107
Test score: 0.3076923076923077
Best Hyperparameters: {}
3321.6234407760203
791.2603412680328
14070.72145092301
230.71816372172907
2332.364963606
192.13403091114014
170.1259242463857
69.01255751773715
73.928785353899
20.309267356060445
3832.600660048425
2740.818578599021
127.45075577311218
0.0
92789.25502318144
0.0
333.5755908600986
129.31627123057842
60.69210374355316
0.6134232133626938
1.8806846365332603
2.3773264456540346
46.02016140893102
98.85432753432542
17.474599838256836
9.822679102420807
40701.28861001041
232.14564761146903
43.92376667819917
470915.173019981
3.2528384253382683
46.48856668174267
26.76118437666446
24.257747836411
3686422.0709873773
0.0
0.0
0.0
425.539523165673
261.2358227642253
405.544557065703
190668.45985237416
1037.712847309187
263056.99751964584
237332.16245634668
1.7768077086657286
416.7150203473866
0.0
0.0
0.0
26076.564593391493
6248.332430148497
71571.90688864607
57



Elapsed time to compute best fit: 8.627 seconds
Cross-validation score: 0.4478288680909648
Test score: 0.3684210526315789
Best Hyperparameters: {}
42608.64693221124
19871.500897537568
39747.19051216252
22.814906931715086
47.24540195777081
9285.405965894694
79.30057513527572
153.19563517952338
173.48018587450497
23.368799209594727
2.341177335008979
34.89628139999695
531.3934638297651
0.0
118.92466987110674
0.0
555.5493074059486
111.12943044304848
177.8392009846866
110.7118849735707
23104.556702136993
4680.006585597992
3.564275039359927
3.848471999168396
226.48095677420497
0.0
267.1515848846175
15857.381247468758
2802.598868639907
479.1498081609607
32.71949118375778
484.34852242609486
361.6873117014766
56.59015233628452
129.13165074400604
0.0
0.0
0.0
47.72566350409761
29.33363946992904
3039.275676297955
315.18442866392434
790.3854960487224
99.70451168669388
729.7474466974381
13.1040014680475
422.726488979999
0.0
0.0
0.0
45.902719784295186
6.616619568318129
195.24891638197005
10.597626618



Elapsed time to compute best fit: 8.508 seconds
Cross-validation score: 0.3909834088180989
Test score: 0.4878048780487805
Best Hyperparameters: {}
19277.793200466054
3801.2480355016887
2433.4257569704205
19.83459572866559
505.3685979240108
141.08163641527062
51.75737887993455
14.018725043162704
34.67283998336643
288.9496304700151
16.08793698064983
210.42194707936142
88.70225470584009
0.0
285.84425510372967
0.0
6.645598469302058
0.10446214023977518
228.96500447485596
48.200023628771305
876.1628263154998
81698.11150639204
2.5377910137176514
0.49748913245275617
0.11270214919932187
4.920228717848659
23.979755487293005
2.166584622580558
78.89363231509924
60.528057099319994
4.163093408569694
1048.2944479162106
80.77135817264207
174.3090056705987
100.79082374786958
0.0
0.0
0.0
362.8763755109394
13.803159393370152
36.77075656980742
66.38718267553486
41.70991993928328
31.234377443324775
28.815507846185938
0.14347320050001144
8938.360324102454
0.0
0.0
0.0
122.6077156161191
7.650285602307122
116.

## 4.2 Rebalancing Strategy - SMOTE

### 4.2.1 Random Forest

In [114]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

smote_randomforest_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    pipeline = Pipeline(steps = [['smote', SMOTE()],
                              #['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    smote_randomforest_normalized_performance_df = pd.concat([smote_randomforest_normalized_performance_df, new_performance_df])
    
smote_randomforest_normalized_performance_df.to_csv("../data/05_model_output/smote_randomforest_normalized_performance_df.csv")



### 4.2.3 XGBoost

In [115]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


smote_xgboost_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    GXBoostPipeline = Pipeline(steps = [['smote', SMOTE()],
                                    #['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    smote_xgboost_normalized_performance_df = pd.concat([smote_xgboost_normalized_performance_df, new_performance_df])
    

smote_xgboost_normalized_performance_df.to_csv("../data/05_model_output/smote_xgboost_normalized_performance_df.csv")




Elapsed time to compute best fit: 171.558 seconds
Cross-validation score: 0.6829939371865903
Test score: 0.5376344086021505
Best Hyperparameters: {}
0.0012632151
0.0026860863
0.21351153
0.0012933743
0.007498932
0.00335979
0.00010213029
0.004531991
0.0014391512
0.00747503
0.011523111
0.017502245
0.0012311807
0.0
0.0015607664
0.0
0.0016828056
0.0
4.013467e-05
0.000101640566
0.0020916879
0.0013208446
0.0021349944
0.007486115
0.00040294733
0.00018696218
0.0007930987
0.0
0.0006565838
0.0006227448
0.00024765483
0.006991868
0.0027575304
0.0016260133
0.0015202677
0.0
0.0
0.0
0.0038348327
0.041022137
0.002458049
0.0013259711
0.0008312207
0.008222012
0.009213648
0.01026367
0.006267072
0.0
0.0
0.0
0.0004430507
0.00019737265
0.0025648407
0.0
0.0
0.004833041
0.027205873
0.00024191258
0.0002800438
0.020501468
0.020448305
0.008618679
0.00023236763
0.0010046936
0.0013657542
0.000931295
0.00027058867
0.00037398798
0.0
0.0015910923
0.0029364163
0.000106747684
0.11063352
0.0011928573
0.13221534
0.0
0.004



Elapsed time to compute best fit: 163.937 seconds
Cross-validation score: 0.7104039208559904
Test score: 0.7971014492753623
Best Hyperparameters: {}
0.011157926
0.018490335
0.23179235
0.0034348876
0.06509443
0.019928433
0.0071628606
0.00068076106
0.00021643934
0.001432767
0.02063085
0.019780718
0.0
0.0
5.4429256e-05
0.0
0.026073558
0.00086721056
0.0037270454
1.7057542e-05
0.00075876224
0.0011308693
0.0028333154
0.025954852
0.0010750202
0.0
5.189227e-05
0.0001555719
0.00037689132
0.024593482
7.105544e-05
0.0027398155
0.0034097338
0.00088419946
0.0075677135
0.0
0.0
0.0
0.0020656083
0.053948827
0.0064349035
0.0017347098
0.006558943
0.003327324
0.0026384199
0.0015947993
0.0013087732
0.0
0.0
0.0
0.00045524107
0.0013996896
0.00049331615
0.00015463591
0.00028194537
0.0
0.008477716
0.0053675105
5.325359e-06
0.0070933793
0.0064643165
0.0034371482
0.023890574
0.00010020094
0.0012474457
0.0030462341
0.0019970548
0.0009029932
0.0009380542
0.0015332452
0.0008579856
0.00022214474
0.05820913
0.0
0.00



Elapsed time to compute best fit: 164.983 seconds
Cross-validation score: 0.7319978729259757
Test score: 0.7058823529411765
Best Hyperparameters: {}
0.013643945
0.056364343
0.18806452
0.002695596
0.022772176
0.0010303314
0.0027918406
0.0
0.0
0.00046790775
0.011737842
0.013948659
0.0
0.0
0.01556035
0.0
0.006728285
0.00036595573
0.003748533
0.020487474
0.000432512
0.0004712439
0.008857213
0.0018746811
8.744141e-05
0.00010626595
0.0
4.827974e-05
0.0016064642
0.0032466636
0.0002276529
0.0021767218
0.001894756
0.0028141045
0.006525116
0.0
0.0
0.0
0.0009319895
0.024223464
0.00700873
0.0018410715
0.000524911
0.002811042
0.0035219775
0.0010058342
0.0056953114
0.0
0.0
0.0
0.0006869894
0.0014407763
0.00049388054
0.00040687594
0.0
0.0014976287
0.0005483194
0.00068824826
0.0012497628
0.009788587
0.012182653
0.013659434
0.00017614367
0.0
0.00058492244
0.00024028569
0.0016558355
9.516592e-05
0.0010919005
0.0019504846
0.041084964
0.00080708996
0.110581435
0.00039871447
0.0028017329
0.0
0.0
0.00718452



Elapsed time to compute best fit: 176.324 seconds
Cross-validation score: 0.6866785051642966
Test score: 0.6470588235294119
Best Hyperparameters: {}
0.0012242434
0.0023637041
0.22454922
0.0012913293
0.009094653
6.542515e-05
0.00038629211
0.0
0.0
0.0
0.020965708
0.014263156
0.0065247705
0.0
0.0025746508
0.0
0.0027175555
0.00026975834
0.0
0.10680144
0.00020388757
0.00024300255
0.0029247862
0.0022796048
0.0013164446
0.0
0.0014023208
7.2951384e-06
0.000115828334
0.00071612955
0.00032489435
0.0051432815
0.0011280503
0.0033024033
0.004086172
0.0
0.0
0.0
0.0031142237
0.037643
0.0122552095
0.00043878713
0.0007902184
0.0061298404
0.0010689938
0.0005381937
0.012153267
0.0
0.0
0.0
0.0007199805
0.003161383
0.0005291564
0.0015848647
0.001530304
0.0
0.0031361016
0.003295708
0.0005874213
0.007532723
0.0057336255
0.018625734
0.0006024569
0.00025402717
0.0016183935
6.692757e-05
3.135281e-05
0.008510174
0.0043304274
0.0041932943
0.03220889
0.002300228
0.14015442
0.00040868006
0.00014549751
0.0
0.0
0.006



Elapsed time to compute best fit: 159.139 seconds
Cross-validation score: 0.7749413365039138
Test score: 0.5504587155963303
Best Hyperparameters: {}
0.0076431218
0.013276364
0.24518722
0.0011157411
0.025268937
0.016097285
0.001108968
0.0010442869
0.00045875815
0.0006145394
0.007881338
6.1844104e-05
0.00034549646
0.0
0.00054999033
0.0
0.0077029597
7.3904805e-05
0.0007603867
0.0005686502
0.00027749187
0.00066386716
0.0013971005
0.0549582
0.0093496125
0.006899382
1.111576e-06
0.00057205267
0.0101761585
0.0047519417
0.0001322216
0.0017131792
0.004590817
0.0009108182
0.0026635753
0.0
0.0
0.0
0.00048201001
0.029686889
0.0039493022
0.0013336382
0.001957217
0.0027322175
0.0006574952
0.0027558645
0.0044718264
0.0
0.0
0.0
0.0012745573
0.0023921323
0.00052368216
0.00019923034
0.0
0.0008844494
0.0010707182
0.0014396872
0.011629831
0.043095995
0.0025567552
0.0013429524
0.0003562689
3.4937322e-05
0.00011738548
4.4780412e-05
5.163489e-05
0.0028096412
0.003146378
0.003951558
0.0018402413
0.0016601196




Elapsed time to compute best fit: 155.094 seconds
Cross-validation score: 0.757444760593059
Test score: 0.5194805194805195
Best Hyperparameters: {}
0.010835961
0.02880401
0.30166465
0.0020007326
0.0053446274
0.00010896732
0.00080234534
0.00033407044
0.001700724
0.0048831687
0.0052155894
0.003544738
0.00012992424
0.0
0.0060554002
0.0
0.0007538506
0.0
0.0
0.0
0.0006663445
0.00026133825
0.0033744038
0.000483404
0.0027564168
0.00018436549
0.0006951678
0.0013222661
0.00017376283
0.00042938447
0.00025521175
0.0018267578
0.0081132585
0.0030591397
0.006986633
0.0
0.0
0.0
0.013227204
0.008277743
0.0044483184
0.00094815256
2.8988641e-05
0.0006449918
0.002052152
0.00049940357
0.0032371068
0.0
0.0
0.0
0.0020530603
0.0003177395
0.00031457533
0.0009927896
0.00091646
0.00012675957
0.0017990462
0.012829889
0.0001460889
0.0075041372
0.0127702365
0.0013566755
0.0
0.00019806465
0.003039703
0.00084240665
0.00021801311
0.0
0.0017652597
0.00040467
0.0057541598
7.399129e-05
0.08914538
0.002915232
0.000259308



Elapsed time to compute best fit: 167.312 seconds
Cross-validation score: 0.6993083276393233
Test score: 0.8235294117647058
Best Hyperparameters: {}
0.019683499
0.05057385
0.19740346
0.0024159977
0.02636961
0.007867991
0.0014258219
0.0
0.0004232656
0.000929463
0.013201684
0.0015186119
0.00029958895
0.0
0.0004366235
0.0
0.00037213095
0.0007255813
0.0017714631
0.00021087182
0.0005190813
0.0016997702
0.005457236
0.0014185514
0.0003448831
0.0
0.0006518351
0.0036708647
0.00011862275
0.0005106709
0.0015590453
0.00075797125
0.0012656687
0.0010796797
0.0006879573
0.0
0.0
0.0
0.0024967557
0.061568983
0.013187214
0.002545549
0.0014139983
0.0006548275
0.004677866
0.0028808848
0.022116464
0.0
0.0
0.0
0.00039481817
0.0011062543
0.00024353612
0.0004410275
0.0
0.00023854012
0.028988
0.0015585073
0.0038288385
0.008007306
0.009291061
0.0052045053
1.0571625e-05
0.0
0.0015309873
0.00012879811
0.00020152376
0.0
0.0034938988
0.0005984684
0.0017720795
0.000111414876
0.10406253
0.0
0.00095021055
0.0
0.0
0.00



Elapsed time to compute best fit: 156.193 seconds
Cross-validation score: 0.704498667870425
Test score: 0.5797101449275363
Best Hyperparameters: {}
0.014265009
0.0636156
0.1754567
0.0009700424
0.06128678
0.0024415222
0.00040481248
0.0019889562
0.0032605669
0.000986417
0.019814277
0.00046665376
0.0
0.0
0.0017832505
0.0
0.023532275
0.0
0.06177144
0.002726712
0.003538386
0.00021279667
0.013409089
0.00051283755
0.0002981815
0.00040840427
0.056902587
0.0013328496
0.00022943839
0.0034502964
0.00024737464
0.00076772633
0.0008789106
0.0023519595
0.0014617381
0.0
0.0
0.0
0.0039254855
0.04279853
0.0024876085
0.003240601
0.0013341403
0.0087447185
0.00057550496
0.0011929794
0.0013335947
0.0
0.0
0.0
0.0015369206
0.0007190431
0.00026685485
0.0002254673
0.00030115808
0.00021709102
0.013228998
0.002631764
0.0044496018
0.003804747
0.0086597605
0.0036330034
0.00039219548
0.00013249945
0.0004910174
0.00015926616
8.509884e-05
0.0
0.0
0.049966965
0.006328994
0.0
0.024941716
0.00050222914
0.0009979985
0.0
0



Elapsed time to compute best fit: 169.962 seconds
Cross-validation score: 0.7236825070552095
Test score: 0.7792207792207791
Best Hyperparameters: {}
0.007400464
0.04062427
0.19127932
0.0015128795
0.026946472
0.008015093
0.0
0.000895075
0.00024749618
0.0009899542
0.0059941728
0.0037194903
4.6926125e-05
0.0
0.0001899323
0.0
0.020370543
0.0001215678
0.0012286203
4.3529162e-05
0.0003760278
0.0012074453
0.005993031
0.010250601
0.0025781146
0.0031761075
0.0003722101
0.0006593478
0.0007576594
0.0004419008
0.00042351676
0.00064687734
0.0013354085
0.0009044342
0.0009556451
0.0
0.0
0.0
0.0031253959
0.009179933
0.004337828
0.0032944183
0.0013707726
0.011967231
0.0017780676
0.0021732117
0.0065140054
0.0
0.0
0.0
0.0006505035
0.0012797972
0.00032670575
0.00056366494
0.0003491224
0.0
0.006816286
0.0012646724
0.00027298622
0.012160964
0.011202006
0.007810002
0.0018394565
0.0007155893
0.00091937213
0.0008269143
0.0009221454
0.0
8.244995e-05
0.0036478355
0.02516005
8.739159e-05
0.107261986
0.007907916
0



Elapsed time to compute best fit: 159.532 seconds
Cross-validation score: 0.7007469700316825
Test score: 0.7407407407407408
Best Hyperparameters: {}
0.009003203
0.12403091
0.17472206
0.0019273749
0.021348007
0.026618682
0.0013796622
0.00014614227
0.001928089
0.0005856636
0.005457681
0.032078978
0.005585317
0.0
0.0052368515
0.0
0.012157777
0.0
0.0
0.00036479544
0.00026604475
2.7858863e-05
0.0042191553
0.0
0.016746338
0.0
0.0009726098
0.0024164876
0.0015447786
0.0012249292
0.00039948843
0.0033288298
0.0008756016
0.0024560222
0.0012355773
0.0
0.0
0.0
0.009468633
0.05948573
0.0016963558
0.0005101266
0.00884617
0.005667561
0.0012881735
0.00047595118
0.0032026616
0.0
0.0
0.0
0.0012849269
0.004761287
0.0002015388
0.0008226762
6.449994e-05
0.0
0.004471002
0.0023920042
0.0003161318
0.01021012
0.0064510023
0.008859868
6.7476496e-05
0.0
0.005653501
0.0007112382
0.004086054
0.0
0.00096829626
0.0006976933
0.0020383492
0.0033606046
0.094668694
0.0
0.00029077055
0.0
0.00010809041
0.0070253657
0.00313



Elapsed time to compute best fit: 155.318 seconds
Cross-validation score: 0.7588577372994588
Test score: 0.8163265306122449
Best Hyperparameters: {}
0.025761154
0.06840255
0.19122203
0.002219155
0.017485125
0.0016800048
0.00018707603
0.0009755887
0.00013881498
7.4029704e-05
0.0047929143
0.006828228
0.0
0.0
0.004109274
0.0
0.023478268
4.528798e-05
0.0
0.0015937894
0.010251414
0.0
0.0022472695
0.0
0.00012956669
0.000120293626
0.000797911
0.0
0.00042011033
0.004681367
0.00031589757
0.006863706
0.0020376462
0.0028625226
0.00015970254
0.0
0.0
0.0
0.0034760719
0.08475206
0.0073166303
0.0011994711
0.014633176
0.0033326403
0.0012842464
0.0023886552
0.0037115002
0.0
0.0
0.0
0.0007799804
0.0002572608
0.00057947484
0.0
0.00012582965
0.00032150294
0.0036866004
0.0023340667
0.0005101495
0.0073110666
0.008662514
0.017869696
0.0009884614
0.0
0.0009756503
0.0
0.0003301575
0.0
0.00097490405
0.0007026926
0.0015668743
0.004620589
0.10742164
0.000862825
0.079071164
0.0
0.0020179774
0.032840896
0.004743578



Elapsed time to compute best fit: 168.669 seconds
Cross-validation score: 0.6403155294164374
Test score: 0.8024691358024691
Best Hyperparameters: {}
0.0071936487
0.046541546
0.17216165
0.004049442
0.03281278
0.008651014
0.0005264871
0.0027955296
0.00041594097
0.0013481907
0.005575743
0.037586655
0.0
0.0
0.0006313011
0.0
0.0010196121
0.0
0.00047964256
0.003174096
0.0003928839
0.0034980143
0.0007123968
0.043482505
0.0006337634
0.0
0.00058147626
0.0028357464
0.00055111485
0.021943925
0.00012011396
0.0012415429
0.0043769274
0.0019622247
0.0047239456
0.0
0.0
0.0
0.0028010532
0.023750313
0.0051334407
0.00271329
0.0018584495
0.003868472
0.0047108703
0.0012567389
0.019593911
0.0
0.0
0.0
0.0005342586
0.0016218907
0.0017898475
0.0005781889
0.00024078619
0.005661436
0.015867377
0.0030725335
0.00030853605
0.009388215
0.011032406
0.011617587
3.241789e-05
2.071075e-05
0.0015032272
0.00035612675
0.0003623233
0.00033657407
0.0
0.0051180413
0.00066067505
0.00224535
0.08686642
0.00045275543
0.0005507802



Elapsed time to compute best fit: 167.181 seconds
Cross-validation score: 0.6520119225552163
Test score: 0.8235294117647058
Best Hyperparameters: {}
0.014329912
0.041512415
0.17464706
0.0025166392
0.056102566
0.008750484
0.00031870374
0.00018051217
0.00011909289
6.9602385e-05
0.01220383
0.00014300435
0.0
0.0
0.0014385036
0.0
0.037247214
0.00024435917
0.021176193
0.004872392
0.00024503484
0.0013952781
0.0043904437
0.006614105
0.0030553255
0.0
0.00027446472
0.005656125
0.00038622651
0.0005740764
0.00026019086
0.001752649
9.028015e-05
0.0002567769
0.001067532
0.0
0.0
0.0
0.0047304663
0.012529137
0.0034196703
0.0006373951
0.0002358745
0.006769032
0.0040482096
0.0020911833
0.0020401583
0.0
0.0
0.0
0.0010566091
0.0033627807
0.00013458822
0.0011537141
0.0
0.0012361063
0.014636746
0.0031928127
0.0012719642
0.008562111
0.019056823
0.003740825
0.0007614889
0.00022473036
0.00069155084
0.0048600812
0.0
0.0
1.8470697e-05
0.0027741126
0.013063083
0.0
0.073830314
0.012712156
0.00035437456
0.0
0.0
0.0



Elapsed time to compute best fit: 158.190 seconds
Cross-validation score: 0.7280162735767258
Test score: 0.7246376811594203
Best Hyperparameters: {}
0.0076630563
0.0685367
0.21445386
0.0021206697
0.023925964
0.0013446863
0.00095773774
0.00040201802
0.0002586174
0.0002632246
0.015504688
0.0016548508
0.0
0.0
0.0021313324
0.0
0.006330862
0.0005481647
0.013755022
0.0149203045
0.000117449665
0.011132345
0.004460703
0.0011807774
0.0025608751
0.0
0.0039487802
0.00011787983
0.00030764626
0.0026614077
0.0001272844
0.0057008187
0.0006695096
0.0020154733
0.0012334541
0.0
0.0
0.0
0.009218753
0.0644906
0.0038863404
0.0021812685
0.00085588085
0.0043194075
0.0053514736
0.0037500504
0.0114572
0.0
0.0
0.0
0.000968309
0.0035457541
0.00025548838
0.00019371498
0.00011299785
0.0
0.0007983639
0.0034795986
0.0068157897
0.0072273775
0.017913194
0.0007695191
0.00011522344
4.3907985e-05
0.002260025
0.00010966837
0.0
0.0036446443
0.0001952761
0.0006035433
0.0019439667
0.011341613
0.09658513
0.0
0.0005458664
0.0




Elapsed time to compute best fit: 171.777 seconds
Cross-validation score: 0.6536110230882096
Test score: 0.8441558441558442
Best Hyperparameters: {}
0.005496952
0.051603023
0.186569
0.0029048696
0.019498099
0.0012527531
0.0013183415
0.001724542
9.102521e-05
0.0
0.004577682
0.0019597865
0.0
0.0
0.010389037
0.0
0.0019636832
0.0010854981
0.0007564573
0.0003657381
0.000616804
0.00021269952
0.00067710364
0.002658773
0.00048739213
0.0013682189
0.00023843512
0.0
0.00027921458
0.016565826
2.6742484e-05
0.001998264
0.0039252336
0.0029186266
0.009573412
0.0
0.0
0.0
0.016085584
0.034758244
0.0055349274
0.013029811
4.953232e-05
0.0046667494
0.0003478169
0.0002113924
0.027984262
0.0
0.0
0.0
0.00058717176
0.002091402
0.00072992704
0.0033857431
0.00024629265
0.0
0.0029319688
0.0018476065
0.005396772
0.014084321
0.008702814
0.0026158511
0.00021420525
0.0003226066
0.0018930666
0.00030601805
0.001783257
7.248777e-05
0.00061563955
0.0034377684
0.019459207
0.0009495911
0.11046316
0.006570854
0.020421373
0



Elapsed time to compute best fit: 172.024 seconds
Cross-validation score: 0.635041285886612
Test score: 0.7534246575342465
Best Hyperparameters: {}
0.0076004756
0.02183623
0.24537812
0.003025986
0.032852117
0.0020132896
0.0
0.0
0.00040433285
0.0005369068
0.013952306
0.000914774
0.0010037349
0.0
0.00061583257
0.0
0.017531708
0.0001643285
0.0003043938
5.1927076e-05
0.00038538268
0.0020144063
0.0014133794
0.00093227345
0.01364812
0.0011170362
0.015252543
0.0009302929
9.529187e-05
0.00027505646
0.00052285194
0.005475295
0.0011002717
0.005106135
0.0010004045
0.0
0.0
0.0
0.007466073
0.066572316
0.008034795
0.00092904666
0.004979811
0.0034560072
0.0039022628
0.0014992167
0.02245624
0.0
0.0
0.0
0.002502523
0.003230266
0.0003304958
0.00031744246
0.0
0.0005795165
0.014548042
0.00022942602
0.0011968834
0.0041763885
0.008944746
0.0026769203
0.0009330793
1.33239955e-05
0.00513178
6.42579e-05
0.0023480563
3.7922287e-05
0.00340843
0.003061215
0.022962783
0.0005753398
0.11486493
0.0009968955
0.0
0.0
0



Elapsed time to compute best fit: 164.723 seconds
Cross-validation score: 0.707932968802534
Test score: 0.7647058823529411
Best Hyperparameters: {}
0.017227268
0.057499975
0.2353024
0.0019451395
0.02062976
0.00017976937
0.0017607097
0.00034413015
0.0
0.00015658782
0.0065315384
0.0002459912
4.1937154e-05
0.0
0.00018636567
0.0
0.011169391
0.0
0.0019408903
0.00029683564
0.00024686247
8.66436e-05
0.0017597945
0.00034265104
0.0029953488
0.00013052704
0.0012594503
0.0
0.0010241357
0.0011537517
0.000138464
0.0014301174
0.0010353129
0.0019648985
0.0034276657
0.0
0.0
0.0
0.011781971
0.02993098
0.0021499966
0.0031721701
0.0016901675
0.009318773
0.0004155154
0.00041830735
0.013399434
0.0
0.0
0.0
0.0027449604
0.0040865033
0.0008848533
0.00022601552
0.00028568902
0.0007132266
0.0015993975
0.012461253
0.006455642
0.013500776
0.011044244
0.002993649
0.00020797076
0.0
0.0015964278
0.0077232756
1.2480213e-05
0.0002988969
0.004241675
0.0008145412
0.0014223508
0.0004514005
0.089896284
0.0073104245
0.0005



Elapsed time to compute best fit: 169.438 seconds
Cross-validation score: 0.7154026669277413
Test score: 0.5913978494623656
Best Hyperparameters: {}
0.0020819798
0.0051672496
0.19393241
0.0006944343
0.07200536
0.0005147148
0.00014270387
0.0038859348
0.0010459548
0.0074672773
0.011753056
0.0075186035
0.0005658043
0.0
0.0008452559
0.0
0.017407294
0.00020919897
0.006577905
0.0039402125
0.0009767201
0.0
0.00055103964
0.0026185554
2.4840429e-05
0.0
0.006267213
2.3788245e-05
0.0012916556
0.01148581
0.00011607157
0.00049359
0.017365584
0.0031752023
0.008702925
0.0
0.0
0.0
0.0037965772
0.035933234
0.004735696
0.0012990871
0.0029715197
0.0025977902
0.0052547036
0.00054237025
0.0034492535
0.0
0.0
0.0
0.0017693274
0.0011387512
0.0002979264
0.00023459579
0.00026670806
0.00095179136
0.0060798205
0.0
0.0046673724
0.019215608
0.00751978
0.004860203
0.0006340451
0.0
0.00022449066
0.00069523026
0.00044272566
0.010665816
0.0018779804
0.004994734
0.00027243703
0.0042351894
0.099397324
0.0021305655
0.0294



Elapsed time to compute best fit: 161.457 seconds
Cross-validation score: 0.7815140402830804
Test score: 0.5670103092783505
Best Hyperparameters: {}
0.015563577
0.050042022
0.18665105
0.0005994955
0.0044075334
0.00025487368
0.000116375464
0.00087923394
3.38367e-05
0.0008452688
0.005776503
0.045858677
6.210409e-05
0.0
0.0024835358
0.0
0.001676617
0.00020717668
0.011521974
8.7606815e-05
0.00040522526
0.007203845
0.001355207
0.0011227046
0.00012190735
0.0
0.0003094742
0.0014227224
0.0010307218
0.020277664
0.00031039183
0.0011779402
0.0017087845
0.0033784567
0.010568133
0.0
0.0
0.0
0.00040673724
0.0021416852
0.0059227524
0.0009346655
0.0049011884
0.002251216
0.0017075704
0.00014634439
0.002141389
0.0
0.0
0.0
0.00050573575
0.00075835706
0.0052279984
0.0
0.00077820127
0.00023338856
0.017866995
0.011622583
0.00025881405
0.006744593
0.014640015
0.0017735444
0.000779618
0.00020968809
0.00076408364
0.0005124972
5.0629224e-05
8.552784e-05
0.003171303
0.00021596828
0.0013795947
0.0002783361
0.1091



Elapsed time to compute best fit: 166.394 seconds
Cross-validation score: 0.678799391351986
Test score: 0.7303370786516854
Best Hyperparameters: {}
0.0032109222
0.01645048
0.21264467
0.0012176207
0.029980741
0.0039243475
0.0
8.111239e-05
0.00020174882
0.0007770683
0.016590424
2.0100279e-05
0.00017939639
0.0
0.0031137571
0.0
0.009154167
0.009859879
0.023292031
0.0006326627
0.0020319715
0.0
0.0049601477
0.0
0.0009068572
0.0
0.00073819724
0.0
0.00050050544
0.00042241937
0.00015673427
0.011633765
0.0007569268
0.0034921565
0.00091143994
0.0
0.0
0.0
0.0011947844
0.051361658
0.0034643696
0.0012861742
0.01413576
0.007159135
0.0006953813
0.007608806
0.0064125583
0.0
0.0
0.0
0.0013495742
0.0008647872
0.00014571319
0.0005805389
0.0
0.00036125715
0.01281196
0.0019154638
0.0002802361
0.008230609
0.010097191
0.0066917962
0.07238583
0.0027361684
0.001292316
0.00011401802
0.00014803764
0.010553323
0.00022282376
0.00033571466
0.0026362357
0.0001671049
0.08508154
0.0015467294
0.04438241
0.0
0.002708421




Elapsed time to compute best fit: 162.851 seconds
Cross-validation score: 0.6858975246225865
Test score: 0.660377358490566
Best Hyperparameters: {}
0.02587617
0.010686289
0.21258968
0.0025830157
0.024364058
0.00017712779
0.0008512601
6.8026704e-05
0.0004094102
0.0036051727
0.01610727
0.01572614
0.0007086565
0.0
0.0025029043
0.0
0.019498972
0.0
0.04694494
0.0
0.00041350344
0.0015185208
0.0006325481
0.0
0.0013159133
2.4265139e-05
0.0007336869
0.000844777
6.0057377e-05
0.0021764613
0.00014073437
0.0014039498
0.0035651524
0.004819386
0.0004610732
0.0
0.0
0.0
0.010841316
0.06749079
0.008427755
0.0024382435
0.0083076535
0.003981981
0.0013221055
0.0023619141
0.0030487475
0.0
0.0
0.0
0.00084947573
0.0012107745
0.00016138009
0.003676087
0.0
0.008555282
0.012096112
0.008997931
0.0022936324
0.0061293654
0.013474395
0.0037644666
0.028851585
0.0013438606
0.00061783526
0.0025114086
0.0
0.0064788167
0.010482696
0.0002281713
0.0030174942
0.000876721
0.09060545
0.003630519
0.00040537407
0.0
0.000108615



Elapsed time to compute best fit: 167.058 seconds
Cross-validation score: 0.6911051875947513
Test score: 0.8024691358024691
Best Hyperparameters: {}
0.011273596
0.07972467
0.16552445
0.00091042294
0.02019884
0.003739257
9.56433e-05
0.0
8.2910956e-05
0.0012963898
0.0065156007
0.016627943
0.0
0.0
0.0013737101
0.0
0.01437448
3.1033625e-05
0.019317362
0.0051233717
0.00016128027
0.005309683
0.002014728
0.013635592
0.002205663
0.0
0.0023589807
0.00035987052
0.0
0.0005614964
0.00021550154
0.00068963476
0.00060929835
0.0016468371
0.0025613518
0.0
0.0
0.0
0.0013961045
0.029884111
0.009396741
0.00076847314
5.0517585e-05
0.0035666882
0.0005149878
0.0016525022
0.0018818564
0.0
0.0
0.0
0.0003153422
0.0003303336
0.0003894319
0.0006100826
0.0
0.00049541635
0.00206448
0.00784878
0.00019915607
0.0070555042
0.012938573
0.0074089393
0.0002421473
0.021758236
0.0019389229
0.00021550228
0.00019404799
0.0
0.001620094
0.0007100578
0.00025776686
0.004405137
0.111938566
0.001580103
0.0012118007
0.0
0.0004974944



Elapsed time to compute best fit: 163.099 seconds
Cross-validation score: 0.7551913957944814
Test score: 0.6140350877192984
Best Hyperparameters: {}
0.0012736616
0.030655708
0.23460752
0.0033281115
0.05489099
0.0011241701
0.00023502142
0.0053272867
0.013568881
0.00035858364
0.014437988
0.0017138554
2.0660102e-05
0.0
0.008970211
0.0
0.021149898
0.0
0.0006850739
0.00095144066
0.00014371575
0.0007331372
0.002214817
0.0
0.0007771407
0.0
0.0118025085
0.0
0.0022905306
0.00011651815
0.0001555091
0.0020275062
0.0024998784
0.0002371874
0.0011976726
0.0
0.0
0.0
0.006824778
0.06927701
0.00363976
0.00060233555
5.7419864e-05
0.009982263
0.0061983056
0.0005790529
0.00213898
0.0
0.0
0.0
0.0013842096
0.0012778132
0.0002521595
0.0
0.0048772194
0.00030372335
0.018930513
0.008417008
0.0030604925
0.006544716
0.011472313
0.0054002153
8.3391e-05
0.0
0.00079546444
0.00015041298
3.3190227e-06
0.0
0.008116613
0.0034007956
0.0026973581
0.0
0.07839715
0.0017456233
0.00043703103
0.0
0.00036229563
0.000103194034
0



Elapsed time to compute best fit: 163.143 seconds
Cross-validation score: 0.722054441664499
Test score: 0.617283950617284
Best Hyperparameters: {}
0.0021561563
0.0027155906
0.20299774
0.0016341092
0.019170225
0.014062781
0.0005106709
0.003066655
0.0020524536
0.0056485673
0.010396501
0.11823026
0.0
0.0
5.083599e-05
0.0
0.019254342
0.0
0.002404436
0.0
0.00013757969
0.0002892722
0.0023119603
0.0004913092
0.0
0.0
0.00017052719
0.0
0.0005057025
0.00019072683
7.9250625e-05
0.0025963348
0.00047904233
0.0054240646
0.00027238214
0.0
0.0
0.0
0.0042728307
0.07163693
0.00086305354
0.00063152827
0.0065078293
0.010311868
0.0012028794
0.0006687393
0.014729775
0.0
0.0
0.0
0.002004142
0.0022141947
0.00016405487
0.0017277941
0.00066343386
0.0035814105
0.010903325
0.0009007043
0.0017297048
0.0055373297
0.009186581
0.0063913665
0.052782215
3.7304137e-05
0.002314012
0.00045369574
0.00018997933
0.0
0.0037544975
0.00016337584
0.0017331574
0.0003592109
0.09673753
0.0
0.044519924
0.0
0.0037740788
0.0012858768




Elapsed time to compute best fit: 169.442 seconds
Cross-validation score: 0.7160621641874678
Test score: 0.6521739130434784
Best Hyperparameters: {}
0.0027283854
0.0051106773
0.2560087
0.0025422964
0.00918829
0.008106589
0.0004271708
0.00040267644
0.0002803911
0.0043414696
0.006138473
0.03181493
0.0001651565
0.0
0.0005513756
0.0
0.00021099126
0.0002629465
0.0
0.0009651582
0.0018273795
0.00049252884
0.0026113843
0.002351661
0.002480201
0.0
0.00021885087
0.021009369
0.0001170642
0.04774177
0.00044006846
0.0024375345
0.0029857159
0.004326265
0.008808946
0.0
0.0
0.0
0.013445112
0.015091332
0.00538034
0.00318756
0.00068194565
0.004009177
0.0005924244
0.0029945804
0.0045628077
0.0
0.0
0.0
0.0029086412
0.000523289
0.00070938055
0.0010845595
0.00035057915
0.0
0.0025457102
0.0011537847
0.0026525347
0.012016008
0.018726096
0.012221257
0.00035416207
1.31515535e-05
0.0010845575
0.0012234041
5.180735e-05
0.0
0.00025092612
0.0033480267
0.0046966407
0.0012272391
0.09585945
0.00050834066
0.022750508
5

### 4.2.4 LightGBM

In [116]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

smote_lightgbm_performance_normalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    LightGBMPipeline = Pipeline(steps = [['smote', SMOTE()],
                                    #['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    smote_lightgbm_performance_normalized_df = pd.concat([smote_lightgbm_performance_normalized_df, new_performance_df])
    

smote_lightgbm_performance_normalized_df.to_csv("../data/05_model_output/smote_lightgbm_performance_normalized_df.csv")




Elapsed time to compute best fit: 17.977 seconds
Cross-validation score: 0.6478313899269782
Test score: 0.625
Best Hyperparameters: {}
576.4752591475844
349.404090821743
252579.0217320323
601.2854906022549
5342.544023141265
101.9560319930315
43.65105962753296
20.24705122411251
31.152516797184944
60.09578540921211
7461.215315878391
5.700392097234726
38.33645099401474
0.0
584.9842336475849
0.0
1402.4229783117771
1.8639030456542969
238.01068305969238
0.0
14.596330717206001
16.0382179915905
437.33563685417175
668.9706163406372
290.0269349217415
2.2240580320358276
1041.8391308784485
133.50049023330212
0.0
355.36679531633854
0.0
830.8654995337129
391.0983234345913
42.68209584057331
186.05083779990673
0.0
0.0
0.0
757.9659666419029
23645.122568786144
415.7213468849659
91.05732667446136
181.71677693724632
593.5867144614458
90.2975746691227
31.793057054281235
4690.253316551447
0.0
0.0
0.0
762.5631597489119
98.25919863581657
250.01959343254566
11.65743713080883
3.36798894405365
3.5871429443359375



Elapsed time to compute best fit: 18.353 seconds
Cross-validation score: 0.6395820511919583
Test score: 0.7738095238095238
Best Hyperparameters: {}
3503.3836790695786
25257.74428524077
218935.23399932683
943.8711154460907
2374.3121305406094
42.60192368924618
0.37687599658966064
4.156311094760895
2.2442910075187683
8.337868988513947
1665.5939106345177
2901.6986568272114
32.32501682639122
0.0
328.2240951061249
0.0
697.0571072399616
2.064389944076538
123.98169922828674
192.24351200461388
1.2183020263910294
3.0310999900102615
225.56849424540997
0.0
63.44833183288574
0.0
138.0767457485199
56.92337127029896
1.3231160044670105
37.75169096887112
5.127279102802277
65.87256099283695
747.5848411023617
106.50099329650402
185.74198929965496
0.0
0.0
0.0
1341.2316480875015
2709.126498669386
92.37466591596603
132.06278079748154
9.403199672698975
492.0797907114029
17.507320128381252
90.50483739376068
2639.3007326200604
0.0
0.0
0.0
776.4948011040688
1055.364976093173
97.1090797483921
9.554615244269371
1



Elapsed time to compute best fit: 18.413 seconds
Cross-validation score: 0.688354547877007
Test score: 0.49382716049382713
Best Hyperparameters: {}
420.23783012479544
68930.7995047681
209297.98262968287
617.3385882712901
2872.436333730817
236.1015997827053
150.58089517056942
1257.1802129745483
733.2431507110596
11.927540063858032
2055.828304708004
14.7431001663208
6.338703654706478
0.0
26.130438186228275
0.0
67.47609455883503
0.0
0.0
0.0
0.0
55.080501556396484
747.8048178851604
30.780250072479248
511.7727565765381
4.776468962430954
64.20629933476448
56.73638787120581
1.2010664194822311
21.541627883911133
4.523070938885212
399.7696149498224
129.55877164006233
45.06243039667606
56.45728686451912
0.0
0.0
0.0
1286.6535835713148
434.837385751307
134.1966567710042
37.540977761149406
82.49221393465996
777.9463832452893
861.8416229784489
304.14842611551285
4242.504178233445
0.0
0.0
0.0
495.847625374794
3326.7605529874563
92.18969846516848
3.7258939892053604
0.0
0.0
337.7682174295187
620.068217



Elapsed time to compute best fit: 17.708 seconds
Cross-validation score: 0.6192751929013539
Test score: 0.5421686746987953
Best Hyperparameters: {}
1473.828272253275
681.1381671875715
251851.38852401078
1113.3301742747426
3099.3312845528126
143.13421808183193
0.19399599730968475
10.885136008262634
0.801367998123169
2.4878959506750107
661.7657155543566
33.09458176791668
23.33043944835663
0.0
6.571510076522827
0.0
182.96895956993103
0.09897319972515106
0.559101015329361
0.2676370143890381
24.149303406476974
2.1864499747753143
280.3301686644554
0.7205270081758499
5.3362399488687515
14.260839700698853
34.67191034555435
15.352289289236069
1.1443599462509155
41.71886044740677
9.834512084722519
500.3788673877716
90.48726896941662
950.3781797885895
277.9112190231681
0.0
0.0
0.0
17.026750087738037
28963.8771302104
299.05430722236633
113.85877095162868
17.046829342842102
356.53315183520317
2.3281079828739166
129.4595597088337
3891.7453852668405
0.0
0.0
0.0
155.65989197045565
1814.130115777254
78



Elapsed time to compute best fit: 17.869 seconds
Cross-validation score: 0.7253931755706603
Test score: 0.4819277108433735
Best Hyperparameters: {}
12067.558545924723
14418.701659455895
240361.54768187553
560.7416771166027
522.5942799597979
971.1418785899878
4.0590440183877945
0.6188300102949142
2.065700627863407
2.3251900672912598
1612.6842490583658
1925.8912301957607
6.4581298828125
0.0
82.7106616050005
0.0
9.487277075648308
2.0941898822784424
257.6215780079365
20.33523279428482
0.2229573056101799
4.007456064224243
173.04597778618336
7.149620056152344
44.00489068031311
0.0
286.7402131855488
7.074027016758919
130.73995184898376
5.739513523876667
23.815289437770844
617.1150010153651
143.77915190160275
575.1333554536104
941.7072785943747
0.0
0.0
0.0
96.71337553858757
4983.018971309066
985.3902162909508
113.06239636987448
53.66259089112282
22.16927633434534
700.7261248826981
1.3712062686681747
3190.9870450124145
0.0
0.0
0.0
652.3177565857768
595.5455489382148
174.46608579158783
16.557559



Elapsed time to compute best fit: 17.435 seconds
Cross-validation score: 0.6378626366317367
Test score: 0.6097560975609756
Best Hyperparameters: {}
4878.2675434798
50789.70801113546
194312.50130966306
1151.1376697719097
3945.674787223339
754.7233621180058
37.006347209215164
104.93205258250237
73.63026869297028
36.0893270522356
3060.543601155281
431.9964965879917
66.78973507881165
0.0
1132.141613587737
0.0
765.896692276001
0.0
0.0
10.260002106428146
2.3872060328722
2.7813360393047333
192.08928360044956
29.09501051902771
19.254622280597687
0.0
0.2471970021724701
0.0
0.19053499400615692
94.15251070261002
167.74579095840454
398.2251587584615
1066.0729636698961
478.15104925632477
1176.6882704943419
0.0
0.0
0.0
217.84045600891113
12961.035878151655
845.545717895031
125.45133632421494
55.969984248280525
2809.7757381796837
69.00154159963131
23.197062969207764
1869.7130879759789
0.0
0.0
0.0
37.09028919041157
1320.4075767993927
388.91823092103004
2.6866399347782135
2.043910026550293
56.760661303



Elapsed time to compute best fit: 18.073 seconds
Cross-validation score: 0.6887332328508798
Test score: 0.5813953488372092
Best Hyperparameters: {}
9641.661763608456
8915.601171292365
234321.00942850858
689.3119438290596
3186.4379846453667
204.04837465286255
15.456091612577438
27.44659924507141
2.2063871175050735
9.597830832004547
1399.5137174129486
1.3330740183591843
58.0473964959383
0.0
78.82053545117378
0.0
225.0755060017109
0.0
11.49316681921482
0.0
2.9557100534439087
23.24349895119667
1044.0935441702604
381.0093570947647
83.23694199323654
3.8646320551633835
29.50133255124092
2.3250259906053543
1.0896999835968018
673.1623801290989
0.4754129946231842
41.892365515232086
387.1411154419184
273.7555994167924
501.38993473351
0.0
0.0
0.0
2063.4002088606358
13737.736178547144
564.334350168705
8.532391183078289
102.6489518135786
815.4709562957287
37.96275568008423
39.05953513085842
3208.96655228734
0.0
0.0
0.0
344.1909644752741
115.55041306465864
806.9745433479548
47.71994125843048
0.0
0.49



Elapsed time to compute best fit: 19.247 seconds
Cross-validation score: 0.6831106029635442
Test score: 0.5555555555555556
Best Hyperparameters: {}
3451.6802864670753
19449.702880509198
228233.01398558915
463.28496462106705
4900.82343237102
50.05701553821564
2.314816951751709
27.599117398262024
34.46066749095917
189.69152255356312
1991.0446843206882
23.789648592472076
16.900959730148315
0.0
174.01079440116882
0.0
1214.8862934112549
0.0
3.072568342089653
24.68231299519539
2.6554540246725082
1.3860979974269867
94.47012101113796
314.41650223731995
13.207486405968666
0.0
54.929901123046875
116.68492349982262
0.12484200298786163
475.2835760861635
1.107723981142044
31.764226853847504
1072.1397577375174
1023.9931215643883
1722.247280806303
0.0
0.0
0.0
103.94186900556087
9846.783102989197
637.8104243278503
80.70727328956127
173.39579105377197
1195.6472957357764
0.34589600563049316
1.748804971575737
687.1104683578014
0.0
0.0
0.0
435.3718399554491
415.2934074178338
195.37370561808348
107.4076123



Elapsed time to compute best fit: 18.045 seconds
Cross-validation score: 0.6633287955868601
Test score: 0.6626506024096386
Best Hyperparameters: {}
2980.6993339955807
12276.402185335755
232808.39892058074
807.5997948274016
5635.380112826824
560.1336581408978
2.760930061340332
46.51848649978638
9.182279706001282
0.9464969784021378
1751.3796660006046
586.3483016937971
0.0
0.0
57.86039161682129
0.0
482.62408995628357
0.0
5.243092089891434
1.4224349856376648
3.7542919665575027
8.438218042254448
241.5416143834591
263.2409973144531
196.22867718338966
30.413799285888672
5.858369797468185
637.4035810232162
0.0
2011.839421480894
2.772001028060913
345.5090760588646
226.00262799859047
877.1573034226894
3934.6964554339647
0.0
0.0
0.0
835.5627666711807
2015.5971185415983
646.0026807039976
6.644865900278091
28.007638245821
1566.9786123335361
1336.3391485065222
13.192569017410278
2177.3969959765673
0.0
0.0
0.0
788.2892444133759
1421.2531101107597
294.8760190978646
44.635469645261765
1.603398993611335



Elapsed time to compute best fit: 18.615 seconds
Cross-validation score: 0.5530852973867679
Test score: 0.8620689655172415
Best Hyperparameters: {}
6798.558663204312
12936.428010836244
224926.9740922451
477.83089013397694
3657.62834918499
63.024099349975586
130.69394099712372
118.11543709039688
140.17698007822037
9.597193986177444
1014.9952217638493
343.35188269615173
27.206069946289062
0.0
3.1743910908699036
0.0
1181.0795148313046
6.532089054584503
97.20221608877182
52.66110038757324
8.126150965690613
4.218022882938385
82.88495230674744
7.361268162727356
353.3509665131569
21.608920574188232
285.22139543294907
4.462147012352943
4.933020114898682
274.66133269667625
292.7307337075472
552.7448235303164
202.2937426865101
210.80026631057262
891.5293359607458
0.0
0.0
0.0
1137.134358227253
15568.437703251839
137.22695633769035
194.35771790146828
97.41605082154274
699.7849950790405
47.28383922576904
9.476456850767136
606.0806111395359
0.0
0.0
0.0
301.97285006940365
1005.5784030407667
546.83550



Elapsed time to compute best fit: 19.043 seconds
Cross-validation score: 0.6085831555785917
Test score: 0.6896551724137931
Best Hyperparameters: {}
2984.822212189436
47081.310090348125
184858.37793247402
1434.4105114787817
2745.5791583657265
235.88749885559082
21.424006208777428
1.1376410126686096
0.6114659905433655
15.017168015241623
2808.1811570227146
404.0474805831909
20.51078987121582
0.0
4.790399044752121
0.0
274.871489033103
0.0
4.7194148898124695
6.0501598715782166
15.214598998427391
10.783550083637238
518.3696303963661
1.3875800371170044
349.09407609701157
0.0
4.442186057567596
31.132856786251068
1.5924160182476044
143.505438670516
4.6739639937877655
464.6362189203501
970.2972297370434
697.6677759885788
2320.3754616975784
0.0
0.0
0.0
1572.6600537002087
12701.517331928015
762.0280020385981
73.41354481875896
54.74424394965172
1264.140475153923
543.1227328777313
24.451710522174835
3725.498514354229
0.0
0.0
0.0
450.6755714863539
754.6612172424793
430.6484344601631
24.37791502475738



Elapsed time to compute best fit: 19.079 seconds
Cross-validation score: 0.6376372022163097
Test score: 0.7058823529411765
Best Hyperparameters: {}
4061.9866207391024
18519.34640520811
209396.94324208796
710.570310972631
4688.249624282122
0.0
9.916061043739319
2.6633999347686768
7.304684937000275
0.0
4564.786121085286
1240.6741273924708
7.051608219742775
0.0
0.0
0.0
2018.0395778417587
4.044310092926025
91.2682991027832
0.34671899676322937
0.0
1.5938260108232498
225.4403556585312
0.5846280008554459
15.566411033272743
0.0
238.44696587324142
0.0
4.463850021362305
217.56950767338276
10.012145906686783
44.742054745554924
2215.8803570866585
455.75501368939877
2311.1644409894943
0.0
0.0
0.0
387.0115033984184
23408.608678124845
1656.0354181826115
133.9187355041504
157.77169781923294
182.08470702171326
157.81255087256432
62.10242947936058
3748.731559202075
0.0
0.0
0.0
660.153126090765
693.1768585145473
360.09594006836414
13.15904588997364
10.411832869052887
5.728659018874168
1072.772583246231
2



Elapsed time to compute best fit: 19.219 seconds
Cross-validation score: 0.65208632097177
Test score: 0.6395348837209303
Best Hyperparameters: {}
7597.624492175877
18113.16142399609
234124.38143837452
1573.0765276625752
2773.7478093504906
1140.3564783036709
16.96931004524231
1.3811399936676025
13.215018048882484
17.144359782338142
1907.899138316512
21.90009744465351
13.976680159568787
0.0
8.089642882347107
0.0
362.55539183318615
0.0
0.0
0.22630099952220917
2.563541978597641
17.6796013712883
268.9133608341217
0.0
12.20294277369976
17.947200775146484
125.33506554365158
0.23785099387168884
13.33136485517025
780.6789400875568
11.730950385332108
104.35696370899677
68.47733472287655
505.4049491882324
1105.1386847794056
0.0
0.0
0.0
1750.1659967303276
11950.503212548792
1830.258882433176
142.01828277111053
51.14943131804466
1309.6817324012518
315.00050354003906
16.96616780757904
3635.90821069479
0.0
0.0
0.0
229.03450395166874
866.5378602445126
210.8642332404852
90.06189227104187
1.996146023273



Elapsed time to compute best fit: 18.014 seconds
Cross-validation score: 0.6845188198129375
Test score: 0.6470588235294119
Best Hyperparameters: {}
7849.492887958884
23320.546225838363
232078.53786106408
378.9214857965708
169.1990047544241
1384.5383361577988
10.76405020058155
1.2685830295085907
0.6954689919948578
42.038078151643276
6993.758774146438
795.0971154868603
35.75299954414368
0.0
36.07670494914055
0.0
3.521850064396858
0.0
0.2708350121974945
0.0
10.350571528077126
16.640419125556946
258.7017941623926
0.27870090305805206
48.77349853515625
0.0
2.9285110533237457
0.0
0.5017083883285522
7.316232420504093
74.63779376447201
276.9197671711445
92.37103708088398
195.1220097243786
939.1474779844284
0.0
0.0
0.0
104.52453534305096
15928.431829690933
182.43109334260225
15.565491527318954
104.48385713249445
845.9117229878902
3.864468038082123
2.2553629875183105
2046.589763469994
0.0
0.0
0.0
530.8847876787186
91.21790805459023
507.9314827620983
18.565069139003754
24.918399810791016
0.0
4931.



Elapsed time to compute best fit: 17.674 seconds
Cross-validation score: 0.6396730465967286
Test score: 0.7647058823529411
Best Hyperparameters: {}
2973.9765182584524
5865.7318314909935
240384.22211059928
795.243518859148
3728.0067954584956
10.8339102268219
4.555308923125267
2.255905032157898
0.0
8.944408878684044
2215.3689906597137
392.04330241680145
40.58694925904274
0.0
35.1196524053812
0.0
704.7034591734409
0.0
291.45123732089996
13.338471174240112
14.81948509812355
14.957987681031227
268.9718592315912
0.0
80.93169784545898
0.0
1.1082780063152313
0.4164170026779175
1.2758129835128784
329.7782623767853
5.802391946315765
455.48895312845707
1386.6710687428713
440.9992128536105
1525.2193825691938
0.0
0.0
0.0
92.82149969041348
11476.279289782047
178.61151406168938
22.83196896314621
225.2126908302307
1410.5945746302605
19.59037621319294
11.763776034116745
661.3295210599899
0.0
0.0
0.0
850.163166642189
2560.571218699217
178.6119288727641
20.618491664528847
0.0
0.0
13.346306771039963
74.39



Elapsed time to compute best fit: 17.844 seconds
Cross-validation score: 0.5928410840175545
Test score: 0.7558139534883721
Best Hyperparameters: {}
5800.857585340738
9238.187021553516
231651.77839341015
936.3296315819025
3029.3118290007114
867.1396980285645
88.57701027393341
13.434020280838013
27.095771580934525
15.558680534362793
2504.3992764651775
3.673191010951996
5.053540229797363
0.0
184.02698677778244
0.0
255.4980598539114
0.0
2.0579869747161865
5.896470069885254
6.871365994215012
12.431290924549103
307.23889222741127
4.166500091552734
299.84273570775986
0.0
297.2518945634365
34.180150389671326
0.0
56.81205013394356
48.88127210736275
244.36709094047546
308.7352253496647
81.5211159735918
120.06271056830883
0.0
0.0
0.0
2107.4373453855515
14494.923840142787
922.4627189934254
179.4684484153986
12.500948198139668
895.0732183903456
483.63955676555634
15.162582665681839
3983.413219191134
0.0
0.0
0.0
620.4189659059048
1541.9972968399525
226.9802074059844
23.012547552585602
52.41806733608



Elapsed time to compute best fit: 17.142 seconds
Cross-validation score: 0.6521460393751415
Test score: 0.6547619047619049
Best Hyperparameters: {}
9728.11877144128
763.7625909149647
231501.88915950805
135.68837651610374
5538.557370126247
671.9739391207695
12.781822085380554
0.0
8.084871977567673
0.0
7601.056592985988
217.06739127635956
15.722692549228668
0.0
5.248457983136177
0.0
1189.009752213955
3.5299201011657715
1.1023900508880615
12.401214063167572
10.631472915410995
1.475163996219635
693.0161541700363
2.9693950414657593
108.80129514634609
2.663450002670288
1038.6910164952278
8.132694959640503
37.930108070373535
311.47323593497276
12.093460038304329
24.8053620159626
259.17743679881096
576.1891873925924
48.317910715937614
0.0
0.0
0.0
353.11752888560295
24157.278924897313
515.158300369978
46.90066985785961
696.3268654644489
820.4691066741943
38.09682980179787
101.33230063319206
994.6260248422623
0.0
0.0
0.0
179.00536526739597
443.22400533407927
270.657180711627
0.9508939832448959
5



Elapsed time to compute best fit: 17.907 seconds
Cross-validation score: 0.6724160007905471
Test score: 0.5882352941176471
Best Hyperparameters: {}
290.7738082408905
264.09494295716286
253567.20570394397
652.4697199985385
3920.1537416875362
33.09628975391388
5.327095851302147
19.005035862326622
6.8656060844659805
17.80404394865036
2273.1496170163155
7.1358218640089035
6.991171061992645
0.0
21.290029764175415
0.0
625.6137169897556
0.0
224.5239963531494
0.216622993350029
4.948156982660294
0.9816910326480865
493.6916908621788
0.0
440.377911940217
5.785140037536621
72.5794405117631
0.0
2.0167360305786133
842.3559204041958
9.61269986629486
208.3765394538641
194.8522961139679
323.2363557368517
1388.7658199220896
0.0
0.0
0.0
129.22140255570412
9382.277806803584
1551.4776741564274
69.37010577321053
21.609009981155396
1214.905923217535
20.467133432626724
69.94922791421413
835.5608465969563
0.0
0.0
0.0
390.1491229683161
584.0626858025789
382.1066918820143
2.9015400409698486
5.292979031801224
116



Elapsed time to compute best fit: 16.668 seconds
Cross-validation score: 0.6726938094585153
Test score: 0.5063291139240507
Best Hyperparameters: {}
995.8638919889927
362.52709808945656
269222.8003476709
186.53897784650326
291.62707959115505
0.0
18.561469670385122
0.34353798627853394
0.25164860114455223
4.476709868758917
1318.930282264948
16.59743505716324
0.0
0.0
1621.5962519049644
0.0
51.266749799251556
0.0
6.351093173027039
13.88934288918972
0.8018019795417786
0.0
75.4852414317429
0.0
48.367500364780426
0.0
2.1777522563934326
11.549374133348465
178.70292649418116
29.138326421380043
2.4959626272320747
310.32545955479145
36.72752845287323
29.93389756977558
49.369203221052885
0.0
0.0
0.0
24.625099167227745
12984.460433587432
599.0115852504969
590.7388212680817
7.02886925637722
569.3739607930183
6.342744097113609
182.65372652560472
29.255273178219795
0.0
0.0
0.0
92.72639556229115
55.083057668060064
323.4953371323645
44.73622041940689
0.0
0.0
2318.0787369161844
32.20448708534241
5.0541700



Elapsed time to compute best fit: 17.739 seconds
Cross-validation score: 0.68995552446098
Test score: 0.625
Best Hyperparameters: {}
6466.707689389586
6448.5433270558715
260155.8182780519
1733.2973283827305
1561.9718163013458
689.137759655714
18.391621589660645
3.8680700063705444
13.686930179595947
25.87350082397461
9199.426960080862
1.72052301466465
23.665067747235298
0.0
3.1846750676631927
0.0
239.39275455474854
5.620397090911865
1309.5931315720081
9.148770332336426
1.2545010149478912
0.18108099699020386
274.7044016420841
564.1047673225403
116.77194841951132
142.27519989013672
123.88191047310829
0.11870700120925903
0.0
1602.6786649376154
25.291952252388
376.39801290631294
1072.578192859888
25.834716148674488
158.5962182432413
0.0
0.0
0.0
208.7031498849392
13973.451681017876
1917.0300973206758
129.07487973570824
359.1376465857029
106.33880862593651
35.415586322546005
253.7920379191637
1072.253947660327
0.0
0.0
0.0
64.59448440372944
480.76848877966404
411.07449589669704
19.028831273317



Elapsed time to compute best fit: 18.308 seconds
Cross-validation score: 0.6292940763528999
Test score: 0.7142857142857143
Best Hyperparameters: {}
3986.960341259837
605.7750698328018
246407.98561049253
815.3436652719975
4205.408344551921
366.2257665991783
19.68023383617401
0.3033199906349182
58.36735221743584
5.472932904958725
1145.2486546337605
533.6078626215458
19.384380042552948
0.0
26.389060348272324
0.0
2256.4133464396
0.637723982334137
4.942048981785774
10.866399765014648
5.596528947353363
15.1982936039567
436.3086839914322
0.0
254.56235295534134
0.0
15.937394067645073
7.501029968261719
8.43207898736
75.38851265609264
3.1846229881048203
97.10793468356133
110.19784171879292
84.7266436740756
204.4611862450838
0.0
0.0
0.0
2355.6339279562235
2505.5977599322796
1070.4020766019821
391.43697591125965
39.84373760223389
947.3286219388247
15.969521075487137
6.728290095925331
762.2983133047819
0.0
0.0
0.0
658.0653745010495
231.20939873158932
54.64525181055069
3.4223000705242157
2.738456010



Elapsed time to compute best fit: 19.933 seconds
Cross-validation score: 0.5990323325771618
Test score: 0.7058823529411765
Best Hyperparameters: {}
2723.7441781908274
1997.0981187224388
240846.01938150823
660.0751802176237
4667.97611707449
119.63236305117607
3.765314966440201
0.0
9.827849864959717
7.811324775218964
1981.5542500168085
3.1047699451446533
7.273750066757202
0.0
12.091919898986816
0.0
3061.1974135935307
0.0
54.95715045928955
28.119709610939026
15.459605678915977
1.3542929887771606
60.242566630244255
2.6485700607299805
824.318623483181
0.0
31.236615918576717
0.5902910232543945
2.5474620163440704
629.9723878502846
4.356787979602814
342.32799039781094
167.57607707381248
565.4498279094696
3198.730767726898
0.0
0.0
0.0
972.0439769178629
2219.5816828906536
316.9888079762459
49.532949179410934
42.121148988604546
818.9872137606144
291.4986999705434
22.384510159492493
795.341828122735
0.0
0.0
0.0
71.3181364685297
97.69515503942966
355.0828393250704
92.02627162635326
11.7777068912982



Elapsed time to compute best fit: 17.994 seconds
Cross-validation score: 0.6423705578117341
Test score: 0.6024096385542168
Best Hyperparameters: {}
7179.992543071508
36875.86498206854
193370.99642893672
651.1344505250454
308.5513878464699
155.82622380554676
10.19450467824936
1.5247299671173096
3.942646026611328
8.262437909841537
806.9327305704355
2186.6623001098633
51.322571098804474
0.0
63.47774004936218
0.0
1105.7741385698318
0.0
6.317478120326996
24.294690132141113
4.262905985116959
1.0211910009384155
465.2539227902889
1031.1513195037842
446.7632945328951
0.1960110068321228
0.6579350233078003
4.807008147239685
79.71903306245804
30.545719504356384
10.268955007195473
185.96816062927246
165.48831106722355
378.6664607822895
2172.5108749121428
0.0
0.0
0.0
212.8104309439659
3962.1859543025494
595.8967678397894
42.881034195423126
84.12201949954033
91.69645695388317
2019.6263518333435
36.91380366683006
1261.4876589626074
0.0
0.0
0.0
449.6745734810829
274.88184474408627
377.4342928081751
5.3



Elapsed time to compute best fit: 17.885 seconds
Cross-validation score: 0.6917637070151302
Test score: 0.6395348837209303
Best Hyperparameters: {}
5713.95674226433
3133.4694408476353
248181.76645696908
4.616946026682854
567.7445231303573
222.34051448106766
11.594776913523674
0.0
2.1363430619239807
2.6026789844036102
678.9966306090355
1316.3215721845627
61.85423490405083
0.0
1.0692399740219116
0.0
22.62193337082863
0.0
5.941150069236755
46.53378963470459
5.256779909133911
5.575314104557037
948.4079856276512
289.741218149662
59.43620252609253
0.15587900578975677
306.04451209306717
13.960901722311974
5.1600319147109985
20907.592126607895
1.8913020193576813
905.8489652052522
467.66102592647076
233.25987847149372
658.5193004235625
0.0
0.0
0.0
41.09185168147087
1744.0880200713873
1031.9120168536901
126.91499301791191
1931.8325265347958
663.3246593102813
102.94255368411541
63.555943205952644
372.21808184683323
0.0
0.0
0.0
60.98399709165096
236.9207830056548
572.6731231287122
24.0033455640077



Elapsed time to compute best fit: 19.352 seconds
Cross-validation score: 0.6439775910364146
Test score: 0.632183908045977
Best Hyperparameters: {}
191.4433431327343
779.5137974172831
265588.2552344054
555.5857027620077
2228.1786911785603
24.229057759046555
44.69968068599701
14.261270821094513
0.0
3.3328680247068405
1200.6535830497742
539.8217641711235
14.813854992389679
0.0
0.7448050081729889
0.0
145.96732425689697
0.8730419874191284
0.0
0.0
49.929907381534576
41.40989154577255
25.373789735138416
34.53770065307617
25.354084491729736
0.6072270274162292
0.0
11.637292861938477
79.59820201992989
219.1112174987793
13.36189717054367
124.46961294859648
81.99526534974575
370.92667447030544
787.0242674350739
0.0
0.0
0.0
3.39437997341156
23534.951027333736
50.634501442313194
1.2720540165901184
17.271074935793877
1187.4620273262262
84.83252184838057
330.01006242632866
3382.820452094078
0.0
0.0
0.0
372.24238991737366
2451.0891337394714
502.29489786177874
2.3915200233459473
3.504727065563202
75.532

## 4.3 Rebalancing Strategy - UNDER

### 4.3.1 Random Forest

In [117]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

under_randomforest_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    pipeline = Pipeline(steps = [#['smote', SMOTE(sampling_strategy = 0.5, n_jobs=2)],
                              ['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    under_randomforest_normalized_performance_df = pd.concat([under_randomforest_normalized_performance_df, new_performance_df])
    
under_randomforest_normalized_performance_df.to_csv("../data/05_model_output/under_randomforest_normalized_performance_df.csv")



### 4.3.2 XGBoost

In [118]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


under_xgboost_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    GXBoostPipeline = Pipeline(steps = [#['smote', SMOTE()],
                                    ['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    under_xgboost_normalized_performance_df = pd.concat([under_xgboost_normalized_performance_df, new_performance_df])
    

under_xgboost_normalized_performance_df.to_csv("../data/05_model_output/under_xgboost_normalized_performance_df.csv")




Elapsed time to compute best fit: 0.921 seconds
Cross-validation score: 0.03455691585972301
Test score: 0.048268029528676884
Best Hyperparameters: {}
0.010113684
0.123468354
0.20093855
0.0
0.050189547
0.03453695
0.0
0.0
0.0
0.0
0.013386338
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.001593966
0.0
0.0
0.0
0.0
0.0
0.0016631448
0.0
0.0
0.004652655
0.008409891
0.0020591686
0.007193151
0.0
0.0
0.0
0.010577968
0.004054774
0.012053889
0.0014198223
0.0
0.0
0.00091137714
0.0
0.0
0.0
0.0
0.0
0.010474033
0.0
0.011965466
0.0
0.0
0.0
0.000819364
0.0042776517
0.002436117
0.0
0.0060554747
0.0
0.0
0.0
0.0
0.013076326
0.0
0.0
0.0
0.023976296
0.0052745827
0.0
0.037094157
0.0
0.0
0.0
0.0
0.0
0.015011517
0.0009485195
0.0
0.03605568
0.005183847
0.056807935
0.0
0.0
0.0128970435
0.0
0.048128888
0.0
0.0
0.0
0.019051304
0.035193123
0.0
0.0
0.0036511896
0.0
0.02129116
0.01931501
0.0
0.0
0.0
0.0
0.0
0.0044164914
0.0
0.0
0.014707712
0.0
0.0019803047
0.026348619
0.0
0.027387226
0.011123979
0.0
0.0
0.0
0.0
0.0
0.



Elapsed time to compute best fit: 1.008 seconds
Cross-validation score: 0.031248999381987226
Test score: 0.03641329085116068
Best Hyperparameters: {}
0.018650385
0.05636085
0.14039704
0.0
0.026776439
0.0
0.0
0.0
0.0
0.0
0.019540604
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0053458335
0.0
0.0
0.0
0.010789671
0.004347833
0.0019012826
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.000786781
0.0
0.00084152975
0.0
0.0
0.0
0.0046170643
0.0016732365
0.011367303
0.0053930334
0.0
0.0
0.0006721798
0.0
0.0
0.0
0.0
0.0
0.0042619784
0.0
0.0
0.0
0.0
0.042610426
0.008329201
0.0
0.0
0.04858333
0.016058115
0.014019063
0.005707343
0.0
0.00789113
0.0
0.0
0.0
0.0
0.018900715
0.008704388
0.006837135
0.036328364
0.0
0.0
0.0
0.0
0.0
0.0219877
0.0
0.0
0.044945028
0.08681081
0.06132379
0.015908835
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0087699685
0.06850738
0.0
0.0
0.00072963536
0.0
0.0
0.014987423
0.0
0.0
0.0
0.0
0.0016644662
0.0
0.0
0.0
0.008025099
0.0
0.0009936651
0.006996765
0.0
0.030851807
0.09726942
0.0
0.0
0.0
0.0
0.0
0.0010618218
0.0
0.001



Elapsed time to compute best fit: 0.925 seconds
Cross-validation score: 0.04106393689765801
Test score: 0.03227293683725219
Best Hyperparameters: {}
0.031783957
0.09892721
0.12259985
0.0
0.010102823
0.0
0.0
0.0
0.0
0.0
0.0093682865
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
4.6316192e-05
0.0
0.0
0.04823696
0.0
0.0
0.0
0.0077588432
0.0
0.0016458008
0.0
0.0
0.021821965
0.0
0.0
0.0
0.0
0.0
0.0
0.0029983777
0.0024545882
0.0
0.0
0.0016200441
0.008245991
0.0
0.0073145134
0.0
0.0
0.0
0.0
0.005446623
0.005532546
0.0
0.0
0.0
0.0010992979
0.0
0.0
0.0042672837
0.05541971
0.012536426
0.0
0.0
0.004694337
0.022813404
0.0051061865
0.0
0.0
0.026486658
0.006383143
0.005979995
0.0056407265
0.0
0.008542726
0.0
0.0
0.034846175
0.0064912797
0.0
0.0038219104
0.0026055383
0.00092621194
0.016744688
0.024087062
0.011464283
0.009931853
0.0
0.024092909
0.0
0.0
0.0
0.049380686
0.055611137
0.0
0.0
0.00580525
0.0
0.0
0.04023685
0.0
0.0
0.0013541976
0.0
0.0
0.0017031217
0.0
0.0
0.0
0.0
0.0
0.0061267354
0.0
0.013659037
0.09



Elapsed time to compute best fit: 0.928 seconds
Cross-validation score: 0.03720487237070981
Test score: 0.0400427122263748
Best Hyperparameters: {}
0.040994246
0.09192758
0.18586272
0.0
0.027623104
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.024762921
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.009753732
0.0
0.019247252
0.0
0.0
0.0
0.0
0.0020693932
0.04789312
0.0
0.0
0.014225559
0.0016400969
0.0
0.0
0.0
0.0
0.0
0.027402392
0.0
0.010082781
0.0
0.0
0.0
0.0
0.0
0.0
0.06704727
0.025654834
0.016059885
0.0008670223
0.0
0.0020284557
0.0
0.0
0.0
0.0
0.005939731
0.0
0.0
0.039091147
0.0
0.0
0.0
0.0
0.0
0.0011556719
0.0
0.0
0.07798153
0.0
0.0022077118
0.0
0.0
0.0
0.0
0.00116535
0.0
0.0
0.0
0.0019465012
0.028933195
0.0
0.0
0.0
0.0
0.007529366
0.028382452
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0070657614
0.0
0.023314375
0.08715011
0.0
0.0
0.043406144
0.0
0.0
0.027503692
0.0010248242
0.0010600772
   Accuracy  Precision    Recall       F1        F2      F0.5  \
0  0.932947 



Elapsed time to compute best fit: 0.909 seconds
Cross-validation score: 0.026328359253823975
Test score: 0.029531192321889995
Best Hyperparameters: {}
0.019203419
0.07234054
0.20607582
0.0
0.015228464
0.036562204
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0015292118
0.02398205
0.0
0.0
0.0
0.0017667373
0.0
0.0
0.0
0.0017966513
0.0031572755
0.0063939826
0.009073237
0.0
0.030371523
0.012665468
0.0
0.01674854
0.0
0.0
0.0
0.0
0.0
0.00086882577
0.0
0.0
0.0
0.0
0.0
0.002721401
0.054132897
0.025573462
0.010517832
0.007948234
0.0
0.0
0.033301212
0.0
0.0
0.0
0.041690774
0.0
0.0
0.07257521
0.0
0.0
0.0
0.0
0.002458609
0.0034797932
0.0
0.0
0.0443959
0.0
0.014818606
0.041790936
0.01957495
0.032493696
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.004344424
0.0062993513
0.0
0.0
0.0018547791
0.008059873
0.0
0.0
0.0011665995
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0018501737
0.0
0.043450493
0.0338212
0.0
0.0
0.0
0.0
0.0
0.0010644724
0.0015070419
0.031344175
   Accuracy  Prec



Elapsed time to compute best fit: 0.939 seconds
Cross-validation score: 0.03436524809852567
Test score: 0.04370179948586118
Best Hyperparameters: {}
0.020258859
0.140302
0.14340952
0.0012211746
0.047671817
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.007168114
0.0
0.0
0.0
0.0
0.004918837
0.0
0.0
0.0
0.0
0.029189965
0.0
0.0
0.0
0.0010124908
0.012452071
0.0
0.0027638022
0.0
0.006874717
0.0052290047
0.0
0.0
0.0
0.0
0.0
0.0011530077
0.0
0.0
0.0
0.0
0.0
0.040598553
0.0
0.02541508
0.009952903
0.024655987
0.01742081
0.004969689
0.0
0.0
0.0
0.009686288
0.0
0.0
0.009946019
0.001457225
0.0
0.06377785
0.0
0.0
0.0
0.0
0.005955403
0.0016708224
0.0
0.0
0.009138937
0.0856154
0.0009602589
0.057113536
0.0
0.018336354
0.0
0.0
0.0
0.0
0.0
0.0013146917
0.035420194
0.0
0.0
0.008394696
0.0
0.01287139
0.07731463
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.018579168
0.0
0.0
0.029655917
0.0
0.0
0.0032930584
0.0
0.0028597359
   Accuracy  Precision  Recall        F1



Elapsed time to compute best fit: 0.909 seconds
Cross-validation score: 0.03241672526076686
Test score: 0.02685624012638231
Best Hyperparameters: {}
0.030448634
0.08343362
0.22091469
0.0
0.0
0.090013795
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0016271943
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0014312271
0.0026931146
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.021343704
0.0
0.003489177
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.01927754
0.0
0.0
0.0
0.0
0.0
0.0
0.01325452
0.012171663
0.0751231
0.050761256
0.0
0.0
0.0
0.0
0.0019013152
0.0
0.0
0.0
0.0
0.0
0.0
0.03515096
0.0
0.0
0.0
0.0
0.0
0.020969972
0.0
0.006842885
0.026010992
0.0
0.010187449
0.0
0.0
0.023239324
0.0
0.10190247
0.0
0.0
0.0
0.0
0.001092297
0.0
0.0
0.00010761967
0.0
0.0055563133
0.049041234
0.013376338
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0009102301
0.0
0.0
0.0
0.0
0.0
0.051819734
0.0
0.0
0.0
0.0
0.0
0.01880281
0.0
0.00710485
   Accuracy  Precision  Recall        F1        F2      F0.5  \
0  0.885519   0.021601     1.0  0.042289  0.09941



Elapsed time to compute best fit: 0.835 seconds
Cross-validation score: 0.034220631657277414
Test score: 0.0377945753668297
Best Hyperparameters: {}
0.054712683
0.07510974
0.14855482
0.0
0.019540733
0.0
0.0
0.0
0.0
0.0
0.0
0.039999917
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.014221773
0.009586858
0.00068255037
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0007910476
0.0
0.0
0.0
0.0
0.0
0.0
0.0024256213
0.0016429718
0.0013545009
0.0
0.022754543
0.0
0.0
0.0
0.0
0.0
0.0
0.00635679
0.007076657
0.005115173
0.0
0.0
0.0
0.0
0.0
0.006380835
0.028568117
0.008153484
0.0
0.027772633
0.004180806
0.003985353
0.0
0.0022357341
0.0
0.0035470652
0.012760085
0.0
0.0
0.06303265
0.0
0.0
0.0
0.0
0.0
0.0034120998
0.00030874487
0.014423531
0.09728251
0.0
0.0
0.009290318
0.0
0.0072942665
0.0
0.0
0.0
0.041442517
0.0
0.0
0.0672804
0.0
0.0
0.0
0.0
0.0009221349
0.020848403
0.02083142
0.0
0.0
0.0
0.022538146
0.0
0.0
0.0
0.0
0.0
0.040993415
0.0019829592
0.0
0.00014965417
0.072344735
0.0
0.0
0.0
0.0
0.0
0.0
0.00063451065
0.0074770



Elapsed time to compute best fit: 0.794 seconds
Cross-validation score: 0.03257430637336342
Test score: 0.0399002493765586
Best Hyperparameters: {}
0.07458471
0.15729734
0.12144646
0.0
0.05687356
0.0
0.0
0.0
0.0
0.0
0.0428156
0.037337463
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.007929249
0.0076444717
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.00073951425
0.0036854316
0.0
0.0
0.0
0.0
0.015494811
0.0037656133
0.015193104
0.0
0.0
0.024387514
0.013765689
0.002971627
0.0042088125
0.0
0.0
0.0
0.0
0.00089548336
0.0
0.035485342
0.0
0.0
0.0
0.036566976
0.0039631026
0.0
0.031996503
0.013446283
0.01117681
0.0
0.0013332771
0.0
0.0
0.0
0.0
0.039406348
0.0
0.0
0.042863574
0.0
0.0
0.0
0.0
0.030765845
0.0008820702
0.0
0.0
0.0038960055
0.0
0.0061646616
0.0
0.0
0.017897388
0.0
0.0
0.0
0.004296072
0.0
0.0010952316
0.0040111043
0.0
0.0056386045
0.0006913554
0.0
0.01874476
0.006555072
0.005145597
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.024920067
0.001080257
0.0
0.0
0.045183804
0.0
0.0
0.00067156315
0.0
0.00088184816



Elapsed time to compute best fit: 0.880 seconds
Cross-validation score: 0.036449818126144576
Test score: 0.029973772948669913
Best Hyperparameters: {}
0.006372625
0.17845409
0.14247668
0.0
0.029814357
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0029013227
0.0
0.0005639962
0.0
0.0
0.0
0.0
0.01100261
0.010600677
0.0
0.0
0.028213074
0.0
0.0
0.010553638
0.0
0.0
0.0
0.004030652
0.0005734932
0.01715428
0.0
0.0
0.0
0.0
0.0
0.0030323183
0.012558359
0.019968849
0.014440685
0.0
0.0
0.00059363205
0.0
0.0
0.0
0.04063438
0.0
0.0
0.0
0.03873264
0.0
0.0
0.0
0.0
0.04266392
0.008634854
0.00037131
0.010972541
0.1428155
0.0039214655
0.0023690911
0.030843394
0.0
0.00038807918
0.0
0.029320851
0.0
0.0
0.0
0.0
0.003963881
0.0024361403
0.0
0.0
0.0
0.0
0.04097138
0.010731207
0.0
0.0
0.0
0.0005259831
0.0
0.0
0.0
0.0
0.0
0.042268325
0.0015859723
0.0
0.0
0.0
0.0
0.0
0.00926288
0.0
0.0
0.04011368
0.0031671566
0.0
   Accuracy  Precision    Recall    



Elapsed time to compute best fit: 0.997 seconds
Cross-validation score: 0.03635594195740142
Test score: 0.047801147227533466
Best Hyperparameters: {}
0.0058084126
0.14062722
0.102080286
0.0
0.045176182
0.022570126
0.0
0.0
0.0
0.0
0.04579528
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.013578932
0.0
0.00067588926
0.0
0.0076223505
0.0
0.0
0.0
0.015192117
0.0
0.0036568844
0.0283127
0.016220195
0.0
0.0
0.0
0.028105762
0.004403217
0.0040599364
0.0021792345
0.050361067
0.028947191
0.0009938682
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.01885818
0.00782714
0.0023491613
0.012738954
0.010072811
0.019820778
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0022645413
0.0
0.0
0.05723359
0.0
0.0
0.0
0.0
0.0
0.022077877
0.009025399
0.0
0.05242192
0.0
0.0069090887
0.028369518
0.049737968
0.0020795392
0.0
0.00079182716
0.0
0.0
0.0
0.0016875505
0.028253183
0.0
0.0
0.006301622
0.0
0.0010056163
0.0
0.024650255
0.0
0.0
0.0
0.0
0.0012736912
0.0
0.0
0.0055191563
0.0
0.0
0.0038739815
0.0
0.0018871451
0.04752014
0.0
0.0
0.0



Elapsed time to compute best fit: 0.868 seconds
Cross-validation score: 0.03384177494496314
Test score: 0.023419203747072598
Best Hyperparameters: {}
0.035527762
0.009830187
0.21579453
0.0
0.020947907
0.0
0.0
0.0
0.0
0.0
0.0
0.047769457
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.01667473
0.011022609
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0020335114
0.008130358
0.0019794563
0.0
0.0
0.0
0.0
0.0048449645
0.0031791392
0.0015596998
0.0
0.0078011625
0.0026177117
0.006082931
0.0
0.0
0.0
0.0
0.00753084
0.0
0.0
0.0
0.0
0.0
0.0364774
0.111636706
0.012145001
0.01495438
0.034533225
0.0
0.0029515328
0.0
0.0
0.038572926
0.0
0.0
0.0
0.014532471
0.0018742968
0.0
0.07172977
0.0
0.0
0.0
0.0
0.0066701416
0.004181377
0.0
0.004456436
0.0983553
0.009393004
0.034984745
0.039658364
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.004111585
0.0
0.004921539
0.013236287
0.0
0.0
0.005862119
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.007090754
0.0
0.0
0.0049068485
0.011167404
0.0
0.0
0.0026681125
0.0
0.0
0.0
0.0
0.005601315
   Accuracy



Elapsed time to compute best fit: 0.832 seconds
Cross-validation score: 0.03790443969447333
Test score: 0.033128253667770946
Best Hyperparameters: {}
0.0081846295
0.023863493
0.27671385
0.0
0.061214913
0.060975537
0.0
0.0
0.0
0.0
0.054807734
0.031076092
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.009886012
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0006759165
0.0
0.0
0.0
0.0
0.0
0.02077439
0.008721921
0.00195884
0.0
0.0
0.0147109125
0.028340692
0.0
0.0
0.0
0.0
0.0
0.003851901
0.0
0.0008958013
0.0
0.0
0.0
0.0
0.0
0.005014126
0.010635877
0.03539885
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.057139754
0.01299735
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.03292339
0.0065180995
0.0
0.034935493
0.0
0.0
0.0
0.015366417
0.0
0.0
0.0
0.0
0.08060863
0.0
0.0
0.0
0.0
0.0012725192
0.026948415
0.0
0.0
0.0
0.0
0.0
0.0037895825
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0007656552
0.04919857
0.0
0.0
0.0042035044
0.0
0.005260829
0.009614551
0.0
0.00075569545
   Accuracy  Precision    Recall        F1        F2      F0.5  \
0  0.9237



Elapsed time to compute best fit: 0.946 seconds
Cross-validation score: 0.03285390334323565
Test score: 0.04794134235758602
Best Hyperparameters: {}
0.010554244
0.007951047
0.16434196
0.034672517
0.020973593
0.0057739206
0.0
0.0
0.0
0.0
0.034008253
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.015230998
0.0
0.0
0.0
0.0
0.0
0.0
0.01004179
0.0023944408
0.012559038
0.0
0.0010631117
0.0
0.0
0.0
0.0
0.014459949
0.024443457
0.027264308
0.0038987373
0.0
0.004883415
0.0031939368
0.0
0.0068828193
0.0
0.0
0.0
0.0
0.0
0.0024136307
0.0
0.0
0.0
0.011666941
0.044609085
0.0
0.04844381
0.011069432
0.0
0.0010467432
0.0
0.0
0.0
0.0
0.0
0.0
0.011032993
0.0
0.0
0.037821792
0.0
0.0
0.0
0.0
0.0042755613
0.0045359777
0.0
0.0
0.027783297
0.062166687
0.0
0.0020705725
0.06550814
0.014541737
0.0
0.0
0.0
0.0030147908
0.0
0.011425226
0.015908703
0.0
0.0010744559
0.0016277998
0.0
0.005371671
0.030519621
0.0027725718
0.0
0.00087456644
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0013798795
0.000981386
0.0
0.089800335
0.06342487
0.



Elapsed time to compute best fit: 0.792 seconds
Cross-validation score: 0.043950121070894584
Test score: 0.03215434083601286
Best Hyperparameters: {}
0.020451644
0.0014264315
0.2695808
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.023140658
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.006649352
0.00827652
0.0
0.0032792569
0.0
0.0
0.0
0.0
0.0
0.035470184
0.010962616
0.0
0.012360418
0.0
0.0012049947
0.006300903
0.0
0.0
0.0
0.0060612913
0.0
0.0028772077
0.0
0.0
0.0
0.0
0.0
0.010064371
0.0
0.0
0.015001943
0.0007566982
0.0
0.0
0.028995654
0.0
0.0
0.0
0.039832007
0.0
0.0
0.106964104
0.0
0.0
0.0
0.0
0.0
0.03888043
0.0
0.0029290144
0.07159328
0.030423433
0.0
0.016407704
0.0
0.028093098
0.0016831774
0.0
0.0076632723
0.0
0.0
0.0
0.0
0.0011307913
0.0
0.0066868146
0.0
0.0048721097
0.080086686
0.0
0.0
0.0
0.0
0.0044413013
0.0
0.0
0.0
0.0
0.0
0.0
0.026454592
0.0
0.0
0.06098534
0.0
0.0
0.0
0.0
0.0
0.0
0.0070164306
0.0009954028
   Accuracy  Precision    Recall        F1        F2 



Elapsed time to compute best fit: 0.837 seconds
Cross-validation score: 0.03589223523682572
Test score: 0.06056018168054504
Best Hyperparameters: {}
0.016914278
0.040592976
0.21021067
0.06878554
0.01272014
0.03440975
0.0
0.0
0.0
0.0
0.0
0.03926455
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0022733486
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.011929296
0.013423145
0.0
0.0
0.0
0.0
0.0
0.00438655
0.006451232
0.0030926303
0.0
0.0
0.008233376
0.0
0.0070158346
0.0
0.0
0.0
0.0
0.0
0.007603
0.0
0.0
0.0
0.0
0.007996104
0.004358335
0.0
0.120924786
0.01670627
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.034422707
0.0013003679
0.0
0.053603794
0.0
0.0
0.0
0.0
0.0054465104
0.013050785
0.0
0.0
0.025346616
0.0
0.024027226
0.0
0.0
0.014296887
0.0
0.0
0.0
0.0
0.0
0.0
0.02735124
0.0009640379
0.0
0.011583225
0.0
0.0
0.056999702
0.024115793
0.0
0.0
0.0
0.0
0.005407888
0.0
0.0
0.0
0.0
0.016035786
0.0
0.0
0.0
0.0
0.0
0.0
0.032120105
0.0
0.0042967796
0.0
0.0010120412
0.011326697
   Accuracy  Precision    Recall        F1        



Elapsed time to compute best fit: 0.807 seconds
Cross-validation score: 0.031050486239989294
Test score: 0.03214744963566223
Best Hyperparameters: {}
0.004983787
0.022754656
0.20270292
0.0037825964
0.016003273
0.0
0.0
0.0
0.0
0.0
0.034245197
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.007947283
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.004131847
0.0
0.0
0.0
0.0
0.0
0.01693137
0.0053597777
0.03422813
0.019453045
0.0
0.0
0.0018426948
0.04994214
0.00363024
0.0
0.0
0.0
0.0031839106
0.0
0.0047706463
0.0
0.0
0.0
0.0
0.0
0.0062870714
0.04524639
0.0227952
0.010803505
0.0
0.009492864
0.0010131383
0.0
0.0
0.0
0.0
0.01957017
0.0
0.0
0.088916585
0.0
0.0
0.0
0.0
0.0056473897
0.006886964
0.0
0.0
0.032335185
0.024886472
0.0
0.008793592
0.07503527
0.031583745
0.0
0.0
0.0
0.006841273
0.0
0.0012478479
0.038127463
0.0009491922
0.0
0.0
0.0
0.0038555535
0.03143457
0.0
0.0
0.0
0.0
0.0017151211
0.0
0.0
0.0
0.0
0.0
0.0008061621
0.009581902
0.0
0.022683473
0.014941623
0.0
0.0
0.018016312
0.0
0.0
0.00961838
0.0
0



Elapsed time to compute best fit: 0.904 seconds
Cross-validation score: 0.03703638295582286
Test score: 0.03314635390107088
Best Hyperparameters: {}
0.006693675
0.018667236
0.24213012
0.0
0.0167524
0.004192552
0.012787921
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0096232025
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0036785705
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0038794626
0.04898439
0.013725669
0.016162692
0.0
0.016216254
0.007078935
0.0
0.026843676
0.0
0.0
0.0
0.010240815
0.0074418406
0.011166583
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.018081026
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.015236711
0.010379002
0.0
0.03086512
0.0
0.0
0.0
0.0
0.021199094
0.0
0.0
0.0
0.017383981
0.0
0.0
0.019414492
0.0
0.0
0.0
0.0
0.0
0.0024718838
0.0
0.036991477
0.049722433
0.011151613
0.0
0.0
0.0
0.00216629
0.022735732
0.014520085
0.0
0.0
0.0
0.03362192
0.0
0.0
0.0
0.0
0.0
0.0
0.013649572
0.0
0.0
0.16025473
0.0
0.0
0.014104396
0.0
0.024914429
0.0
0.0
0.0048700795
   Accuracy  Precision    Recall       F1        F2      



Elapsed time to compute best fit: 0.829 seconds
Cross-validation score: 0.03710910535579037
Test score: 0.0513347022587269
Best Hyperparameters: {}
0.00812819
0.08130984
0.22919947
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.051891513
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0015125215
0.0
0.0
0.0
0.0
0.0
0.0
0.0047431025
0.0
0.024190528
0.0
0.0066732345
0.0056468174
0.0
0.0
0.0
0.0
0.0012315863
0.009973586
0.0018330461
0.0
0.004860075
0.00303453
0.0
0.0
0.0
0.0
0.0
0.0031839793
0.0
0.0
0.0
0.0
0.0
0.0
0.0024971145
0.0017754892
0.011559558
0.041738816
0.019847753
0.0
0.0
0.008687099
0.0
0.0
0.0
0.0
0.016705317
0.0
0.0
0.0065425304
0.0
0.02202634
0.0
0.0
0.021807186
0.018972535
0.0
0.0
0.021848528
0.019230574
0.03622588
0.0
0.0
0.020769484
0.0
0.0
0.0054858
0.0
0.0
0.023729948
0.012712197
0.0
0.0
0.013194274
0.0
0.0023247595
0.025253184
0.0
0.0
0.008442931
0.0
0.0
0.00077630143
0.0
0.0
0.0
0.0
0.03293496
0.027300252
0.0
0.0
0.08226525
0.0
0.02023414
0.030472236
0.0
0.004790653
0.0015236246
0.0
0



Elapsed time to compute best fit: 0.829 seconds
Cross-validation score: 0.03702585670356223
Test score: 0.03209536909674461
Best Hyperparameters: {}
0.019174049
0.018286938
0.19731353
0.0
0.06774238
0.03451894
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.016429089
0.019188603
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.033560447
0.0
0.014950909
0.0
0.0
0.0
0.0
0.0
0.004581848
0.0010526197
0.008290957
0.0
0.015863098
0.0
0.005251371
0.0
0.0
0.0
0.0056230756
0.0
0.003406941
0.0
0.0
0.0
0.0
0.0
0.020194164
0.0
0.043817777
0.0
0.0077010733
0.0
0.0
0.0
0.020631101
0.0
0.0
0.040012654
0.0
0.0
0.0116669405
0.0
0.0
0.0
0.0
0.0015454774
0.001005392
0.007138011
0.0034677319
0.025115974
0.002056785
0.0020224578
0.015062216
0.07585567
0.0
0.0024951159
0.0
0.0
0.0
0.0
0.0008037217
0.0019176956
0.0
0.008049612
0.002964056
0.0
0.0010963269
0.02766709
0.008166385
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.018645214
0.0
0.0
0.026281282
0.0
0.014547578
0.09097766
0.0
0.0
0.0501614
0.0
0.0028127914
0.0
0.0



Elapsed time to compute best fit: 0.846 seconds
Cross-validation score: 0.037757779465726705
Test score: 0.028612303290414882
Best Hyperparameters: {}
0.039058764
0.094920054
0.20947118
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.030293243
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0040683444
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0075142826
0.002484065
0.0
0.0
0.024450837
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0009361959
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.01546305
0.0017954139
0.0
0.0
0.0010682697
0.038796384
0.0
0.0
0.0
0.0
0.015736016
0.0
0.16960199
0.0
0.0
0.0
0.0
0.0
0.005128047
0.0
0.0
0.12674928
0.11573134
0.0
0.0009943105
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.03241425
0.0
0.0
0.0047696745
0.0
0.0
0.001369393
0.0
0.0
0.001811342
0.0
0.0014700698
0.0
0.0
0.0
0.0
0.0
0.0
0.022653306
0.0
0.0
0.03125095
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
   Accuracy  Precision    Recall        F1        F2      F0.5  \
0  0.923729   0.023077  0.705882  0.044693  0.102041  0.028612   

   Average Prec



Elapsed time to compute best fit: 0.920 seconds
Cross-validation score: 0.0378420814315365
Test score: 0.03522780648191639
Best Hyperparameters: {}
0.01693299
0.09787842
0.14767426
0.015130424
0.02993128
0.012230157
0.0
0.0
0.0
0.0
0.05481103
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.017671315
0.0
0.0
0.0
0.0
0.0
0.0
0.010610288
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0125564635
0.0
0.0017061487
0.0
0.0007542836
0.0019185147
0.020442476
0.0
0.0065245116
0.0
0.0
0.0
0.0
0.0019754944
0.0
0.0
0.0
0.0
0.0033543305
0.0
0.008382015
0.007399627
0.064363144
0.0
0.0
0.0
0.0
0.020293754
0.0
0.0
0.008693727
0.025698213
0.0
0.0
0.050751813
0.0
0.0
0.0
0.0
0.0019856733
0.09477508
0.0051968377
0.0
0.10936511
0.029149693
0.0
0.0
0.0
0.0
0.0
0.011243315
0.0
0.001173504
0.0
0.0051462804
0.048459437
0.0
0.002881404
0.0017248163
0.0
0.002597262
0.0011106764
0.0
0.0
0.00079817296
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.02495098
0.0
0.0
0.0
0.0
0.0
0.0
0.002196423
0.0
0.00784795
0.0021064086
0.0044590086
0.0051472127




Elapsed time to compute best fit: 0.832 seconds
Cross-validation score: 0.03314934365535828
Test score: 0.02942330325617889
Best Hyperparameters: {}
0.011920216
0.056185484
0.12721969
0.0
0.027159207
0.0
0.0
0.0
0.0
0.0
0.010961958
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.010927818
0.0
0.02881404
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0005678039
0.0
0.0048378534
0.0
0.0
0.0
0.0
0.0033887068
0.028397491
0.0
0.0034653833
0.0
0.0
0.0
0.0033595834
0.0
0.0
0.0
0.04805152
0.0
0.023164142
0.0
0.0
0.0
0.04637321
0.0
0.0017276438
0.004070244
0.00970301
0.03050032
0.0027205418
0.0
0.0
0.0020987294
0.0
0.0
0.0
0.0067990455
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0019077503
0.0
0.0
0.072043896
0.04553603
0.02283307
0.1392074
0.0
0.0
0.0
0.00052019535
0.0
0.0
0.0
0.01637666
0.06823957
0.0
0.0
0.001005506
0.0
0.0099794725
0.04185818
0.0021206173
0.0
0.0
0.0
0.0
0.0008540501
0.0
0.0
0.0
0.0
0.0
0.009362388
0.0
0.024488244
0.01734276
0.0
0.0
0.01761735
0.0
0.0075333416
0.0053843306
0.0033755077
0.0
   Accuracy 



Elapsed time to compute best fit: 0.781 seconds
Cross-validation score: 0.02971604049723296
Test score: 0.0481637567730283
Best Hyperparameters: {}
0.017400429
0.07083218
0.10292871
0.0
0.016120486
0.017361661
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0025290395
0.0
0.014241966
0.0011832116
0.0
0.0
0.0
0.0
0.0
0.0012739889
0.005078382
0.0022495778
0.0
0.0
0.014229211
0.0018566455
0.0665495
0.0092598265
0.0
0.0
0.0
0.0048709013
0.0
0.0037514989
0.0
0.0
0.0
0.012914444
0.024458356
0.004404659
0.0
0.051243786
0.020018004
0.0
0.0
0.013692202
0.0039287996
0.0
0.0
0.0
0.019350965
0.0
0.0
0.097921915
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.00043325435
0.037963744
0.036215436
0.0066579096
0.15265502
0.0
0.017266007
0.0
0.0
0.0
0.0
0.0
0.0024857447
0.02451796
0.0
0.0
0.0
0.0
0.003348453
0.0086029675
0.0558331
0.0
0.0006081983
0.0
0.0002016212
0.0013976505
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.046500254
0.0
0.0
0.0004950276
0.0
0.0
0.0
0.0039618565
0.0012053



### 4.2.3 LightGBM

In [119]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

under_lightgbm_performance_normalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    LightGBMPipeline = Pipeline(steps = [#['smote', SMOTE()],
                                    ['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    under_lightgbm_performance_normalized_df = pd.concat([under_lightgbm_performance_normalized_df, new_performance_df])
    

under_lightgbm_performance_normalized_df.to_csv("../data/05_model_output/under_lightgbm_performance_normalized_df.csv")




Elapsed time to compute best fit: 0.603 seconds
Cross-validation score: 0.1262628457153605
Test score: 0.10101010101010101
Best Hyperparameters: {}
45.87680268460463
27.696697015237987
591.4083031718037
0.0
21.901780073414557
0.2655539959669113
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
4.7233099937438965
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.663905993103981
1.8708415155415423
0.0003689379955176264
1.1187649965286255
0.0017373723861275892
0.0
0.0
0.0
0.52558608725667
3.644360065460205
5.281849958002567
0.0
0.0
0.42168377394864365
0.32085898518562317
0.0
0.08760210126638412
0.0
0.0
0.0
1.7935304548882414
0.9681735062049484
0.46890743280528113
0.015413600020110607
0.49738629907369614
0.5341508910059929
0.27674201130867004
0.4777210056781769
16.97296901792288
0.5658980011940002
5.83089017868042
8.833200675435364
0.09616569607169367
0.0
0.0033250401029363275
0.08423029631376266
0.0
0.0
0.0
12.86983759339546
4.061187716230052
0.0
69.1249105554889
2.684878036379814
0.55526697635



Elapsed time to compute best fit: 0.886 seconds
Cross-validation score: 0.13797894200469005
Test score: 0.14625228519195613
Best Hyperparameters: {}
60.2022842541337
112.31428849697113
327.0820177234709
0.0
0.0312460009008646
8.246598847210407
0.0
0.0
0.0
0.0
14.516815096139908
123.01596142351627
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.2092989981174469
0.0
1.2241100072860718
0.0
0.0
0.0
0.0
2.643896073102951
0.0
0.0006269959849305451
1.2432489693164825
1.945099949836731
15.307964657433331
0.0
0.0
0.0
0.6653209924697876
0.12744900584220886
1.602929949760437
0.11998199671506882
3.228879919668543
2.181592047214508
0.0
0.023764999583363533
4.580543011426926
0.0
0.0
0.0
9.764178141951561
6.2806117832660675
1.464537389576435
0.0
0.0
0.0
15.356647923588753
0.0
2.5547723434865475
0.8629360049962997
7.693733975291252
7.120999172329903
3.116735391318798
0.0
0.0
0.12606599926948547
3.6456914097070694
1.4818600416183472
0.0
22.87423411011696
1.8892597034573555
0.0
3.5777480541655677
2.1520059406



Elapsed time to compute best fit: 0.579 seconds
Cross-validation score: 0.1532572595478155
Test score: 0.11661807580174928
Best Hyperparameters: {}
11.254777240945259
37.30117763951421
474.95844689202977
0.0
53.98939214646816
0.0
0.0
0.0
0.0
0.0
0.06502830237150192
4.558024048805237
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.8281020075082779
0.0
0.17129549756646156
0.0
0.0
0.0
0.0
0.0
0.0
0.16982600092887878
0.006112630013376474
3.521739959716797
0.0
0.0
0.0
0.0
0.2846600115299225
0.0
0.04737810045480728
0.0
0.4366375186236837
0.13814150169491768
5.142588049173355
0.0
0.0
0.0
0.0
1.9774802644533338
0.0
8.932599902153015
0.0
0.0
0.9841510057449341
0.03281040117144585
0.0
9.441980179399252
21.939432053346536
15.238230994138576
75.93606147666833
0.0
0.0
0.0
1.722993016242981
0.06376039981842041
0.0
0.24595530703663826
4.78835915774107
2.0246299925474887e-07
0.0
32.23861662298441
0.0
0.0
0.0
1.1918649673461914
0.5509830117225647
17.111624442966104
0.0
0.0
0.42006244882941246
21.10463026



Elapsed time to compute best fit: 0.640 seconds
Cross-validation score: 0.11738082397079945
Test score: 0.08342602892102337
Best Hyperparameters: {}
138.30616364534944
44.32346222969818
316.4324968419969
0.0
10.417821805924177
0.0
0.0
0.0
0.0
0.0
15.314889065921307
0.0
0.0
0.0
0.0
0.0
0.004554320126771927
0.0
0.0
0.0
0.0
0.0
0.6153600066900253
0.0
0.12673569656908512
0.0
0.0
0.0
0.0
0.16614654627669267
11.60664963722229
6.274254906202259e-05
0.14840970933437347
0.0
0.094268798828125
0.0
0.0
0.0
0.6031959196552634
10.137463107705116
8.423077217354148
0.9384703943505883
0.0
8.192598778754473
1.7902726233005524
2.5585537776350975
0.21834300458431244
0.0
0.0
0.0
0.8933314979076385
3.3414096599444747
4.906170028523775
0.00015025900211185217
0.0
0.0
0.0
0.0
0.17618799954652786
0.3728373944759369
2.3741300106048584
0.9893217787321191
2.342146941460669
0.0
0.5548142734915018
0.0
1.0637090168893337
0.0
0.0
27.78524232842028
0.7747748047113419
0.0
6.839507557451725
0.0
0.0
4.850820118917909e-07




Elapsed time to compute best fit: 0.618 seconds
Cross-validation score: 0.1320782728156664
Test score: 0.15151515151515152
Best Hyperparameters: {}
80.14740645499182
54.7301139831543
336.99412326147285
0.0
0.15651600062847137
58.33514181151986
0.0
0.0
0.0
0.0
0.1941089928150177
2.6992330625653267
0.0
0.0
0.0
0.0
0.026228800415992737
0.0
0.0
0.0
0.0
0.0
0.0
0.16176660358905792
0.0005829940200783312
0.0
0.0
0.0016625479911454022
0.0
3.4181169979274273
0.0
0.1869650036096573
0.2748290002346039
0.47091300785541534
1.0032099485397339
0.0
0.0
0.0
1.0235600471496582
0.45727960020303726
15.118736699223518
0.09796214965172112
0.11353100091218948
1.896314123792763
0.0
0.12099500000476837
3.582209149375558
0.0
0.0
0.0
5.270320844108937
0.6860709134489298
0.6461892947554588
0.0
0.0
0.0
0.0
0.0
19.641094787046313
0.993947493154792
25.659269731491804
24.32033384476199
0.3082420080900192
0.5339769087731838
0.02011149935424328
0.4852920174598694
0.0
0.1762229949235916
0.10673700273036957
55.9835837915



Elapsed time to compute best fit: 0.607 seconds
Cross-validation score: 0.14337295547735124
Test score: 0.1002673796791444
Best Hyperparameters: {}
51.49573880279786
12.611666791141033
573.0915158121352
0.0
15.239053528755903
0.2895990014076233
0.0
0.0
0.0
0.0
1.3638403676450253
0.0
0.0
0.0
0.0
0.0
2.217629909515381
0.0
0.0
0.0
0.0
0.0
0.1413400024175644
5.6743998527526855
0.0
0.0
0.0
0.0
0.0
4.253435015678406
0.9916009902954102
1.689974894747138
0.0
0.0
0.44555258098989725
0.0
0.0
0.0
0.45436976896598935
25.138550251722336
0.0
0.0
0.0
0.0
5.3591628251597285
4.554119944572449
0.05163128930144012
0.0
0.0
0.0
12.02219787798822
1.0550731047987938
2.5543003976345062
0.0
0.0
0.0
0.0
2.496769905090332
13.321354631334543
12.132609543390572
16.45319453626871
0.1536179929971695
0.30344029515981674
0.0
0.0
0.0
1.132471490651369
0.0
0.0
0.00010888394243124822
0.0
0.0
17.57424855977297
0.0
0.0
0.0
0.0
0.0
22.644957475364208
0.0
0.0
55.955788674764335
38.32955102622509
27.28927217423916
1.365528009



Elapsed time to compute best fit: 0.575 seconds
Cross-validation score: 0.12744137701521424
Test score: 0.13260530421216848
Best Hyperparameters: {}
8.625366307795048
50.32768201036379
452.17959332771716
0.0
0.661546990275383
0.0
0.0
0.0
0.0
0.0
27.798785485327244
0.0
0.0
0.0
0.0
0.0
0.0
0.0057235402055084705
0.0
0.0
0.0
0.0
0.12296800315380096
3.8396850526332855
0.11304300278425217
0.0
0.0
0.0
0.0
0.0
2.6529419273138046
4.203107509762049
1.6964399814605713
0.0
1.1203257738961838
0.0
0.0
0.0
4.7306536212563515
6.279356881976128
9.249370098114014
4.901390224695206
0.0
22.453616119921207
1.2846443794405786
39.18758852779865
0.5339300110936165
0.0
0.0
0.0
4.612348495982587
0.2868669927120209
2.917920518666506
0.0
0.0
0.0
0.29333698749542236
0.5687779784202576
5.273839831352234
0.0
0.0
0.27156199514865875
1.1355600357055664
0.691190592944622
2.9726110696792603
0.03660010173916817
0.0
0.0
1.4913599491119385
0.5239984914660454
0.0
0.0
6.329877868294716
0.0
0.25547099113464355
0.0
0.0
0.0
2.3



Elapsed time to compute best fit: 0.593 seconds
Cross-validation score: 0.1590311715124747
Test score: 0.16455696202531644
Best Hyperparameters: {}
29.186997252829315
63.06084254011512
474.6085260361433
0.0
1.7970577515661716
7.381717771291733
0.0
0.0
0.0
0.0
20.974228590726852
0.0
0.0
0.0
0.0
0.0
0.009161969646811485
0.00926239974796772
0.0
0.0
0.013597399927675724
0.0
0.0
0.0
0.0028435999993234873
0.0
0.0
0.0
0.0
0.0
0.12538300454616547
0.415526308119297
0.009543710388243198
0.0
4.636940509080887
0.0
0.0
0.0
0.23893199861049652
0.6458585038781166
0.24813849478960037
5.080305544659495
4.381927132606506
15.713113714009523
0.16095190902706236
1.0267499685287476
0.34661581367254257
0.0
0.0
0.0
1.6280160420574248
0.0013627000153064728
3.5829027965664864
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0014813600573688745
14.965959791094065
0.0
0.0
0.01611959934234619
9.974269084166735
0.4901590049266815
0.0
0.9451422989368439
1.397773797739319
0.034819820895791054
0.0
13.700750153977424
1.1608200111368205e-



Elapsed time to compute best fit: 0.698 seconds
Cross-validation score: 0.12087320224246309
Test score: 0.14571948998178508
Best Hyperparameters: {}
38.02428161818534
54.173230725261874
309.3220109678805
0.0
10.061052992939949
0.21631499379873276
0.0
0.0
0.0
0.0
9.991669923067093
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
15.742861229926348
0.342238113284111
0.0
0.0
0.0
0.0
1.2089631855487823
1.493248999118805
0.03228349983692169
0.324957013130188
0.0
0.05721979960799217
0.0
0.0
0.0
1.3139091690573537
11.38276782631874
0.10001300275325775
8.86424994468689
0.0
1.752704086531594
0.6880579963326454
0.0
0.0
0.0
0.0
0.0
26.877137849573046
0.36690670996904373
0.32155200839042664
0.0
0.0
0.0
0.0
0.0
11.019453957676888
0.4697299897670746
0.06850890070199966
0.0
0.0
0.0
0.0
0.1584479957818985
0.0
0.0
0.0
5.811954123200849
8.878807157278061
0.18528099358081818
15.786883369088173
0.0
0.9804746136069298
0.0
0.0
0.07950740307569504
9.615268236317206
0.0
1.712108999490738
213.34408812783659
59.



Elapsed time to compute best fit: 0.485 seconds
Cross-validation score: 0.14300743109512312
Test score: 0.14893617021276598
Best Hyperparameters: {}
9.813976442441344
55.41325993835926
408.3920138720423
0.0
105.81003464758396
2.476112674921751
8.827689725876553e-07
0.0
0.0
0.0
7.784933984279633
0.0
0.0
0.0
0.0
0.0
0.6758390069007874
0.0
0.0
0.0
0.0
0.0
0.1926400065422058
20.29915016144514
0.0
0.0
0.0
0.0
0.0
0.0
0.7083895941032097
1.8857799842953682
0.1437000036239624
0.0
1.8549759685993195
0.0
0.0
0.0
0.2814571037888527
6.407742083072662
0.8620719909667969
0.03453340008854866
0.0
0.3235729932785034
1.9405814595520496
2.1226919889450073
4.181334771215916
0.0
0.0
0.0
0.0
0.0
1.5753863433739639
0.0
0.0
0.0
0.013824470341205597
0.0
6.562255114868094
8.361650109291077
115.26996260334272
11.188116028904915
0.0
0.0
0.0
1.7985399170756864
0.0
0.0
0.0
8.699192222207785
16.04915764182806
0.0
15.15095317363739
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
14.942623720038682
0.0
2.7956900596618652
0.0
0.0
0.01



Elapsed time to compute best fit: 0.682 seconds
Cross-validation score: 0.14296727719717753
Test score: 0.15384615384615385
Best Hyperparameters: {}
67.59193042293191
26.03261810578988
413.29025556892157
0.0
0.10414600372314453
27.648979030549526
0.12945719808340073
0.0
0.0
0.0
0.0
8.026148080825806
0.0
0.0
0.0
0.0
1.5015589892864227
0.0
0.0
0.0
0.0
0.0
0.11489400267601013
0.0
0.07262030243873596
0.0
0.0
0.0
0.24762199819087982
14.837538540363312
0.47069400548934937
0.0
0.3507480025291443
0.0
5.679194942116737
0.0
0.0
0.0
16.997290374989007
2.2427444644272327
6.417404860258102
1.2553220242261887
12.446478843688965
43.94398028496653
0.0
0.0
0.08702679723501205
0.0
0.0
0.0
0.46711599081754684
1.6082899570465088
2.6283064633607864
0.0
0.0
0.0
0.0
1.8130980283021927
3.3624789118766785
29.025045323753147
11.785977143626951
8.364498503447976
4.67011022567749
0.0
0.15680700540542603
0.0013892799615859985
0.0
0.0
0.000657594995573163
6.055026143621944
2.208135463297367
0.0
47.38511257991195
0.



Elapsed time to compute best fit: 0.709 seconds
Cross-validation score: 0.12612659240315616
Test score: 0.1616161616161616
Best Hyperparameters: {}
67.75733671337366
183.9467814899981
213.86831994354725
0.0
35.414065033968654
0.0
0.0
0.0
0.0
0.0
0.0
0.9877541065216064
0.0
0.0
0.0
0.0
0.0
0.4074159860610962
0.0
0.0
0.0
0.0
0.575478308009906
0.0
1.0066819787025452
0.0
0.0
9.074219997273758e-05
0.0
5.7845839858055115
0.0
1.0473423918355707
0.886838332996831
1.20449498295784
0.0
0.0
0.0
0.0
3.4786256700754166
0.0
8.468196486588567
2.6164054173968907
7.720797881484032
2.404472127556801
9.877540774643421
3.843839943408966
2.201469207249829
0.0
0.0
0.0
5.397420018911362
0.8446270227432251
4.684766054153442
3.276029929111246e-06
0.0
0.0
0.5167050063610077
0.0
1.7591939941048622
0.3128570020198822
1.0097470059990883
0.3552176970988512
0.0
0.0
0.8346909880638123
0.07361805019900203
0.0
0.0
1.884663999080658
1.22527814517494
5.209383159415665
0.003481850028038025
3.027909994125366
0.0
0.375984013



Elapsed time to compute best fit: 0.667 seconds
Cross-validation score: 0.12749409199631226
Test score: 0.12345679012345678
Best Hyperparameters: {}
77.59546052012593
29.37426271662116
364.61254773316796
0.0
5.377021744847298
0.8946869969367981
0.0
0.0
0.0
0.0
17.886775568127632
0.05253089964389801
0.0
0.0
0.0
0.0
4.727630606192207
0.014183400198817253
0.0
0.0
0.0
0.0
2.9352680146694183
5.020538218785077
0.37913399934768677
0.41539400815963745
0.0
0.0
0.0
0.9707088023424149
0.0
0.12637099623680115
2.3768087029457092
0.0
1.6938073746860027
0.0
0.0
0.0
1.4647489786148071
15.822689771652222
17.973222012504266
0.10234219022095203
0.077400803565979
0.2074439972639084
1.7840460808329226
0.0
7.583410024642944
0.0
0.0
0.0
0.5792472955768062
1.5915115904062986
1.3301396071910858
0.0
0.3861199915409088
0.0
0.12304859980940819
0.005377789959311485
5.468610640207771
1.53310988843441
31.81009316528798
4.423111745039932
0.0
0.0
2.8726693131029606
1.4366870783269405
0.0
0.0
2.060001015663147
0.105207



Elapsed time to compute best fit: 0.686 seconds
Cross-validation score: 0.1546108258920199
Test score: 0.140597539543058
Best Hyperparameters: {}
50.50861268397421
31.944300496950746
462.7942570740124
0.0
28.721566402586177
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.23555199801921844
0.0
0.0
0.0
0.0
0.0
0.8936580922454596
0.0
0.042569998651742935
0.0
0.0
0.0660029846476391
0.0
2.684308501891792
0.0
0.6795520037412643
0.9580685530090705
2.3275360465049744
0.006677330005913973
0.0
0.0
0.0
0.03715577069669962
5.153563886880875
20.369356282055378
2.1050400733947754
0.07095810025930405
51.62554206699133
1.1701451428234577
0.0
0.31962141459916893
0.0
0.0
0.0
0.0
0.2599569857120514
0.029869599267840385
0.0
0.0
0.0
0.17781400121748447
1.2329620197415352
0.107984799426049
1.1041750013828278
64.84125366061926
26.299805849790573
0.042413339600898325
0.0
0.0690993785392493
0.004733869805932045
0.1873006192035973
0.0
0.0
0.8797178044915199
0.006964460015296936
0.0
38.550634041428566
0.0
0.0
0.0




Elapsed time to compute best fit: 0.583 seconds
Cross-validation score: 0.12333413456521931
Test score: 0.21067415730337083
Best Hyperparameters: {}
106.3895863071084
110.83940239623189
309.67940112550605
0.0
25.228037476539612
13.499692112207413
0.0
0.0
0.0
0.0
4.201132565736771
0.0
0.0
0.0
0.15801929868757725
0.0
0.011147200129926205
0.0
0.0
0.0
0.05330066740862094
0.0
0.0
16.154664158821106
0.0
0.0
0.0
0.4166826009750366
0.0
0.0
1.13332998752594
0.21961839497089386
0.00025245299912057817
0.0
14.073380499146879
0.0
0.0
0.0
0.004485673998715356
8.718401983380318
1.6014660000801086
0.0
0.0
0.0
1.0738343349366914
0.0
8.637010097503662
0.0
0.0
0.0
1.0388279184699059
0.0
3.0668680667877197
0.0
0.0
0.0
0.0
0.1475464031100273
12.498712748289108
8.485550194978714
1.9563715942203999
1.8516159392893314
0.10425999760627747
0.19279499351978302
0.3178130090236664
0.016629600897431374
0.006847150041721761
0.0
2.4226660571002867e-05
0.490094393491745
0.0
1.0555399656295776
4.148298889398575
3.35113



Elapsed time to compute best fit: 0.611 seconds
Cross-validation score: 0.13332145521576988
Test score: 0.14492753623188406
Best Hyperparameters: {}
37.78326557390392
130.09357638287474
211.75041602905526
0.0
55.947435118258
0.0
0.0
0.0
0.0
0.0
10.616308256983757
0.0
0.0
0.0
0.0
0.0
0.0
0.00011440909656812437
0.0
0.0
0.0
0.0
0.04712120071053505
0.7339774891734123
0.05694720149040222
0.0
0.0
0.0
0.0
0.0
0.4322569966316223
0.0
2.642262026667595
0.0
0.46418800950050354
0.0
0.0
0.0
0.9307680130004883
0.0
0.1505902532512664
2.890803962945938
0.0028623400721699
0.0390515998005867
7.522967868018895
0.0
5.604805909097195
0.0
0.0
0.0
12.833673436194658
0.0
1.040384978055954
0.0
0.8677579760551453
1.155667006969452
0.1974640041589737
0.0
0.34162420273423777
0.09974409639835358
67.69354767491495
8.597565211355686
0.0
0.0
0.00016051100101321936
1.0436500310897827
5.0346499979496
0.0
0.29402899742126465
10.798848550650291
2.194677025079727
0.0
3.2911899089813232
8.007689757505432e-05
0.407668195664



Elapsed time to compute best fit: 0.684 seconds
Cross-validation score: 0.16655375724340898
Test score: 0.15695067264573992
Best Hyperparameters: {}
105.17476784987139
49.09275349415839
346.6826409474015
0.0
0.00014474100316874683
0.0
0.0
0.0
0.0
0.0
25.60406517237425
0.0
0.0
0.0
0.0
0.0
0.0
0.2224929928779602
0.0
0.0
0.2058980017900467
0.0
8.896959968751617e-08
0.0
0.0
0.018217099830508232
0.0
0.0
0.0
0.0
0.0
0.12538300454616547
0.17336103883280884
0.0
1.1737617086619139
0.0
0.0
0.0
0.21922789647697982
29.234296016395092
0.5605644760653377
0.8577790185809135
0.0
0.1790158972144127
0.7691495966402577
0.0
0.08258580043911934
0.0
0.0
0.0
0.9697449508657883
9.319893348030746
0.18068803945789114
0.0
4.94991002142342e-06
0.0
0.0
0.0
3.2219788804650307
0.372994726523757
56.93125820579007
0.0
0.0
0.021031100302934647
0.49760500621050596
0.0
1.2552769556641579
0.0007437149761244655
0.0
9.793165501207113
1.935787983238697
0.0
68.89506544027608
0.0
0.0
0.0
0.0
0.2351589947938919
13.0796056041817



Elapsed time to compute best fit: 0.687 seconds
Cross-validation score: 0.12974124911937196
Test score: 0.1478743068391867
Best Hyperparameters: {}
2.562988275894895
95.69436764506541
370.9396662810468
0.0
4.3049123883247375
3.2464499473571777
0.0
0.0
0.0
0.0
9.851481914520264
19.72385197877884
0.0
0.0
0.0
0.0
0.0
0.22680599987506866
0.0
0.0
0.0
0.0
0.08493291982449591
9.045098721981049
0.704816997051239
0.0
0.0
0.0
0.0
0.0
2.66243939101696
0.3385699987411499
1.9215282196637418
0.06810689717531204
0.0
0.0
0.0
0.0
1.7150870114564896
0.0007367110229097307
4.443568423084798e-05
0.21346500515937805
1.4032954540098217
12.509071066975594
1.9433000087738037
0.0
17.179923379588217
0.0
0.0
0.0
5.0770087242126465
0.29240700602531433
0.14560050144791603
0.0
0.0
0.0
2.692720045160968e-06
0.0
9.366915591061115
0.0
28.895883286371827
3.319804001948796
0.0
0.0
7.842491090297699
0.6455209851264954
0.0
0.02761089988052845
0.001233931372553343
7.808264546096325
0.006781109783332795
0.0
64.48696680925786



Elapsed time to compute best fit: 0.570 seconds
Cross-validation score: 0.13757159891933518
Test score: 0.11527377521613832
Best Hyperparameters: {}
96.19351218950669
41.56638089284678
196.33193416599534
0.0
14.036217708140612
15.778956022113562
0.0
0.0
0.0
0.0
0.0
0.7432639896869659
0.0
0.0
0.0
0.0
0.7187840193510056
0.0
0.0
0.0
0.0
0.0
0.0
0.12452799826860428
6.084032982587814
0.0
0.0
0.5201259851455688
0.1416970044374466
2.151504673063755
0.0
0.0
0.0
0.0
2.60394916716308
0.0
0.0
0.0
0.473706990480423
0.08860480040311813
9.524701327303774
0.13498200476169586
0.0
8.606669556594625
3.695648677647114
0.4271453022956848
2.7051896173506975
0.0
0.0
0.0
13.67771004140377
0.9711340069770813
0.5044705960899591
0.0
0.04432540014386177
0.04096069931983948
0.15304699540138245
0.16135600209236145
1.713281411677599
30.402675472199917
74.08200988546014
0.3824336929246783
0.005335030145943165
0.0
0.1990325003862381
0.04435350000858307
0.0
2.1257200241088867
0.0004137640062253922
2.072476677596569
5.



Elapsed time to compute best fit: 0.608 seconds
Cross-validation score: 0.14287556934545584
Test score: 0.12195121951219513
Best Hyperparameters: {}
124.07717320710402
14.001270676031709
465.1629289891571
0.0
5.066310055553913
0.0
0.0
0.0
0.0
0.0
0.0
0.40131930261850357
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.003936806024285033
1.2264370461423368e-06
0.0
0.0
0.0
5.388125061988831
0.0
0.11223360151052475
8.791749717888251
0.0024518799036741257
0.9308063834905624
0.0
0.0
0.0
1.073627807199955
7.431128600612283
0.7972077067242935
0.36074599623680115
0.0
2.0085766911506653
5.2966723665595055
1.097080155443308
0.0
0.0
0.0
0.0
0.8191232047975063
0.0
0.006538313233761528
0.0
0.0
0.0
1.793814033135277e-05
0.0
5.301512196660042
20.060856819152832
1.726249024271965
0.04076340049505234
0.0
0.0
0.0
0.0
0.0
0.0
0.12811900675296783
13.18748359405754
5.10521683296065
0.0
20.994652361705278
0.0
0.0
0.0
0.0006861460278742015
0.0
46.208914985880256
0.0
0.791243708692491
153.76233863539656
0.02



Elapsed time to compute best fit: 0.685 seconds
Cross-validation score: 0.16445430701803104
Test score: 0.16853932584269662
Best Hyperparameters: {}
109.72465916723013
66.26797891035676
347.08699195086956
0.0
36.401642970740795
1.7788430079817772
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
4.234655894339085
0.0
0.0
0.0
0.0
0.0
0.11343361995318446
0.0
0.17277899384498596
0.0
0.0
2.0656966120004654
0.0
6.59745741635561
0.0033678701147437096
4.342380190130825
0.0010279488376454538
0.08000549953430891
0.2841362953186035
0.0
0.0
0.0
0.0
2.547120548784733
5.6166051030159
7.547357390634716
0.0
5.860238311376975
0.01150050014257431
0.0
0.28812869638204575
0.0
0.0
0.0
0.6317674964666367
0.27520819939672947
0.8265205684583634
0.0
0.0
0.0
0.0
0.0
1.4458439946174622
1.8965100049972534
10.009300142526627
21.401755712926388
0.0
1.2748210430145264
0.0
0.06454210169613361
0.0
0.0
0.0
12.20739314505306
2.029118090867996
0.0
48.53899657540023
0.0
0.0
0.0
0.0
0.30932718981057405
0.13058491563424468
0.0
2.338



Elapsed time to compute best fit: 0.598 seconds
Cross-validation score: 0.13093446276451884
Test score: 0.125
Best Hyperparameters: {}
102.51321867108345
3.6256649047136307
336.82845663279295
0.0
22.7692369222641
12.01667670160532
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.11817718017846346
0.0
16.336519718170166
0.0
0.0
0.0
0.0
0.0
2.3808999061584473
0.0
0.0
0.0
0.0
0.0
0.0
6.859063848853111
0.30204400420188904
0.694132000207901
0.44170089811086655
0.7449840009212494
3.0525556169450283
0.0
0.0
0.0
7.657981817610562
1.007447987794876
8.60608695447445
1.2071120142936707
2.5921730995178223
11.181856772047468
6.881891191005707
0.0
4.687692940235138
0.0
0.0
0.0
5.818684503436089
0.8243965804576874
15.231763921678066
0.0
0.0
0.0
0.0
0.7708849906921387
5.7033480405807495
6.770950924335921
3.0568140000104904
43.41487139463425
3.881462603167165
0.0
0.0
0.0
2.5482699871063232
0.0
0.0
24.96964715514332
0.0
0.0
35.56349380966276
0.0
0.0
0.0
0.0
1.3057159706950188
10.125938713550568
0.0
0.9663069844245911




Elapsed time to compute best fit: 0.679 seconds
Cross-validation score: 0.11581321919087065
Test score: 0.10506798516687268
Best Hyperparameters: {}
196.7805026602
107.79518648330122
173.22136111883447
0.0
43.4031583853066
0.8774430155754089
0.0
0.0
0.0
0.0
1.1887500286102295
0.3878840059041977
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.6507949829101562
0.0
0.37404200434684753
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.06453999876976013
0.9863535836338997
0.0
0.0
0.0
0.0
0.04671729914844036
24.236120970221236
1.1686732177222439
0.02104640007019043
0.0
2.136355035007
0.12258599698543549
0.0
16.832434222102165
0.0
0.0
0.0
0.15291672959574498
0.3986699879169464
0.2649877965450287
0.0
0.0
1.4893300533294678
1.0758399963378906
1.615884229540825
18.24934872984886
0.0
0.9983709305524826
0.0
1.6230454742908478
0.0
0.07602609694004059
0.0
0.0
0.0
0.0
12.025507885031402
0.3093259930610657
0.0
27.612312335520983
0.0
0.0
0.0
1.6302299499511719
0.3300776109099388
0.0
0.0
4.779970169067383
153.2406440768391
24.31



Elapsed time to compute best fit: 0.595 seconds
Cross-validation score: 0.11763339937651658
Test score: 0.09976525821596244
Best Hyperparameters: {}
24.979594435542822
22.439168095588684
384.71174261905253
0.0
31.776607055217028
0.0
0.0
0.0
0.0
0.0
27.171646274626255
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.1036180257797241
0.0
0.0
0.35282761231064796
0.0
5.242671690881252
0.6882150173187256
0.0
2.9248104244470596
0.0
3.8721050024032593
0.0
0.0
0.0
0.9468389749526978
0.0
14.446747153997421
0.14911500178277493
0.13267700374126434
1.1004399955272675
1.8701519966125488
0.0
1.6905950158834457
0.0
0.0
0.0
1.1567299962043762
0.6191179752349854
3.5183781050145626
0.0
0.0
0.0
0.299112007021904
0.9757040292024612
0.01091460045427084
3.5696948170661926
12.840765532106161
31.44010016322136
2.991987258195877
0.2677119970321655
1.4410750269889832
5.033703178167343
0.0
0.0
6.840891105122864
1.2902356907725334
6.757889062166214
0.0
33.68007004261017
0.0016880299663171172
0.0
0.0
0.0
0.0




Elapsed time to compute best fit: 0.575 seconds
Cross-validation score: 0.13539468397246746
Test score: 0.1103448275862069
Best Hyperparameters: {}
129.5038578318541
101.68767499299065
334.06956475976784
0.0
6.487968280911446
84.93026852607727
0.0
0.0
0.0
0.0
0.8833156079053879
0.0
0.0
0.0
0.0
0.0
0.12156900018453598
0.0
0.0
0.0
0.0
0.0
0.0
0.19199900329113007
1.5782400453190348e-07
0.5466139912605286
0.0
0.1586415022611618
0.0
0.5065900087356567
0.7558259963989258
0.4652668982744217
0.0063941597371197645
0.0
8.298118957318366
0.0
0.0
0.0
1.9642201807857873
1.6091449856758118
5.84266996383667
0.0020952799823135138
1.2254879921674728
8.599105545617931
38.46922513842583
0.12923899292945862
0.5518519878387451
0.0
0.0
0.0
0.5390816163271666
0.0
3.8258337676525116
0.0
0.13441899418830872
0.0
0.0
0.0
0.2454013004899025
7.247030998580158
7.600531794130802
48.90164127899334
0.20189900696277618
0.0
0.561777587980032
0.0
3.717200893908739
0.0
0.0010859599569812417
5.914460307856643
0.0
0.0
0.0
0

## 4.4 Rebalancing Strategy - 5050

### 4.4.1 Random Forest

In [120]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

fiftyfifty_randomforest_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    pipeline = Pipeline(steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                              ['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    fiftyfifty_randomforest_normalized_performance_df = pd.concat([fiftyfifty_randomforest_normalized_performance_df, new_performance_df])
    
fiftyfifty_randomforest_normalized_performance_df.to_csv("../data/05_model_output/fiftyfifty_randomforest_normalized_performance_df.csv")



### 4.4.2 XGBoost

In [121]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


fiftyfifty_xgboost_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    GXBoostPipeline = Pipeline(steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                                    ['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    fiftyfifty_xgboost_normalized_performance_df = pd.concat([fiftyfifty_xgboost_normalized_performance_df, new_performance_df])
    

fiftyfifty_xgboost_normalized_performance_df.to_csv("../data/05_model_output/fiftyfifty_xgboost_normalized_performance_df.csv")



Elapsed time to compute best fit: 76.897 seconds
Cross-validation score: 0.6291675374307735
Test score: 0.5797101449275363
Best Hyperparameters: {}
0.015303044
0.012151187
0.2891131
0.002303017
0.01885558
0.00043004125
0.000705069
0.0
0.0003200941
0.0
0.005451348
0.0005822181
0.0
0.0
0.0034037982
0.0
0.00040007944
0.0
0.0021616411
0.00019101227
0.00031065714
0.0022620454
0.009622223
0.0064894063
0.0003197715
0.0001555845
0.008179886
0.0013154412
0.00033465013
0.0066124396
0.0001843362
0.0021646854
0.010958387
0.0046745054
0.009189691
0.0
0.0
0.0
0.016271986
0.010476453
0.015831139
0.0032263002
0.0007454184
0.0064514256
0.0009642835
0.0068265
0.001973449
0.0
0.0
0.0
0.0021376857
0.009411111
0.00060177286
0.0
0.0
0.0
0.0014087681
0.005569022
0.00064835744
0.016545068
0.012551411
0.010613537
0.0
0.00015767684
0.00198873
7.4199e-05
0.00038309942
0.0
0.00079135207
0.0036196623
0.001796854
0.0044112955
0.07723891
0.0
0.0007493126
0.00028464117
0.0031153292
0.00613324
0.00021315536
0.00018175



Elapsed time to compute best fit: 77.191 seconds
Cross-validation score: 0.5828990992114816
Test score: 0.5639097744360901
Best Hyperparameters: {}
0.03291727
0.0048477645
0.22311349
0.002673046
0.031858236
0.0026941216
0.0023768041
0.001621543
0.0014919678
0.0
0.009841782
0.021988874
0.0005439931
0.0
0.0017112792
0.0
0.007040169
0.0
0.0005341136
0.011564299
4.7311696e-05
0.0002991011
0.0009973061
0.00016786161
0.0017019657
0.011350952
0.0
0.00014321506
0.0
0.0031212084
5.4283755e-05
0.0070397602
0.0012134345
0.0019854202
0.0002938223
0.0
0.0
0.0
0.006997006
0.03135011
0.017643211
0.002899837
0.00017677463
0.010378898
0.000568873
0.0013023544
0.0015251026
0.0
0.0
0.0
0.000962329
0.0023922655
0.00022960053
0.00033399535
0.0
0.002048645
0.022085395
0.005335993
0.001969537
0.013703114
0.0075950595
0.012082638
0.0019531159
2.7534932e-06
0.0005062081
0.0013115495
0.0002001765
0.0
0.003981773
0.0004883398
0.0015879397
0.0073344
0.11354843
0.0007503953
0.038435604
0.0
0.0032000523
0.003155304



Elapsed time to compute best fit: 70.937 seconds
Cross-validation score: 0.5998925753945391
Test score: 0.46391752577319584
Best Hyperparameters: {}
0.01731945
0.060653258
0.2156559
0.0013746538
0.016102305
0.0055766683
0.0011192871
0.0
0.0022918282
0.00063614745
0.011895561
0.0010537849
0.0
0.0
0.0015517451
0.0
0.0015134276
0.0
0.0112278145
0.0
0.00033559915
0.00028224033
0.004514912
0.0012443103
0.0011086694
5.301443e-05
0.008454538
0.0004214221
0.0012326061
0.016558679
0.00032442884
0.00039987155
0.0073139924
0.0018790098
0.008798329
0.0
0.0
0.0
0.0014842022
0.022219468
0.0070897313
0.00038706893
0.0005597954
0.006189189
0.00015107376
0.000434692
0.04145274
0.0
0.0
0.0
0.00043379408
0.0010662759
0.000401961
0.0009251212
0.0
0.00051593414
0.037046578
0.0
0.0022724238
0.012383336
0.007868201
0.003424848
0.0018999656
0.00074293144
0.00016219316
0.0004379218
0.0001643226
0.0017586065
0.0016077985
0.00048170445
0.0023394637
0.0117902085
0.064881384
0.0030919183
0.038811054
8.337925e-05
0



Elapsed time to compute best fit: 81.514 seconds
Cross-validation score: 0.5755837230154339
Test score: 0.6790123456790124
Best Hyperparameters: {}
0.020217538
0.048296
0.19651268
0.0025330135
0.017380197
0.008448788
0.0
0.0
0.0
0.023225835
0.008584983
0.00088014215
0.0
0.0
0.00021498551
0.0
0.003625367
0.00029965385
0.049170244
0.0
0.0008145666
0.0005673842
0.0042508706
0.0013026226
0.0006751225
0.0
0.0016875484
0.00013784783
0.00027724865
0.009329194
0.00030391474
0.0037900144
0.002084357
0.00057858333
0.0015191388
0.0
0.0
0.0
0.005545745
0.055053443
0.009778989
0.00037311038
0.009012577
0.007947492
0.0030624284
0.009516302
0.007349381
0.0
0.0
0.0
0.00164989
0.010269444
0.0008921704
0.0012049185
0.0018397036
0.0
0.0033108406
5.7337597e-06
0.00039378862
0.013045626
0.005505026
0.0040326365
0.004984094
0.0011314118
0.0009659347
0.006998591
0.0031577614
0.00034005998
0.013869133
0.004881041
0.0015402545
0.0032815277
0.08625601
0.0010664968
0.06191623
0.0
0.0
0.0047151963
0.004847183
0.0



Elapsed time to compute best fit: 76.524 seconds
Cross-validation score: 0.5925404503693006
Test score: 0.5913978494623656
Best Hyperparameters: {}
0.02313755
0.07189571
0.18409628
0.00061516464
0.026673341
0.0073615885
0.00026798542
8.671695e-05
0.00055498217
0.00035189078
0.009538626
0.00058925204
0.0
0.0
0.00060695794
0.0
0.05332218
0.0006413617
0.02603336
0.0
0.00070616225
0.000749627
0.009573179
0.0
0.00011269953
0.0
0.0026433542
0.0
0.0007201927
0.0017143796
0.00031279566
0.007918766
0.0012859488
0.0028240671
0.0033601357
0.0
0.0
0.0
0.007192019
0.0069280355
0.01046393
0.00077605055
0.0004629578
0.010624901
0.00026987016
0.0026531827
0.0034609707
0.0
0.0
0.0
0.0059709544
0.00973151
0.0012298452
0.0016492712
0.0019182819
0.00027951674
0.0055644535
0.037299097
0.0030067752
0.009341108
0.017859902
0.00501658
0.0002933893
2.4258443e-06
0.00010482232
0.0018304238
7.3360454e-05
0.0
0.0024455297
0.0011807116
0.0027174777
0.0036646307
0.086221226
0.008805658
0.06921901
1.814043e-06
0.004



Elapsed time to compute best fit: 80.316 seconds
Cross-validation score: 0.6154303715019898
Test score: 0.6493506493506493
Best Hyperparameters: {}
0.0015204615
0.002070643
0.2254147
0.001735025
0.050262693
0.001097385
0.0004989704
0.0011769417
0.0010645852
0.0006108372
0.0121666845
0.0042089433
0.0015992457
0.0
0.0002903612
0.0
0.018313525
0.00036048502
0.0
0.009293882
0.00044154082
0.0011766889
0.007933915
0.0024638237
0.0002332384
0.0
0.0
0.00051279995
0.002076668
0.008658324
0.00015559945
0.0028827996
0.00093853753
0.016504394
0.006153898
0.0
0.0
0.0
0.007841974
0.02665904
0.0068312725
0.0008741092
0.0016842465
0.008648407
0.0019005515
0.0023732842
0.0048235646
0.0
0.0
0.0
0.000638522
0.00096652325
0.0005432927
0.0019982515
0.0
0.001340125
0.024243379
0.0041374844
0.00051431544
0.017821845
0.013326764
0.0028317426
0.0011523546
3.0188856e-05
0.0016415184
0.008382771
0.0008220101
0.0037331867
0.0025760925
0.0014042596
0.0031110938
0.0158297
0.10453844
0.002763404
0.00028956684
0.0001



Elapsed time to compute best fit: 73.851 seconds
Cross-validation score: 0.6009620941111166
Test score: 0.4273504273504274
Best Hyperparameters: {}
0.009572202
0.027510963
0.25003082
0.00050363946
0.012198027
0.0015632556
0.0014010732
6.3530366e-05
0.00047150886
0.0
0.0043096193
0.005026406
0.0
0.0
0.0029523498
0.0
0.00019587816
0.0002634987
0.0
0.0010479088
0.0014252953
0.00041135002
0.004297981
0.028442422
2.015454e-05
0.0
0.0056637344
0.00020750305
4.149613e-05
0.0043624854
0.0002669072
0.00018082513
0.0023296897
0.0014806073
0.0012401992
0.0
0.0
0.0
0.003035763
0.015756935
0.0014684096
0.00056645955
9.0077076e-05
0.009154644
0.0112115145
0.00039041083
0.015805967
0.0
0.0
0.0
0.00078892236
0.00029493074
0.0006811767
0.003689894
0.0013436703
0.0
0.007676031
0.01289262
0.0009256215
0.01098291
0.009030039
0.047328293
0.0005832756
4.6492834e-05
0.0076490496
0.0
0.00015729383
0.0
0.0006731508
0.0004319926
0.0010871968
0.0072202245
0.08443
0.0
0.0
0.0
0.0010258268
4.2093317e-05
0.00056805



Elapsed time to compute best fit: 75.967 seconds
Cross-validation score: 0.6724326420837334
Test score: 0.5056179775280899
Best Hyperparameters: {}
0.002237188
0.0018154973
0.22287904
0.0054351473
0.034797348
0.016942678
0.010361248
0.0
0.00020000442
0.0010334322
0.022219844
0.010271702
0.0
0.0
0.00084605505
0.0
0.008422156
0.0
0.0
0.0
0.001045025
0.0058092317
0.006154767
0.0
0.0
0.0
0.0
0.0016704783
0.0
0.013811012
0.00028497117
0.0062507554
0.00072763354
0.0018704954
0.0012418822
0.0
0.0
0.0
0.00068764965
0.031602032
0.009451743
0.00012875115
0.006967706
0.013495125
0.00029146485
0.0
0.0013011765
0.0
0.0
0.0
0.0011559738
0.027032923
0.00035334387
0.0
0.00095807033
0.0
0.021671465
0.0013060451
0.0011124548
0.0068882424
0.027619107
0.005692229
0.05170929
2.9954058e-06
0.0016354524
0.0
3.956565e-05
0.0
0.0
0.002018419
0.008315313
0.0
0.09404614
0.0
0.0054863445
0.0
0.0
0.004768031
0.0023439804
0.0008326727
3.836862e-05
0.00077668804
0.026653215
0.028324684
0.102872424
0.015751632
0.0004



Elapsed time to compute best fit: 84.868 seconds
Cross-validation score: 0.5416186767200741
Test score: 0.5504587155963303
Best Hyperparameters: {}
0.0016603324
0.06614635
0.16997223
0.0019122217
0.03219873
0.019446258
0.00014807962
0.00047328914
0.0004267753
0.0006982798
0.009537077
0.0
3.2729768e-05
0.0
0.0034862969
0.0
0.013215916
0.00034034278
0.00027665508
0.0
0.0002933652
0.0012488447
0.0015816769
0.0061528883
0.00012566734
0.00193709
0.00039996504
0.00035551653
0.00010278293
0.01565323
0.00028899405
0.0010107585
0.0057213674
0.0033022242
0.0085585965
0.0
0.0
0.0
0.009560548
0.02304669
0.0054062516
0.01165854
0.0003538951
0.013644416
0.002242909
0.0028945308
0.0069798334
0.0
0.0
0.0
0.0014801979
0.0028100023
0.00050681556
0.00042719467
0.0
0.002827364
0.016085641
0.008224918
0.0010666095
0.011860939
0.016589664
0.0075582615
9.597561e-05
0.00023120311
0.0013412634
4.960333e-05
0.0031204598
0.00022919445
0.016033426
0.0058594444
0.004526321
0.00030800467
0.102847576
0.0
0.009406996



Elapsed time to compute best fit: 75.099 seconds
Cross-validation score: 0.5503072031537847
Test score: 0.6989247311827957
Best Hyperparameters: {}
0.010259512
0.08761982
0.1617732
0.003341831
0.007909393
0.0032957196
0.003171781
0.0
0.00077006547
0.00080531114
0.002597054
0.09725063
0.0
0.0
0.00028978664
0.0
0.0008544955
9.023912e-05
7.134908e-05
0.0
0.0
1.1995362e-05
0.0026899078
0.0
0.0
0.0
0.0028426757
0.0004486917
0.00285734
0.0011513083
0.000225476
0.0043412847
9.500647e-05
0.0026402194
0.008626127
0.0
0.0
0.0
0.0023926347
0.009276496
0.004661477
0.0005271946
0.0007527018
0.004724299
0.000100869205
0.00044489806
0.0036515142
0.0
0.0
0.0
0.0006913225
0.00086135993
0.0009241714
0.00015551587
0.0
0.0
0.009566794
0.000569408
0.00068390026
0.0073024957
0.01016526
0.0016356055
0.0001292808
0.0002645203
0.0052591064
0.00059615955
0.0047210655
0.0
0.0010244976
0.00036367617
0.04237001
0.000115207804
0.11262799
0.0058656815
0.04894544
0.0
0.0007846427
0.004929011
0.0024358255
4.6093213e-0



Elapsed time to compute best fit: 72.560 seconds
Cross-validation score: 0.5935297898385666
Test score: 0.7216494845360825
Best Hyperparameters: {}
0.0022126783
0.0031731003
0.22379866
0.0030227746
0.04306084
0.00015058866
0.009007912
0.00057745515
0.0003278457
0.014257455
0.013423407
0.0
0.00048478413
0.0
0.00018238547
0.0
1.4101418e-05
0.001924094
0.004605662
0.022257145
0.00013933712
0.0008628025
0.00188345
0.010116259
0.002632849
0.0
0.009891894
0.0
0.0016390206
0.0004501188
0.00068274693
0.0018075233
0.0006689532
0.0006849698
0.0018887683
0.0
0.0
0.0
0.0048966943
0.04690411
0.0076286523
0.00063675205
0.0009574858
0.0073812976
0.0002861362
0.0017582845
0.015269781
0.0
0.0
0.0
0.0020226296
0.004133211
0.001240473
0.006291702
0.0
0.0
0.00860642
0.00013531251
0.0008872339
0.017977672
0.0051524797
0.010288917
0.00054562825
3.8107446e-05
0.0025822525
0.0
2.9377223e-05
0.0041859807
0.0017198198
0.0042891344
0.0034445024
0.0006353582
0.092798814
0.00430381
0.027669901
0.00045048204
0.0007



Elapsed time to compute best fit: 79.502 seconds
Cross-validation score: 0.5438139600419588
Test score: 0.5445544554455446
Best Hyperparameters: {}
0.012003975
0.07758087
0.14162757
0.0022962824
0.024608007
0.05540096
0.0006942403
0.0006931269
0.00036756223
0.001185218
0.008579361
0.00036357658
0.0
0.0
0.0025425346
0.0
0.02441805
0.0
0.058900278
0.00013927562
0.00033130666
0.0011786881
0.002510465
0.00020823123
0.0012421444
0.00041236522
0.008030395
0.00027517352
0.00033327896
0.008138052
0.00020391631
0.00014138814
0.0034843215
0.013546665
0.0033819606
0.0
0.0
0.0
0.0036801193
0.049721792
0.0038669973
0.0021132205
4.3228567e-05
0.0032363273
0.0050057503
0.0005204071
0.0043120747
0.0
0.0
0.0
0.0020223008
0.0004663279
0.00083350844
0.0011768386
0.0
0.00082189846
0.031048043
0.0048075602
0.00036467725
0.005257846
0.009061659
0.0057135136
0.011936455
3.5775884e-05
0.00074687466
0.0009823036
0.0
0.0
0.001461556
0.0041199275
0.011609779
0.012366992
0.08871429
0.007407322
0.027522804
0.0
0.0



Elapsed time to compute best fit: 76.095 seconds
Cross-validation score: 0.6001211361285421
Test score: 0.5913978494623656
Best Hyperparameters: {}
0.01271534
0.059696488
0.20260826
0.0024187637
0.014518472
0.00057128107
0.00025459556
0.0007021923
0.00048912835
0.00053570356
0.008312278
0.00087710004
0.0003592116
0.0
0.0014263658
0.0
0.00012867099
0.0
0.00015856078
0.0001249821
0.00036790757
6.0983322e-05
0.007293428
0.0010849515
0.0
0.0
0.0013068317
0.0
0.0010871657
0.002057447
0.00026718818
0.0048394026
0.0021377588
0.0012006423
0.010754737
0.0
0.0
0.0
0.0016808765
0.007907258
0.0013781595
0.0008915606
0.0010810195
0.0051517864
0.0005371373
0.028473178
0.0035612301
0.0
0.0
0.0
0.0016201665
0.04101281
0.00031505217
0.0024128433
0.0
0.009826895
0.0027634585
0.0051629245
0.0014921157
0.00394059
0.008921797
0.0030961365
0.00024987658
0.0010056038
0.0011039706
0.0008202708
0.003851954
0.0004431272
0.0001378263
0.002989261
0.0028464668
0.0
0.08047522
0.0002847619
0.0005228064
0.0
0.0082280



Elapsed time to compute best fit: 73.208 seconds
Cross-validation score: 0.6180697861855272
Test score: 0.617283950617284
Best Hyperparameters: {}
0.0021186436
0.0051868265
0.3217337
0.0029012733
0.03700033
0.0047281897
0.0064725317
0.0008649174
0.0
0.0
0.005439191
0.00015352329
0.0
0.0
0.003990892
0.0
0.025789078
0.0
0.0
0.0
0.00066781335
0.0036907853
0.0019645381
0.00036258908
0.0
0.0
0.0
0.00040848795
0.0004277252
0.0019298263
0.0005632981
0.0015120709
0.0017914169
0.002782357
0.00732244
0.0
0.0
0.0
0.0014599863
0.009808656
0.0009949575
0.003105742
0.0014839814
0.00080955157
0.0012263886
0.0031777867
0.013028427
0.0
0.0
0.0
0.0009571658
0.005838793
0.00034447826
0.0008293964
0.0003156524
0.0002354401
0.0026241788
0.0011405207
0.0012655864
0.014432955
0.0150807705
0.0034920329
0.00030304337
0.0028352016
0.0057754884
0.0023218838
0.00014689779
0.0
0.002305422
0.00011855841
0.006858527
3.880263e-05
0.13647251
0.0035274252
0.0
0.0
0.0011233065
5.6480018e-05
0.0004627349
0.0035019806
0.0



Elapsed time to compute best fit: 71.115 seconds
Cross-validation score: 0.6045473244692521
Test score: 0.7246376811594203
Best Hyperparameters: {}
0.0201061
0.09006511
0.18653275
0.00090406946
0.027796352
0.00028968998
0.0
0.0010807955
6.197773e-05
0.0025156764
0.004447903
0.0017585644
0.0
0.0
0.0033844672
0.0
0.0
0.0
0.001095514
0.00015999298
0.000632558
0.0
0.0
0.0070769563
0.0
0.00011310876
0.0056459615
0.0053319708
0.0003567629
0.0007211788
0.0003376714
0.005265153
0.0029786178
0.0013705549
0.0038576848
0.0
0.0
0.0
0.0091768
0.011422136
0.0008435619
0.0017465438
0.0013444163
0.0025068051
0.0002918174
0.0016053424
0.0034471762
0.0
0.0
0.0
0.00047270578
0.0033877348
0.0004235939
0.002200841
0.0
0.0002974764
0.0050052153
2.4823714e-05
0.00048405284
0.0042363964
0.018327324
0.003746841
0.0071226247
0.0
0.0017828064
0.0022803328
0.0
0.0
0.0
0.00033878436
0.0017005143
0.0007845992
0.12701166
0.0
0.000807288
0.0
0.0
0.000569874
0.0037743787
0.00033522534
0.0032130936
0.09860341
0.0345574



Elapsed time to compute best fit: 74.009 seconds
Cross-validation score: 0.6304716935345338
Test score: 0.5963302752293578
Best Hyperparameters: {}
0.014278718
0.017132945
0.23936185
0.003701551
0.039809197
0.03153452
0.00029720028
0.0006336169
0.002260851
0.0024607563
0.009996338
0.003002823
0.0
0.0
0.00022233966
0.0
0.0020548243
0.00016028214
0.0
0.0
0.00029542184
0.00046683216
0.004049914
0.0031137804
0.0062408266
0.00055646687
0.019349566
0.0001924129
0.00027519104
0.0076493584
0.00023476966
0.0029535454
0.00018760448
0.003956624
0.007158655
0.0
0.0
0.0
0.0062746066
0.007534891
0.012132854
0.0007853113
0.0007868563
0.00072181894
0.0030266643
0.0012141888
0.004141342
0.0
0.0
0.0
0.00084908516
0.002961426
0.0008998189
3.0513822e-05
0.0
0.0
0.004224354
0.000444389
0.000502671
0.014772495
0.030304706
0.0010743819
0.0005264058
0.00025512007
0.0021888483
0.00054828037
0.0001280907
0.0012355379
0.008633166
0.003390112
0.010796065
0.013394287
0.08419926
0.0
0.0
0.0
0.001160878
0.00719671
0



Elapsed time to compute best fit: 75.168 seconds
Cross-validation score: 0.6076217670876567
Test score: 0.5376344086021505
Best Hyperparameters: {}
0.0030352566
0.010070903
0.2432996
0.0012891627
0.08662259
0.0001496475
0.0
0.0041805618
0.00010269801
0.00094821333
0.0073870462
0.0
0.0
0.0
0.002284176
0.0
0.0
0.0
0.03238801
0.00030593728
0.00066947774
6.5482805e-06
0.0012818516
0.009183731
3.12095e-05
0.0013807839
0.0
0.000104346385
0.0021419744
0.0022892565
0.0
0.0021458766
0.0011242686
0.00046719034
0.0022736946
0.0
0.0
0.0
0.0029647825
0.031343076
0.0054422677
0.02770538
0.0014600953
0.0071571227
0.003964562
5.87434e-05
0.0027849372
0.0
0.0
0.0
0.00094722206
0.010506115
0.00025090072
0.0012041028
0.0018557621
0.0
0.008629756
0.00023231196
0.0009041259
0.011431349
0.006563974
0.00238051
0.0030840696
0.0007093136
0.0031994646
0.0006581854
0.0
0.0
0.0003219699
0.05963928
0.017129064
3.1765354e-05
0.020174246
0.0025718128
0.056902613
0.0
0.002239493
0.007889677
0.0013862035
0.000605461
0



Elapsed time to compute best fit: 76.970 seconds
Cross-validation score: 0.6712640046644917
Test score: 0.6470588235294119
Best Hyperparameters: {}
0.0040629194
0.004344365
0.24529107
0.00193271
0.03765681
0.026122471
0.0059622694
0.0
0.0017230099
0.0051557855
0.010764417
0.0021127977
0.0019778786
0.0
0.002784751
0.0
0.017188337
0.00013471556
0.0071292925
0.002338705
0.0007340142
0.00053238124
0.002315831
0.0032438184
0.0001095947
0.0
0.0054024514
0.00020802746
0.0
0.0024147148
0.00020576952
0.0017311871
0.00046290227
0.0013167536
0.004993242
0.0
0.0
0.0
0.0013885909
0.009124509
0.005912915
0.0007158785
0.001375153
0.0064447057
0.028952409
0.0012166394
0.0020699364
0.0
0.0
0.0
0.0005611075
0.00028252817
0.001262672
0.0053820703
0.00034105827
0.0
0.010169614
0.0016594826
0.015795695
0.009188554
0.0134944115
0.0038988534
0.00018007924
0.0
0.0011316014
2.8623155e-05
6.0852028e-05
0.002234758
0.0006143771
0.004474551
0.0018656371
0.0015907906
0.10068023
0.0
0.00031997255
0.0
0.0
0.00909746



Elapsed time to compute best fit: 77.156 seconds
Cross-validation score: 0.6664895189082951
Test score: 0.6435643564356436
Best Hyperparameters: {}
0.005217873
0.050459377
0.19001016
0.0028607033
0.010204397
0.003198689
0.00052309287
0.00033121026
0.0025572127
0.00058264035
0.014845127
0.0036968384
0.0
0.0
0.0016117785
0.0
0.0029085812
2.608302e-05
0.09315224
0.021885836
0.00015636969
0.0
0.0016192981
0.016408566
0.0
0.0
0.0018515603
0.0051814117
0.0030751938
0.0063672517
0.0004624521
0.0039166734
0.0033321418
0.00041560363
0.0042599905
0.0
0.0
0.0
0.0012015974
0.029232547
0.00622613
0.0017171628
0.003285561
0.016956354
0.00037844363
0.0019043359
0.0051214495
0.0
0.0
0.0
0.0005894075
0.011562683
0.00015112756
0.0006919554
0.014839642
0.0003925859
0.003321129
0.0017875034
0.00033939467
0.012717132
0.023706723
0.004954646
0.0001637773
0.0013550522
0.0003112737
0.0075483345
0.00034856595
0.0070225983
0.0018579881
0.0016903164
0.018474437
0.017553432
0.084044404
0.0008766752
0.000592998
0.



Elapsed time to compute best fit: 73.866 seconds
Cross-validation score: 0.5712436923587172
Test score: 0.5882352941176471
Best Hyperparameters: {}
0.008404211
0.011018361
0.29621798
0.0030187257
0.03890132
0.016188303
0.00029457678
0.0010021189
0.0
0.012563985
0.0093996525
0.0
0.0002549403
0.0
0.021263955
0.0
0.0015620568
0.0005221344
0.0
0.0
6.41512e-05
0.0041543026
0.0045177764
0.0016943881
2.6186895e-05
0.0
0.0047741975
0.0044098017
0.00089971366
0.0069741583
0.00047943572
0.001182203
0.00089573074
0.0002434846
0.008221881
0.0
0.0
0.0
0.011678865
0.065442234
0.0038696462
0.0010128937
0.0001359804
0.00050231797
0.00010747153
0.0007424738
0.0076806573
0.0
0.0
0.0
0.0006234967
0.029660847
0.0010483856
0.0041233627
0.0
0.00010351916
0.0021276425
0.00052966236
0.00064061943
0.0073045027
0.0042875214
0.0002593022
0.0003716096
0.0
0.0015551588
0.0021997588
0.0033785298
0.0
0.0040270286
0.0010357819
0.01900604
0.0066595147
0.10786778
0.0011268957
0.002158058
0.0
0.004062296
0.00067511224
0



Elapsed time to compute best fit: 76.443 seconds
Cross-validation score: 0.5732325376075189
Test score: 0.6741573033707865
Best Hyperparameters: {}
0.007136783
0.036387216
0.1833895
0.0031521607
0.021643534
0.00029530682
0.0
0.00039751417
0.0016808802
0.00078576076
0.009957535
0.011851322
0.00023275372
0.0
0.00017718435
0.0
0.001050402
0.0
0.00718787
0.008641894
0.0004852271
0.00094732654
0.0040162373
0.0914372
0.0005532378
0.00017671076
0.0027452118
1.9620014e-05
0.0007894893
0.010449933
0.0006025379
0.0031876839
0.0011120066
0.00077807327
0.0046475246
0.0
0.0
0.0
0.0079350835
0.0073777013
0.0054700226
0.00016548265
0.00041581315
0.0066328463
0.0043366603
0.00069208385
0.017523937
0.0
0.0
0.0
0.0017282555
0.0027351235
0.0006267444
0.0008681521
0.0
0.0010428188
0.0026377128
0.004374887
0.002394641
0.020965876
0.024121942
0.008162025
0.0010960384
0.00036722433
0.0029677474
0.010145193
0.0
0.0005413031
9.5842675e-05
0.0006220111
0.007102787
0.0019139627
0.06427269
0.00070020143
0.0236774



Elapsed time to compute best fit: 70.909 seconds
Cross-validation score: 0.6859651761848055
Test score: 0.5376344086021505
Best Hyperparameters: {}
0.025760567
0.051292058
0.22227283
0.004100933
0.0030444004
0.016149757
0.0008471425
0.0007766618
0.0014279964
0.0021356894
0.012749274
0.019134304
0.0
0.0
0.06986746
0.0
0.0017068444
9.282483e-05
0.0058824415
0.0
0.0
0.0002760616
0.0065407977
0.0017542646
0.0052320603
0.0
0.0016889322
0.0007976945
0.0
0.0015790259
0.0002141722
0.0028787614
0.0024968705
0.02670028
0.00027806417
0.0
0.0
0.0
0.0013432102
0.04973367
0.0015024036
0.0007234935
0.0047272155
0.008006451
0.0009834343
0.009543271
0.0061562457
0.0
0.0
0.0
0.0023711761
0.00012230422
0.00021708736
0.0040944694
0.00016582602
0.0016454622
0.015970271
0.023825556
0.0014319412
0.013101187
0.017216992
0.007695981
8.006462e-05
0.0
0.00093494717
0.000111355555
0.0022522158
0.0
0.001994199
0.06722402
0.00022409282
0.0
0.03236111
0.0
0.0020959806
0.0
0.0
0.009027406
0.001036076
0.011754071
0.00



Elapsed time to compute best fit: 76.208 seconds
Cross-validation score: 0.6103587120193922
Test score: 0.5555555555555556
Best Hyperparameters: {}
0.024685778
0.01961323
0.23522587
0.0061270585
0.007276454
0.0013509223
0.0010345448
0.0
0.000106793705
0.0
0.004226381
0.021718055
0.00066364167
0.0
0.006807051
0.0
0.001130632
0.00027843204
0.0
0.005385157
0.00042734866
0.0005251665
0.010584143
0.03519049
0.0039640022
0.0
0.0
0.002860298
0.0038721913
0.0015050548
0.0006527509
0.0009942411
0.0025292612
0.0015394419
0.0021405546
0.0
0.0
0.0
0.0050316937
0.0061844555
0.0045407177
0.0008584611
0.00035853923
0.0034719408
0.010942097
0.00020200963
0.016465468
0.0
0.0
0.0
0.0014437014
0.0005611383
0.00026469777
0.0012541793
0.0
0.0
0.005389547
0.0016490079
0.00038590853
0.012547754
0.011277517
0.0043957047
0.00075345434
0.00047672947
0.0010420867
0.025035193
0.0017353208
0.0025194248
0.011945508
0.0012750609
0.009551226
0.0012360811
0.10590598
0.0
0.00082394376
0.0
0.0005245831
0.01210334
0.0015



Elapsed time to compute best fit: 81.015 seconds
Cross-validation score: 0.6531091564099816
Test score: 0.6451612903225806
Best Hyperparameters: {}
0.014358179
0.005057474
0.21696346
0.006317914
0.042102423
0.000852579
0.0027389852
0.00038916708
0.00025739172
0.0029726631
0.009589322
0.0
0.000567871
0.0
0.0
0.0
0.01853625
0.0
0.0
0.00030651907
0.00048466548
0.00072652654
0.0011194029
0.0050505586
0.0018025399
0.0
0.0
0.00021435403
0.005081787
0.021788534
0.0030644597
0.002773443
0.0030294713
0.0049017537
0.008418339
0.0
0.0
0.0
0.0064621484
0.020834507
0.002937431
0.0021230117
2.2481674e-05
0.00999051
0.000743066
0.0021905636
0.00934172
0.0
0.0
0.0
0.0010478082
0.002226241
0.0013522446
0.00053680193
0.0
0.0
0.0069895666
0.040582906
0.0016980538
0.01902581
0.0074058347
0.0014940251
0.00036745847
0.00016802324
0.0032184764
0.0002787587
0.00080203393
0.003937557
0.0040493924
0.0026571443
0.0018591335
0.0011677286
0.11791564
0.0006462604
0.0010665866
0.0
0.0015578347
0.00051392964
0.005654



Elapsed time to compute best fit: 72.502 seconds
Cross-validation score: 0.5878680626728298
Test score: 0.5294117647058824
Best Hyperparameters: {}
0.001549965
0.0014500745
0.25236702
0.007653078
0.014990551
0.0032216015
0.006623512
0.000351004
0.0
0.0
0.007062249
0.0017229313
0.0061834888
0.0
5.8064557e-05
0.0
0.008009274
0.0002324028
0.0
0.0
0.0
0.0006441658
0.00054976007
0.033206373
0.00095751765
0.0
0.0
0.0001468915
0.0064808936
0.0065400326
0.00034971358
0.0025034002
0.004766813
0.0018335334
0.0066875713
0.0
0.0
0.0
0.005374441
0.04293162
0.003147417
0.0039580655
0.00514771
0.017310599
7.021896e-05
0.010101689
0.020960452
0.0
0.0
0.0
0.00197027
0.0026419656
0.00054537033
0.00042105178
0.0013458927
0.0073627033
0.0037853676
0.0480387
0.0054212636
0.040607847
0.010144542
0.00994184
0.00028553288
0.0017209812
0.031281725
0.0041937805
0.0044100676
0.0
0.00066463335
0.002575653
0.0025506378
0.00026139524
0.082429335
0.0006102917
0.0011713216
0.0
0.003064204
0.0020643983
0.0047228336
0.

### 4.4.3 LightGBM

In [None]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

fiftyfifty_lightgbm_performance_normalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    LightGBMPipeline = Pipeline(steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                                    ['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    fiftyfifty_lightgbm_performance_normalized_df = pd.concat([fiftyfifty_lightgbm_performance_normalized_df, new_performance_df])
    

fiftyfifty_lightgbm_performance_normalized_df.to_csv("../data/05_model_output/fiftyfifty_lightgbm_performance_normalized_df.csv")




Elapsed time to compute best fit: 11.831 seconds
Cross-validation score: 0.6717039845647655
Test score: 0.5681818181818182
Best Hyperparameters: {}
5396.670615058392
1799.7686098180711
120045.10319454968
134.11846401169896
704.9035040140152
10.366311930119991
0.0
10.069184593856335
14.650080259889364
11.029239892959595
2622.099277164787
133.50733575224876
0.06914160028100014
0.0
387.67689675092697
0.0
460.6848460957408
0.14550599455833435
1.0003018900752068
12.164654809981585
0.5356176905333996
13.354241758584976
45.83915804326534
21.719510078430176
92.94408492743969
17.228439807891846
0.1174900010228157
1.9347860738635063
56.44881973415613
34.66618651151657
13.47803020477295
408.2693593092263
233.10002787411213
331.83079332858324
2042.7317341193557
0.0
0.0
0.0
426.89832520484924
3252.0856094919145
974.9881523400545
32.51532655581832
25.89369957894087
591.4629293754697
24.376262962818146
1.755947008728981
286.3863976188004
0.0
0.0
0.0
151.7824035063386
2228.1085266321898
253.7350911051



# 5. Modeling - Non-Normalization

In [6]:
features = processedData_mxShopFeatures
labels = processedData_mxShopLabels

## 5.1 Rebalancing Strategy - None

### 5.1.1 Random Forest

In [None]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

none_randomforest_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    pipeline = Pipeline(steps = [#['smote', SMOTE(sampling_strategy = 0.5, n_jobs=2)],
                              #['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    none_randomforest_nonnormalized_performance_df = pd.concat([none_randomforest_nonnormalized_performance_df, new_performance_df])
    
none_randomforest_nonnormalized_performance_df.to_csv("../data/05_model_output/none_randomforest_nonnormalized_performance_df.csv")

### 5.1.2 XGBoost

In [14]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


none_xgboost_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': [],
        'cv_score': [],
        'test_score': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    GXBoostPipeline = Pipeline(steps = [#['smote', SMOTE()],
                                    #['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=space, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(type(new_performance))
    print(new_performance_df)
    new_performance_df = pd.DataFrame(new_performance)
    
    cv_score_array = [cv_score]
    test_score_array = [test_score]
  
    # Using 'Address' as the column name
    # and equating it to the list
    new_performance_df['cv_score'] = cv_score_array
    new_performance_df['test_score'] = test_score_array
    
    none_xgboost_nonnormalized_performance_df = pd.concat([none_xgboost_nonnormalized_performance_df, new_performance_df])
    
    print(none_xgboost_nonnormalized_performance_df)

none_xgboost_nonnormalized_performance_df.to_csv("../data/05_model_output/none_xgboost_nonnormalized_performance_hyperparameter_fhalf_df.csv")




Elapsed time to compute best fit: 1398.972 seconds
Cross-validation score: 0.8580835171925448
Test score: 0.5737704918032787
Best Hyperparameters: {'classifier__min_child_weight': 9, 'classifier__max_depth': 12, 'classifier__learning_rate': 0.35, 'classifier__gamma': 0.1, 'classifier__colsample_bytree': 0.8}
0.027060336
0.02657418
0.06091818
0.0
0.033658937
0.011471945
0.0
0.0
0.0
0.0
0.025657088
0.004869667
0.0
0.0
0.030228166
0.0
0.03633201
0.002460159
0.0
0.0
0.0
0.04055345
0.005087352
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0024800962
0.009542161
0.0009427847
0.0
0.0020863754
0.0054643895
0.0
0.010119024
0.0
0.0
0.0
0.0
0.0
0.0097494675
0.0
0.0
0.0
0.0
0.014767831
0.0043677106
0.102277234
0.096444346
0.01572338
0.001752062
0.0
0.004650726
0.0
0.0
0.0
0.0
0.001913054
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.010781068
0.015439428
0.0
0.0051251394
0.030844178
0.017113695
0.01313323
0.027034467
0.0025352498
0.0061360346
0.0
0.0059890784
0.0
0.0044179265
0.0
0.0
0.0122822



Elapsed time to compute best fit: 1497.546 seconds
Cross-validation score: 0.8469812328858612
Test score: 0.655737704918033
Best Hyperparameters: {'classifier__min_child_weight': 9, 'classifier__max_depth': 4, 'classifier__learning_rate': 0.25, 'classifier__gamma': 0.7, 'classifier__colsample_bytree': 1.0}
0.031246787
0.02461585
0.05543514
0.0
0.05777382
0.020388754
0.0
0.0
0.0
0.0
0.023663945
0.0049281716
0.0
0.0
0.02924011
0.0
0.0
0.0
0.0
0.0
0.0
0.03344807
0.007264928
0.015874542
0.0
0.0
0.0
0.0
0.0
0.009750613
0.0
0.0035277444
0.0
0.0035536145
0.0061080065
0.0
0.0
0.0
0.0
0.008565672
0.0
0.0059989695
0.0
0.010863172
0.0
0.028088434
0.0061305254
0.0
0.0
0.0
0.0
0.0067714225
0.011216676
0.0
0.0
0.0
0.0
0.0
0.008336717
0.04405682
0.04864308
0.023500841
0.0
0.0
0.0067943432
0.0
0.0
0.0
0.0
0.01248832
0.0
0.0
0.021231798
0.0
0.0
0.0
0.0
0.011258664
0.0155519415
0.0
0.008903076
0.040253565
0.014384421
0.0
0.02949222
0.0076476717
0.051105417
0.0
0.0051572807
0.0
0.0
0.0
0.020478863
0.0188



Elapsed time to compute best fit: 1585.907 seconds
Cross-validation score: 0.8323520703383404
Test score: 0.8163265306122449
Best Hyperparameters: {'classifier__min_child_weight': 10, 'classifier__max_depth': 8, 'classifier__learning_rate': 0.2, 'classifier__gamma': 0.2, 'classifier__colsample_bytree': 0.6}
0.03440917
0.022945596
0.07082729
0.0
0.023787247
0.03016624
0.02156841
0.0
0.0
0.0
0.003128863
0.019886898
0.0
0.0
0.018219894
0.0
0.015219429
0.0
0.0
0.0
0.0
0.034056567
0.011109067
0.0
0.0
0.0
0.0
0.0
0.0
0.0067432355
0.0
0.004874998
0.0
0.0
0.0
0.0041693076
0.0
0.0
0.0
0.007559601
0.007913764
0.0
0.0
0.0
0.0
0.0
0.005508682
0.0
0.0
0.0
0.0
0.0
0.01731819
0.0
0.0
0.0
0.0
0.0071004536
0.0095775435
0.035266843
0.12671457
0.052889667
0.0
0.0
0.0
0.0
0.0
0.0068635796
0.0
0.011170234
0.019358454
0.0
0.0028379175
0.0
0.0
0.0
0.0
0.007790304
0.016254684
0.0
0.0
0.041853666
0.022596652
0.014705816
0.023745086
0.009308911
0.009564578
0.0
0.008830543
0.0
0.004660196
0.0
0.0015069741
0.0187



Elapsed time to compute best fit: 1627.470 seconds
Cross-validation score: 0.8545512331553521
Test score: 0.8490566037735848
Best Hyperparameters: {'classifier__min_child_weight': 9, 'classifier__max_depth': 4, 'classifier__learning_rate': 0.4, 'classifier__gamma': 0.8, 'classifier__colsample_bytree': 0.8}
0.023422407
0.03304791
0.090817176
0.0
0.053193115
0.027778776
0.0
0.0
0.0
0.0
0.020765804
0.0
0.0
0.0
0.028397415
0.0
0.045911577
0.0
0.0
0.0
0.0
0.041613605
0.007819906
0.0
0.0
0.0
0.0
0.0
0.0
0.0139705455
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0038726858
0.0153662935
0.016256718
0.0037978108
0.0
0.0
0.009221184
0.0
0.005030212
0.0
0.0
0.0
0.007938508
0.0
0.013381723
0.0
0.0
0.0
0.0
0.0044746427
0.011828481
0.068275854
0.04075442
0.024449917
0.0087375
0.0
0.008376755
0.0
0.0
0.0
0.0
0.014467896
0.0
0.0
0.009740472
0.0
0.0
0.0
0.0
0.011718019
0.010966483
0.0
0.0055991095
0.048230346
0.007908915
0.015899265
0.01300851
0.0091036
0.009313932
0.0
0.0
0.0
0.004307201
0.0
0.0
0.0030003039
0.0




Elapsed time to compute best fit: 1575.189 seconds
Cross-validation score: 0.81053289766713
Test score: 0.9016393442622951
Best Hyperparameters: {'classifier__min_child_weight': 9, 'classifier__max_depth': 2, 'classifier__learning_rate': 0.55, 'classifier__gamma': 0.7, 'classifier__colsample_bytree': 0.9}
0.042415038
0.017314587
0.11133856
0.0
0.059716824
0.0
0.01888198
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.021411968
0.0
0.021610761
0.0
0.0
0.0
0.0
0.040782783
0.015798692
0.0
0.0
0.0
0.0
0.0
0.0
0.009489368
0.0
0.0
0.0
0.0
0.01877317
0.0
0.0
0.0
0.0
0.008178784
0.016609356
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.020094935
0.0
0.0
0.0
0.0
0.0
0.0067750467
0.20170622
0.045003857
0.0
0.00998987
0.0
0.0064825197
0.0
0.0
0.0
0.0
0.0
0.01508205
0.0
0.013966216
0.0
0.0
0.0
0.0
0.007167047
0.008354575
0.0
0.010448437
0.06432578
0.0
0.0
0.034295186
0.0043684496
0.02147952
0.0
0.0
0.0
0.0
0.0
0.0
0.022150174
0.0
0.0
0.0037598314
0.0
0.010039915
0.0
0.009797187
0.0
0.004933614
0.0
0.0
0.009946545
0



Elapsed time to compute best fit: 1628.539 seconds
Cross-validation score: 0.7973849988281227
Test score: 0.8771929824561404
Best Hyperparameters: {'classifier__min_child_weight': 9, 'classifier__max_depth': 16, 'classifier__learning_rate': 0.2, 'classifier__gamma': 0.2, 'classifier__colsample_bytree': 0.8}
0.023663705
0.02195563
0.042025615
0.0
0.044185955
0.013943908
0.0
0.0
0.0
0.0
0.041066658
0.006101533
0.0
0.0
0.02262935
0.0
0.024247115
0.0
0.0
0.0
0.0
0.005892229
0.009274564
0.018120099
0.0
0.016355418
0.0
0.0
0.0066096033
0.009189359
0.0
0.00434883
0.006461588
0.0
0.017226866
0.0
0.0
0.0
0.0
0.013299914
0.014918481
0.0
0.0
0.0077795265
0.0
0.0
0.010377186
0.0
0.0
0.0
0.013295432
0.0
0.007601033
0.0
0.0
0.0050971247
0.0
0.008489592
0.007795713
0.032840755
0.034264315
0.058833595
0.0
0.0
0.0
0.0
0.00586845
0.0
0.007893474
0.0086417
0.014052247
0.0013769238
0.012781253
0.0
0.0025262416
0.0
0.0
0.0047153467
0.017105697
0.0
0.0075047165
0.033761755
0.02842155
0.027578248
0.02796687




Elapsed time to compute best fit: 1585.669 seconds
Cross-validation score: 0.8402307193884665
Test score: 0.7317073170731707
Best Hyperparameters: {'classifier__min_child_weight': 10, 'classifier__max_depth': 18, 'classifier__learning_rate': 0.55, 'classifier__gamma': 0.7, 'classifier__colsample_bytree': 0.2}
0.061685972
0.017314395
0.059070025
0.0
0.035394896
0.0
0.0
0.0
0.0
0.0
0.03921107
0.0
0.0
0.0
0.020391827
0.0
0.019041682
0.0
0.0061031412
0.013225303
0.0
0.014796904
0.0041940026
0.0
0.0
0.0
0.0
0.0
0.0
0.0049587404
0.0
0.0
0.014444299
0.0
0.003933671
0.0
0.0
0.018904597
0.0
0.008850975
0.009723062
0.033510942
0.0
0.0
0.008865407
0.0050042206
0.0042697634
0.0
0.0
0.0
0.021708148
0.0
0.0
0.0
0.0
0.0
0.0
0.012932643
0.005970247
0.0
0.004821477
0.046222053
0.0
0.0
0.0031827602
0.012540288
0.0
0.0
0.0
0.010929249
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.007003825
0.014939473
0.0
0.0
0.026239786
0.019622933
0.009260473
0.04839845
0.013188019
0.00384391
0.0
0.0047608884
0.0
0.0
0.0
0.0
0.0229467



Elapsed time to compute best fit: 1670.892 seconds
Cross-validation score: 0.7911324651431774
Test score: 0.7692307692307693
Best Hyperparameters: {'classifier__min_child_weight': 4, 'classifier__max_depth': 12, 'classifier__learning_rate': 0.4, 'classifier__gamma': 0.0, 'classifier__colsample_bytree': 0.9}
0.014651898
0.016049547
0.032367405
0.010188894
0.025099536
0.0076975906
0.0
0.0019914191
0.01050671
0.0
0.05367147
0.0074781124
0.0
0.0
0.018533573
0.0
0.0033997938
0.026582694
0.0
0.0
0.0
0.033418704
0.005156682
0.0
0.0
0.0
0.0149444295
0.0
0.00214083
0.014495571
0.009753681
0.01149922
0.023408057
0.00859064
0.0018341261
0.0
0.0
0.0019426807
0.0067134397
0.0038017868
0.0050118626
0.004472599
0.024432227
0.0070314216
0.018197877
0.019282231
0.0
0.0
0.0
0.0
0.0039344234
0.0
0.006139644
0.0
0.0
0.0
0.007667579
0.0
0.0060908394
0.04325637
0.052321352
0.067367606
0.011061435
0.00239858
0.0029787817
0.0039755823
0.0
0.0
0.007778987
0.009077878
0.007991638
0.0
0.010238003
0.0
0.009137376



Elapsed time to compute best fit: 1605.801 seconds
Cross-validation score: 0.8276506678883588
Test score: 0.7894736842105263
Best Hyperparameters: {'classifier__min_child_weight': 7, 'classifier__max_depth': 2, 'classifier__learning_rate': 0.25, 'classifier__gamma': 0.7, 'classifier__colsample_bytree': 0.3}
0.03479883
0.016235813
0.10923325
0.0
0.03142682
0.011070861
0.011721479
0.0
0.0
0.0
0.013109973
0.007312582
0.0
0.0
0.051585503
0.0
0.024258161
0.0
0.0
0.0
0.0
0.03608265
0.0051737074
0.0093201
0.019617537
0.013788378
0.0
0.0
0.0
0.013143515
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0060430253
0.0
0.038757358
0.008249855
0.023233302
0.0
0.0
0.003835126
0.0
0.0
0.0
0.0
0.0
0.007889562
0.0
0.012478467
0.0
0.0
0.0
0.0
0.0058688545
0.0
0.0
0.07297753
0.03029146
0.0
0.0
0.0
0.0
0.0
0.0
0.007823972
0.005683502
0.010563157
0.003075074
0.008140188
0.0
0.0
0.0
0.0
0.004463916
0.005920045
0.0
0.005463874
0.041129418
0.021824896
0.016433593
0.005780097
0.009502471
0.0041888086
0.0
0.0
0.0
0.0
0.0
0.01134



Elapsed time to compute best fit: 1457.680 seconds
Cross-validation score: 0.7963967890741345
Test score: 0.8196721311475408
Best Hyperparameters: {'classifier__min_child_weight': 10, 'classifier__max_depth': 10, 'classifier__learning_rate': 0.55, 'classifier__gamma': 0.9, 'classifier__colsample_bytree': 0.6}
0.03021961
0.021977192
0.07026057
0.0
0.029757163
0.0
0.0
0.0
0.0
0.0
0.0140219815
0.062335797
0.0
0.0
0.051566023
0.0
0.046323255
0.0
0.0
0.0
0.0
0.04677971
0.018396966
0.0
0.0
0.0
0.0
0.0
0.0
0.010831753
0.0037822127
0.0051650438
0.012098678
0.0
0.006512684
0.0
0.0
0.0
0.0
0.006841854
0.013939798
0.0
0.0
0.0052278573
0.0
0.0052791745
0.005263227
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.014650741
0.004600978
0.024749698
0.10955219
0.0
0.0
0.007572385
0.0
0.0
0.0
0.0
0.01364155
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.005399542
0.020683238
0.0
0.021381218
0.07250879
0.016965961
0.011372958
0.0
0.0
0.021322362
0.0
0.01964055
0.0
0.007772356
0.0
0.0
0.040327005
0.0
0.0
0.006138463
0.0
0.0



Elapsed time to compute best fit: 1638.508 seconds
Cross-validation score: 0.8066844882191452
Test score: 0.7792207792207791
Best Hyperparameters: {'classifier__min_child_weight': 10, 'classifier__max_depth': 10, 'classifier__learning_rate': 0.5, 'classifier__gamma': 0.1, 'classifier__colsample_bytree': 0.8}
0.031261735
0.03108646
0.0599805
0.0
0.025391875
0.050957188
0.020008473
0.0
0.0
0.0
0.042697694
0.0039195125
0.0
0.0
0.03645991
0.0
0.03473061
0.0
0.0
0.0
0.0
0.05157139
0.011440879
0.018057711
0.0
0.0
0.0
0.0
0.0
0.0
0.0069201584
0.0
0.012594747
0.0
0.0033360422
0.0
0.0
0.024917854
0.002556083
0.008154499
0.015056738
0.005834713
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0029573985
0.0
0.0038417429
0.0
0.0
0.0
0.0
0.0
0.0065095234
0.024390882
0.113401495
0.04862199
0.0027279328
0.0
0.0039896592
0.0
0.0
0.0
0.0
0.009439437
0.0
0.0
0.009460472
0.0
0.0
0.0
0.0
0.0067520784
0.009982855
0.0
0.0
0.037503093
0.016904978
0.022190005
0.019000662
0.00091402075
0.0
0.0
0.011834588
0.0
0.0019900517
0



Elapsed time to compute best fit: 1532.185 seconds
Cross-validation score: 0.8255699635562335
Test score: 0.6666666666666666
Best Hyperparameters: {'classifier__min_child_weight': 4, 'classifier__max_depth': 8, 'classifier__learning_rate': 0.55, 'classifier__gamma': 0.5, 'classifier__colsample_bytree': 0.1}
0.024543049
0.00831028
0.038782846
0.0
0.03681212
0.0021150687
0.014999829
0.0
0.015480613
0.0
0.03372744
0.004601714
0.0141901905
0.0
0.0
0.013133407
0.043022998
0.005395967
0.012203612
0.035463613
0.016646953
0.028047865
0.0119060455
0.013960314
0.0033841114
0.0
0.0026119354
0.0029578279
0.015973585
0.007920673
0.0
0.0049278103
0.0
0.0
0.005104083
0.0034743056
0.002589165
0.0
0.0030455473
0.034465052
0.0
0.006661427
0.002774527
0.002463605
0.005790954
0.002828993
0.01235152
0.0
0.0
0.0
0.0078054415
0.0
0.0050241314
0.0
0.0
0.0
0.0064982735
0.0049138693
0.012271332
0.0059918105
0.05364388
0.009353136
0.003948906
0.0028419034
0.0038621346
0.0
0.0057247495
0.0
0.00246396
0.005761758




Elapsed time to compute best fit: 1557.480 seconds
Cross-validation score: 0.8370298656339846
Test score: 0.7142857142857142
Best Hyperparameters: {'classifier__min_child_weight': 10, 'classifier__max_depth': 20, 'classifier__learning_rate': 0.2, 'classifier__gamma': 0.7, 'classifier__colsample_bytree': 0.2}
0.022456435
0.022872915
0.07106755
0.0
0.017550718
0.016993487
0.0061123534
0.0
0.01564303
0.0
0.019006291
0.033139586
0.0
0.0
0.0042608427
0.0034895083
0.01176273
0.010018735
0.0
0.003501987
0.0
0.023634983
0.0061513656
0.0019908375
0.04694696
0.0
0.0
0.0
0.0
0.00561257
0.0
0.007390417
0.007079714
0.004921866
0.0
0.007229518
0.0
0.0021084894
0.015871705
0.016011605
0.024781806
0.027077574
0.0
0.03243608
0.0070359935
0.008467675
0.0026715396
0.0
0.0
0.0
0.0067255935
0.0054709534
0.0053796843
0.0
0.0
0.0
0.028547885
0.0038794724
0.0047538206
0.011563818
0.024760885
0.034382526
0.0
0.0
0.0026199364
0.0
0.0
0.0
0.0
0.0076042293
0.0057429974
0.0
0.006201258
0.0
0.0
0.002392902
0.0
0.00



Elapsed time to compute best fit: 1521.631 seconds
Cross-validation score: 0.8524214736623387
Test score: 0.8163265306122449
Best Hyperparameters: {'classifier__min_child_weight': 8, 'classifier__max_depth': 16, 'classifier__learning_rate': 0.55, 'classifier__gamma': 0.1, 'classifier__colsample_bytree': 0.9}
0.021643942
0.04155194
0.04053874
0.0
0.04416553
0.026722938
0.0
0.0007264727
0.00966886
0.0
0.0
0.005265869
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.030745802
0.0056598624
0.0
0.0055331625
0.0
0.0
0.0
0.0
0.0
0.0
0.0013478128
0.0042530657
0.0
0.012562218
0.0
0.0
0.0
0.0014250792
0.010861232
0.023579376
0.06546404
0.0
0.0
0.0
0.0
0.012278082
0.0
0.0
0.0
0.0044278274
0.0028117143
0.018242821
0.0
0.0
0.0
0.0
0.0
0.007561177
0.06006602
0.10063314
0.05305441
0.010637318
0.0
0.0041645435
0.017791336
0.0
0.0
0.002606198
0.023892974
0.009050708
0.0
0.0040711584
0.0
0.0
0.0
0.0
0.005676488
0.018772649
0.0
0.00644925
0.050698258
0.0036346724
0.010490921
0.0
0.0070226016
0.004804665
0.0
0.01818



Elapsed time to compute best fit: 1455.131 seconds
Cross-validation score: 0.8519444747133533
Test score: 0.7777777777777777
Best Hyperparameters: {'classifier__min_child_weight': 10, 'classifier__max_depth': 12, 'classifier__learning_rate': 0.25, 'classifier__gamma': 0.6, 'classifier__colsample_bytree': 0.1}
0.0362863
0.039779216
0.062741384
0.0
0.029504707
0.0074693686
0.013276365
0.0
0.018414157
0.0
0.0
0.012189711
0.0
0.0
0.0
0.0032284274
0.02250143
0.005032219
0.0
0.013647152
0.025813185
0.027935034
0.010861295
0.005148955
0.003384772
0.0070602763
0.0
0.014023594
0.0
0.020472387
0.0
0.0033607364
0.0
0.0
0.0
0.008346337
0.0
0.0021626092
0.00991079
0.004862918
0.0077928524
0.023484303
0.0028310865
0.0026628915
0.0
0.0038185627
0.007032476
0.0
0.0
0.0
0.002398756
0.008181687
0.0066620884
0.0
0.0
0.0
0.011716205
0.0
0.0045880037
0.012193406
0.051256612
0.03200141
0.0025362866
0.0
0.007938579
0.0
0.0
0.003096965
0.0
0.0058880867
0.0
0.004112216
0.0
0.0
0.0
0.0
0.0
0.0037270186
0.003773



Elapsed time to compute best fit: 1375.025 seconds
Cross-validation score: 0.8527972448582177
Test score: 0.660377358490566
Best Hyperparameters: {'classifier__min_child_weight': 6, 'classifier__max_depth': 10, 'classifier__learning_rate': 0.15, 'classifier__gamma': 0.7, 'classifier__colsample_bytree': 0.4}
0.029836249
0.014501749
0.0432135
0.0
0.015269862
0.008428171
0.0
0.0
0.0073453593
0.0065262173
0.009992246
0.011997316
0.0
0.0
0.015691308
0.013891175
0.028458063
0.0
0.0
0.0
0.030084077
0.019105751
0.0081232805
0.0
0.0
0.0
0.0
0.0
0.0
0.007419197
0.006116866
0.0038081235
0.0
0.0
0.0
0.0
0.0
0.0
0.012474128
0.013121254
0.0058882087
0.013046848
0.064875826
0.013057631
0.008318499
0.007459388
0.004345794
0.0
0.0
0.0
0.006126439
0.0041454663
0.009368843
0.0
0.0
0.0
0.0
0.0
0.0
0.034651984
0.025650349
0.060548574
0.0
0.0
0.0
0.0027552329
0.0
0.0
0.008675352
0.012664734
0.017448058
0.0
0.011629012
0.0
0.0
0.0
0.0
0.005210222
0.018424299
0.0068941326
0.006541003
0.0344616
0.01514494
0.01



Elapsed time to compute best fit: 1557.767 seconds
Cross-validation score: 0.8655896988848933
Test score: 0.7407407407407408
Best Hyperparameters: {'classifier__min_child_weight': 10, 'classifier__max_depth': 10, 'classifier__learning_rate': 0.6, 'classifier__gamma': 0.0, 'classifier__colsample_bytree': 0.4}
0.049029894
0.032901026
0.07706269
0.0038442763
0.034602396
0.0
0.0
0.0
0.0
0.0
0.0
0.0102778645
0.0
0.0
0.0
0.0
0.035680298
0.0
0.0
0.0
0.0
0.033453472
0.0028055601
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.009005654
0.0
0.02044309
0.0042203213
0.0
0.047693625
0.0049471357
0.0101508
0.005326472
0.0
0.0
0.0
0.007639247
0.0
0.003831852
0.0
0.0
0.0
0.0
0.0
0.0027496433
0.04262364
0.020632584
0.14511158
0.0
0.0
0.0034536205
0.0
0.0
0.0
0.0013974378
0.0
0.03093984
0.0
0.016479203
0.0
0.0
0.0
0.0
0.0063438937
0.007756429
0.002429222
0.01807963
0.032510005
0.022675447
0.015959397
0.0
0.002880803
0.009316059
0.0
0.0014441338
0.0
0.0064435555
0.0
0.0025857151
0.05185796



Elapsed time to compute best fit: 1331.115 seconds
Cross-validation score: 0.8408237105993066
Test score: 0.7777777777777777
Best Hyperparameters: {'classifier__min_child_weight': 3, 'classifier__max_depth': 2, 'classifier__learning_rate': 0.2, 'classifier__gamma': 0.3, 'classifier__colsample_bytree': 0.6}
0.030296715
0.015574882
0.09014362
0.010712637
0.03877248
0.01602085
0.017128255
0.0
0.014282883
0.0
0.016687956
0.018629476
0.0
0.0
0.02337949
0.0
0.015367319
0.0
0.0
0.0
0.0
0.029385015
0.0
0.0
0.0
0.010359352
0.0
0.0
0.0
0.0
0.0
0.003096814
0.0013682799
0.0
0.009990874
0.0
0.0
0.0033112033
0.0
0.0
0.011008514
0.0
0.0
0.016639512
0.020664291
0.008163209
0.0
0.0
0.0
0.0
0.005109932
0.0
0.008271528
0.0
0.0
0.0
0.004567231
0.0025759456
0.008710413
0.043916438
0.04680552
0.022942506
0.012809973
0.0
0.0036316735
0.0042577954
0.0
0.0
0.0075774966
0.009508162
0.006201367
0.0
0.0
0.0
0.0
0.0
0.0
0.0031355696
0.0068151397
0.0
0.0
0.052708738
0.01515592
0.0
0.037363395
0.02274633
0.0
0.0
0.0



Elapsed time to compute best fit: 1620.029 seconds
Cross-validation score: 0.82769706269005
Test score: 0.7058823529411765
Best Hyperparameters: {'classifier__min_child_weight': 7, 'classifier__max_depth': 2, 'classifier__learning_rate': 0.5, 'classifier__gamma': 0.1, 'classifier__colsample_bytree': 1.0}
0.032322057
0.018382782
0.05843469
0.0
0.025330769
0.02450468
0.0
0.0
0.045940384
0.0
0.07628528
0.0
0.0
0.0
0.03212443
0.0
0.029252537
0.0
0.0
0.0
0.0
0.03864527
0.0038189522
0.005480243
0.0
0.0
0.0
0.0
0.0
0.011373758
0.0046595945
0.00468784
0.0026619781
0.0
0.004754598
0.0
0.0
0.0
0.0042585516
0.013789252
0.008975683
0.002413596
0.0
0.0
0.0
0.0
0.0018146057
0.0
0.0
0.0
0.0035259377
0.0
0.006048507
0.0
0.0
0.0
0.003726551
0.015733069
0.0031865034
0.14292333
0.0
0.03470034
0.0
0.0
0.0043989425
0.0
0.0
0.0
0.0
0.0032029015
0.015637213
0.0
0.009513105
0.0
0.0
0.0
0.0
0.0041725496
0.0067171585
0.0016727813
0.010540143
0.041652896
0.02829771
0.0
0.0
0.0076298374
0.0
0.0
0.005899286
0.0
0.



Elapsed time to compute best fit: 1514.991 seconds
Cross-validation score: 0.8123205590726075
Test score: 0.7547169811320755
Best Hyperparameters: {'classifier__min_child_weight': 10, 'classifier__max_depth': 10, 'classifier__learning_rate': 0.3, 'classifier__gamma': 0.4, 'classifier__colsample_bytree': 0.3}
0.03477623
0.0146381
0.072828285
0.0
0.025654485
0.05265948
0.008024569
0.010236505
0.0
0.0
0.018324314
0.0
0.0
0.0
0.01965831
0.014055947
0.002993202
0.0
0.0
0.0
0.0
0.034417037
0.0067689638
0.011746616
0.0
0.0
0.0
0.0
0.0
0.0026019316
0.0
0.0
0.007955087
0.0
0.0043490506
0.0
0.0
0.0
0.0
0.00777432
0.030600898
0.019166362
0.0
0.0
0.008973131
0.0
0.0041130357
0.0
0.0
0.0
0.0
0.0
0.0088985115
0.0
0.0
0.0
0.013342607
0.0043329094
0.005430892
0.032681752
0.032013718
0.04285833
0.0079679955
0.011401423
0.0055398685
0.0
0.0
0.0
0.00443522
0.008139184
0.006912443
0.0
0.009606989
0.0
0.0
0.0
0.0
0.008325484
0.022302218
0.0013506141
0.0033320882
0.03902884
0.03103534
0.024467345
0.01632143



Elapsed time to compute best fit: 1526.148 seconds
Cross-validation score: 0.8034970691352177
Test score: 0.7894736842105263
Best Hyperparameters: {'classifier__min_child_weight': 1, 'classifier__max_depth': 12, 'classifier__learning_rate': 0.15, 'classifier__gamma': 0.6, 'classifier__colsample_bytree': 0.4}
0.017323501
0.011286418
0.030507386
0.0038713363
0.012113552
0.008912943
0.0064142533
0.005442061
0.0061532524
0.006569197
0.0075518605
0.0064636376
0.0
0.0
0.009162448
0.012492706
0.0154045485
0.004692261
0.004147171
0.0
0.0062378454
0.018265462
0.0085553955
0.009379002
0.010582722
0.018378722
0.0
0.0062251533
0.0040343455
0.008127265
0.0
0.005933266
0.0077891336
0.012177072
0.0070495713
0.006363657
0.0025265033
0.006039981
0.012085311
0.010720868
0.017034644
0.010824145
0.0077618994
0.009397161
0.012699667
0.007245022
0.0071975146
0.0
0.0
0.0
0.00900556
0.0083019715
0.0052252547
0.010892463
0.0
0.0
0.014916417
0.0032462855
0.009366101
0.027556414
0.019319016
0.049003627
0.0152107



Elapsed time to compute best fit: 1517.312 seconds
Cross-validation score: 0.8130625574376958
Test score: 0.6923076923076923
Best Hyperparameters: {'classifier__min_child_weight': 1, 'classifier__max_depth': 16, 'classifier__learning_rate': 0.3, 'classifier__gamma': 0.8, 'classifier__colsample_bytree': 0.8}
0.017222555
0.02006795
0.024390291
0.012172934
0.01526878
0.010819824
0.0083431685
0.0
0.0065542897
0.004889781
0.013579792
0.0037732706
0.0
0.0
0.01128631
0.012570192
0.012685544
0.0053379377
0.0
0.0
0.011311333
0.016390273
0.0053150174
0.0046773157
0.010667261
0.0
0.0
0.0
0.0058019455
0.0067280973
0.008773183
0.0
0.0
0.0066051786
0.007379075
0.0
0.0
0.0
0.008646832
0.017182644
0.011390239
0.01855783
0.017580345
0.0071496386
0.018270891
0.0
0.003899414
0.0
0.0
0.0
0.007062233
0.0051773386
0.011613422
0.0
0.0
0.014103161
0.0046404027
0.019391984
0.009772104
0.03120659
0.04713799
0.0818144
0.0050013303
0.0
0.0
0.0
0.008824001
0.0
0.008191303
0.011810686
0.019962788
0.0
0.009271174
0.



Elapsed time to compute best fit: 1567.466 seconds
Cross-validation score: 0.8400198684763531
Test score: 0.7246376811594203
Best Hyperparameters: {'classifier__min_child_weight': 1, 'classifier__max_depth': 18, 'classifier__learning_rate': 0.15, 'classifier__gamma': 0.2, 'classifier__colsample_bytree': 0.5}
0.017783398
0.010445803
0.023265544
0.0069849156
0.013042193
0.009539294
0.0072332597
0.0044642272
0.0071337754
0.004953865
0.010919995
0.005994347
0.002657501
0.0
0.00947422
0.0
0.020102803
0.0047182417
0.005347487
0.0052059675
0.009526406
0.012553446
0.004668325
0.003976835
0.008236402
0.0
0.0
0.0043390603
0.0066785053
0.0036627545
0.004747202
0.01801832
0.011034906
0.0032005971
0.006261696
0.004800747
0.00469419
0.006768458
0.013532625
0.03761838
0.0087037375
0.0143677145
0.006070503
0.009417574
0.013436762
0.03934586
0.0042739236
0.0
0.0
0.0
0.0076544173
0.0004485746
0.0049625686
0.0055663814
0.0
0.0
0.00753941
0.0048160967
0.0036263578
0.0301554
0.031324908
0.03888488
0.006316



Elapsed time to compute best fit: 1681.048 seconds
Cross-validation score: 0.8176605875690542
Test score: 0.8461538461538461
Best Hyperparameters: {'classifier__min_child_weight': 9, 'classifier__max_depth': 8, 'classifier__learning_rate': 0.25, 'classifier__gamma': 0.6, 'classifier__colsample_bytree': 0.2}
0.027981607
0.019025546
0.039914478
0.0
0.025728663
0.012835792
0.0021307191
0.0
0.015966978
0.0
0.02019319
0.016637834
0.0
0.0
0.019855244
0.0029060037
0.010486606
0.0
0.0030290885
0.008340126
0.0
0.015449965
0.0040073553
0.009077386
0.018733641
0.008455257
0.0076214727
0.0
0.0
0.012157812
0.0
0.0
0.003479851
0.0
0.016277775
0.004577668
0.0
0.0059053013
0.012100523
0.014764965
0.005198684
0.016816402
0.0
0.012541658
0.0051368135
0.00817843
0.004454905
0.0
0.0
0.0
0.0
0.011866252
0.0075404863
0.0
0.0
0.0
0.021503156
0.0026310477
0.007552999
0.0
0.066509865
0.035368416
0.0020129674
0.0
0.0030403954
0.009506022
0.004358728
0.0
0.0
0.01479404
0.0
0.0
0.0068007526
0.0
0.0
0.0
0.0
0.0027



Elapsed time to compute best fit: 1490.370 seconds
Cross-validation score: 0.8282951177466567
Test score: 0.8771929824561404
Best Hyperparameters: {'classifier__min_child_weight': 10, 'classifier__max_depth': 20, 'classifier__learning_rate': 0.35, 'classifier__gamma': 0.5, 'classifier__colsample_bytree': 0.3}
0.04014066
0.02067864
0.058260985
0.0
0.028923104
0.0
0.011637523
0.0
0.0
0.0
0.03864932
0.0
0.0
0.0
0.040493187
0.0
0.013114424
0.0
0.0
0.027486484
0.0
0.043072112
0.0054737604
0.0
0.0
0.0
0.0
0.0
0.006639543
0.01238989
0.0
0.0048587057
0.0056446698
0.0
0.0
0.0
0.0
0.0
0.02143818
0.012465622
0.006913664
0.014057444
0.0
0.005579692
0.008386345
0.004165718
0.0030776907
0.0
0.0
0.0
0.008532318
0.0
0.015436978
0.0
0.0
0.0
0.0
0.012169782
0.0040262644
0.056593277
0.03615816
0.05604669
0.0
0.0
0.002093178
0.0
0.0
0.0
0.017853608
0.0060463552
0.022801097
0.0
0.007917485
0.0
0.0
0.0
0.0
0.004084143
0.012184312
0.0
0.0024262457
0.023723288
0.03792321
0.02349078
0.01664557
0.0054342924
0.0

### 5.1.3 LightGBM

In [None]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

none_lightgbm_performance_nonnormalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    LightGBMPipeline = Pipeline(steps = [#['smote', SMOTE()],
                                    #['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    none_lightgbm_performance_nonnormalized_df = pd.concat([none_lightgbm_performance_nonnormalized_df, new_performance_df])
    

none_lightgbm_performance_nonnormalized_df.to_csv("../data/05_model_output/none_lightgbm_performance_nonnormalized_df.csv")


## 5.2 Rebalancing Strategy - SMOTE

### 5.2.1 Random Forests

In [None]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

smote_randomforest_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    pipeline = Pipeline(steps = [['smote', SMOTE()],
                              #['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    smote_randomforest_nonnormalized_performance_df = pd.concat([smote_randomforest_nonnormalized_performance_df, new_performance_df])
    
smote_randomforest_nonnormalized_performance_df.to_csv("../data/05_model_output/smote_randomforest_nonnormalized_performance_df.csv")

### 5.2.2 XGBoost

In [None]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


smote_xgboost_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    GXBoostPipeline = Pipeline(steps = [['smote', SMOTE()],
                                    #['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    smote_xgboost_nonnormalized_performance_df = pd.concat([smote_xgboost_nonnormalized_performance_df, new_performance_df])
    

smote_xgboost_nonnormalized_performance_df.to_csv("../data/05_model_output/smote_xgboost_nonnormalized_performance_df.csv")


### 5.2.3 LightGBM

In [None]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

smote_lightgbm_performance_nonnormalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    LightGBMPipeline = Pipeline(steps = [['smote', SMOTE()],
                                    #['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    smote_lightgbm_performance_nonnormalized_df = pd.concat([smote_lightgbm_performance_nonnormalized_df, new_performance_df])
    

smote_lightgbm_performance_nonnormalized_df.to_csv("../data/05_model_output/smote_lightgbm_performance_nonnormalized_df.csv")


## 5.3 Rebalancing Strategy - UNDER

### 5.3.1 Random Forest

In [None]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

under_randomforest_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    pipeline = Pipeline(steps = [#['smote', SMOTE()],
                              ['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    under_randomforest_nonnormalized_performance_df = pd.concat([under_randomforest_nonnormalized_performance_df, new_performance_df])
    
under_randomforest_nonnormalized_performance_df.to_csv("../data/05_model_output/under_randomforest_nonnormalized_performance_df.csv")

### 5.3.2 XGBoost

In [None]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


under_xgboost_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    GXBoostPipeline = Pipeline(steps = [#['smote', SMOTE()],
                                    ['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    under_xgboost_nonnormalized_performance_df = pd.concat([under_xgboost_nonnormalized_performance_df, new_performance_df])
    

under_xgboost_nonnormalized_performance_df.to_csv("../data/05_model_output/under_xgboost_nonnormalized_performance_df.csv")


### 5.3.3 LightGBM

In [None]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

under_lightgbm_performance_nonnormalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    LightGBMPipeline = Pipeline(steps = [#['smote', SMOTE()],
                                    ['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    under_lightgbm_performance_nonnormalized_df = pd.concat([under_lightgbm_performance_nonnormalized_df, new_performance_df])
    

under_lightgbm_performance_nonnormalized_df.to_csv("../data/05_model_output/under_lightgbm_performance_nonnormalized_df.csv")


## 5.1 Rebalancing Strategy - 5050

### 5.4.1 Random Forest

In [None]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

fiftyfifty_randomforest_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    pipeline = Pipeline(steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                              ['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    fiftyfifty_randomforest_nonnormalized_performance_df = pd.concat([fiftyfifty_randomforest_nonnormalized_performance_df, new_performance_df])
    
fiftyfifty_randomforest_nonnormalized_performance_df.to_csv("../data/05_model_output/fiftyfifty_randomforest_nonnormalized_performance_df.csv")

### 5.4.2 XGBoost

In [None]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


fiftyfifty_xgboost_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    GXBoostPipeline = Pipeline(steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                                    ['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    fiftyfifty_xgboost_nonnormalized_performance_df = pd.concat([fiftyfifty_xgboost_nonnormalized_performance_df, new_performance_df])
    

fiftyfifty_xgboost_nonnormalized_performance_df.to_csv("../data/05_model_output/fiftyfifty_xgboost_nonnormalized_performance_df.csv")


### 5.4.3 LightGBM

In [None]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

fiftyfifty_lightgbm_performance_nonnormalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    LightGBMPipeline = Pipeline(steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                                    ['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    fiftyfifty_lightgbm_performance_nonnormalized_df = pd.concat([fiftyfifty_lightgbm_performance_nonnormalized_df, new_performance_df])
    

fiftyfifty_lightgbm_performance_nonnormalized_df.to_csv("../data/05_model_output/fiftyfifty_lightgbm_performance_nonnormalized_df.csv")
