In [1]:
#Import Python Libraries
import pandas as pd
import pickle
import time
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt')

#Import Self-written Functions
import os
import sys
src_dir = os.path.join(os.getcwd(), '..', 'src')
sys.path.append(src_dir)

from d00_utils.calculateTimeDifference import calculateTimeDifference #Function to calc time difference
from d01_data.loadCommits import loadCommits #Function to load SVN data
from d02_intermediate.cleanCommitData import cleanCommitData #Function to clean commit data
from d02_intermediate.cleanJiraData import cleanJiraData #Function to clean JIRA data

from d03_processing.createFittedTF_IDF import createFittedTF_IDF #Function to see if a trace is valid
from d03_processing.createCorpusFromDocumentList import createCorpusFromDocumentList #Function to create a corpus
from d03_processing.checkValidityTrace import checkValidityTrace #Function to see if a trace is valid
from d03_processing.calculateTimeDif import calculateTimeDif #Calculate the time difference between 2 dates in seconds
from d03_processing.checkFullnameEqualsEmail import checkFullnameEqualsEmail #Check if fullName is equal to the email
from d03_processing.calculateCosineSimilarity import calculateCosineSimilarity #Calculate the cos similarity
from d03_processing.calculateDocumentStatistics import calculateUniqueWordCount
from d03_processing.calculateDocumentStatistics import calculateTotalWordCount
from d03_processing.calculateDocumentStatistics import calculateOverlapBetweenDocuments

from d04_modelling.summariseClassDistribution import summariseClassDistribution #Visualize the class distribution
from d04_modelling.showModelPerformance import showModelPerformance # Show several performance measures

#Display full value of a column
pd.set_option('display.max_colwidth', None)

#Display all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rande\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rande\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 1. Load Raw Data

In [2]:
#Set dataset

datasetDirectory = ""

In [3]:
#Import raw JIRA dataset
rawData_JIRA_mxShop = pd.read_excel('../data/01_raw/JIRA Mendix Engagement export_22_06_2021.xlsx')

#import
rawData_SVN_dataProcessing = loadCommits('../data/01_raw/MxShop-dump.txt')

# 2. Clean Raw Data
## 2.1 Clean Raw Data - SVN Data
Clean the raw data of the SVN files

In [4]:
from datetime import datetime
import re
import pandas as pd
import string

#nltk for NLP 
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.util import ngrams

#Function to transform natural text into unigram tokens
def preprocessNaturalLanguage(text, porterStemmer, cachedStopWords):
    string_text = str(text)
    #lowercase the string
    lower_case_string = string_text.lower()
    
    #Remove interpunction
    no_interpunction = lower_case_string.translate(str.maketrans('','',string.punctuation))
    
    #Remove numbers
    no_numbers = ''.join([i for i in no_interpunction if not i.isdigit()])
    
    #tokenize string
    tokens = word_tokenize(no_interpunction)
    
    #remove stopwords
    tokens_without_sw = [word for word in tokens if not word in cachedStopWords]
    
    #Stem the tokens
    stemmedToken = list(map(porterStemmer.stem, tokens_without_sw))

    return(stemmedToken)

#Function to transform natural text into n-gram tokens
def preprocessNGrams(text, porterStemmer, cachedStopWords, nGramSize):
    string_text = str(text)
    
    #lowercase the string
    lower_case_string = string_text.lower()
    
    #Remove interpunction
    no_interpunction = lower_case_string.translate(str.maketrans('','',string.punctuation))
    
    #Remove numbers
    no_numbers = ''.join([i for i in no_interpunction if not i.isdigit()])
    
    #tokenize string
    tokens = word_tokenize(no_interpunction)
    
    #Create the ngrams
    ngrams = list(nltk.ngrams(tokens, nGramSize))
    
    #remove all the n-grams containing a stopword
    cleanNGrams = [ngram for ngram in ngrams if not any(stop in ngram for stop in cachedStopWords)]
    
    #Stem the tokens
    stemmedNGrams = []
    for ngram in cleanNGrams:
        stemmed = list(map(porterStemmer.stem, ngram))
        stemmedNGrams.append(stemmed)
    return(stemmedNGrams)

#Function to transform date into a date object
def preprocessCommitDate(date_string):
    date_time_obj = datetime.strptime(date_string, '%Y-%m-%dT%H:%M:%S.%fZ')  
    return(date_time_obj)
    
#Remove the found Issue key from the log
def removeIssueKey(log_message):
    issue_keys = re.findall(r"LRN+.[0-9]+|AFM+.[0-9]+|MA+.[0-9]+|AFI+.[0-9]+|EM+.[0-9]+|OE+.[0-9]+|EM+.[0-9]+", log_message)
    log_message_without_key = log_message
    for issue_key in issue_keys:
        log_message_without_key = log_message_without_key.replace(issue_key, "")
    return(log_message_without_key)

def unitNamesLambdaFunc(unitName, stemmer):
    #Lower case
    unitNameLowered = unitName.lower()
    
    #Remove interpunction
    noInterpunction = unitNameLowered.translate(str.maketrans('','',string.punctuation))
    
    #Remove numbers
    noNumbers = ''.join([i for i in noInterpunction if not i.isdigit()])
    
    stemmendUnitName = stemmer.stem(noInterpunction)
    
    
    return(stemmendUnitName)
    

def preprocessUnitNames(unitName, porterStemmer, cachedStopWords):
    if (isinstance(unitName, str)):
        #Split camelCasing
        unitNameSplitList = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', unitName)).split()
        
        porterStemmer = PorterStemmer() #create an object of class PorterStemmer
        
        #Preprocess each split found.
        unitNameLowered = list(map(lambda unitName: unitNamesLambdaFunc(unitName, porterStemmer), 
                                   unitNameSplitList))
        
        #Check for stopwords
        tokensWithoutSW = [word for word in unitNameLowered if not word in cachedStopWords]

        return(tokensWithoutSW)

def preprocessNGramsUnitNames(unitName, porterStemmer, cachedStopWords, nGramSize):
    if (isinstance(unitName, str)):
        #Split camelCasing
        unitNameSplitList = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', unitName)).split()
        
        cleanedUnitNames = []
        for unitNameSplit in unitNameSplitList:
            #Lower case unit names
            lowerCased = unitNameSplit.lower()

            #Remove interpunction
            removedInterpunction = lowerCased.translate(str.maketrans('','',string.punctuation))
            cleanedUnitNames.append(removedInterpunction)
            
        #Transform to string (needed for tokenizer
        unitNameString = ' '.join(cleanedUnitNames)

        #Tokenzize words
        tokenized = word_tokenize(unitNameString)
        
        #Create the ngrams
        ngrams = list(nltk.ngrams(tokenized, nGramSize))
        
        porterStemmer = PorterStemmer() #create an object of class PorterStemmer
        
        #remove all the n-grams containing a stopword
        cleanNGrams = [ngram for ngram in ngrams if not any(stop in ngram for stop in cachedStopWords)]
    
        #Stem the tokens
        stemmedNGrams = []
        for ngram in cleanNGrams:
            stemmed = list(map(porterStemmer.stem, ngram))
            stemmedNGrams.append(stemmed)
            
        return(stemmedNGrams)

#Method to clean all columns of the provided data
def cleanCommitData(rawCommitData): 
    #create an object of class PorterStemmer
    porterStemmer = PorterStemmer()
    
    #Find all stopwords
    cachedStopWords = stopwords.words("english")
    
    #Remove all revisions without an issue key in the log message
    commit_df = rawCommitData[rawCommitData["related_issue_key"].notna()]

    #Execute cleaning methods on dataset
    cleaned_commit_logs = commit_df['log'].apply(lambda x: removeIssueKey(x))
    processed_commit_logs = cleaned_commit_logs.apply(lambda x: preprocessNaturalLanguage(x, porterStemmer, cachedStopWords))
    processed_commit_logs_2grams = cleaned_commit_logs.apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 2))
    processed_commit_logs_3grams = cleaned_commit_logs.apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 3))
    processed_date_times = commit_df['date'].apply(lambda x: preprocessCommitDate(x))
    processed_unit_names = commit_df['impacted_unit_names'].apply(lambda x: preprocessUnitNames(x, porterStemmer, cachedStopWords))
    processed_unit_names_2grams = commit_df['impacted_unit_names'].apply(lambda x: preprocessNGramsUnitNames(x, porterStemmer, cachedStopWords, 2))
    processed_unit_names_3grams = commit_df['impacted_unit_names'].apply(lambda x: preprocessNGramsUnitNames(x, porterStemmer, cachedStopWords, 3))
    

    #Put all data together into a new dataframe
    commit_data = {'Revision': commit_df["revision"],
               'Email' : commit_df["email"],
               'Commit_date': processed_date_times,
               "Issue_key_commit": commit_df["related_issue_key"],
               'Logs': processed_commit_logs, 
               'Logs_2grams': processed_commit_logs_2grams, 
               'Logs_3grams': processed_commit_logs_3grams, 
               'Unit_names': processed_unit_names,
               'Unit_names_2grams': processed_unit_names_2grams,
               'Unit_names_3grams': processed_unit_names_3grams,
               'Commit_natural_text': processed_commit_logs + processed_unit_names,
               'Commit_natural_text_2grams': processed_commit_logs_2grams + processed_unit_names_2grams,
               'Commit_natural_text_3grams': processed_commit_logs_3grams + processed_unit_names_3grams
               }
               
    commit_processed_df = pd.DataFrame(data=commit_data)

    return(commit_processed_df)

In [5]:
#Start timer
startTime = time.time() 

intermediateData_SVN_dataProcessing = cleanCommitData(rawData_SVN_dataProcessing)

#Create a temp XLSX file for all intermediate datasets
intermediateData_SVN_dataProcessing.to_excel(excel_writer = "../data/02_intermediate/intermediateData_SVN_dataProcessing.xlsx", index = False)

#Create a pickle file for all intermediate datasets
intermediateData_SVN_dataProcessing.to_pickle(path= "../data/02_intermediate/intermediateData_SVN_dataProcessing.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished cleaning after " + timeDifference)

Finished cleaning after 0 minutes and 7.827357769012451 seconds


In [6]:
import re

import string
#nltk for NLP 
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag  import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from datetime import datetime
import numpy as np
import pandas as pd
import time
nltk.download('averaged_perceptron_tagger')


#Function to clean the comments
def clean_comments(comment):
    try:
        commentDates = re.findall(r"[0-9]{2} [A-Z][a-z]{2} [0-9]{4} [0-9]{2}:[0-9]{2};[a-zA-Z0-9_]{24};", comment)
        accountIds = re.findall(r"\[~accountid:[a-zA-Z0-9]{24}\]", comment)
               
        
        cleanedComment = comment.replace("nan",'')
        for commentDate in commentDates:
            cleanedComment = cleanedComment.replace(commentDate,'')
        
        for accountId in accountIds: 
            cleanedComment = cleanedComment.replace(accountId,'')
        
        return(cleanedComment)
    except:
        return("")

def preprocess(text, porterStemmer, cachedStopwords):
    string_text = str(text)
    #lowercase the string
    lower_case_string = string_text.lower()
    
    #Remove interpunction
    no_interpunction = lower_case_string.translate(str.maketrans('','',string.punctuation))
    
    #Remove numbers
    no_numbers = ''.join([i for i in no_interpunction if not i.isdigit()])
    
    #tokenize string
    tokens = word_tokenize(no_numbers)
    
    #remove stopwords
    tokens_without_sw = [word for word in tokens if not word in cachedStopwords]
    
    #Stem the tokens
    stemmedToken = list(map(porterStemmer.stem, tokens_without_sw))

    return(stemmedToken)

def preprocessNGrams(text, porterStemmer, cachedStopWords, nGramSize):
    string_text = str(text)
    
    #lowercase the string
    lower_case_string = string_text.lower()
    
    #Remove interpunction
    no_interpunction = lower_case_string.translate(str.maketrans('','',string.punctuation))
    
    #Remove numbers
    no_numbers = ''.join([i for i in no_interpunction if not i.isdigit()])
    
    #tokenize string
    tokens = word_tokenize(no_numbers)
    
    #Create the ngrams
    ngrams = list(nltk.ngrams(tokens, nGramSize))
    
    #remove all the n-grams containing a stopword
    cleanNGrams = [ngram for ngram in ngrams if not any(stop in ngram for stop in cachedStopWords)]
    
    #Stem the tokens
    stemmedNGrams = []
    for ngram in cleanNGrams:
        stemmed = list(map(porterStemmer.stem, ngram))
        stemmedNGrams.append(stemmed)
    return(stemmedNGrams)

#Function to transform date into a date object
def preprocess_jira_date(date_string):
    if(isinstance(date_string, str)):
        try:
            date_time_obj = datetime.strptime(date_string, '%d %b %Y %H:%M')
        except:
            date_time_obj = datetime.strptime(date_string, '%Y-%m-%d %H:%M:%S:%f')
        return(date_time_obj)
    elif(isinstance(date_string, datetime)): 
        return(date_string)
    else:
        return(np.nan)
    
    
def findVerbs(tokenList):
    posTags = pos_tag(tokenList)
    verbAbrList = ['VBP', 'VBG', 'VBN', 'VBP', 'VBZ', 'RB', 'RBR', 'RBS']
    verbList = []
    for posTag in posTags:
        if posTag[1] in verbAbrList:
            verbList.append(posTag[0])
    return(verbList)

#Preprocess all the features and transform to the format needed for further processing.
def preprocessJiraData(cleanDataFrame, preprocessComments, porterStemmer, cachedStopWords, startTime):
    if (preprocessComments == True):
        nOfSteps = '4'
    else:
        nOfSteps = '3'

    #preprocess Summaries
    jira_summaries = cleanDataFrame['Summary'].apply(lambda x: preprocess(x, porterStemmer, cachedStopWords))
    jira_summaries_2grams = cleanDataFrame['Summary'].apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 2))
    jira_summaries_3grams = cleanDataFrame['Summary'].apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 3))
    
    endTimeCleaningSummaries = time.time() - startTime
    print("1/" + nOfSteps + ") Finished Cleaning Summaries after " + str(endTimeCleaningSummaries) + " sec")

    #preprocess Descriptions
    jira_descriptions = cleanDataFrame['Description'].apply(lambda x: preprocess(x, porterStemmer, cachedStopWords))
    jira_descriptions_2grams = cleanDataFrame['Description'].apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 2))
    jira_descriptions_3grams = cleanDataFrame['Description'].apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 2))
    
    endTimeCleaningDescriptions = time.time() - startTime
    print("2/" + nOfSteps + ") Finished Cleaning Description after " + str(endTimeCleaningDescriptions) + " sec")

    #preprocess Dates
    jira_creation = cleanDataFrame['Created'].apply(lambda x: preprocess_jira_date(x))
    jira_updated = cleanDataFrame['Updated'].apply(lambda x: preprocess_jira_date(x))
    jira_resolved = cleanDataFrame['Resolved'].apply(lambda x: preprocess_jira_date(x))
    endTimeCleaningDates = time.time() - startTime
    print("3/" + nOfSteps + ") Finished Cleaning Dates after " + str(endTimeCleaningDates) + " sec")

    #Comments take too long for a test run.
    if (preprocessComments == True):
        jira_comments = cleanDataFrame['Comments'].apply(lambda x: preprocess(x, porterStemmer, cachedStopWords))
        jira_comments_2grams = cleanDataFrame['Comments'].apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 2))
        jira_comments_3grams = cleanDataFrame['Comments'].apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 2))
        endTimeCleaningComments = time.time() - startTime
        print("4/" + nOfSteps + ") Finished Cleaning Comments after " + str(endTimeCleaningComments) + " sec")

         #create JIRA corpus by merging Summary and Description
        jira_data = {'Issue_key_jira': cleanDataFrame['Issue key'], 
             'Assignee': cleanDataFrame['Assignee'],
             'Jira_created_date': jira_creation, 
             'Jira_updated_date': jira_updated, 
             'Jira_resolved_date': jira_resolved, 
             'Summary': jira_summaries, 
             'Summary_2grams': jira_summaries_2grams,
             'Summary_3grams': jira_summaries_3grams, 
             'Description': jira_descriptions,
             'Description_2grams': jira_descriptions_2grams,
             'Description_3grams': jira_descriptions_3grams,
             'Comments': jira_comments,
             'Comments_2grams': jira_comments_2grams,
             'Comments_3grams': jira_comments_3grams,
             'Jira_natural_text': jira_summaries +  jira_descriptions + jira_comments,
             'Jira_natural_text_2grams': jira_summaries_2grams +  jira_descriptions_2grams + jira_comments_2grams,
             'Jira_natural_text_3grams': jira_summaries_3grams +  jira_descriptions_3grams + jira_comments_3grams}
    else:
         #create JIRA corpus by merging Summary and Description
        jira_data = {'Issue_key_jira': cleanDataFrame['Issue key'], 
             'Assignee': cleanDataFrame['Assignee'],
             'Jira_created_date': jira_creation, 
             'Jira_updated_date': jira_updated, 
             'Jira_resolved_date': jira_resolved, 
             'Summary': jira_summaries,
             'Summary_2grams': jira_summaries_2grams,
             'Summary_3grams': jira_summaries_3grams,
             'Description': jira_descriptions,
             'Description_2grams': jira_descriptions_2grams,
             'Description_3grams': jira_descriptions_3grams,
             'Jira_natural_text': jira_summaries +  jira_descriptions,
             'Jira_natural_text_2grams': jira_summaries_2grams +  jira_descriptions_2grams,
             'Jira_natural_text_3grams': jira_summaries_3grams +  jira_descriptions_3grams}

    jira_processed_df = pd.DataFrame(data=jira_data)
    
    #Find verbs
    jira_processed_df['verbs'] = jira_processed_df['Jira_natural_text'].apply(lambda x: findVerbs(x))
    
    return(jira_processed_df)

#Input dataframe and num of_comments, and bool to determine if comments need to be cleaned
def cleanJiraData(dataFrame, cleanComments, commentAmount):
    startTime = time.time()

    #create an object of class PorterStemmer
    porterStemmer = PorterStemmer()
    
    #Find all stopwords
    cachedStopWords = stopwords.words("english")

    if (cleanComments == True):
        #Subset only all comments 
        loc_first_comment = dataFrame.columns.get_loc('Comment') # Variable storing the col location of the 1st comment
    
        dataFrame["Comments"] = dataFrame.iloc[:,loc_first_comment:loc_first_comment+commentAmount].apply(
            lambda x: " ".join(x.astype(str)), axis=1)
    
        #First remove the date and comment string from the comments
        dataFrame["Comments"] = dataFrame["Comments"].apply(lambda x: clean_comments(x))

        #Subset JIRA ID, Summary, Description, comments
        jira_issues_subset = dataFrame[["Issue key", "Assignee", "Summary", "Description", "Comments", "Created", "Resolved", "Updated"]]
        cleanedAndProcessedJiraData = preprocessJiraData(jira_issues_subset, preprocessComments = True, porterStemmer = porterStemmer, cachedStopWords = cachedStopWords, startTime = startTime)
        return(cleanedAndProcessedJiraData)
    else: 
        jira_issues_subset = dataFrame[["Issue key", "Assignee", "Summary", "Description", "Created", "Resolved", "Updated"]]
        cleanedAndProcessedJiraData = preprocessJiraData(jira_issues_subset, preprocessComments = False, porterStemmer = porterStemmer, cachedStopWords = cachedStopWords, startTime = startTime)
        return(cleanedAndProcessedJiraData)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rande\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [7]:
#Rename key to Issue key
rawData_JIRA_dataProcessing = rawData_JIRA_dataProcessing.rename({'Key': 'Issue key'}, axis=1)

#Clean Data sets
intermediateData_JIRA_dataProcessing = cleanJiraData(dataFrame = rawData_JIRA_dataProcessing, cleanComments = False, commentAmount = 39)

#Create a temp XLSX file for all intermediate datasets
intermediateData_JIRA_dataProcessing.to_excel(excel_writer = "../data/02_intermediate/intermediateData_JIRA_dataProcessing.xlsx", index = False)

#Create a pickle file for all intermediate datasets
intermediateData_JIRA_dataProcessing.to_pickle(path= "../data/02_intermediate/intermediateData_JIRA_dataProcessing.pkl")

1/3) Finished Cleaning Summaries after 0.09774184226989746 sec
2/3) Finished Cleaning Description after 0.5597755908966064 sec
3/3) Finished Cleaning Dates after 0.5667126178741455 sec


## 2.4 Clean Raw Data - Create JIRA Corpora
Create the corpora for JIRA UNIGRAM

In [8]:
def createCorpusFromDocumentList(token_column):
    token_list = token_column.tolist()
    corpus_list = []
    
    for document in token_list:
        #Only join to the string when a list. When it is not a list, then it is np.NaN, thus no changes
        if(isinstance(document, list)):
            #Transform list to a string for SKLEARN to accept the input.
            token_string = ' '.join(document)
        
            #Add string to the corpus list
            corpus_list.append(token_string)
    return(corpus_list)

In [9]:
#Create JIRA corpus for dataProcessing dataset
intermediateData_JIRA_dataProcessingCorpusSummary = createCorpusFromDocumentList(intermediateData_JIRA_dataProcessing.Summary)
intermediateData_JIRA_dataProcessingCorpusDescription = createCorpusFromDocumentList(intermediateData_JIRA_dataProcessing.Description)

#Merge all JIRA Corpora into 1 corpus
intermediateData_JIRA_dataProcessingCorpus = [i+" "+j for i,j in zip(intermediateData_JIRA_dataProcessingCorpusSummary,
                                                                             intermediateData_JIRA_dataProcessingCorpusDescription
                                                                            )]

#Save intermediate pickles
with open('../data/02_intermediate/intermediateData_JIRA_dataProcessingCorpus.pkl', 'wb') as f:
    pickle.dump(intermediateData_JIRA_dataProcessingCorpus, f)

Bigram corpora

In [10]:
def createCorpusNGrams(tokenColumn):
    tokenList = tokenColumn.tolist()
    corpusList = []
    
    #Transform to strings
    for document in tokenList:
        if(isinstance(document, list)):
            for ngram in document:
                ngramString = ' '.join(ngram)
                corpusList.append(ngramString)         
    return(corpusList)

In [11]:
#Create JIRA corpus for dataProcessing dataset
intermediateData_JIRA_dataProcessingCorpusSummary_2grams = createCorpusNGrams(intermediateData_JIRA_dataProcessing.Summary_2grams)
intermediateData_JIRA_dataProcessingCorpusDescription_2grams = createCorpusNGrams(intermediateData_JIRA_dataProcessing.Description_2grams)

#Merge all JIRA Corpora into 1 corpus
intermediateData_JIRA_dataProcessingCorpus_2gram = [i+" "+j for i,j in zip(intermediateData_JIRA_dataProcessingCorpusSummary_2grams,
                                                                             intermediateData_JIRA_dataProcessingCorpusDescription_2grams
                                                                             )]


#Save intermediate pickles
with open('../data/02_intermediate/intermediateData_JIRA_dataProcessingCorpus_2gram.pkl', 'wb') as f:
    pickle.dump(intermediateData_JIRA_dataProcessingCorpus_2gram, f)

## 2.4 Clean Raw Data - Create SVN Corpora
Create the corpora for SVN

In [12]:
intermediateData_SVN_dataProcessing = pd.read_pickle("../data/02_intermediate/intermediateData_SVN_dataProcessing.pkl")

In [13]:
#Create corpus for log messages
intermediateData_SVNLogs_dataProcessingCorpus = createCorpusFromDocumentList(intermediateData_SVN_dataProcessing.Logs)

#Create corpus for unit names
intermediateData_SVNUnitNames_dataProcessingCorpus = createCorpusFromDocumentList(intermediateData_SVN_dataProcessing.Unit_names)

#Create corpus for entire commit (log message + model)
intermediateData_SVN_dataProcessingCorpus = createCorpusFromDocumentList(intermediateData_SVN_dataProcessing.Logs + intermediateData_SVN_dataProcessing.Unit_names)
intermediateData_SVN_dataProcessingCorpusAll = createCorpusFromDocumentList(intermediateData_SVN_dataProcessing.Logs + intermediateData_SVN_dataProcessing.Unit_names)
#Save intermediate pickles
with open('../data/02_intermediate/intermediateData_SVNLogs_dataProcessingCorpus.pkl', 'wb') as f:
    pickle.dump(intermediateData_SVNLogs_dataProcessingCorpus, f)

with open('../data/02_intermediate/intermediateData_SVNUnitNames_dataProcessingCorpus.pkl', 'wb') as f:
    pickle.dump(intermediateData_SVNUnitNames_dataProcessingCorpus, f)

with open('../data/02_intermediate/intermediateData_SVN_dataProcessingCorpus.pkl', 'wb') as f:
    pickle.dump(intermediateData_SVN_dataProcessingCorpus, f)
    
with open('../data/02_intermediate/intermediateData_SVN_dataProcessingCorpusAll.pkl', 'wb') as f:
    pickle.dump(intermediateData_SVN_dataProcessingCorpusAll, f)

bigram corpora

In [14]:
intermediateData_SVNLogs_dataProcessingCorpus_2gram = createCorpusNGrams(intermediateData_SVN_dataProcessing.Logs_2grams)
intermediateData_SVNUnitNames_dataProcessingCorpus_2gram = createCorpusNGrams(intermediateData_SVN_dataProcessing.Unit_names_2grams)
with open('../data/02_intermediate/intermediateData_SVNLogs_dataProcessingCorpus_2gram.pkl', 'wb') as f:
    pickle.dump(intermediateData_SVNLogs_dataProcessingCorpus_2gram, f)
    
    
with open('../data/02_intermediate/intermediateData_SVNUnitNames_dataProcessingCorpus_2gram.pkl', 'wb') as f:
    pickle.dump(intermediateData_SVNUnitNames_dataProcessingCorpus_2gram, f)

# 3. Preprocess Data

In [15]:
#Run this code block when you've restarted the kernel, and want to use previously gained results.
intermediateData_JIRA_dataProcessing = pd.read_pickle("../data/02_intermediate/intermediateData_JIRA_dataProcessing.pkl")

intermediateData_SVN_dataProcessing = pd.read_pickle("../data/02_intermediate/intermediateData_SVN_dataProcessing.pkl")

intermediateData_JIRA_dataProcessingCorpus = pd.read_pickle(r'../data/02_intermediate/intermediateData_JIRA_dataProcessingCorpus.pkl')
intermediateData_JIRA_dataProcessingCorpus = pd.read_pickle(r'../data/02_intermediate/intermediateData_JIRA_dataProcessingCorpus.pkl')
#intermediateData_SVN_dataProcessingCorpusAll = pd.read_pickle(r'../data/02_intermediate/intermediateData_SVN_dataProcessingCorpusAll.pkl')
#intermediateData_SVN_dataProcessingCorpusModel = pd.read_pickle(r'../data/02_intermediate/intermediateData_SVN_dataProcessingCorpusModel.pkl')
intermediateData_SVN_dataProcessingCorpus = pd.read_pickle(r'../data/02_intermediate/intermediateData_SVN_dataProcessingCorpus.pkl')

############# Bigrams


############# Trigrams

## 3.0 Preprocess Data - Create cartesian product JIRA x Commits

In [16]:
#Create cartesian products JIRA x Commits
processedData_dataProcessingCartesian = intermediateData_JIRA_dataProcessing.merge(intermediateData_SVN_dataProcessing, how='cross')

processedData_dataProcessingCartesian = processedData_dataProcessingCartesian.drop(processedData_dataProcessingCartesian[processedData_dataProcessingCartesian.Jira_created_date > processedData_dataProcessingCartesian.Commit_date].index)

#Create a pickle file for all intermediate datasets
processedData_dataProcessingCartesian.to_pickle(path= "../data/03_processed/processedData_dataProcessingCartesian.pkl")


## 3.1 Preprocess Data - Create Labels

In [17]:
#Create new dataFrames for the time features
processedData_dataProcessingLabels = pd.DataFrame() 


#Create a column, which indicates which traces are valid.
processedData_dataProcessingLabels["is_valid"] = processedData_dataProcessingCartesian.apply(lambda x: checkValidityTrace(x.Issue_key_jira, x.Issue_key_commit), axis=1)
print("Finished creating labels for dataProcessing")

#Save intermediate results
processedData_dataProcessingLabels.to_pickle(path= "../data/03_processed/processedData_dataProcessingLabels.pkl")

processedData_dataProcessingLabels.info()

Finished creating labels for dataProcessing
<class 'pandas.core.frame.DataFrame'>
Int64Index: 27815 entries, 451 to 33755
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   is_valid  27815 non-null  bool 
dtypes: bool(1)
memory usage: 244.5 KB


In [18]:
#processedData_dataProcessingLabels[processedData_dataProcessingLabels.is_valid == True].count()
processedData_dataProcessingLabels[processedData_dataProcessingLabels.is_valid == True].count()

is_valid    552
dtype: int64

## 3.2 Preprocess Data - Create Time-Related Features

In [19]:
#Create new dataFrames for the time features
processedData_dataProcessingFeaturesTime = pd.DataFrame() 


#Calculate the time features for data Processing Dataset
processedData_dataProcessingFeaturesTime['Creation_commit_date_dif'] = processedData_dataProcessingCartesian.apply(lambda x: calculateTimeDif(x.Jira_created_date, x.Commit_date), axis=1)
processedData_dataProcessingFeaturesTime['Updated_commit_date_dif'] = processedData_dataProcessingCartesian.apply(lambda x: calculateTimeDif(x.Jira_updated_date, x.Commit_date), axis=1)
processedData_dataProcessingFeaturesTime['Resolved_commit_date_dif'] = processedData_dataProcessingCartesian.apply(lambda x: calculateTimeDif(x.Jira_resolved_date, x.Commit_date), axis=1)
print("Finished data Processing")

#Create a pickle file for all intermediate datasets
processedData_dataProcessingFeaturesTime.to_pickle(path= "../data/03_processed/processedData_dataProcessingFeaturesTime.pkl")

Finished data Processing


## 3.3 Preprocess Data - Create Stakeholder-Related Features

In [20]:
#Create new dataFrames for the Stakeholder features
processedData_dataProcessingFeaturesStakeholder = pd.DataFrame() 

processedData_dataProcessingFeaturesStakeholder['Assignee_is_commiter'] = processedData_dataProcessingCartesian.apply(lambda x: checkFullnameEqualsEmail(x.Assignee, x.Email), axis=1)
print("Finished dataProcessing")

#Create a pickle file for all intermediate datasets
processedData_dataProcessingFeaturesStakeholder.to_pickle(path= "../data/03_processed/processedData_dataProcessingFeaturesStakeholder.pkl")


Finished dataProcessing


## 3.4 Preprocess Data - Create Cosine Similarity Features
### 3.4.1 dataProcessing - Cosine Similarity UniGrams

In [21]:
from scipy import spatial
import pandas as pd

def calc_vector_representation(document, cv, fittedTF_IDF):        
    #Transform document type to a string
    documentString = document
    
    #Calculate the Term Frequency of the document
    inputDocs = [documentString] 

    # count matrix 
    count_vector = cv.transform(inputDocs) 
 
    #tf-idf scores 
    tf_idf_vector = fittedTF_IDF.transform(count_vector)

    feature_names = cv.get_feature_names() 
 
    #get tfidf vector for first document 
    document_vector=tf_idf_vector[0] 
 
    #print the scores 
    
    # place tf-idf values in a pandas data frame 
    df = pd.DataFrame(document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
    df.sort_values(by=["tfidf"],ascending=False)

    return(document_vector.T.todense())

def calculateCosineSimilarity(document1, document2, cv, fittedTF_IDF):

    #If both doc1 and doc2 are lists
    if (isinstance(document1, list) & isinstance(document2, list)):
        #Transform document to string type
        document1String = ' '.join(document1)
        document2String = ' '.join(document2)

    #Only document1 is a list
    elif(isinstance(document1, list)):
        #Transform document to string type
        document1String = ' '.join(document1)
        document2String = ''

    #Only document2 is a list
    elif(isinstance(document2, list)):
        #Transform document to string type
        document1String = ''
        document2String = ' '.join(document2)
        
    else:
        document1String = ''
        document2String = ''

    vector1 = calc_vector_representation(document1String, cv, fittedTF_IDF)
    vector2 = calc_vector_representation(document2String, cv, fittedTF_IDF)
    
    #The cosine similarity. Produces NaN if no terms are found in the corpus.
    result = 1 - spatial.distance.cosine(vector1, vector2)
    
    return(result)

def calculateCosineSimilarityNGrams(document1, document2, cv, fittedTF_IDF):

    #If both doc1 and doc2 are lists
    if (isinstance(document1, list) & isinstance(document2, list)):
        #Transform document to string type
        document1String = ' '.join(document1)
        document2String = ' '.join(document2)

    #Only document1 is a list
    elif(isinstance(document1, list)):
        #Transform document to string type
        document1String = ' '.join(document1)
        document2String = ''

    #Only document2 is a list
    elif(isinstance(document2, list)):
        #Transform document to string type
        document1String = ''
        document2String = ' '.join(document2)
        
    else:
        document1String = ''
        document2String = ''

    vector1 = calc_vector_representation(document1String, cv, fittedTF_IDF)
    vector2 = calc_vector_representation(document2String, cv, fittedTF_IDF)
    
    #The cosine similarity. Produces NaN if no terms are found in the corpus.
    result = 1 - spatial.distance.cosine(vector1, vector2)
    
    return(result)


def calculateCosineSimilarityWithPOSPruning(document1, document2, cv, fittedTF_IDF, verbList):

    #If both doc1 and doc2 are lists
    if (isinstance(document1, list) & isinstance(document2, list)):
        #Transform document to string type
        document1String = ' '.join(document1)
        document2String = ' '.join(document2)

    #Only document1 is a list
    elif(isinstance(document1, list)):
        #Transform document to string type
        document1String = ' '.join(document1)
        document2String = ''

    #Only document2 is a list
    elif(isinstance(document2, list)):
        #Transform document to string type
        document1String = ''
        document2String = ' '.join(document2)
        
    else:
        document1String = ''
        document2String = ''

    vector1 = calc_vector_representation(document1String, cv, fittedTF_IDF)
    vector2 = calc_vector_representation(document2String, cv, fittedTF_IDF)
    
    #The cosine similarity. Produces NaN if no terms are found in the corpus.
    result = 1 - spatial.distance.cosine(vector1, vector2)
    
    verbCounter = 0
    if(isinstance(document2, list)):
        for token in document2:
            if token in verbList:
                verbCounter = verbCounter + 1
    
    if verbCounter > 0:
        result = result * (1 + (0.1 * verbCounter))
    else:
        result = 0
    
    return(result)

In [22]:
#Instantiate the count vectorizer and tfidf for the corpus
from sklearn.feature_extraction.text import CountVectorizer 

######################################################
#                       dataProcessing              #
######################################################

################# Unigrams ###############
#instantiate CountVectorizer() for SVN
processedData_SVN_dataProcessingCountVectorizer = CountVectorizer()
processedData_SVN_dataProcessingCountTF_IDF = createFittedTF_IDF(processedData_SVN_dataProcessingCountVectorizer, intermediateData_SVN_dataProcessingCorpus)

processedData_SVNLogs_dataProcessingCountVectorizer = CountVectorizer()
processedData_SVNLogs_dataProcessingCountTF_IDF = createFittedTF_IDF(processedData_SVNLogs_dataProcessingCountVectorizer, intermediateData_SVNLogs_dataProcessingCorpus)

processedData_SVNUnitNames_dataProcessingCountVectorizer = CountVectorizer()
processedData_SVNUnitNames_dataProcessingCountTF_IDF = createFittedTF_IDF(processedData_SVNUnitNames_dataProcessingCountVectorizer, intermediateData_SVNUnitNames_dataProcessingCorpus)

#instantiate CountVectorizer() for JIRA - unigram
processedData_JIRA_dataProcessingCountVectorizer = CountVectorizer()
processedData_JIRA_dataProcessingCountTF_IDF = createFittedTF_IDF(processedData_JIRA_dataProcessingCountVectorizer, intermediateData_JIRA_dataProcessingCorpus)

processedData_JIRASummaries_dataProcessingCountVectorizer = CountVectorizer()
processedData_JIRASummaries_dataProcessingCountTF_IDF = createFittedTF_IDF(processedData_JIRASummaries_dataProcessingCountVectorizer, intermediateData_JIRA_dataProcessingCorpusSummary)

processedData_JIRADescriptions_dataProcessingCountVectorizer = CountVectorizer()
processedData_JIRADescriptions_dataProcessingCountTF_IDF = createFittedTF_IDF(processedData_JIRADescriptions_dataProcessingCountVectorizer, intermediateData_JIRA_dataProcessingCorpusDescription)

#processedData_JIRAComments_dataProcessingCountVectorizer = CountVectorizer()
#processedData_JIRAComments_dataProcessingCountTF_IDF = createFittedTF_IDF(processedData_JIRAComments_dataProcessingCountVectorizer, intermediateData_JIRA_dataProcessingCorpusComments)


################# Bigrams ###############
#instantiate CountVectorizer() for SVN - bigrams
processedData_SVNLogs_dataProcessingCountVectorizer_2gram = CountVectorizer(ngram_range=(2, 2))
processedData_SVNLogs_dataProcessingCountTF_IDF_2gram = createFittedTF_IDF(processedData_SVNLogs_dataProcessingCountVectorizer_2gram, intermediateData_SVNLogs_dataProcessingCorpus_2gram)

processedData_SVNUnitNames_dataProcessingCountVectorizer_2gram = CountVectorizer()
processedData_SVNUnitNames_dataProcessingCountTF_IDF_2gram = createFittedTF_IDF(processedData_SVNUnitNames_dataProcessingCountVectorizer_2gram, intermediateData_SVNUnitNames_dataProcessingCorpus_2gram)


#instantiate CountVectorizer() for JIRA - biigram
processedData_JIRA_dataProcessingCountVectorizer_2gram = CountVectorizer(ngram_range=(2, 2))
processedData_JIRA_dataProcessingCountTF_IDF_2gram = createFittedTF_IDF(processedData_JIRA_dataProcessingCountVectorizer_2gram, intermediateData_JIRA_dataProcessingCorpus_2gram)

processedData_JIRASummaries_dataProcessingCountVectorizer_2gram = CountVectorizer(ngram_range=(2, 2))
processedData_JIRASummaries_dataProcessingCountTF_IDF_2gram = createFittedTF_IDF(processedData_JIRASummaries_dataProcessingCountVectorizer_2gram, intermediateData_JIRA_dataProcessingCorpusSummary_2grams)

processedData_JIRADescriptions_dataProcessingCountVectorizer_2gram = CountVectorizer(ngram_range=(2, 2))
processedData_JIRADescriptions_dataProcessingCountTF_IDF_2gram = createFittedTF_IDF(processedData_JIRADescriptions_dataProcessingCountVectorizer_2gram, intermediateData_JIRA_dataProcessingCorpusDescription_2grams)

#processedData_JIRAComments_dataProcessingCountVectorizer_2gram = CountVectorizer(ngram_range=(2, 2))
#processedData_JIRAComments_dataProcessingCountTF_IDF_2gram = createFittedTF_IDF(processedData_JIRAComments_dataProcessingCountVectorizer_2gram, intermediateData_JIRA_dataProcessingCorpusComments_2grams)




#### 3.4.1 [VSM unigram] Similarity between JIRA issue and Commit Log - Jira As Query

In [23]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmLogsJiraAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmLogsJiraAsQuery["vsm_logs_jira_as_query"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Logs, processedData_JIRA_dataProcessingCountVectorizer, processedData_JIRA_dataProcessingCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmLogsJiraAsQuery.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmLogsJiraAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

  dist = 1.0 - uv / np.sqrt(uu * vv)


Finished creating 'VSM Logs Jira as query' after 2 minutes and 34.81702733039856 seconds


#### 3.4.2 [VSM unigram] Similarity between JIRA issue and Commit Log - Log As Query

In [24]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmLogsLogAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmLogsLogAsQuery["vsm_logs_log_as_query"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Logs, processedData_SVNLogs_dataProcessingCountVectorizer, processedData_SVNLogs_dataProcessingCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmLogsLogAsQuery.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmLogsLogAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 2 minutes and 2.514415740966797 seconds


#### 3.4.3 [VSM unigram] Similarity between JIRA issue and Unit Names - JIRA As Query

In [25]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmUnitNamesJiraAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmUnitNamesJiraAsQuery["vsm_unit_names_jira_as_query"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Unit_names, processedData_JIRA_dataProcessingCountVectorizer, processedData_JIRA_dataProcessingCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmUnitNamesJiraAsQuery.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmUnitNamesJiraAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 2 minutes and 39.73747515678406 seconds


#### 3.4.1 [VSM unigram] Similarity between JIRA Summary and Commit Log - Jira As Query

In [26]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmSummaryLogsSummaryAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmSummaryLogsSummaryAsQuery["vsm_summary_logs_summary_as_query"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Summary, x.Logs, processedData_JIRASummaries_dataProcessingCountVectorizer, processedData_JIRASummaries_dataProcessingCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmSummaryLogsSummaryAsQuery.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmSummaryLogsSummaryAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 1 minutes and 6.654573440551758 seconds


#### 3.4.1 [VSM unigram] Similarity between JIRA Summary and Commit Log - Log As Query

In [27]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmSummaryLogsLogsAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmSummaryLogsLogsAsQuery["vsm_summary_logs_logs_as_query"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Summary, x.Logs, processedData_SVNLogs_dataProcessingCountVectorizer, processedData_SVNLogs_dataProcessingCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmSummaryLogsLogsAsQuery.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmSummaryLogsLogsAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 2 minutes and 34.33582663536072 seconds


#### 3.4.1 [VSM unigram] Similarity between JIRA Summary and UnitNames - Summary As Query

In [28]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmSummaryUnitNamesSummaryAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmSummaryUnitNamesSummaryAsQuery["vsm_summary_unitNames_summary_as_query"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Summary, x.Unit_names, processedData_JIRASummaries_dataProcessingCountVectorizer, processedData_JIRASummaries_dataProcessingCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmSummaryUnitNamesSummaryAsQuery.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmSummaryUnitNamesSummaryAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 1 minutes and 28.920072555541992 seconds


#### 3.4.1 [VSM unigram] Similarity between JIRA Summary and UnitNames - UnitNames As Query

In [29]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmSummaryUnitNamesUnitNamesAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmSummaryUnitNamesUnitNamesAsQuery["vsm_summary_unitNames_unitNames_as_query"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Summary, x.Unit_names, processedData_SVNUnitNames_dataProcessingCountVectorizer, processedData_SVNUnitNames_dataProcessingCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmSummaryUnitNamesSummaryAsQuery.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmSummaryUnitNamesUnitNamesAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 1 minutes and 27.108842611312866 seconds


#### 3.4.3 [VSM unigram - verb pruning] Similarity between JIRA issue and Unit Names - JIRA As Query

In [30]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmVerbPruningUnitNamesJiraAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmVerbPruningUnitNamesJiraAsQuery["vsm_verb_pruning_unit_names_jira_as_query"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarityWithPOSPruning(x.Jira_natural_text, x.Unit_names, processedData_JIRA_dataProcessingCountVectorizer, processedData_JIRA_dataProcessingCountTF_IDF, x.verbs), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmVerbPruningUnitNamesJiraAsQuery.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmVerbPruningUnitNamesJiraAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query and verb pruning' after " + timeDifference)

Finished creating 'VSM Logs Jira as query and verb pruning' after 2 minutes and 46.799163818359375 seconds


#### 3.4.4 [VSM unigram] Similarity between JIRA issue and Unit Names  - Unit Names As Query

In [31]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmUnitNamesUnitNamesAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmUnitNamesUnitNamesAsQuery["vsm_unit_names_log_as_query"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Unit_names, processedData_SVNUnitNames_dataProcessingCountVectorizer, processedData_SVNUnitNames_dataProcessingCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmUnitNamesUnitNamesAsQuery.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmUnitNamesUnitNamesAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 2 minutes and 54.604673624038696 seconds


#### 3.4.5 [VSM unigram] Similarity between JIRA description and commit log - Description as query

In [32]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmDescriptionDescriptionAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmDescriptionDescriptionAsQuery["vsm_description_description_as_query"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Description, x.Logs, processedData_JIRADescriptions_dataProcessingCountVectorizer, processedData_JIRADescriptions_dataProcessingCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmDescriptionDescriptionAsQuery.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmDescriptionDescriptionAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 2 minutes and 44.85198760032654 seconds


#### 3.4.5 [VSM unigram Silarity between JIRA description and commit log - Log as descrintion

In [33]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmDescriptionLogsAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmDescriptionLogsAsQuery["vsm_description_log_as_query"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Description, x.Unit_names, processedData_SVNUnitNames_dataProcessingCountVectorizer, processedData_SVNUnitNames_dataProcessingCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmDescriptionLogsAsQuery.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmDescriptionLogsAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 2 minutes and 40.53455090522766 seconds


#### 3.4.5 [VSM unigram Silarity between JIRA Comment and unitnames - Comment as query

#### 3.4.5 [VSM unigram Silarity between JIRA Comment and unitnames - Comment as query

#### 3.4.5 [VSM unigram Silarity between JIRA Comment and commit log - Comment as description

#### 3.4.5 [VSM unigram Silarity between JIRA description and commit log - Log as description

#### [VSM bigram] Similarity between JIRA comments and Commit Logs - Logs as query

#### 3.4.5 [VSM bigram] Silarity between JIRA Comment and commit log - Comment as query

#### [VSM Unigram] Similarity between Unit Names and Description - Unit Names as query

In [34]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmUnitNamesDescriptionUnitNamesAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmUnitNamesDescriptionUnitNamesAsQuery["vsm_unitnames_description_unitnames_as_query"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Description, x.Unit_names, processedData_SVNUnitNames_dataProcessingCountVectorizer, processedData_SVNUnitNames_dataProcessingCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmUnitNamesDescriptionUnitNamesAsQuery.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmUnitNamesDescriptionUnitNamesAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 2 minutes and 51.6970591545105 seconds


#### [VSM Unigram] Similarity between Unit Names and Description - Description as query

In [35]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmUnitNamesDescriptionDescriptionAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmUnitNamesDescriptionDescriptionAsQuery["vsm_unitnames_description_description_as_query"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Description, x.Unit_names, processedData_JIRADescriptions_dataProcessingCountVectorizer, processedData_JIRADescriptions_dataProcessingCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmUnitNamesDescriptionDescriptionAsQuery.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmUnitNamesDescriptionDescriptionAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 2 minutes and 4.561413288116455 seconds


#### [VSM Unigram] Similarity between Unit Names and Comments - Unit Names as query

#### [VSM Unigram] Similarity between Unit Names and Comments - Comments as query

In [36]:
#### [VSM Unigram] Similarity between SVN (entirely) and JIRA (entirely)- JIRA as query

In [37]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmSvnJiraJiraAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmSvnJiraJiraAsQuery["vsm_svn_jira_jira_as_query"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Commit_natural_text, processedData_JIRA_dataProcessingCountVectorizer, processedData_JIRA_dataProcessingCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmSvnJiraJiraAsQuery.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmSvnJiraJiraAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 3 minutes and 40.83568096160889 seconds


In [38]:
#### [VSM Unigram] Similarity between SVN (entirely) and JIRA (entirely) - SVN as query

In [39]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmSvnJiraSvnAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmSvnJiraSvnAsQuery["vsm_svn_jira_svn_as_query"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Commit_natural_text, processedData_SVN_dataProcessingCountVectorizer, processedData_SVN_dataProcessingCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmSvnJiraSvnAsQuery.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmSvnJiraSvnAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 2 minutes and 28.447618007659912 seconds


In [40]:
#### [VSM Unigram] Similarity between SVN (entirely) and Summary - SVN as query

In [41]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmSvnSummarySvnAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmSvnSummarySvnAsQuery["vsm_svn_summary_svn_as_query"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Commit_natural_text, x.Summary, processedData_SVN_dataProcessingCountVectorizer, processedData_SVN_dataProcessingCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmSvnSummarySvnAsQuery.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmSvnSummarySvnAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 3 minutes and 6.155735969543457 seconds


In [42]:
#### [VSM Unigram] Similarity between SVN (entirely) and Summary - Summary as query

In [43]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmSvnSummarySummaryAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmSvnSummarySummaryAsQuery["vsm_svn_summary_summary_as_query"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Commit_natural_text, x.Summary, processedData_JIRASummaries_dataProcessingCountVectorizer, processedData_JIRASummaries_dataProcessingCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmSvnSummarySummaryAsQuery.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmSvnSummarySummaryAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 2 minutes and 32.9554648399353 seconds


In [44]:
#### [VSM Unigram] Similarity between SVN (entirely) and Description - SVN as query

In [45]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmSvnDescriptionSvnAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmSvnDescriptionSvnAsQuery["vsm_svn_description_svn_as_query"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Commit_natural_text, x.Description, processedData_SVN_dataProcessingCountVectorizer, processedData_SVN_dataProcessingCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmSvnDescriptionSvnAsQuery.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmSvnDescriptionSvnAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 2 minutes and 6.522587299346924 seconds


In [46]:
#### [VSM Unigram] Similarity between SVN (entirely) and Description - Description as query

In [47]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmSvnDescriptionDescriptionAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmSvnDescriptionDescriptionAsQuery["vsm_svn_description_description_as_query"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Commit_natural_text, x.Description, processedData_JIRADescriptions_dataProcessingCountVectorizer, processedData_JIRADescriptions_dataProcessingCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmSvnDescriptionDescriptionAsQuery.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmSvnDescriptionDescriptionAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 2 minutes and 0.14793992042541504 seconds


In [48]:
#### [VSM Unigram] Similarity between SVN (entirely) and Comments - SVN as query

In [49]:
#### [VSM Unigram] Similarity between SVN (entirely) and Comments - Comments as query

#### 3.4.3 [VSM unigram - verb pruning] Similarity between JIRA issue and Unit Names and verb pruning - Unit Names As Query

In [50]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmVerbPruningUnitNamesUnitNamesAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmVerbPruningUnitNamesUnitNamesAsQuery["vsm_verb_pruning_unit_names_log_as_query"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarityWithPOSPruning(x.Jira_natural_text, x.Unit_names, processedData_SVNUnitNames_dataProcessingCountVectorizer, processedData_SVNUnitNames_dataProcessingCountTF_IDF, x.verbs), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmVerbPruningUnitNamesUnitNamesAsQuery.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmVerbPruningUnitNamesUnitNamesAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 2 minutes and 52.05518364906311 seconds


#### 3.4.5 [VSM bigram] Similarity between JIRA issue and Commit Log - Jira As Query

In [51]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmLogsJiraAsQuery_2gram = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmLogsJiraAsQuery_2gram["vsm_logs_jira_as_query_2gram"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Logs, processedData_JIRA_dataProcessingCountVectorizer_2gram, processedData_JIRA_dataProcessingCountTF_IDF_2gram), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmLogsJiraAsQuery_2gram.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmLogsJiraAsQuery_2gram.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 2 minutes and 30.792415142059326 seconds


#### 3.4.6 [VSM bigram] Similarity between JIRA issue and Commit Log - Logs As Query

In [52]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmLogsLogAsQuery_2gram = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmLogsLogAsQuery_2gram["vsm_logs_log_as_query_2gram"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Logs, processedData_SVNLogs_dataProcessingCountVectorizer_2gram, processedData_SVNLogs_dataProcessingCountTF_IDF_2gram), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmLogsLogAsQuery_2gram.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmLogsLogAsQuery_2gram.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 3 minutes and 51.17773699760437 seconds


#### 3.4.6 [VSM bigram] Similarity between JIRA issue and Unit Names - Jira As Query

In [53]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmUnitNamesJiraAsQuery_2gram = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmUnitNamesJiraAsQuery_2gram["vsm_unit_names_jira_as_query_2gram"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Unit_names, processedData_JIRA_dataProcessingCountVectorizer_2gram, processedData_JIRA_dataProcessingCountTF_IDF_2gram), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmUnitNamesJiraAsQuery_2gram.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmUnitNamesJiraAsQuery_2gram.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 2 minutes and 14.736358165740967 seconds


#### 3.4.6 [VSM bigram] Similarity between JIRA issue and Unit Names - UnitNames As Query

In [54]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmUnitNamesUnitNamesAsQuery_2gram = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmUnitNamesUnitNamesAsQuery_2gram["vsm_unit_names_log_as_query_2gram"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Unit_names, processedData_SVNUnitNames_dataProcessingCountVectorizer_2gram, processedData_SVNUnitNames_dataProcessingCountTF_IDF_2gram), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmUnitNamesUnitNamesAsQuery_2gram.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmUnitNamesUnitNamesAsQuery_2gram.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 2 minutes and 4.138108253479004 seconds


#### [VSM bigram] Similarity between Logs and Description - Logs as Query

In [55]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmDescriptionLogsAsQuery_2gram = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmDescriptionLogsAsQuery_2gram["vsm_description_log_as_query_2gram"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Description, x.Unit_names, processedData_SVNUnitNames_dataProcessingCountVectorizer_2gram, processedData_SVNUnitNames_dataProcessingCountTF_IDF_2gram), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmDescriptionLogsAsQuery_2gram.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmDescriptionLogsAsQuery_2gram.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 2 minutes and 32.443113565444946 seconds


#### [VSM bigram] Similarity between Logs and Description - Description as Query

In [56]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmDescriptionDescriptionAsQuery_2gram = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmDescriptionDescriptionAsQuery_2gram["vsm_description_description_as_query_2gram"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarity(x.Description, x.Logs, processedData_JIRADescriptions_dataProcessingCountVectorizer_2gram, processedData_JIRADescriptions_dataProcessingCountTF_IDF_2gram), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmDescriptionDescriptionAsQuery_2gram.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmDescriptionDescriptionAsQuery_2gram.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Bigrams' after " + timeDifference)

Finished creating 'VSM Bigrams' after 1 minutes and 22.284845113754272 seconds


#### [VSM bigram] Similarity between Logs and Summary - Logs as Query

#### [VSM bigram] Similarity between Logs and Summary - Summary as Query

In [57]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dataProcessing_features_VsmSummaryLogsSummaryAsQuery_2gram = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dataProcessing_features_VsmSummaryLogsSummaryAsQuery_2gram["vsm_summary_logs_summary_as_query_2gram"] = processedData_dataProcessingCartesian.apply(lambda x: calculateCosineSimilarityNGrams(x.Summary, x.Logs, processedData_JIRASummaries_dataProcessingCountVectorizer_2gram, processedData_JIRASummaries_dataProcessingCountTF_IDF_2gram), 
                                                            axis=1)

#Save results in pickle
processedData_dataProcessing_features_VsmSummaryLogsSummaryAsQuery_2gram.to_pickle(path= "../data/03_processed/processedData_dataProcessing_features_VsmSummaryLogsSummaryAsQuery_2gram.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 1 minutes and 6.003474950790405 seconds


## 3.6 Document Statistics

### dataProcessing

In [58]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRA_dataProcessingFeaturesUniqueWordCount = pd.DataFrame() 
processedData_SVN_dataProcessingFeaturesUniqueWordCount = pd.DataFrame() 
processedData_JIRA_dataProcessingFeaturesTotalWordCount = pd.DataFrame() 
processedData_SVN_dataProcessingFeaturesTotalWordCount = pd.DataFrame()

processedData_JIRA_dataProcessingFeaturesOverlapPercentage = pd.DataFrame()
processedData_SVN_dataProcessingFeaturesOverlapPercentage = pd.DataFrame()
processedData_UNION_dataProcessingFeaturesOverlapPercentage = pd.DataFrame()

#Calculate unique terms JIRA for each trace
processedData_JIRA_dataProcessingFeaturesUniqueWordCount["unique_term_count_jira"] = processedData_dataProcessingCartesian.apply(lambda x: calculateUniqueWordCount(x.Jira_natural_text), 
                                                            axis=1)
#Calculate unique terms JIRA for each trace
processedData_SVN_dataProcessingFeaturesUniqueWordCount["unique_term_count_svn"] = processedData_dataProcessingCartesian.apply(lambda x: calculateUniqueWordCount(x.Commit_natural_text), 
                                                            axis=1)

#Calculate total terms JIRA for each trace
processedData_JIRA_dataProcessingFeaturesTotalWordCount["total_term_count_jira"] = processedData_dataProcessingCartesian.apply(lambda x: calculateTotalWordCount(x.Jira_natural_text), 
                                                            axis=1)
#Calculate total terms JIRA for each trace
processedData_SVN_dataProcessingFeaturesTotalWordCount["total_term_count_svn"] = processedData_dataProcessingCartesian.apply(lambda x: calculateTotalWordCount(x.Commit_natural_text), 
                                                            axis=1)

processedData_JIRA_dataProcessingFeaturesOverlapPercentage["overlap_percentage_compared_to_jira"] = processedData_dataProcessingCartesian.apply(lambda x: calculateOverlapBetweenDocuments(x.Jira_natural_text, x.Commit_natural_text, 'list1'),
                                                            axis=1)
processedData_SVN_dataProcessingFeaturesOverlapPercentage["overlap_percentage_compared_to_svn"] = processedData_dataProcessingCartesian.apply(lambda x: calculateOverlapBetweenDocuments(x.Jira_natural_text, x.Commit_natural_text, 'list2'),
                                                            axis=1)
processedData_UNION_dataProcessingFeaturesOverlapPercentage["overlap_percentage_compared_to_union"] = processedData_dataProcessingCartesian.apply(lambda x: calculateOverlapBetweenDocuments(x.Jira_natural_text, x.Commit_natural_text, 'union'),
                                                            axis=1)





#Save results in pickle
processedData_JIRA_dataProcessingFeaturesUniqueWordCount.to_pickle(path= "../data/03_processed/processedData_JIRA_dataProcessingFeaturesUniqueWordCount.pkl")
processedData_SVN_dataProcessingFeaturesUniqueWordCount.to_pickle(path= "../data/03_processed/processedData_SVN_dataProcessingFeaturesUniqueWordCount.pkl")
processedData_JIRA_dataProcessingFeaturesTotalWordCount.to_pickle(path= "../data/03_processed/processedData_JIRA_dataProcessingFeaturesTotalWordCount.pkl")
processedData_SVN_dataProcessingFeaturesTotalWordCount.to_pickle(path= "../data/03_processed/processedData_SVN_dataProcessingFeaturesTotalWordCount.pkl")

processedData_JIRA_dataProcessingFeaturesOverlapPercentage.to_pickle(path= "../data/03_processed/processedData_JIRA_dataProcessingFeaturesOverlapPercentage.pkl")
processedData_SVN_dataProcessingFeaturesOverlapPercentage.to_pickle(path= "../data/03_processed/processedData_SVN_dataProcessingFeaturesOverlapPercentage.pkl")
processedData_UNION_dataProcessingFeaturesOverlapPercentage.to_pickle(path= "../data/03_processed/processedData_UNION_dataProcessingFeaturesOverlapPercentage.pkl")



endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating document statistics in " + timeDifference)

Finished creating document statistics in 0 minutes and 5.150083780288696 seconds


## 3.7 Query Quality

In [59]:
#Instantiate the count vectorizer and tfidf for the corpus
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import TfidfVectorizer 
from statistics import mean, median, mode, stdev, variance
from math import log, sqrt
import itertools

#Function calculating the IDFs of all query terms. Returns a list containing all IDFs
def calcIDFList(document, cv, tfidf_transformer):
    idfScoreList=[]
    if isinstance(document, list):
        termCount = len(document)
        for term in document:
            try:
                indexOfWord = cv.get_feature_names().index(term)
                idfScore = tfidf_transformer.idf_[indexOfWord]
                idfScoreList.append(idfScore)
            except:
                idfScoreList.append(0)
    else:
        termCount = 0
    return(idfScoreList)


def calcAvgIDF(IDFList):
    termCount = len(IDFList)
    if(termCount != 0):
        avgIdf = sum(IDFList) / termCount
    else:
        avgIdf = 0
    return(avgIdf)

def calcMaxIDF(IDFList): 
    termCount = len(IDFList)
    if(termCount != 0):
        maxIdf = np.amax(IDFList)
    else: 
        maxIdf = 0
    return(maxIdf)

def calcDevIDF(IDFList):
    termCount = len(IDFList)
    if(termCount > 1):
        stdevIdf = stdev(IDFList)
    else: 
        stdevIdf = 0
    return(stdevIdf)

#Function calculating the ICTF of all query terms. Returns a list containing all IDFs
def calcICTFList(document, cv, documentCount):
    ICTFList = []
        #For all terms in query, find how often they occur in the Corpus
    if isinstance(document, list):
        for term in document:
            try:
            #Find out how often the term occurs in the corpus
                termFrequency = (cv.vocabulary_[term])
                
                #Compute the log
                ictF = log(documentCount/termFrequency)
            except:
                ictF = 0
            
            ICTFList.append(ictF)
    return(ICTFList)

def calcAvgICTF(ICTFList, documentCount):
    avgICTF = sum(ICTFList) / documentCount
    return(avgICTF)


def calcMaxICTF(ICTFList): 
    termCount = len(ICTFList)
    if(termCount != 0):
        maxICTF = np.amax(ICTFList)
    else: 
        maxICTF = 0
    return(maxICTF)

def calcDevICTF(ICTFList):
    termCount = len(ICTFList)
    if(termCount > 1):
        stdevICTF = stdev(ICTFList)
    else: 
        stdevICTF = 0
    return(stdevICTF)


def calcEntropyList(query, cv, documentCount, docCollection):
    #entropy(t) = ∑ (d∈Dt)  ( tf(t,d) / tf(t, D) ) * log |D|(tf(t,d) / tf(t, D) )
        
    entropyValueList = []
    #for each term in the query, calculate the entropy of the query
    if isinstance(query, list):
        for queryTerm in query:
            #For each d ∈ D
            
            partialEntropyList = []
            
            for d in docCollection:
                #Check if queryTerm occurs in D (i.e/ d∈Dt)
                if (isinstance(d, list)):
                    if queryTerm in d:
                        try:
                            #Calculate the frequency of the term occurs in the document (i.e tf(t,d))
                            queryTermFrequencyInDocument = d.count(queryTerm)
                            
                            #calculate the frequency the term occurs in the query corpus (i.e tf(t,D))
                            queryTermFrequencyInCorpus = (cv.vocabulary_[queryTerm])
                             
                            # This part of the calculation tf(t,d) / tf(t, D)  * log |D|(tf(t,d) / tf(t, D))
                            partialEntropy1stHalf = queryTermFrequencyInDocument / queryTermFrequencyInCorpus
                            partialEntropy2ndHalf = log((queryTermFrequencyInDocument / queryTermFrequencyInCorpus), documentCount)
                            partialEntropy = partialEntropy1stHalf
                            partialEntropyList.append(partialEntropy)
                        except:
                            partialEntropyList.append(0) #If term not found entropy is 0
            #this part of the calculation ∑ (d∈Dt)
            entropyValueOfQueryTerm = sum(partialEntropyList)
            entropyValueList.append(entropyValueOfQueryTerm)
    
    return(entropyValueList)


def calcAvgEntropy(entropyValueList):
    termCount = len(entropyValueList)
    if(termCount != 0):
        #Calculate the average of all the entropies
        avgEntropy = sum(entropyValueList) / len(entropyValueList)
    else:
        avgEntropy = 0
    return(avgEntropy)

    
def calcMedEntropy(entropyValueList):
    termCount = len(entropyValueList)
    if(termCount != 0):
        #Calculate the average of all the entropies
        medEntropy = median(entropyValueList)
    else:
        medEntropy = 0
    return(medEntropy)
    
def calcMaxEntropy(entropyValueList):
    termCount = len(entropyValueList)
    if(termCount != 0):
        maxEntropy = np.amax(entropyValueList)
    else: 
        maxEntropy = 0
    return(maxEntropy)
    
def calcDevEntropy(entropyValueList):
    termCount = len(entropyValueList)
    if(termCount > 1):
        #Calculate the average of all the entropies
        devEntropy = stdev(entropyValueList)
    else:
        devEntropy = 0
    return(devEntropy)

#The percentage of documents in the collection containing at least one of the query terms
def calcQueryScope(query, docCollection): 
    counter = 0
    if isinstance(query, list):
        for document in docCollection:
            #check if query occurs in term. 
            if(isinstance(document, list)):
                for queryTerm in query:
                    if queryTerm in document:
                        counter = counter + 1
                        break
    queryScope = counter / len(docCollection)
    return(queryScope)

#The Kullback-Leiber divergence of the query language model from the collection language model
def calcSCS(query, cv, docCount):
    divergenceList = []
    if isinstance(query, list):
        for queryTerm in query:
            try:
                #frequency of term in query - tf(q, Q)/|Q|
                pqQ = query.count(queryTerm) / len(query)
                
                #frequency of term in documentlist - tf(q, D)/|D|
                pqD = cv.vocabulary_[queryTerm]
                
                divergence = pqQ * log(pqQ / pqD)
                divergenceList.append(divergence)
            except:
                continue
    SCS = sum(divergenceList)
    return(SCS)

#The average of the collection-query similarity (SCQ) over all query terms
def calcSCQList(query, docCollection, cv, fittedTF_IDF, documentCount):
    SCQList = []
    if isinstance(query, list):
        documentString = ' '.join(query)
        
        #Calculate the Term Frequency of the document
        inputDocs = [documentString] 
        
        # count matrix 
        count_vector = cv.transform(inputDocs) 
 
        #tf-idf scores 
        tf_idf_vector = fittedTF_IDF.transform(count_vector)
        
        feature_names = cv.get_feature_names() 
        # place tf-idf values in a pandas data frame 
        df = pd.DataFrame(tf_idf_vector.T.todense(), 
                          index=feature_names, columns=["tfidf"])
    
        
        #Find the tfidf of the term
        for queryTerm in query:    
            try:
                tfidf = df["tfidf"][queryTerm]
                SCQ = (1 + log(tfidf))
                SCQList.append(SCQ)
            except:
                continue
        
    avgSCQ = sum(SCQList) / documentCount
    return(SCQList)

#The average of the collection-query similarity (SCQ) over all query terms
def calcAvgSCQ(SCQList, documentCount):
    avgSCQ = sum(SCQList) / documentCount
    return(avgSCQ)
    
#The average of the collection-query similarity (SCQ) over all query terms
def calcMaxSCQ(SCQList):
    termCount = len(SCQList)
    if(termCount != 0):
        maxSCQ = np.amax(SCQList)
    else:
        maxSCQ = np.NaN
    return(maxSCQ)

#The average of the collection-query similarity (SCQ) over all query terms
def calcSumSCQ(SCQList):
    sumSCQ = sum(SCQList)
    return(sumSCQ)

def createTermPairs(cv):
    terms = list(cv.vocabulary_.keys())
    #Create all possible pair combinations from the terms in the query 
    pairCombinationList = list(itertools.combinations(terms, 2))
    return(pairCombinationList)

#Method to find out how often a term occurs in a document
def findTermFrequencies(cv, docCollection):
    terms = list(cv.vocabulary_.keys())
    termFrequencies = {}
    for term in terms:
        termCounter = 0
        for document in docCollection:
            if isinstance(document, list):
                if term in document: 
                    termCounter = termCounter + 1
        termFrequencies[term] = termCounter
    return(termFrequencies)

#Method to find out how often both terms occur in a document. 
def findTermPairFrequencies(termPairs, docCollection):
    termPairFrequencies = {}
    for termPair in termPairs:
        termPairCount = 0
        for document in docCollection:
            if (isinstance(document, list)):
                if all(i in document for i in termPair):
                    termPairCount = termPairCount + 1
        termPairFrequencies[termPair] = termPairCount
    return(termPairFrequencies)   

def calcPMIList(query, termFrequencies, termPairFrequencies, docCollection):
    if isinstance(query, list):
    #Find the frequencies of the individual terms and the pairs
        pairCombinationList = list(itertools.combinations(query, 2))
        termOccurances = []
        for pair in pairCombinationList:
            try:
                q1Freq = termFrequencies[pair[0]]
            except:
                q1Freq = 0
            try:
                q2Freq = termFrequencies[pair[1]]
            except:
                q2Freq = 0
            try:
                q1q2Freq = termPairFrequencies[pair]
            except:
                q1q2Freq = 0
                    
            termOccurances.append({'q1Freq': q1Freq, 
                                   'q2Freq': q2Freq, 
                                   'q1q2Freq': q1q2Freq})
    
        docCount = len(docCollection)
        pmiList = []
        for term in termOccurances:
            pq1 = term['q1Freq'] / docCount
            pq2 = term['q2Freq'] / docCount
            pq1q2 = term['q1q2Freq'] / docCount

            try:
                pmi = log(pq1q2 /(pq1 * pq2))
            except:
                pmi = np.nan
            pmiList.append(pmi)
        return(pmiList)
    else:
        return(np.nan)

def calcAvgPMI(pmiList):
    if(isinstance(pmiList, list)):
        pairCount = len(pmiList)
        if(pairCount != 0):
            #Calculate the average of all the entropies
            avgPMI= np.nansum(pmiList) / pairCount
        else:
            avgPMI = 0
        return(avgPMI)
    return(np.nan)

def calcMaxPMI(pmiList): 
    if(isinstance(pmiList, list)):
        pairCount = len(pmiList)
        if(pairCount != 0):
            maxPMI = np.nanmax(pmiList)
        else: 
            maxPMI = np.nan
        return(maxPMI)
    return(np.nan)

In [60]:
#Read datasets from disk
processedData_dataProcessingCartesian = pd.read_pickle(r"../data/03_processed/processedData_dataProcessingCartesian.pkl")

#instantiate CountVectorizer() for SVN
processedData_SVN_dataProcessingCountVectorizer = CountVectorizer()
processedData_SVN_dataProcessingTF_IDF = createFittedTF_IDF(processedData_SVN_dataProcessingCountVectorizer, intermediateData_SVN_dataProcessingCorpusAll)

#instantiate CountVectorizer() for JIRA
processedData_JIRA_dataProcessingCountVectorizer = CountVectorizer()
processedData_JIRA_dataProcessingTF_IDF = createFittedTF_IDF(processedData_JIRA_dataProcessingCountVectorizer, intermediateData_JIRA_dataProcessingCorpus)

#Determine document counts
intermediateData_JIRA_dataProcessing_documentCount = len(intermediateData_JIRA_dataProcessing.index)
intermediateData_SVN_dataProcessing_documentCount = len(intermediateData_SVN_dataProcessing.index)



#### IDF Scores (SVN as Query)

In [61]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVN_dataProcessingFeaturesIDF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVN_dataProcessingFeaturesIDF["SvnAsQuery_IDF"] = processedData_dataProcessingCartesian.apply(lambda x: calcIDFList(x.Commit_natural_text, 
                                                                                                                processedData_SVN_dataProcessingCountVectorizer, 
                                                                                                                processedData_SVN_dataProcessingTF_IDF),axis=1)

processedData_SVN_dataProcessingFeaturesIDF["SvnAsQuery_avgIDF"] = processedData_SVN_dataProcessingFeaturesIDF.apply(lambda x: calcAvgIDF(x.SvnAsQuery_IDF), axis=1)
processedData_SVN_dataProcessingFeaturesIDF["SvnAsQuery_maxIDF"] = processedData_SVN_dataProcessingFeaturesIDF.apply(lambda x: calcMaxIDF(x.SvnAsQuery_IDF), axis=1)
processedData_SVN_dataProcessingFeaturesIDF["SvnAsQuery_devIDF"] = processedData_SVN_dataProcessingFeaturesIDF.apply(lambda x: calcDevIDF(x.SvnAsQuery_IDF), axis=1)

#Save results in pickle
processedData_SVN_dataProcessingFeaturesIDF.to_pickle(path= "../data/03_processed/processedData_SVN_dataProcessingFeaturesIDF.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 9 minutes and 16.324886083602905 seconds


#### IDF Scores (SVNLogs as Query)

In [62]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNLogs_dataProcessingFeaturesIDF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNLogs_dataProcessingFeaturesIDF["SvnLogsAsQuery_IDF"] = processedData_dataProcessingCartesian.apply(lambda x: calcIDFList(x.Logs, 
                                                                                                                processedData_SVNLogs_dataProcessingCountVectorizer, 
                                                                                                                processedData_SVNLogs_dataProcessingCountTF_IDF),axis=1)

processedData_SVNLogs_dataProcessingFeaturesIDF["SvnLogsAsQuery_avgIDF"] = processedData_SVNLogs_dataProcessingFeaturesIDF.apply(lambda x: calcAvgIDF(x.SvnLogsAsQuery_IDF), axis=1)
processedData_SVNLogs_dataProcessingFeaturesIDF["SvnLogsAsQuery_maxIDF"] = processedData_SVNLogs_dataProcessingFeaturesIDF.apply(lambda x: calcMaxIDF(x.SvnLogsAsQuery_IDF), axis=1)
processedData_SVNLogs_dataProcessingFeaturesIDF["SvnLogsAsQuery_devIDF"] = processedData_SVNLogs_dataProcessingFeaturesIDF.apply(lambda x: calcDevIDF(x.SvnLogsAsQuery_IDF), axis=1)

#Save results in pickle
processedData_SVNLogs_dataProcessingFeaturesIDF.to_pickle(path= "../data/03_processed/processedData_SVNLogs_dataProcessingFeaturesIDF.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 2 minutes and 56.161683320999146 seconds


#### IDF Scores (SVNUnitNames as Query

In [63]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNUnitNames_dataProcessingFeaturesIDF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNUnitNames_dataProcessingFeaturesIDF["SvnUnitNamesAsQuery_IDF"] = processedData_dataProcessingCartesian.apply(lambda x: calcIDFList(x.Logs, 
                                                                                                                processedData_SVNLogs_dataProcessingCountVectorizer, 
                                                                                                                processedData_SVNLogs_dataProcessingCountTF_IDF),axis=1)

processedData_SVNUnitNames_dataProcessingFeaturesIDF["SvnUnitNamesAsQuery_avgIDF"] = processedData_SVNUnitNames_dataProcessingFeaturesIDF.apply(lambda x: calcAvgIDF(x.SvnUnitNamesAsQuery_IDF), axis=1)
processedData_SVNUnitNames_dataProcessingFeaturesIDF["SvnUnitNamesAsQuery_maxIDF"] = processedData_SVNUnitNames_dataProcessingFeaturesIDF.apply(lambda x: calcMaxIDF(x.SvnUnitNamesAsQuery_IDF), axis=1)
processedData_SVNUnitNames_dataProcessingFeaturesIDF["SvnUnitNamesAsQuery_devIDF"] = processedData_SVNUnitNames_dataProcessingFeaturesIDF.apply(lambda x: calcDevIDF(x.SvnUnitNamesAsQuery_IDF), axis=1)

#Save results in pickle
processedData_SVNUnitNames_dataProcessingFeaturesIDF.to_pickle(path= "../data/03_processed/processedData_SVNUnitNames_dataProcessingFeaturesIDF.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 2 minutes and 2.4939610958099365 seconds


##### IDF Scores (JIRA as Query)

In [64]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRA_dataProcessingFeaturesIDF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRA_dataProcessingFeaturesIDF["JiraAsQuery_IDF"] = processedData_dataProcessingCartesian.apply(lambda x: calcIDFList(x.Jira_natural_text, 
                                                                                                                processedData_JIRA_dataProcessingCountVectorizer, 
                                                                                                                processedData_JIRA_dataProcessingTF_IDF),axis=1)

processedData_JIRA_dataProcessingFeaturesIDF["JiraAsQuery_avgIDF"] = processedData_JIRA_dataProcessingFeaturesIDF.apply(lambda x: calcAvgIDF(x.JiraAsQuery_IDF), axis=1)
processedData_JIRA_dataProcessingFeaturesIDF["JiraAsQuery_maxIDF"] = processedData_JIRA_dataProcessingFeaturesIDF.apply(lambda x: calcMaxIDF(x.JiraAsQuery_IDF), axis=1)
processedData_JIRA_dataProcessingFeaturesIDF["JiraAsQuery_devIDF"] = processedData_JIRA_dataProcessingFeaturesIDF.apply(lambda x: calcDevIDF(x.JiraAsQuery_IDF), axis=1)

#Save results in pickle
processedData_JIRA_dataProcessingFeaturesIDF.to_pickle(path= "../data/03_processed/processedData_JIRA_dataProcessingFeaturesIDF.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 5 minutes and 30.488741636276245 seconds


##### IDF Scores (JIRA Summaries as Query)

In [65]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRASummaries_dataProcessingFeaturesIDF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRASummaries_dataProcessingFeaturesIDF["JiraSummariesAsQuery_IDF"] = processedData_dataProcessingCartesian.apply(lambda x: calcIDFList(x.Summary, 
                                                                                                                processedData_JIRASummaries_dataProcessingCountVectorizer, 
                                                                                                                processedData_JIRASummaries_dataProcessingCountTF_IDF),axis=1)

processedData_JIRASummaries_dataProcessingFeaturesIDF["JiraSummariesAsQuery_avgIDF"] = processedData_JIRASummaries_dataProcessingFeaturesIDF.apply(lambda x: calcAvgIDF(x.JiraSummariesAsQuery_IDF), axis=1)
processedData_JIRASummaries_dataProcessingFeaturesIDF["JiraSummariesAsQuery_maxIDF"] = processedData_JIRASummaries_dataProcessingFeaturesIDF.apply(lambda x: calcMaxIDF(x.JiraSummariesAsQuery_IDF), axis=1)
processedData_JIRASummaries_dataProcessingFeaturesIDF["JiraSummariesAsQuery_devIDF"] = processedData_JIRASummaries_dataProcessingFeaturesIDF.apply(lambda x: calcDevIDF(x.JiraSummariesAsQuery_IDF), axis=1)

#Save results in pickle
processedData_JIRASummaries_dataProcessingFeaturesIDF.to_pickle(path= "../data/03_processed/processedData_JIRASummaries_dataProcessingFeaturesIDF.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 1 minutes and 47.57470226287842 seconds


##### IDF Scores (JIRA Descriptions as Query)

In [66]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRADescriptions_dataProcessingFeaturesIDF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRADescriptions_dataProcessingFeaturesIDF["JiraDescriptionsAsQuery_IDF"] = processedData_dataProcessingCartesian.apply(lambda x: calcIDFList(x.Description, 
                                                                                                                processedData_JIRADescriptions_dataProcessingCountVectorizer, 
                                                                                                                processedData_JIRADescriptions_dataProcessingCountTF_IDF),axis=1)

processedData_JIRADescriptions_dataProcessingFeaturesIDF["JiraDescriptionsAsQuery_avgIDF"] = processedData_JIRADescriptions_dataProcessingFeaturesIDF.apply(lambda x: calcAvgIDF(x.JiraDescriptionsAsQuery_IDF), axis=1)
processedData_JIRADescriptions_dataProcessingFeaturesIDF["JiraDescriptionsAsQuery_maxIDF"] = processedData_JIRADescriptions_dataProcessingFeaturesIDF.apply(lambda x: calcMaxIDF(x.JiraDescriptionsAsQuery_IDF), axis=1)
processedData_JIRADescriptions_dataProcessingFeaturesIDF["JiraDescriptionsAsQuery_devIDF"] = processedData_JIRADescriptions_dataProcessingFeaturesIDF.apply(lambda x: calcDevIDF(x.JiraDescriptionsAsQuery_IDF), axis=1)

#Save results in pickle
processedData_JIRADescriptions_dataProcessingFeaturesIDF.to_pickle(path= "../data/03_processed/processedData_JIRADescriptions_dataProcessingFeaturesIDF.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 5 minutes and 26.57950496673584 seconds


##### IDF Scores (JIRA Comments as Query)

#### ICTF Scores (SVN as query)

In [67]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVN_dataProcessingFeaturesICTF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVN_dataProcessingFeaturesICTF["SvnAsQuery_ICTF"] = processedData_dataProcessingCartesian.apply(lambda x: calcIDFList(x.Commit_natural_text, 
                                                                                                                processedData_SVN_dataProcessingCountVectorizer, 
                                                                                                                intermediateData_SVN_dataProcessing_documentCount),axis=1)

processedData_SVN_dataProcessingFeaturesICTF["SvnAsQuery_avgICTF"] = processedData_SVN_dataProcessingFeaturesICTF.apply(lambda x: calcAvgICTF(x.SvnAsQuery_ICTF, intermediateData_SVN_dataProcessing_documentCount), axis=1)
processedData_SVN_dataProcessingFeaturesICTF["SvnAsQuery_maxICTF"] = processedData_SVN_dataProcessingFeaturesICTF.apply(lambda x: calcMaxICTF(x.SvnAsQuery_ICTF), axis=1)
processedData_SVN_dataProcessingFeaturesICTF["SvnAsQuery_devICTF"] = processedData_SVN_dataProcessingFeaturesICTF.apply(lambda x: calcDevICTF(x.SvnAsQuery_ICTF), axis=1)

#Save results in pickle
processedData_SVN_dataProcessingFeaturesICTF.to_pickle(path= "../data/03_processed/processedData_SVN_dataProcessingFeaturesICTF.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 7 minutes and 2.560788631439209 seconds


#### ICTF Scores (SVNLogs as query)

In [68]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNLogs_dataProcessingFeaturesICTF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNLogs_dataProcessingFeaturesICTF["SvnLogsAsQuery_ICTF"] = processedData_dataProcessingCartesian.apply(lambda x: calcICTFList(x.Logs, 
                                                                                                                processedData_SVNLogs_dataProcessingCountVectorizer, 
                                                                                                                intermediateData_SVN_dataProcessing_documentCount),axis=1)
##
processedData_SVNLogs_dataProcessingFeaturesICTF["SvnLogsAsQuery_avgICTF"] = processedData_SVNLogs_dataProcessingFeaturesICTF.apply(lambda x: calcAvgICTF(x.SvnLogsAsQuery_ICTF, intermediateData_SVN_dataProcessing_documentCount), axis=1)
processedData_SVNLogs_dataProcessingFeaturesICTF["SvnLogsAsQuery_maxICTF"] = processedData_SVNLogs_dataProcessingFeaturesICTF.apply(lambda x: calcMaxICTF(x.SvnLogsAsQuery_ICTF), axis=1)
processedData_SVNLogs_dataProcessingFeaturesICTF["SvnLogsAsQuery_devICTF"] = processedData_SVNLogs_dataProcessingFeaturesICTF.apply(lambda x: calcDevICTF(x.SvnLogsAsQuery_ICTF), axis=1)

#Save results in pickle
processedData_SVNLogs_dataProcessingFeaturesICTF.to_pickle(path= "../data/03_processed/processedData_SVNLogs_dataProcessingFeaturesICTF.pkl")



endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 0 minutes and 5.917165994644165 seconds


#### ICTF Scores (SVNUnitNames as query)

In [69]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNUnitNames_dataProcessingFeaturesICTF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNUnitNames_dataProcessingFeaturesICTF["SvnUnitNamesAsQuery_ICTF"] = processedData_dataProcessingCartesian.apply(lambda x: calcICTFList(x.Unit_names, 
                                                                                                                processedData_SVNUnitNames_dataProcessingCountVectorizer, 
                                                                                                                intermediateData_SVN_dataProcessing_documentCount),axis=1)
##
processedData_SVNUnitNames_dataProcessingFeaturesICTF["SvnUnitNamesAsQuery_avgICTF"] = processedData_SVNUnitNames_dataProcessingFeaturesICTF.apply(lambda x: calcAvgICTF(x.SvnUnitNamesAsQuery_ICTF, intermediateData_SVN_dataProcessing_documentCount), axis=1)
processedData_SVNUnitNames_dataProcessingFeaturesICTF["SvnUnitNamesAsQuery_maxICTF"] = processedData_SVNUnitNames_dataProcessingFeaturesICTF.apply(lambda x: calcMaxICTF(x.SvnUnitNamesAsQuery_ICTF), axis=1)
processedData_SVNUnitNames_dataProcessingFeaturesICTF["SvnUnitNamesAsQuery_devICTF"] = processedData_SVNUnitNames_dataProcessingFeaturesICTF.apply(lambda x: calcDevICTF(x.SvnUnitNamesAsQuery_ICTF), axis=1)

#Save results in pickle
processedData_SVNUnitNames_dataProcessingFeaturesICTF.to_pickle(path= "../data/03_processed/processedData_SVNUnitNames_dataProcessingFeaturesICTF.pkl")



endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 0 minutes and 6.788835048675537 seconds


#### ICTF Scores (JIRA as query)

In [70]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRA_dataProcessingFeaturesICTF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRA_dataProcessingFeaturesICTF["JiraAsQuery_ICTF"] = processedData_dataProcessingCartesian.apply(lambda x: calcICTFList(x.Jira_natural_text, 
                                                                                                                processedData_JIRA_dataProcessingCountVectorizer, 
                                                                                                                intermediateData_JIRA_dataProcessing_documentCount),axis=1)
##
processedData_JIRA_dataProcessingFeaturesICTF["JiraAsQuery_avgICTF"] = processedData_JIRA_dataProcessingFeaturesICTF.apply(lambda x: calcAvgICTF(x.JiraAsQuery_ICTF, intermediateData_JIRA_dataProcessing_documentCount), axis=1)
processedData_JIRA_dataProcessingFeaturesICTF["JiraAsQuery_maxICTF"] = processedData_JIRA_dataProcessingFeaturesICTF.apply(lambda x: calcMaxICTF(x.JiraAsQuery_ICTF), axis=1)
processedData_JIRA_dataProcessingFeaturesICTF["JiraAsQuery_devICTF"] = processedData_JIRA_dataProcessingFeaturesICTF.apply(lambda x: calcDevICTF(x.JiraAsQuery_ICTF), axis=1)

#Save results in pickle
processedData_JIRA_dataProcessingFeaturesICTF.to_pickle(path= "../data/03_processed/processedData_JIRA_dataProcessingFeaturesICTF.pkl")



endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 0 minutes and 9.56740140914917 seconds


#### ICTF Scores (JIRA Summaries as query)

In [71]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRASummaries_dataProcessingFeaturesICTF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRASummaries_dataProcessingFeaturesICTF["JiraSummariesAsQuery_ICTF"] = processedData_dataProcessingCartesian.apply(lambda x: calcICTFList(x.Summary, 
                                                                                                                processedData_JIRASummaries_dataProcessingCountVectorizer, 
                                                                                                                intermediateData_JIRA_dataProcessing_documentCount),axis=1)
##
processedData_JIRASummaries_dataProcessingFeaturesICTF["JiraSummariesAsQuery_avgICTF"] = processedData_JIRASummaries_dataProcessingFeaturesICTF.apply(lambda x: calcAvgICTF(x.JiraSummariesAsQuery_ICTF, intermediateData_JIRA_dataProcessing_documentCount), axis=1)
processedData_JIRASummaries_dataProcessingFeaturesICTF["JiraSummariesAsQuery_maxICTF"] = processedData_JIRASummaries_dataProcessingFeaturesICTF.apply(lambda x: calcMaxICTF(x.JiraSummariesAsQuery_ICTF), axis=1)
processedData_JIRASummaries_dataProcessingFeaturesICTF["JiraSummariesAsQuery_devICTF"] = processedData_JIRASummaries_dataProcessingFeaturesICTF.apply(lambda x: calcDevICTF(x.JiraSummariesAsQuery_ICTF), axis=1)

#Save results in pickle
processedData_JIRASummaries_dataProcessingFeaturesICTF.to_pickle(path= "../data/03_processed/processedData_JIRASummaries_dataProcessingFeaturesICTF.pkl")



endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 0 minutes and 8.337110996246338 seconds


#### ICTF Scores (JIRA Descriptions as query)

In [72]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRADescriptions_dataProcessingFeaturesICTF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRADescriptions_dataProcessingFeaturesICTF["JiraDescriptionsAsQuery_ICTF"] = processedData_dataProcessingCartesian.apply(lambda x: calcICTFList(x.Description, 
                                                                                                                processedData_JIRADescriptions_dataProcessingCountVectorizer, 
                                                                                                                intermediateData_JIRA_dataProcessing_documentCount),axis=1)
##
processedData_JIRADescriptions_dataProcessingFeaturesICTF["JiraDescriptionsAsQuery_avgICTF"] = processedData_JIRADescriptions_dataProcessingFeaturesICTF.apply(lambda x: calcAvgICTF(x.JiraDescriptionsAsQuery_ICTF, intermediateData_JIRA_dataProcessing_documentCount), axis=1)
processedData_JIRADescriptions_dataProcessingFeaturesICTF["JiraDescriptionsAsQuery_maxICTF"] = processedData_JIRADescriptions_dataProcessingFeaturesICTF.apply(lambda x: calcMaxICTF(x.JiraDescriptionsAsQuery_ICTF), axis=1)
processedData_JIRADescriptions_dataProcessingFeaturesICTF["JiraDescriptionsAsQuery_devICTF"] = processedData_JIRADescriptions_dataProcessingFeaturesICTF.apply(lambda x: calcDevICTF(x.JiraDescriptionsAsQuery_ICTF), axis=1)

#Save results in pickle
processedData_JIRADescriptions_dataProcessingFeaturesICTF.to_pickle(path= "../data/03_processed/processedData_JIRADescriptions_dataProcessingFeaturesICTF.pkl")



endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 0 minutes and 9.83654522895813 seconds


#### ICTF Scores (JIRA Comments as query)

#### Entropy (SVN as query)

In [73]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVN_dataProcessingFeaturesEntropy = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVN_dataProcessingFeaturesEntropy["SvnAsQuery_Entropy"] = processedData_dataProcessingCartesian.apply(lambda x: calcEntropyList(x.Commit_natural_text, 
                                                                                                                processedData_SVN_dataProcessingCountVectorizer, 
                                                                                                                intermediateData_SVN_dataProcessing_documentCount,
                                                                                                                intermediateData_SVN_dataProcessing.Commit_natural_text),axis=1)

processedData_SVN_dataProcessingFeaturesEntropy["SvnAsQuery_avgEntropy"] = processedData_SVN_dataProcessingFeaturesEntropy.apply(lambda x: calcAvgEntropy(x.SvnAsQuery_Entropy), axis=1)
processedData_SVN_dataProcessingFeaturesEntropy["SvnAsQuery_medEntropy"] = processedData_SVN_dataProcessingFeaturesEntropy.apply(lambda x: calcMedEntropy(x.SvnAsQuery_Entropy), axis=1)
processedData_SVN_dataProcessingFeaturesEntropy["SvnAsQuery_maxEntropy"] = processedData_SVN_dataProcessingFeaturesEntropy.apply(lambda x: calcMaxEntropy(x.SvnAsQuery_Entropy), axis=1)
processedData_SVN_dataProcessingFeaturesEntropy["SvnAsQuery_devEntropy"] = processedData_SVN_dataProcessingFeaturesEntropy.apply(lambda x: calcDevEntropy(x.SvnAsQuery_Entropy), axis=1)

#Save results in pickle
processedData_SVN_dataProcessingFeaturesEntropy.to_pickle(path= "../data/03_processed/processedData_SVN_dataProcessingFeaturesEntropy.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 13 minutes and 14.405963659286499 seconds


#### Entropy (SVNLogs as query)

In [74]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNLogs_dataProcessingFeaturesEntropy = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNLogs_dataProcessingFeaturesEntropy["SvnLogsAsQuery_Entropy"] = processedData_dataProcessingCartesian.apply(lambda x: calcEntropyList(x.Logs, 
                                                                                                                processedData_SVNLogs_dataProcessingCountVectorizer, 
                                                                                                                intermediateData_SVN_dataProcessing_documentCount,
                                                                                                                intermediateData_SVN_dataProcessing.Logs),axis=1)
##
processedData_SVNLogs_dataProcessingFeaturesEntropy["SvnLogsAsQuery_avgEntropy"] = processedData_SVNLogs_dataProcessingFeaturesEntropy.apply(lambda x: calcAvgEntropy(x.SvnLogsAsQuery_Entropy), axis=1)
processedData_SVNLogs_dataProcessingFeaturesEntropy["SvnLogsAsQuery_medEntropy"] = processedData_SVNLogs_dataProcessingFeaturesEntropy.apply(lambda x: calcMedEntropy(x.SvnLogsAsQuery_Entropy), axis=1)
processedData_SVNLogs_dataProcessingFeaturesEntropy["SvnLogsAsQuery_maxEntropy"] = processedData_SVNLogs_dataProcessingFeaturesEntropy.apply(lambda x: calcMaxEntropy(x.SvnLogsAsQuery_Entropy), axis=1)
processedData_SVNLogs_dataProcessingFeaturesEntropy["SvnLogsAsQuery_devEntropy"] = processedData_SVNLogs_dataProcessingFeaturesEntropy.apply(lambda x: calcDevEntropy(x.SvnLogsAsQuery_Entropy), axis=1)


#Save results in pickle
processedData_SVNLogs_dataProcessingFeaturesEntropy.to_pickle(path= "../data/03_processed/processedData_SVNLogs_dataProcessingFeaturesEntropy.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 1 minutes and 16.272884607315063 seconds


#### Entropy (SVNUnitNames as query)

In [75]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNUnitNames_dataProcessingFeaturesEntropy = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNUnitNames_dataProcessingFeaturesEntropy["SvnUnitNamesAsQuery_Entropy"] = processedData_dataProcessingCartesian.apply(lambda x: calcEntropyList(x.Unit_names, 
                                                                                                                processedData_SVNUnitNames_dataProcessingCountVectorizer, 
                                                                                                                intermediateData_SVN_dataProcessing_documentCount,
                                                                                                                intermediateData_SVN_dataProcessing.Unit_names),axis=1)
##
processedData_SVNUnitNames_dataProcessingFeaturesEntropy["SvnUnitNamesAsQuery_avgEntropy"] = processedData_SVNUnitNames_dataProcessingFeaturesEntropy.apply(lambda x: calcAvgEntropy(x.SvnUnitNamesAsQuery_Entropy), axis=1)
processedData_SVNUnitNames_dataProcessingFeaturesEntropy["SvnUnitNamesAsQuery_medEntropy"] = processedData_SVNUnitNames_dataProcessingFeaturesEntropy.apply(lambda x: calcMedEntropy(x.SvnUnitNamesAsQuery_Entropy), axis=1)
processedData_SVNUnitNames_dataProcessingFeaturesEntropy["SvnUnitNamesAsQuery_maxEntropy"] = processedData_SVNUnitNames_dataProcessingFeaturesEntropy.apply(lambda x: calcMaxEntropy(x.SvnUnitNamesAsQuery_Entropy), axis=1)
processedData_SVNUnitNames_dataProcessingFeaturesEntropy["SvnUnitNamesAsQuery_devEntropy"] = processedData_SVNUnitNames_dataProcessingFeaturesEntropy.apply(lambda x: calcDevEntropy(x.SvnUnitNamesAsQuery_Entropy), axis=1)


#Save results in pickle
processedData_SVNUnitNames_dataProcessingFeaturesEntropy.to_pickle(path= "../data/03_processed/processedData_SVNUnitNames_dataProcessingFeaturesEntropy.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 10 minutes and 56.19137644767761 seconds


#### Entropy (JIRA as query)

In [76]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRA_dataProcessingFeaturesEntropy = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRA_dataProcessingFeaturesEntropy["JiraAsQuery_Entropy"] = processedData_dataProcessingCartesian.apply(lambda x: calcEntropyList(x.Jira_natural_text, 
                                                                                                                processedData_JIRA_dataProcessingCountVectorizer, 
                                                                                                                intermediateData_JIRA_dataProcessing_documentCount,
                                                                                                                intermediateData_JIRA_dataProcessing.Jira_natural_text),axis=1)
##
processedData_JIRA_dataProcessingFeaturesEntropy["JiraAsQuery_avgEntropy"] = processedData_JIRA_dataProcessingFeaturesEntropy.apply(lambda x: calcAvgEntropy(x.JiraAsQuery_Entropy), axis=1)
processedData_JIRA_dataProcessingFeaturesEntropy["JiraAsQuery_medEntropy"] = processedData_JIRA_dataProcessingFeaturesEntropy.apply(lambda x: calcMedEntropy(x.JiraAsQuery_Entropy), axis=1)
processedData_JIRA_dataProcessingFeaturesEntropy["JiraAsQuery_maxEntropy"] = processedData_JIRA_dataProcessingFeaturesEntropy.apply(lambda x: calcMaxEntropy(x.JiraAsQuery_Entropy), axis=1)
processedData_JIRA_dataProcessingFeaturesEntropy["JiraAsQuery_devEntropy"] = processedData_JIRA_dataProcessingFeaturesEntropy.apply(lambda x: calcDevEntropy(x.JiraAsQuery_Entropy), axis=1)


#Save results in pickle
processedData_JIRA_dataProcessingFeaturesEntropy.to_pickle(path= "../data/03_processed/processedData_JIRA_dataProcessingFeaturesEntropy.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 2 minutes and 45.43707466125488 seconds


#### Entropy (JIRA Summaries as query)

In [77]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRASummaries_dataProcessingFeaturesEntropy = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRASummaries_dataProcessingFeaturesEntropy["JiraSummariesAsQuery_Entropy"] = processedData_dataProcessingCartesian.apply(lambda x: calcEntropyList(x.Summary, 
                                                                                                                processedData_JIRASummaries_dataProcessingCountVectorizer, 
                                                                                                                intermediateData_JIRA_dataProcessing_documentCount,
                                                                                                                intermediateData_JIRA_dataProcessing.Summary),axis=1)
##
processedData_JIRASummaries_dataProcessingFeaturesEntropy["JiraSummariesAsQuery_avgEntropy"] = processedData_JIRASummaries_dataProcessingFeaturesEntropy.apply(lambda x: calcAvgEntropy(x.JiraSummariesAsQuery_Entropy), axis=1)
processedData_JIRASummaries_dataProcessingFeaturesEntropy["JiraSummariesAsQuery_medEntropy"] = processedData_JIRASummaries_dataProcessingFeaturesEntropy.apply(lambda x: calcMedEntropy(x.JiraSummariesAsQuery_Entropy), axis=1)
processedData_JIRASummaries_dataProcessingFeaturesEntropy["JiraSummariesAsQuery_maxEntropy"] = processedData_JIRASummaries_dataProcessingFeaturesEntropy.apply(lambda x: calcMaxEntropy(x.JiraSummariesAsQuery_Entropy), axis=1)
processedData_JIRASummaries_dataProcessingFeaturesEntropy["JiraSummariesAsQuery_devEntropy"] = processedData_JIRASummaries_dataProcessingFeaturesEntropy.apply(lambda x: calcDevEntropy(x.JiraSummariesAsQuery_Entropy), axis=1)


#Save results in pickle
processedData_JIRASummaries_dataProcessingFeaturesEntropy.to_pickle(path= "../data/03_processed/processedData_JIRASummaries_dataProcessingFeaturesEntropy.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 0 minutes and 17.999998092651367 seconds


#### Entropy (JIRA Descriptions as query)

In [78]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRADescriptions_dataProcessingFeaturesEntropy = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRADescriptions_dataProcessingFeaturesEntropy["JiraDescriptionsAsQuery_Entropy"] = processedData_dataProcessingCartesian.apply(lambda x: calcEntropyList(x.Description, 
                                                                                                                processedData_JIRADescriptions_dataProcessingCountVectorizer, 
                                                                                                                intermediateData_JIRA_dataProcessing_documentCount,
                                                                                                                intermediateData_JIRA_dataProcessing.Description),axis=1)
##
processedData_JIRADescriptions_dataProcessingFeaturesEntropy["JiraDescriptionsAsQuery_avgEntropy"] = processedData_JIRADescriptions_dataProcessingFeaturesEntropy.apply(lambda x: calcAvgEntropy(x.JiraDescriptionsAsQuery_Entropy), axis=1)
processedData_JIRADescriptions_dataProcessingFeaturesEntropy["JiraDescriptionsAsQuery_medEntropy"] = processedData_JIRADescriptions_dataProcessingFeaturesEntropy.apply(lambda x: calcMedEntropy(x.JiraDescriptionsAsQuery_Entropy), axis=1)
processedData_JIRADescriptions_dataProcessingFeaturesEntropy["JiraDescriptionsAsQuery_maxEntropy"] = processedData_JIRADescriptions_dataProcessingFeaturesEntropy.apply(lambda x: calcMaxEntropy(x.JiraDescriptionsAsQuery_Entropy), axis=1)
processedData_JIRADescriptions_dataProcessingFeaturesEntropy["JiraDescriptionsAsQuery_devEntropy"] = processedData_JIRADescriptions_dataProcessingFeaturesEntropy.apply(lambda x: calcDevEntropy(x.JiraDescriptionsAsQuery_Entropy), axis=1)


#Save results in pickle
processedData_JIRADescriptions_dataProcessingFeaturesEntropy.to_pickle(path= "../data/03_processed/processedData_JIRADescriptions_dataProcessingFeaturesEntropy.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 1 minutes and 1.3077232837677002 seconds


#### Entropy (JIRA Comments as query)

##### Query Scope (SVN as query)

In [79]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVN_dataProcessingFeaturesQueryScope = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVN_dataProcessingFeaturesQueryScope["SvnAsQuery_QueryScope"] = processedData_dataProcessingCartesian.apply(lambda x: calcQueryScope(x.Commit_natural_text, 
                                                                                                                intermediateData_SVN_dataProcessing.Commit_natural_text),axis=1)

#Save results in pickle
processedData_SVN_dataProcessingFeaturesQueryScope.to_pickle(path= "../data/03_processed/processedData_SVN_dataProcessingFeaturesQueryScope.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 1 minutes and 8.568105459213257 seconds


##### Query Scope (SVNLogs as query)

In [80]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNLogs_dataProcessingFeaturesQueryScope = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNLogs_dataProcessingFeaturesQueryScope["SvnLogsAsQuery_QueryScope"] = processedData_dataProcessingCartesian.apply(lambda x: calcQueryScope(x.Logs, 
                                                                                                                intermediateData_SVN_dataProcessing.Logs),axis=1)

#Save results in pickle
processedData_SVNLogs_dataProcessingFeaturesQueryScope.to_pickle(path= "../data/03_processed/processedData_SVNLogs_dataProcessingFeaturesQueryScope.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 27.32073998451233 seconds


##### Query Scope (SVNUnitNames as query)

In [81]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNUnitNames_dataProcessingFeaturesQueryScope = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNUnitNames_dataProcessingFeaturesQueryScope["SvnUnitNamesAsQuery_QueryScope"] = processedData_dataProcessingCartesian.apply(lambda x: calcQueryScope(x.Unit_names, 
                                                                                                                intermediateData_SVN_dataProcessing.Unit_names),axis=1)

#Save results in pickle
processedData_SVNUnitNames_dataProcessingFeaturesQueryScope.to_pickle(path= "../data/03_processed/processedData_SVNUnitNames_dataProcessingFeaturesQueryScope.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 21.544811010360718 seconds


##### Query Scope (JIRA as query)

In [82]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRA_dataProcessingFeaturesQueryScope = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRA_dataProcessingFeaturesQueryScope["JiraAsQuery_QueryScope"] = processedData_dataProcessingCartesian.apply(lambda x: calcQueryScope(x.Jira_natural_text, 
                                                                                                                intermediateData_JIRA_dataProcessing.Jira_natural_text),axis=1)

#Save results in pickle
processedData_JIRA_dataProcessingFeaturesQueryScope.to_pickle(path= "../data/03_processed/processedData_JIRA_dataProcessingFeaturesQueryScope.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 7.517747163772583 seconds


##### Query Scope (JIRA Summaries as query)

In [83]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRASummaries_dataProcessingFeaturesQueryScope = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRASummaries_dataProcessingFeaturesQueryScope["JiraSummariesAsQuery_QueryScope"] = processedData_dataProcessingCartesian.apply(lambda x: calcQueryScope(x.Summary, 
                                                                                                                intermediateData_JIRA_dataProcessing.Summary),axis=1)

#Save results in pickle
processedData_JIRASummaries_dataProcessingFeaturesQueryScope.to_pickle(path= "../data/03_processed/processedData_JIRASummaries_dataProcessingFeaturesQueryScope.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 2.391801595687866 seconds


##### Query Scope (JIRA Descriptions as query)

In [84]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRADescriptions_dataProcessingFeaturesQueryScope = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRADescriptions_dataProcessingFeaturesQueryScope["JiraDescriptionsAsQuery_QueryScope"] = processedData_dataProcessingCartesian.apply(lambda x: calcQueryScope(x.Description, 
                                                                                                                intermediateData_JIRA_dataProcessing.Description),axis=1)

#Save results in pickle
processedData_JIRADescriptions_dataProcessingFeaturesQueryScope.to_pickle(path= "../data/03_processed/processedData_JIRADescriptions_dataProcessingFeaturesQueryScope.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 7.506921291351318 seconds


##### Query Scope (JIRA Comments as query)

#### Kullback-Leiber divergence (SVN as query)

In [85]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVN_dataProcessingFeaturesSCS = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVN_dataProcessingFeaturesSCS["SvnAsQuery_SCS"] = processedData_dataProcessingCartesian.apply(lambda x: calcSCS(x.Commit_natural_text, 
                                                                                                                processedData_SVN_dataProcessingCountVectorizer, 
                                                                                                                intermediateData_SVN_dataProcessing_documentCount),axis=1)

#Save results in pickle
processedData_SVN_dataProcessingFeaturesSCS.to_pickle(path= "../data/03_processed/processedData_SVN_dataProcessingFeaturesSCS.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 6.048178434371948 seconds


#### Kullback-Leiber divergence (SVNLogs as query)

In [86]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNLogs_dataProcessingFeaturesSCS = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNLogs_dataProcessingFeaturesSCS["SvnLogsAsQuery_SCS"] = processedData_dataProcessingCartesian.apply(lambda x: calcSCS(x.Logs, 
                                                                                                                processedData_SVNLogs_dataProcessingCountVectorizer, 
                                                                                                                intermediateData_SVN_dataProcessing_documentCount),axis=1)

#Save results in pickle
processedData_SVNLogs_dataProcessingFeaturesSCS.to_pickle(path= "../data/03_processed/processedData_SVNLogs_dataProcessingFeaturesSCS.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 0.7778475284576416 seconds


#### Kullback-Leiber divergence (SVNUnitNames as query)

In [87]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNUnitNames_dataProcessingFeaturesSCS = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNUnitNames_dataProcessingFeaturesSCS["SvnUnitNamesAsQuery_SCS"] = processedData_dataProcessingCartesian.apply(lambda x: calcSCS(x.Unit_names, 
                                                                                                                processedData_SVNUnitNames_dataProcessingCountVectorizer, 
                                                                                                                intermediateData_SVN_dataProcessing_documentCount),axis=1)

#Save results in pickle
processedData_SVNUnitNames_dataProcessingFeaturesSCS.to_pickle(path= "../data/03_processed/processedData_SVNUnitNames_dataProcessingFeaturesSCS.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 5.156197547912598 seconds


#### Kullback-Leiber divergence (JIRA as query)

In [88]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRA_dataProcessingFeaturesSCS = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRA_dataProcessingFeaturesSCS["JiraAsQuery_SCS"] = processedData_dataProcessingCartesian.apply(lambda x: calcSCS(x.Jira_natural_text, 
                                                                                                                processedData_JIRA_dataProcessingCountVectorizer, 
                                                                                                                intermediateData_JIRA_dataProcessing_documentCount),axis=1)

#Save results in pickle
processedData_JIRA_dataProcessingFeaturesSCS.to_pickle(path= "../data/03_processed/processedData_JIRA_dataProcessingFeaturesSCS.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 2.2250781059265137 seconds


#### Kullback-Leiber divergence (JIRA Summaries as query)

In [89]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRASummaries_dataProcessingFeaturesSCS = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRASummaries_dataProcessingFeaturesSCS["JiraSummariesAsQuery_SCS"] = processedData_dataProcessingCartesian.apply(lambda x: calcSCS(x.Summary, 
                                                                                                                processedData_JIRASummaries_dataProcessingCountVectorizer, 
                                                                                                                intermediateData_JIRA_dataProcessing_documentCount),axis=1)

#Save results in pickle
processedData_JIRASummaries_dataProcessingFeaturesSCS.to_pickle(path= "../data/03_processed/processedData_JIRASummaries_dataProcessingFeaturesSCS.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 0.8609614372253418 seconds


In [90]:
##### Kullback-Leiber divergence (JIRA Description as query)

In [91]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRADescriptions_dataProcessingFeaturesSCS = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRADescriptions_dataProcessingFeaturesSCS["JiraDescriptionsAsQuery_SCS"] = processedData_dataProcessingCartesian.apply(lambda x: calcSCS(x.Description, 
                                                                                                                processedData_JIRADescriptions_dataProcessingCountVectorizer, 
                                                                                                                intermediateData_JIRA_dataProcessing_documentCount),axis=1)

#Save results in pickle
processedData_JIRADescriptions_dataProcessingFeaturesSCS.to_pickle(path= "../data/03_processed/processedData_JIRADescriptions_dataProcessingFeaturesSCS.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 1.734570026397705 seconds


In [92]:
##### Kullback-Leiber divergence (JIRA Comments as query)

#### SCQ (SVN as Query)

In [93]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVN_dataProcessingFeaturesSCQ = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVN_dataProcessingFeaturesSCQ["SvnAsQuery_SCQ"] = processedData_dataProcessingCartesian.apply(lambda x: calcSCQList(x.Commit_natural_text, intermediateData_SVN_dataProcessing.Commit_natural_text,
                                                                                                                                         processedData_SVN_dataProcessingCountVectorizer,
                                                                                                                                         processedData_SVN_dataProcessingCountTF_IDF,
                                                                                                                                         intermediateData_SVN_dataProcessing_documentCount),axis=1)

processedData_SVN_dataProcessingFeaturesSCQ["SvnAsQuery_avgSCQ"] = processedData_SVN_dataProcessingFeaturesSCQ.apply(lambda x: calcAvgSCQ(x.SvnAsQuery_SCQ, intermediateData_SVN_dataProcessing_documentCount), axis=1)
processedData_SVN_dataProcessingFeaturesSCQ["SvnAsQuery_maxSCQ"] = processedData_SVN_dataProcessingFeaturesSCQ.apply(lambda x: calcMaxSCQ(x.SvnAsQuery_SCQ), axis=1)
processedData_SVN_dataProcessingFeaturesSCQ["SvnAsQuery_sumSCQ"] = processedData_SVN_dataProcessingFeaturesSCQ.apply(lambda x: calcSumSCQ(x.SvnAsQuery_SCQ), axis=1)


#Save results in pickle
processedData_SVN_dataProcessingFeaturesSCQ.to_pickle(path= "../data/03_processed/processedData_SVN_dataProcessingFeaturesSCQ.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 1 minutes and 42.76312828063965 seconds


#### SCQ (SVNLogs as Query)

In [94]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNLogs_dataProcessingFeaturesSCQ = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNLogs_dataProcessingFeaturesSCQ["SvnLogsAsQuery_SCQ"] = processedData_dataProcessingCartesian.apply(lambda x: calcSCQList(x.Logs, intermediateData_SVN_dataProcessing.Logs,
                                                                                                                                         processedData_SVNLogs_dataProcessingCountVectorizer,
                                                                                                                                         processedData_SVNLogs_dataProcessingCountTF_IDF,
                                                                                                                                         intermediateData_SVN_dataProcessing_documentCount),axis=1)

processedData_SVNLogs_dataProcessingFeaturesSCQ["SvnLogsAsQuery_avgSCQ"] = processedData_SVNLogs_dataProcessingFeaturesSCQ.apply(lambda x: calcAvgSCQ(x.SvnLogsAsQuery_SCQ, intermediateData_SVN_dataProcessing_documentCount), axis=1)
processedData_SVNLogs_dataProcessingFeaturesSCQ["SvnLogsAsQuery_maxSCQ"] = processedData_SVNLogs_dataProcessingFeaturesSCQ.apply(lambda x: calcMaxSCQ(x.SvnLogsAsQuery_SCQ), axis=1)
processedData_SVNLogs_dataProcessingFeaturesSCQ["SvnLogsAsQuery_sumSCQ"] = processedData_SVNLogs_dataProcessingFeaturesSCQ.apply(lambda x: calcSumSCQ(x.SvnLogsAsQuery_SCQ), axis=1)


#Save results in pickle
processedData_SVNLogs_dataProcessingFeaturesSCQ.to_pickle(path= "../data/03_processed/processedData_SVNLogs_dataProcessingFeaturesSCQ.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 1 minutes and 46.11970615386963 seconds


#### SCQ (SVNUnitNames as Query)

In [95]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNUnitNames_dataProcessingFeaturesSCQ = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNUnitNames_dataProcessingFeaturesSCQ["SvnUnitNamesAsQuery_SCQ"] = processedData_dataProcessingCartesian.apply(lambda x: calcSCQList(x.Unit_names, intermediateData_SVN_dataProcessing.Unit_names,
                                                                                                                                         processedData_SVNUnitNames_dataProcessingCountVectorizer,
                                                                                                                                         processedData_SVNUnitNames_dataProcessingCountTF_IDF,
                                                                                                                                         intermediateData_SVN_dataProcessing_documentCount),axis=1)

processedData_SVNUnitNames_dataProcessingFeaturesSCQ["SvnUnitNamesAsQuery_avgSCQ"] = processedData_SVNUnitNames_dataProcessingFeaturesSCQ.apply(lambda x: calcAvgSCQ(x.SvnUnitNamesAsQuery_SCQ, intermediateData_SVN_dataProcessing_documentCount), axis=1)
processedData_SVNUnitNames_dataProcessingFeaturesSCQ["SvnUnitNamesAsQuery_maxSCQ"] = processedData_SVNUnitNames_dataProcessingFeaturesSCQ.apply(lambda x: calcMaxSCQ(x.SvnUnitNamesAsQuery_SCQ), axis=1)
processedData_SVNUnitNames_dataProcessingFeaturesSCQ["SvnUnitNamesAsQuery_sumSCQ"] = processedData_SVNUnitNames_dataProcessingFeaturesSCQ.apply(lambda x: calcSumSCQ(x.SvnUnitNamesAsQuery_SCQ), axis=1)


#Save results in pickle
processedData_SVNUnitNames_dataProcessingFeaturesSCQ.to_pickle(path= "../data/03_processed/processedData_SVNUnitNames_dataProcessingFeaturesSCQ.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 1 minutes and 44.2932505607605 seconds


#### SCQ (JIRA as Query)

In [96]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRA_dataProcessingFeaturesSCQ = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRA_dataProcessingFeaturesSCQ["JiraAsQuery_SCQ"] = processedData_dataProcessingCartesian.apply(lambda x: calcSCQList(x.Jira_natural_text, intermediateData_JIRA_dataProcessing.Jira_natural_text,
                                                                                                                                         processedData_JIRA_dataProcessingCountVectorizer,
                                                                                                                                         processedData_JIRA_dataProcessingTF_IDF,
                                                                                                                                         intermediateData_JIRA_dataProcessing_documentCount),axis=1)

processedData_JIRA_dataProcessingFeaturesSCQ["JiraAsQuery_avgSCQ"] = processedData_JIRA_dataProcessingFeaturesSCQ.apply(lambda x: calcAvgSCQ(x.JiraAsQuery_SCQ, intermediateData_JIRA_dataProcessing_documentCount), axis=1)
processedData_JIRA_dataProcessingFeaturesSCQ["JiraAsQuery_maxSCQ"] = processedData_JIRA_dataProcessingFeaturesSCQ.apply(lambda x: calcMaxSCQ(x.JiraAsQuery_SCQ), axis=1)
processedData_JIRA_dataProcessingFeaturesSCQ["JiraAsQuery_sumSCQ"] = processedData_JIRA_dataProcessingFeaturesSCQ.apply(lambda x: calcSumSCQ(x.JiraAsQuery_SCQ), axis=1)


#Save results in pickle
processedData_JIRA_dataProcessingFeaturesSCQ.to_pickle(path= "../data/03_processed/processedData_JIRA_dataProcessingFeaturesSCQ.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 1 minutes and 45.523656368255615 seconds


#### SCQ (JIRA Summaries as Query)

In [97]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRASummaries_dataProcessingFeaturesSCQ = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRASummaries_dataProcessingFeaturesSCQ["JiraSummariesAsQuery_SCQ"] = processedData_dataProcessingCartesian.apply(lambda x: calcSCQList(x.Summary, intermediateData_JIRA_dataProcessing.Summary,
                                                                                                                                         processedData_JIRASummaries_dataProcessingCountVectorizer,
                                                                                                                                         processedData_JIRASummaries_dataProcessingCountTF_IDF,
                                                                                                                                         intermediateData_JIRA_dataProcessing_documentCount),axis=1)

processedData_JIRASummaries_dataProcessingFeaturesSCQ["JiraSummariesAsQuery_avgSCQ"] = processedData_JIRASummaries_dataProcessingFeaturesSCQ.apply(lambda x: calcAvgSCQ(x.JiraSummariesAsQuery_SCQ, intermediateData_JIRA_dataProcessing_documentCount), axis=1)
processedData_JIRASummaries_dataProcessingFeaturesSCQ["JiraSummariesAsQuery_maxSCQ"] = processedData_JIRASummaries_dataProcessingFeaturesSCQ.apply(lambda x: calcMaxSCQ(x.JiraSummariesAsQuery_SCQ), axis=1)
processedData_JIRASummaries_dataProcessingFeaturesSCQ["JiraSummariesAsQuery_sumSCQ"] = processedData_JIRASummaries_dataProcessingFeaturesSCQ.apply(lambda x: calcSumSCQ(x.JiraSummariesAsQuery_SCQ), axis=1)


#Save results in pickle
processedData_JIRASummaries_dataProcessingFeaturesSCQ.to_pickle(path= "../data/03_processed/processedData_JIRASummaries_dataProcessingFeaturesSCQ.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 1 minutes and 39.17560577392578 seconds


#### SCQ (JIRA Descriptions as Query)

In [98]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRADescriptions_dataProcessingFeaturesSCQ = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRADescriptions_dataProcessingFeaturesSCQ["JiraDescriptionsAsQuery_SCQ"] = processedData_dataProcessingCartesian.apply(lambda x: calcSCQList(x.Description, intermediateData_JIRA_dataProcessing.Description,
                                                                                                                                         processedData_JIRADescriptions_dataProcessingCountVectorizer,
                                                                                                                                         processedData_JIRADescriptions_dataProcessingCountTF_IDF,
                                                                                                                                         intermediateData_JIRA_dataProcessing_documentCount),axis=1)

processedData_JIRADescriptions_dataProcessingFeaturesSCQ["JiraDescriptionsAsQuery_avgSCQ"] = processedData_JIRADescriptions_dataProcessingFeaturesSCQ.apply(lambda x: calcAvgSCQ(x.JiraDescriptionsAsQuery_SCQ, intermediateData_JIRA_dataProcessing_documentCount), axis=1)
processedData_JIRADescriptions_dataProcessingFeaturesSCQ["JiraDescriptionsAsQuery_maxSCQ"] = processedData_JIRADescriptions_dataProcessingFeaturesSCQ.apply(lambda x: calcMaxSCQ(x.JiraDescriptionsAsQuery_SCQ), axis=1)
processedData_JIRADescriptions_dataProcessingFeaturesSCQ["JiraDescriptionsAsQuery_sumSCQ"] = processedData_JIRADescriptions_dataProcessingFeaturesSCQ.apply(lambda x: calcSumSCQ(x.JiraDescriptionsAsQuery_SCQ), axis=1)


#Save results in pickle
processedData_JIRADescriptions_dataProcessingFeaturesSCQ.to_pickle(path= "../data/03_processed/processedData_JIRADescriptions_dataProcessingFeaturesSCQ.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 1 minutes and 43.849058866500854 seconds


#### SCQ (JIRA Comments as Query)

#### PMI (SVN as query)

In [99]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(processedData_SVN_dataProcessingCountVectorizer)
termFrequencies = findTermFrequencies(processedData_SVN_dataProcessingCountVectorizer, intermediateData_SVN_dataProcessing.Commit_natural_text)
termPairFrequencies = findTermPairFrequencies(termPairs, intermediateData_SVN_dataProcessing.Commit_natural_text)

#Create new dataFrame
processedData_SVN_dataProcessingFeaturesPMI = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVN_dataProcessingFeaturesPMI["SvnAsQuery_PMI"] = processedData_dataProcessingCartesian.apply(lambda x: calcPMIList(x.Commit_natural_text, 
                                                                                                                                  termFrequencies, 
                                                                                                                                  termPairFrequencies, 
                                                                                                                                  intermediateData_SVN_dataProcessing.Commit_natural_text),axis=1)

processedData_SVN_dataProcessingFeaturesPMI["SvnAsQuery_avgPMI"] = processedData_SVN_dataProcessingFeaturesPMI.apply(lambda x: calcAvgPMI(x.SvnAsQuery_PMI), axis=1)
processedData_SVN_dataProcessingFeaturesPMI["SvnAsQuery_maxPMI"] = processedData_SVN_dataProcessingFeaturesPMI.apply(lambda x: calcMaxPMI(x.SvnAsQuery_PMI), axis=1)



processedData_SVN_dataProcessingFeaturesPMI.drop('SvnAsQuery_PMI', axis = 1, inplace=True)

#Save results in pickle
processedData_SVN_dataProcessingFeaturesPMI.to_pickle(path= "../data/03_processed/processedData_SVN_dataProcessingFeaturesPMI.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



  maxPMI = np.nanmax(pmiList)


Finished creating query quality features in 13 minutes and 28.791770935058594 seconds


#### PMI (SVNLogs as query)

In [100]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(processedData_SVNLogs_dataProcessingCountVectorizer)
termFrequencies = findTermFrequencies(processedData_SVNLogs_dataProcessingCountVectorizer, intermediateData_SVN_dataProcessing.Logs)
termPairFrequencies = findTermPairFrequencies(termPairs, intermediateData_SVN_dataProcessing.Logs)

#Create new dataFrame
processedData_SVNLogs_dataProcessingFeaturesPMI = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNLogs_dataProcessingFeaturesPMI["SvnLogsAsQuery_PMI"] = processedData_dataProcessingCartesian.apply(lambda x: calcPMIList(x.Logs, 
                                                                                                                                  termFrequencies, 
                                                                                                                                  termPairFrequencies, 
                                                                                                                                  intermediateData_SVN_dataProcessing.Logs),axis=1)

processedData_SVNLogs_dataProcessingFeaturesPMI["SvnLogsAsQuery_avgPMI"] = processedData_SVNLogs_dataProcessingFeaturesPMI.apply(lambda x: calcAvgPMI(x.SvnLogsAsQuery_PMI), axis=1)
processedData_SVNLogs_dataProcessingFeaturesPMI["SvnLogsAsQuery_maxPMI"] = processedData_SVNLogs_dataProcessingFeaturesPMI.apply(lambda x: calcMaxPMI(x.SvnLogsAsQuery_PMI), axis=1)



processedData_SVNLogs_dataProcessingFeaturesPMI.drop('SvnLogsAsQuery_PMI', axis = 1, inplace=True)

#Save results in pickle
processedData_SVNLogs_dataProcessingFeaturesPMI.to_pickle(path= "../data/03_processed/processedData_SVNLogs_dataProcessingFeaturesPMI.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



  maxPMI = np.nanmax(pmiList)


Finished creating query quality features in 5 minutes and 31.293832063674927 seconds


#### PMI (SVNUnitNames as query)

In [101]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(processedData_SVNUnitNames_dataProcessingCountVectorizer)
termFrequencies = findTermFrequencies(processedData_SVNUnitNames_dataProcessingCountVectorizer, intermediateData_SVN_dataProcessing.Unit_names)
termPairFrequencies = findTermPairFrequencies(termPairs, intermediateData_SVN_dataProcessing.Unit_names)

#Create new dataFrame
processedData_SVNUnitNames_dataProcessingFeaturesPMI = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNUnitNames_dataProcessingFeaturesPMI["SvnUnitNamesAsQuery_PMI"] = processedData_dataProcessingCartesian.apply(lambda x: calcPMIList(x.Unit_names, 
                                                                                                                                  termFrequencies, 
                                                                                                                                  termPairFrequencies, 
                                                                                                                                  intermediateData_SVN_dataProcessing.Unit_names),axis=1)

processedData_SVNUnitNames_dataProcessingFeaturesPMI["SvnUnitNamesAsQuery_avgPMI"] = processedData_SVNUnitNames_dataProcessingFeaturesPMI.apply(lambda x: calcAvgPMI(x.SvnUnitNamesAsQuery_PMI), axis=1)
processedData_SVNUnitNames_dataProcessingFeaturesPMI["SvnUnitNamesAsQuery_maxPMI"] = processedData_SVNUnitNames_dataProcessingFeaturesPMI.apply(lambda x: calcMaxPMI(x.SvnUnitNamesAsQuery_PMI), axis=1)



processedData_SVNUnitNames_dataProcessingFeaturesPMI.drop('SvnUnitNamesAsQuery_PMI', axis = 1, inplace=True)

#Save results in pickle
processedData_SVNUnitNames_dataProcessingFeaturesPMI.to_pickle(path= "../data/03_processed/processedData_SVNUnitNames_dataProcessingFeaturesPMI.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



  maxPMI = np.nanmax(pmiList)


Finished creating query quality features in 5 minutes and 34.49775004386902 seconds


#### PMI (JIRA as query)

In [102]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(processedData_JIRA_dataProcessingCountVectorizer)
termFrequencies = findTermFrequencies(processedData_JIRA_dataProcessingCountVectorizer, intermediateData_JIRA_dataProcessing.Jira_natural_text)
termPairFrequencies = findTermPairFrequencies(termPairs, intermediateData_JIRA_dataProcessing.Jira_natural_text)

#Create new dataFrame
processedData_JIRA_dataProcessingFeaturesPMI = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRA_dataProcessingFeaturesPMI["JiraAsQuery_PMI"] = processedData_dataProcessingCartesian.apply(lambda x: calcPMIList(x.Jira_natural_text, 
                                                                                                                                  termFrequencies, 
                                                                                                                                  termPairFrequencies, 
                                                                                                                                  intermediateData_JIRA_dataProcessing.Jira_natural_text),axis=1)

processedData_JIRA_dataProcessingFeaturesPMI["JiraAsQuery_avgPMI"] = processedData_JIRA_dataProcessingFeaturesPMI.apply(lambda x: calcAvgPMI(x.JiraAsQuery_PMI), axis=1)
processedData_JIRA_dataProcessingFeaturesPMI["JiraAsQuery_maxPMI"] = processedData_JIRA_dataProcessingFeaturesPMI.apply(lambda x: calcMaxPMI(x.JiraAsQuery_PMI), axis=1)



processedData_JIRA_dataProcessingFeaturesPMI.drop('JiraAsQuery_PMI', axis = 1, inplace=True)

#Save results in pickle
processedData_JIRA_dataProcessingFeaturesPMI.to_pickle(path= "../data/03_processed/processedData_JIRA_dataProcessingFeaturesPMI.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 1 minutes and 17.765934705734253 seconds


#### PMI (JIRA Summaries as query)

In [103]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(processedData_JIRASummaries_dataProcessingCountVectorizer)
termFrequencies = findTermFrequencies(processedData_JIRASummaries_dataProcessingCountVectorizer, intermediateData_JIRA_dataProcessing.Summary)
termPairFrequencies = findTermPairFrequencies(termPairs, intermediateData_JIRA_dataProcessing.Summary)

#Create new dataFrame
processedData_JIRASummaries_dataProcessingFeaturesPMI = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRASummaries_dataProcessingFeaturesPMI["JiraSummariesAsQuery_PMI"] = processedData_dataProcessingCartesian.apply(lambda x: calcPMIList(x.Summary, 
                                                                                                                                  termFrequencies, 
                                                                                                                                  termPairFrequencies, 
                                                                                                                                  intermediateData_JIRA_dataProcessing.Summary),axis=1)

processedData_JIRASummaries_dataProcessingFeaturesPMI["JiraSummariesAsQuery_avgPMI"] = processedData_JIRASummaries_dataProcessingFeaturesPMI.apply(lambda x: calcAvgPMI(x.JiraSummariesAsQuery_PMI), axis=1)
processedData_JIRASummaries_dataProcessingFeaturesPMI["JiraSummariesAsQuery_maxPMI"] = processedData_JIRASummaries_dataProcessingFeaturesPMI.apply(lambda x: calcMaxPMI(x.JiraSummariesAsQuery_PMI), axis=1)



processedData_JIRASummaries_dataProcessingFeaturesPMI.drop('JiraSummariesAsQuery_PMI', axis = 1, inplace=True)

#Save results in pickle
processedData_JIRASummaries_dataProcessingFeaturesPMI.to_pickle(path= "../data/03_processed/processedData_JIRASummaries_dataProcessingFeaturesPMI.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



  maxPMI = np.nanmax(pmiList)


Finished creating query quality features in 0 minutes and 6.765865325927734 seconds


#### PMI (JIRA Descriptions as query)

In [104]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(processedData_JIRADescriptions_dataProcessingCountVectorizer)
termFrequencies = findTermFrequencies(processedData_JIRADescriptions_dataProcessingCountVectorizer, intermediateData_JIRA_dataProcessing.Description)
termPairFrequencies = findTermPairFrequencies(termPairs, intermediateData_JIRA_dataProcessing.Description)

#Create new dataFrame
processedData_JIRADescriptions_dataProcessingFeaturesPMI = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRADescriptions_dataProcessingFeaturesPMI["JiraDescriptionsAsQuery_PMI"] = processedData_dataProcessingCartesian.apply(lambda x: calcPMIList(x.Description, 
                                                                                                                                  termFrequencies, 
                                                                                                                                  termPairFrequencies, 
                                                                                                                                  intermediateData_JIRA_dataProcessing.Description),axis=1)

processedData_JIRADescriptions_dataProcessingFeaturesPMI["JiraDescriptionsAsQuery_avgPMI"] = processedData_JIRADescriptions_dataProcessingFeaturesPMI.apply(lambda x: calcAvgPMI(x.JiraDescriptionsAsQuery_PMI), axis=1)
processedData_JIRADescriptions_dataProcessingFeaturesPMI["JiraDescriptionsAsQuery_maxPMI"] = processedData_JIRADescriptions_dataProcessingFeaturesPMI.apply(lambda x: calcMaxPMI(x.JiraDescriptionsAsQuery_PMI), axis=1)



processedData_JIRADescriptions_dataProcessingFeaturesPMI.drop('JiraDescriptionsAsQuery_PMI', axis = 1, inplace=True)

#Save results in pickle
processedData_JIRADescriptions_dataProcessingFeaturesPMI.to_pickle(path= "../data/03_processed/processedData_JIRADescriptions_dataProcessingFeaturesPMI.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



  maxPMI = np.nanmax(pmiList)


Finished creating query quality features in 1 minutes and 0.5736644268035889 seconds


#### PMI (JIRA Comments as query)

In [105]:
from sklearn import preprocessing
import numpy as np
import pandas as pd

def normalizeData(dataFrame):
    scaler = preprocessing.MinMaxScaler()
    names = dataFrame.columns
    d = scaler.fit_transform(dataFrame)
    scaledDataFrame = pd.DataFrame(d, columns=names)
    return(scaledDataFrame)

# Normalize all data

In [106]:
from sklearn import preprocessing
import numpy as np

################################## Loading #################################
#Load Process-Related Features
processedData_dataProcessingFeaturesTime = pd.read_pickle(r'../data/03_processed/processedData_dataProcessingFeaturesTime.pkl')
processedData_dataProcessingFeaturesStakeholder = pd.read_pickle(r'../data/03_processed/processedData_dataProcessingFeaturesStakeholder.pkl')

#Load IR-Related Features - unigram
processedData_dataProcessing_features_VsmLogsJiraAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmLogsJiraAsQuery.pkl')
processedData_dataProcessing_features_VsmLogsLogAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmLogsLogAsQuery.pkl')
processedData_dataProcessing_features_VsmUnitNamesJiraAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmUnitNamesJiraAsQuery.pkl')
processedData_dataProcessing_features_VsmUnitNamesUnitNamesAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmUnitNamesUnitNamesAsQuery.pkl')

#processedData_dataProcessing_features_VsmUnitNamesCommentsCommentsAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmUnitNamesCommentsCommentsAsQuery.pkl')
#processedData_dataProcessing_features_VsmUnitNamesCommentsUnitNamesAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmUnitNamesCommentsUnitNamesAsQuery.pkl')
processedData_dataProcessing_features_VsmUnitNamesDescriptionDescriptionAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmUnitNamesDescriptionDescriptionAsQuery.pkl')
processedData_dataProcessing_features_VsmUnitNamesDescriptionUnitNamesAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmUnitNamesDescriptionUnitNamesAsQuery.pkl')

#processedData_dataProcessing_features_VsmVerbPruningUnitNamesJiraAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmVerbPruningUnitNamesJiraAsQuery.pkl')
#processedData_dataProcessing_features_VsmVerbPruningUnitNamesUnitNamesAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmVerbPruningUnitNamesUnitNamesAsQuery.pkl')
processedData_dataProcessing_features_VsmSummaryLogsSummaryAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmSummaryLogsSummaryAsQuery.pkl')
processedData_dataProcessing_features_VsmSummaryLogsLogsAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmSummaryLogsLogsAsQuery.pkl')
processedData_dataProcessing_features_VsmSummaryUnitNamesSummaryAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmSummaryUnitNamesSummaryAsQuery.pkl')
processedData_dataProcessing_features_VsmSummaryUnitNamesUnitNamesAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmSummaryUnitNamesUnitNamesAsQuery.pkl')
processedData_dataProcessing_features_VsmDescriptionDescriptionAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmDescriptionDescriptionAsQuery.pkl')
processedData_dataProcessing_features_VsmDescriptionLogsAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmDescriptionLogsAsQuery.pkl')
#processedData_dataProcessing_features_VsmCommentsCommentsAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmCommentsCommentsAsQuery.pkl')
#processedData_dataProcessing_features_VsmCommentsLogsAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmCommentsLogsAsQuery.pkl')

processedData_dataProcessing_features_VsmSvnJiraJiraAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmSvnJiraJiraAsQuery.pkl')
processedData_dataProcessing_features_VsmSvnJiraSvnAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmSvnJiraSvnAsQuery.pkl')
processedData_dataProcessing_features_VsmSvnSummarySvnAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmSvnSummarySvnAsQuery.pkl')
processedData_dataProcessing_features_VsmSvnSummarySummaryAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmSvnSummarySummaryAsQuery.pkl')
processedData_dataProcessing_features_VsmSvnDescriptionSvnAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmSvnDescriptionSvnAsQuery.pkl')
processedData_dataProcessing_features_VsmSvnDescriptionDescriptionAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmSvnDescriptionDescriptionAsQuery.pkl')
#processedData_dataProcessing_features_VsmSvnCommentsSvnAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmSvnCommentsSvnAsQuery.pkl')
#processedData_dataProcessing_features_VsmSvnCommentsCommentsAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmSvnCommentsCommentsAsQuery.pkl')


#Load IR-Related Features - bigram
#processedData_dataProcessing_features_VsmLogsJiraAsQuery_2gram = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmLogsJiraAsQuery_2gram.pkl')
#processedData_dataProcessing_features_VsmLogsLogAsQuery_2gram = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmLogsLogAsQuery_2gram.pkl')
#processedData_dataProcessing_features_VsmUnitNamesJiraAsQuery_2gram = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmUnitNamesJiraAsQuery_2gram.pkl')
#processedData_dataProcessing_features_VsmUnitNamesUnitNamesAsQuery_2gram = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmUnitNamesUnitNamesAsQuery_2gram.pkl')
#processedData_dataProcessing_features_VsmCommentsLogsAsQuery_2gram = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmCommentsLogsAsQuery_2gram.pkl')
#processedData_dataProcessing_features_VsmCommentsCommentsAsQuery_2gram = pd.read_pickle(r'../data/03_processed/processedData_dataProcessing_features_VsmCommentsCommentsAsQuery_2gram.pkl')


#Load Document Statistics Features
processedData_JIRA_dataProcessingFeaturesUniqueWordCount = pd.read_pickle(r"../data/03_processed/processedData_JIRA_dataProcessingFeaturesUniqueWordCount.pkl")
processedData_SVN_dataProcessingFeaturesUniqueWordCount = pd.read_pickle(r"../data/03_processed/processedData_SVN_dataProcessingFeaturesUniqueWordCount.pkl")
processedData_JIRA_dataProcessingFeaturesTotalWordCount = pd.read_pickle(r"../data/03_processed/processedData_JIRA_dataProcessingFeaturesTotalWordCount.pkl")
processedData_SVN_dataProcessingFeaturesTotalWordCount = pd.read_pickle(r"../data/03_processed/processedData_SVN_dataProcessingFeaturesTotalWordCount.pkl")
processedData_JIRA_dataProcessingFeaturesOverlapPercentage = pd.read_pickle(r"../data/03_processed/processedData_JIRA_dataProcessingFeaturesOverlapPercentage.pkl")
processedData_SVN_dataProcessingFeaturesOverlapPercentage = pd.read_pickle(r"../data/03_processed/processedData_SVN_dataProcessingFeaturesOverlapPercentage.pkl")
processedData_UNION_dataProcessingFeaturesOverlapPercentage = pd.read_pickle(r"../data/03_processed/processedData_UNION_dataProcessingFeaturesOverlapPercentage.pkl")

#Load Query Quality Features
#processedData_dataProcessingFeaturesQueryQuality = pd.read_pickle(r'../data/03_processed/processedData_dataProcessingFeaturesQueryQuality.pkl')
processedData_SVN_dataProcessingFeaturesIDF = pd.read_pickle(r'../data/03_processed/processedData_SVN_dataProcessingFeaturesIDF.pkl')
processedData_SVNLogs_dataProcessingFeaturesIDF = pd.read_pickle(r'../data/03_processed/processedData_SVNLogs_dataProcessingFeaturesIDF.pkl')
processedData_SVNUnitNames_dataProcessingFeaturesIDF = pd.read_pickle(r'../data/03_processed/processedData_SVNUnitNames_dataProcessingFeaturesIDF.pkl')
processedData_JIRA_dataProcessingFeaturesIDF = pd.read_pickle(r'../data/03_processed/processedData_JIRA_dataProcessingFeaturesIDF.pkl')
processedData_JIRASummaries_dataProcessingFeaturesIDF = pd.read_pickle(r'../data/03_processed/processedData_JIRASummaries_dataProcessingFeaturesIDF.pkl')
processedData_JIRADescriptions_dataProcessingFeaturesIDF = pd.read_pickle(r'../data/03_processed/processedData_JIRADescriptions_dataProcessingFeaturesIDF.pkl')
#processedData_JIRAComments_dataProcessingFeaturesIDF = pd.read_pickle(r'../data/03_processed/processedData_JIRAComments_dataProcessingFeaturesIDF.pkl')


processedData_SVN_dataProcessingFeaturesICTF = pd.read_pickle(r'../data/03_processed/processedData_SVN_dataProcessingFeaturesICTF.pkl')
processedData_SVNLogs_dataProcessingFeaturesICTF = pd.read_pickle(r'../data/03_processed/processedData_SVNLogs_dataProcessingFeaturesICTF.pkl')
processedData_SVNUnitNames_dataProcessingFeaturesICTF = pd.read_pickle(r'../data/03_processed/processedData_SVNUnitNames_dataProcessingFeaturesICTF.pkl')
processedData_JIRA_dataProcessingFeaturesICTF = pd.read_pickle(r'../data/03_processed/processedData_JIRA_dataProcessingFeaturesICTF.pkl')
processedData_JIRASummaries_dataProcessingFeaturesICTF = pd.read_pickle(r'../data/03_processed/processedData_JIRASummaries_dataProcessingFeaturesICTF.pkl')
processedData_JIRADescriptions_dataProcessingFeaturesICTF = pd.read_pickle(r'../data/03_processed/processedData_JIRADescriptions_dataProcessingFeaturesICTF.pkl')
#processedData_JIRAComments_dataProcessingFeaturesICTF = pd.read_pickle(r'../data/03_processed/processedData_JIRAComments_dataProcessingFeaturesICTF.pkl')


processedData_SVN_dataProcessingFeaturesEntropy = pd.read_pickle(r'../data/03_processed/processedData_SVN_dataProcessingFeaturesEntropy.pkl')
processedData_SVNLogs_dataProcessingFeaturesEntropy = pd.read_pickle(r'../data/03_processed/processedData_SVNLogs_dataProcessingFeaturesEntropy.pkl')
processedData_SVNUnitNames_dataProcessingFeaturesEntropy = pd.read_pickle(r'../data/03_processed/processedData_SVNUnitNames_dataProcessingFeaturesEntropy.pkl')
processedData_JIRA_dataProcessingFeaturesEntropy = pd.read_pickle(r'../data/03_processed/processedData_JIRA_dataProcessingFeaturesEntropy.pkl')
processedData_JIRASummaries_dataProcessingFeaturesEntropy = pd.read_pickle(r'../data/03_processed/processedData_JIRASummaries_dataProcessingFeaturesEntropy.pkl')
processedData_JIRADescriptions_dataProcessingFeaturesEntropy = pd.read_pickle(r'../data/03_processed/processedData_JIRADescriptions_dataProcessingFeaturesEntropy.pkl')
#processedData_JIRAComments_dataProcessingFeaturesEntropy = pd.read_pickle(r'../data/03_processed/processedData_JIRAComments_dataProcessingFeaturesEntropy.pkl')


processedData_SVN_dataProcessingFeaturesQueryScope = pd.read_pickle(r'../data/03_processed/processedData_SVN_dataProcessingFeaturesQueryScope.pkl')
processedData_SVNLogs_dataProcessingFeaturesQueryScope = pd.read_pickle(r'../data/03_processed/processedData_SVNLogs_dataProcessingFeaturesQueryScope.pkl')
processedData_SVNUnitNames_dataProcessingFeaturesQueryScope = pd.read_pickle(r'../data/03_processed/processedData_SVNUnitNames_dataProcessingFeaturesQueryScope.pkl')
processedData_JIRA_dataProcessingFeaturesQueryScope = pd.read_pickle(r'../data/03_processed/processedData_JIRA_dataProcessingFeaturesQueryScope.pkl')
processedData_JIRASummaries_dataProcessingFeaturesQueryScope = pd.read_pickle(r'../data/03_processed/processedData_JIRASummaries_dataProcessingFeaturesQueryScope.pkl')
processedData_JIRADescriptions_dataProcessingFeaturesQueryScope = pd.read_pickle(r'../data/03_processed/processedData_JIRADescriptions_dataProcessingFeaturesQueryScope.pkl')
#processedData_JIRAComments_dataProcessingFeaturesQueryScope = pd.read_pickle(r'../data/03_processed/processedData_JIRAComments_dataProcessingFeaturesQueryScope.pkl')


processedData_SVN_dataProcessingFeaturesSCS = pd.read_pickle(r'../data/03_processed/processedData_SVN_dataProcessingFeaturesSCS.pkl')
processedData_SVNLogs_dataProcessingFeaturesSCS = pd.read_pickle(r'../data/03_processed/processedData_SVNLogs_dataProcessingFeaturesSCS.pkl')
processedData_SVNUnitNames_dataProcessingFeaturesSCS = pd.read_pickle(r'../data/03_processed/processedData_SVNUnitNames_dataProcessingFeaturesSCS.pkl')
processedData_JIRA_dataProcessingFeaturesSCS = pd.read_pickle(r'../data/03_processed/processedData_JIRA_dataProcessingFeaturesSCS.pkl')
processedData_JIRASummaries_dataProcessingFeaturesSCS = pd.read_pickle(r'../data/03_processed/processedData_JIRASummaries_dataProcessingFeaturesSCS.pkl')
processedData_JIRADescriptions_dataProcessingFeaturesSCS = pd.read_pickle(r'../data/03_processed/processedData_JIRADescriptions_dataProcessingFeaturesSCS.pkl')
#processedData_JIRAComments_dataProcessingFeaturesSCS = pd.read_pickle(r'../data/03_processed/processedData_JIRAComments_dataProcessingFeaturesSCS.pkl')


processedData_SVN_dataProcessingFeaturesSCQ = pd.read_pickle(r'../data/03_processed/processedData_SVN_dataProcessingFeaturesSCQ.pkl')
processedData_SVNLogs_dataProcessingFeaturesSCQ = pd.read_pickle(r'../data/03_processed/processedData_SVNLogs_dataProcessingFeaturesSCQ.pkl')
processedData_SVNUnitNames_dataProcessingFeaturesSCQ = pd.read_pickle(r'../data/03_processed/processedData_SVNUnitNames_dataProcessingFeaturesSCQ.pkl')
processedData_JIRA_dataProcessingFeaturesSCQ = pd.read_pickle(r'../data/03_processed/processedData_JIRA_dataProcessingFeaturesSCQ.pkl')
processedData_JIRASummaries_dataProcessingFeaturesSCQ = pd.read_pickle(r'../data/03_processed/processedData_JIRASummaries_dataProcessingFeaturesSCQ.pkl')
processedData_JIRADescriptions_dataProcessingFeaturesSCQ = pd.read_pickle(r'../data/03_processed/processedData_JIRADescriptions_dataProcessingFeaturesSCQ.pkl')
#processedData_JIRAComments_dataProcessingFeaturesSCQ = pd.read_pickle(r'../data/03_processed/processedData_JIRAComments_dataProcessingFeaturesSCQ.pkl')


#processedData_SVN_dataProcessingFeaturesPMI = pd.read_pickle(r'../data/03_processed/processedData_SVN_dataProcessingFeaturesPMI.pkl')
processedData_SVNLogs_dataProcessingFeaturesPMI = pd.read_pickle(r'../data/03_processed/processedData_SVNLogs_dataProcessingFeaturesPMI.pkl')
#processedData_SVNUnitNames_dataProcessingFeaturesPMI = pd.read_pickle(r'../data/03_processed/processedData_SVNUnitNames_dataProcessingFeaturesPMI.pkl')
#processedData_JIRA_dataProcessingFeaturesPMI = pd.read_pickle(r'../data/03_processed/processedData_JIRA_dataProcessingFeaturesPMI.pkl')
processedData_JIRASummaries_dataProcessingFeaturesPMI = pd.read_pickle(r'../data/03_processed/processedData_JIRASummaries_dataProcessingFeaturesPMI.pkl')
#processedData_JIRADescriptions_dataProcessingFeaturesPMI = pd.read_pickle(r'../data/03_processed/processedData_JIRADescriptions_dataProcessingFeaturesPMI.pkl')
#processedData_JIRAComments_dataProcessingFeaturesPMI = pd.read_pickle(r'../data/03_processed/processedData_JIRAComments_dataProcessingFeaturesPMI.pkl')


################################## Drop query array for normalization ###############################################


processedData_SVN_dataProcessingFeaturesIDF.drop('SvnAsQuery_IDF', axis = 1, inplace=True)
processedData_SVNLogs_dataProcessingFeaturesIDF.drop('SvnLogsAsQuery_IDF', axis = 1, inplace=True)
processedData_SVNUnitNames_dataProcessingFeaturesIDF.drop('SvnUnitNamesAsQuery_IDF', axis = 1, inplace=True)
processedData_JIRA_dataProcessingFeaturesIDF.drop('JiraAsQuery_IDF', axis = 1, inplace=True)
processedData_JIRASummaries_dataProcessingFeaturesIDF.drop('JiraSummariesAsQuery_IDF', axis = 1, inplace=True)
processedData_JIRADescriptions_dataProcessingFeaturesIDF.drop('JiraDescriptionsAsQuery_IDF', axis = 1, inplace=True)
#processedData_JIRAComments_dataProcessingFeaturesIDF.drop('JiraCommentsAsQuery_IDF', axis = 1, inplace=True)

processedData_SVN_dataProcessingFeaturesICTF.drop('SvnAsQuery_ICTF', axis = 1, inplace=True)
processedData_SVNLogs_dataProcessingFeaturesICTF.drop('SvnLogsAsQuery_ICTF', axis = 1, inplace=True)
processedData_SVNUnitNames_dataProcessingFeaturesICTF.drop('SvnUnitNamesAsQuery_ICTF', axis = 1, inplace=True)
processedData_JIRA_dataProcessingFeaturesICTF.drop('JiraAsQuery_ICTF', axis = 1, inplace=True)
processedData_JIRASummaries_dataProcessingFeaturesICTF.drop('JiraSummariesAsQuery_ICTF', axis = 1, inplace=True)
processedData_JIRADescriptions_dataProcessingFeaturesICTF.drop('JiraDescriptionsAsQuery_ICTF', axis = 1, inplace=True)
#processedData_JIRAComments_dataProcessingFeaturesICTF.drop('JiraCommentsAsQuery_ICTF', axis = 1, inplace=True)

processedData_SVN_dataProcessingFeaturesEntropy.drop('SvnAsQuery_Entropy', axis = 1, inplace=True)
processedData_SVNLogs_dataProcessingFeaturesEntropy.drop('SvnLogsAsQuery_Entropy', axis = 1, inplace=True)
processedData_SVNUnitNames_dataProcessingFeaturesEntropy.drop('SvnUnitNamesAsQuery_Entropy', axis = 1, inplace=True)
processedData_JIRA_dataProcessingFeaturesEntropy.drop('JiraAsQuery_Entropy', axis = 1, inplace=True)
processedData_JIRASummaries_dataProcessingFeaturesEntropy.drop('JiraSummariesAsQuery_Entropy', axis = 1, inplace=True)
processedData_JIRADescriptions_dataProcessingFeaturesEntropy.drop('JiraDescriptionsAsQuery_Entropy', axis = 1, inplace=True)
#processedData_JIRAComments_dataProcessingFeaturesEntropy.drop('JiraCommentsAsQuery_Entropy', axis = 1, inplace=True)

processedData_SVN_dataProcessingFeaturesSCQ.drop('SvnAsQuery_SCQ', axis = 1, inplace=True)
processedData_SVNLogs_dataProcessingFeaturesSCQ.drop('SvnLogsAsQuery_SCQ', axis = 1, inplace=True)
processedData_SVNUnitNames_dataProcessingFeaturesSCQ.drop('SvnUnitNamesAsQuery_SCQ', axis = 1, inplace=True)
processedData_JIRA_dataProcessingFeaturesSCQ.drop('JiraAsQuery_SCQ', axis = 1, inplace=True)
processedData_JIRASummaries_dataProcessingFeaturesSCQ.drop('JiraSummariesAsQuery_SCQ', axis = 1, inplace=True)
processedData_JIRADescriptions_dataProcessingFeaturesSCQ.drop('JiraDescriptionsAsQuery_SCQ', axis = 1, inplace=True)
#processedData_JIRAComments_dataProcessingFeaturesSCQ.drop('JiraCommentsAsQuery_SCQ', axis = 1, inplace=True)

################################## Normalizing ################################################

processedData_dataProcessingFeaturesTime_normalized = normalizeData(processedData_dataProcessingFeaturesTime)
processedData_dataProcessingFeaturesStakeholder_normalized = normalizeData(processedData_dataProcessingFeaturesStakeholder)

#Load IR-Related Features - unigram
processedData_dataProcessing_features_VsmLogsJiraAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmLogsJiraAsQuery)
processedData_dataProcessing_features_VsmLogsLogAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmLogsLogAsQuery)
processedData_dataProcessing_features_VsmUnitNamesJiraAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmUnitNamesJiraAsQuery)
processedData_dataProcessing_features_VsmUnitNamesUnitNamesAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmUnitNamesUnitNamesAsQuery)
#processedData_dataProcessing_features_VsmUnitNamesCommentsCommentsAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmUnitNamesCommentsCommentsAsQuery)
#processedData_dataProcessing_features_VsmUnitNamesCommentsUnitNamesAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmUnitNamesCommentsUnitNamesAsQuery)
processedData_dataProcessing_features_VsmUnitNamesDescriptionDescriptionAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmUnitNamesDescriptionDescriptionAsQuery)
processedData_dataProcessing_features_VsmUnitNamesDescriptionUnitNamesAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmUnitNamesDescriptionUnitNamesAsQuery)

#processedData_dataProcessing_features_VsmVerbPruningUnitNamesJiraAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmVerbPruningUnitNamesJiraAsQuery)
#processedData_dataProcessing_features_VsmVerbPruningUnitNamesUnitNamesAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmVerbPruningUnitNamesUnitNamesAsQuery)
processedData_dataProcessing_features_VsmSummaryLogsSummaryAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmSummaryLogsSummaryAsQuery)
processedData_dataProcessing_features_VsmSummaryLogsLogsAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmSummaryLogsLogsAsQuery)
processedData_dataProcessing_features_VsmSummaryUnitNamesSummaryAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmSummaryUnitNamesSummaryAsQuery)
processedData_dataProcessing_features_VsmSummaryUnitNamesUnitNamesAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmSummaryUnitNamesUnitNamesAsQuery)
processedData_dataProcessing_features_VsmDescriptionDescriptionAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmDescriptionDescriptionAsQuery)
processedData_dataProcessing_features_VsmDescriptionLogsAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmDescriptionLogsAsQuery)
#processedData_dataProcessing_features_VsmCommentsCommentsAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmCommentsCommentsAsQuery)
#processedData_dataProcessing_features_VsmCommentsLogsAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmCommentsLogsAsQuery)

processedData_dataProcessing_features_VsmSvnJiraJiraAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmSvnJiraJiraAsQuery)
processedData_dataProcessing_features_VsmSvnJiraSvnAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmSvnJiraSvnAsQuery)
processedData_dataProcessing_features_VsmSvnSummarySvnAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmSvnSummarySvnAsQuery)
processedData_dataProcessing_features_VsmSvnSummarySummaryAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmSvnSummarySummaryAsQuery)
processedData_dataProcessing_features_VsmSvnDescriptionSvnAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmSvnDescriptionSvnAsQuery)
processedData_dataProcessing_features_VsmSvnDescriptionDescriptionAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmSvnDescriptionDescriptionAsQuery)
#processedData_dataProcessing_features_VsmSvnCommentsSvnAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmSvnCommentsSvnAsQuery)
#processedData_dataProcessing_features_VsmSvnCommentsCommentsAsQuery_normalized = normalizeData(processedData_dataProcessing_features_VsmSvnCommentsCommentsAsQuery)



#Load IR-Related Features - bigram
#processedData_dataProcessing_features_VsmLogsJiraAsQuery_2gram_normalized = normalizeData(processedData_dataProcessing_features_VsmLogsJiraAsQuery_2gram)
#processedData_dataProcessing_features_VsmLogsLogAsQuery_2gram_normalized = normalizeData(processedData_dataProcessing_features_VsmLogsLogAsQuery_2gram)
#processedData_dataProcessing_features_VsmUnitNamesJiraAsQuery_2gram_normalized = normalizeData(processedData_dataProcessing_features_VsmUnitNamesJiraAsQuery_2gram)
#processedData_dataProcessing_features_VsmUnitNamesUnitNamesAsQuery_2gram_normalized = normalizeData(processedData_dataProcessing_features_VsmUnitNamesUnitNamesAsQuery_2gram)
#processedData_dataProcessing_features_VsmCommentsLogsAsQuery_2gram_normalized = normalizeData(processedData_dataProcessing_features_VsmCommentsLogsAsQuery_2gram)
#processedData_dataProcessing_features_VsmCommentsCommentsAsQuery_2gram_normalized = normalizeData(processedData_dataProcessing_features_VsmCommentsCommentsAsQuery_2gram)


#Load Document Statistics Features
processedData_JIRA_dataProcessingFeaturesUniqueWordCount_normalized = normalizeData(processedData_JIRA_dataProcessingFeaturesUniqueWordCount)
processedData_SVN_dataProcessingFeaturesUniqueWordCount_normalized = normalizeData(processedData_SVN_dataProcessingFeaturesUniqueWordCount)
processedData_JIRA_dataProcessingFeaturesTotalWordCount_normalized = normalizeData(processedData_JIRA_dataProcessingFeaturesTotalWordCount)
processedData_SVN_dataProcessingFeaturesTotalWordCount_normalized = normalizeData(processedData_SVN_dataProcessingFeaturesTotalWordCount)
processedData_JIRA_dataProcessingFeaturesOverlapPercentage_normalized = normalizeData(processedData_JIRA_dataProcessingFeaturesOverlapPercentage)
processedData_SVN_dataProcessingFeaturesOverlapPercentage_normalized = normalizeData(processedData_SVN_dataProcessingFeaturesOverlapPercentage)
processedData_UNION_dataProcessingFeaturesOverlapPercentage_normalized = normalizeData(processedData_UNION_dataProcessingFeaturesOverlapPercentage)

#Load Query Quality Features
processedData_SVN_dataProcessingFeaturesIDF_normalized = normalizeData(processedData_SVN_dataProcessingFeaturesIDF)
processedData_SVNLogs_dataProcessingFeaturesIDF_normalized = normalizeData(processedData_SVNLogs_dataProcessingFeaturesIDF)
processedData_SVNUnitNames_dataProcessingFeaturesIDF_normalized = normalizeData(processedData_SVNUnitNames_dataProcessingFeaturesIDF)
processedData_JIRA_dataProcessingFeaturesIDF_normalized = normalizeData(processedData_JIRA_dataProcessingFeaturesIDF)
processedData_JIRASummaries_dataProcessingFeaturesIDF_normalized = normalizeData(processedData_JIRASummaries_dataProcessingFeaturesIDF)
processedData_JIRADescriptions_dataProcessingFeaturesIDF_normalized = normalizeData(processedData_JIRADescriptions_dataProcessingFeaturesIDF)
#processedData_JIRAComments_dataProcessingFeaturesIDF_normalized = normalizeData(processedData_JIRAComments_dataProcessingFeaturesIDF)

processedData_SVN_dataProcessingFeaturesICTF_normalized = normalizeData(processedData_SVN_dataProcessingFeaturesICTF)
processedData_SVNLogs_dataProcessingFeaturesICTF_normalized = normalizeData(processedData_SVNLogs_dataProcessingFeaturesICTF)
processedData_SVNUnitNames_dataProcessingFeaturesICTF_normalized = normalizeData(processedData_SVNUnitNames_dataProcessingFeaturesICTF)
processedData_JIRA_dataProcessingFeaturesICTF_normalized = normalizeData(processedData_JIRA_dataProcessingFeaturesICTF)
processedData_JIRASummaries_dataProcessingFeaturesICTF_normalized = normalizeData(processedData_JIRASummaries_dataProcessingFeaturesICTF)
processedData_JIRADescriptions_dataProcessingFeaturesICTF_normalized = normalizeData(processedData_JIRADescriptions_dataProcessingFeaturesICTF)
#processedData_JIRAComments_dataProcessingFeaturesICTF_normalized = normalizeData(processedData_JIRAComments_dataProcessingFeaturesICTF)

processedData_SVN_dataProcessingFeaturesEntropy_normalized = normalizeData(processedData_SVN_dataProcessingFeaturesEntropy)
processedData_SVNLogs_dataProcessingFeaturesEntropy_normalized = normalizeData(processedData_SVNLogs_dataProcessingFeaturesEntropy)
processedData_SVNUnitNames_dataProcessingFeaturesEntropy_normalized = normalizeData(processedData_SVNUnitNames_dataProcessingFeaturesEntropy)
processedData_JIRA_dataProcessingFeaturesEntropy_normalized = normalizeData(processedData_JIRA_dataProcessingFeaturesEntropy)
processedData_JIRASummaries_dataProcessingFeaturesEntropy_normalized = normalizeData(processedData_JIRASummaries_dataProcessingFeaturesEntropy)
processedData_JIRADescriptions_dataProcessingFeaturesEntropy_normalized = normalizeData(processedData_JIRADescriptions_dataProcessingFeaturesEntropy)
#processedData_JIRAComments_dataProcessingFeaturesEntropy_normalized = normalizeData(processedData_JIRAComments_dataProcessingFeaturesEntropy)

processedData_SVN_dataProcessingFeaturesQueryScope_normalized = normalizeData(processedData_SVN_dataProcessingFeaturesQueryScope)
processedData_SVNLogs_dataProcessingFeaturesQueryScope_normalized = normalizeData(processedData_SVNLogs_dataProcessingFeaturesQueryScope)
processedData_SVNUnitNames_dataProcessingFeaturesQueryScope_normalized = normalizeData(processedData_SVNUnitNames_dataProcessingFeaturesQueryScope)
processedData_JIRA_dataProcessingFeaturesQueryScope_normalized = normalizeData(processedData_JIRA_dataProcessingFeaturesQueryScope)
processedData_JIRASummaries_dataProcessingFeaturesQueryScope_normalized = normalizeData(processedData_JIRASummaries_dataProcessingFeaturesQueryScope)
processedData_JIRADescriptions_dataProcessingFeaturesQueryScope_normalized = normalizeData(processedData_JIRADescriptions_dataProcessingFeaturesQueryScope)
#processedData_JIRAComments_dataProcessingFeaturesQueryScope_normalized = normalizeData(processedData_JIRAComments_dataProcessingFeaturesQueryScope)

processedData_SVN_dataProcessingFeaturesSCS_normalized = normalizeData(processedData_SVN_dataProcessingFeaturesSCS)
processedData_SVNLogs_dataProcessingFeaturesSCS_normalized = normalizeData(processedData_SVNLogs_dataProcessingFeaturesSCS)
processedData_SVNUnitNames_dataProcessingFeaturesSCS_normalized = normalizeData(processedData_SVNUnitNames_dataProcessingFeaturesSCS)
processedData_JIRA_dataProcessingFeaturesSCS_normalized = normalizeData(processedData_JIRA_dataProcessingFeaturesSCS)
processedData_JIRASummaries_dataProcessingFeaturesSCS_normalized = normalizeData(processedData_JIRASummaries_dataProcessingFeaturesSCS)
processedData_JIRADescriptions_dataProcessingFeaturesSCS_normalized = normalizeData(processedData_JIRADescriptions_dataProcessingFeaturesSCS)
#processedData_JIRAComments_dataProcessingFeaturesSCS_normalized = normalizeData(processedData_JIRAComments_dataProcessingFeaturesSCS)

processedData_SVN_dataProcessingFeaturesSCQ_normalized = normalizeData(processedData_SVN_dataProcessingFeaturesSCQ)
processedData_SVNLogs_dataProcessingFeaturesSCQ_normalized = normalizeData(processedData_SVNLogs_dataProcessingFeaturesSCQ)
processedData_SVNUnitNames_dataProcessingFeaturesSCQ_normalized = normalizeData(processedData_SVNUnitNames_dataProcessingFeaturesSCQ)
processedData_JIRA_dataProcessingFeaturesSCQ_normalized = normalizeData(processedData_JIRA_dataProcessingFeaturesSCQ)
processedData_JIRASummaries_dataProcessingFeaturesSCQ_normalized = normalizeData(processedData_JIRASummaries_dataProcessingFeaturesSCQ)
processedData_JIRADescriptions_dataProcessingFeaturesSCQ_normalized = normalizeData(processedData_JIRADescriptions_dataProcessingFeaturesSCQ)
#processedData_JIRAComments_dataProcessingFeaturesSCQ_normalized = normalizeData(processedData_JIRAComments_dataProcessingFeaturesSCQ)

#processedData_SVN_dataProcessingFeaturesPMI_normalized = normalizeData(processedData_SVN_dataProcessingFeaturesPMI)
processedData_SVNLogs_dataProcessingFeaturesPMI_normalized = normalizeData(processedData_SVNLogs_dataProcessingFeaturesPMI)
#processedData_SVNUnitNames_dataProcessingFeaturesPMI_normalized = normalizeData(processedData_SVNUnitNames_dataProcessingFeaturesPMI)
#processedData_JIRA_dataProcessingFeaturesPMI_normalized = normalizeData(processedData_JIRA_dataProcessingFeaturesPMI)
processedData_JIRASummaries_dataProcessingFeaturesPMI_normalized = normalizeData(processedData_JIRASummaries_dataProcessingFeaturesPMI)
#processedData_JIRADescriptions_dataProcessingFeaturesPMI_normalized = normalizeData(processedData_JIRADescriptions_dataProcessingFeaturesPMI)
#processedData_JIRAComments_dataProcessingFeaturesPMI_normalized = normalizeData(processedData_JIRAComments_dataProcessingFeaturesPMI)


## 3.8 Preprocess Data - Load and transform feature families needed for training

In [107]:

#Merge features into 1 dataframe
processedData_dataProcessingFeatures_normalized = pd.concat([processedData_dataProcessingFeaturesTime_normalized,
                                                  processedData_dataProcessingFeaturesStakeholder_normalized,
                                                  #IR-based
                                                  processedData_dataProcessing_features_VsmLogsJiraAsQuery_normalized,
                                                  processedData_dataProcessing_features_VsmLogsLogAsQuery_normalized,
                                                  processedData_dataProcessing_features_VsmUnitNamesJiraAsQuery_normalized,
                                                  processedData_dataProcessing_features_VsmUnitNamesUnitNamesAsQuery_normalized,
                                                #  processedData_dataProcessing_features_VsmUnitNamesCommentsCommentsAsQuery_normalized,
                                                #  processedData_dataProcessing_features_VsmUnitNamesCommentsUnitNamesAsQuery_normalized,
                                                  processedData_dataProcessing_features_VsmUnitNamesDescriptionDescriptionAsQuery_normalized,
                                                  processedData_dataProcessing_features_VsmUnitNamesDescriptionUnitNamesAsQuery_normalized,
                                                  processedData_dataProcessing_features_VsmSummaryLogsSummaryAsQuery_normalized,
                                                  processedData_dataProcessing_features_VsmSummaryLogsLogsAsQuery_normalized,
                                                  processedData_dataProcessing_features_VsmSummaryUnitNamesSummaryAsQuery_normalized,
                                                  processedData_dataProcessing_features_VsmSummaryUnitNamesUnitNamesAsQuery_normalized,
                                                  processedData_dataProcessing_features_VsmDescriptionDescriptionAsQuery_normalized,
                                                  processedData_dataProcessing_features_VsmDescriptionLogsAsQuery_normalized,
                                                 # processedData_dataProcessing_features_VsmCommentsCommentsAsQuery_normalized,
                                                #  processedData_dataProcessing_features_VsmCommentsLogsAsQuery_normalized,
                                                 # processedData_dataProcessing_features_VsmLogsJiraAsQuery_2gram_normalized,
                                                 # processedData_dataProcessing_features_VsmLogsLogAsQuery_2gram_normalized,
                                                 # processedData_dataProcessing_features_VsmUnitNamesJiraAsQuery_2gram_normalized,
                                                 # processedData_dataProcessing_features_VsmUnitNamesUnitNamesAsQuery_2gram_normalized,
                                                  #processedData_dataProcessing_features_VsmVerbPruningUnitNamesJiraAsQuery_normalized,
                                                 # processedData_dataProcessing_features_VsmVerbPruningUnitNamesUnitNamesAsQuery_normalized,
                                                  processedData_dataProcessing_features_VsmSvnJiraJiraAsQuery_normalized,
                                                  processedData_dataProcessing_features_VsmSvnJiraSvnAsQuery_normalized,
                                                  processedData_dataProcessing_features_VsmSvnSummarySvnAsQuery_normalized,
                                                  processedData_dataProcessing_features_VsmSvnSummarySummaryAsQuery_normalized,
                                                  processedData_dataProcessing_features_VsmSvnDescriptionSvnAsQuery_normalized,
                                                  processedData_dataProcessing_features_VsmSvnDescriptionDescriptionAsQuery_normalized,
                                                #  processedData_dataProcessing_features_VsmSvnCommentsSvnAsQuery_normalized,
                                                #  processedData_dataProcessing_features_VsmSvnCommentsCommentsAsQuery_normalized,

                                                  
                                                  #Document Statistics
                                                  processedData_JIRA_dataProcessingFeaturesUniqueWordCount_normalized,
                                                  processedData_SVN_dataProcessingFeaturesUniqueWordCount_normalized,
                                                  processedData_JIRA_dataProcessingFeaturesTotalWordCount_normalized,
                                                  processedData_SVN_dataProcessingFeaturesTotalWordCount_normalized,
                                                  processedData_JIRA_dataProcessingFeaturesOverlapPercentage_normalized,
                                                  processedData_SVN_dataProcessingFeaturesOverlapPercentage_normalized,
                                                  processedData_UNION_dataProcessingFeaturesOverlapPercentage_normalized,
                                                 #Query Quality
                                                  processedData_SVN_dataProcessingFeaturesIDF_normalized['SvnAsQuery_avgIDF'],
                                                  processedData_SVN_dataProcessingFeaturesIDF_normalized['SvnAsQuery_maxIDF'],
                                                  processedData_SVN_dataProcessingFeaturesIDF_normalized['SvnAsQuery_devIDF'],
                                                  processedData_SVNLogs_dataProcessingFeaturesIDF_normalized['SvnLogsAsQuery_avgIDF'],
                                                  processedData_SVNLogs_dataProcessingFeaturesIDF_normalized['SvnLogsAsQuery_maxIDF'],
                                                  processedData_SVNLogs_dataProcessingFeaturesIDF_normalized['SvnLogsAsQuery_devIDF'],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesIDF_normalized['SvnUnitNamesAsQuery_avgIDF'],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesIDF_normalized['SvnUnitNamesAsQuery_maxIDF'],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesIDF_normalized['SvnUnitNamesAsQuery_devIDF'],
                                                  processedData_JIRA_dataProcessingFeaturesIDF_normalized['JiraAsQuery_avgIDF'],
                                                  processedData_JIRA_dataProcessingFeaturesIDF_normalized['JiraAsQuery_maxIDF'],
                                                  processedData_JIRA_dataProcessingFeaturesIDF_normalized['JiraAsQuery_devIDF'],  
                                                  processedData_JIRASummaries_dataProcessingFeaturesIDF_normalized['JiraSummariesAsQuery_avgIDF'],
                                                  processedData_JIRASummaries_dataProcessingFeaturesIDF_normalized['JiraSummariesAsQuery_maxIDF'],
                                                  processedData_JIRASummaries_dataProcessingFeaturesIDF_normalized['JiraSummariesAsQuery_devIDF'],  
                                                  processedData_JIRADescriptions_dataProcessingFeaturesIDF_normalized['JiraDescriptionsAsQuery_avgIDF'],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesIDF_normalized['JiraDescriptionsAsQuery_maxIDF'],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesIDF_normalized['JiraDescriptionsAsQuery_devIDF'],  
                                                #  processedData_JIRAComments_dataProcessingFeaturesIDF_normalized['JiraCommentsAsQuery_avgIDF'],
                                                #  processedData_JIRAComments_dataProcessingFeaturesIDF_normalized['JiraCommentsAsQuery_maxIDF'],
                                                #  processedData_JIRAComments_dataProcessingFeaturesIDF_normalized['JiraCommentsAsQuery_devIDF'],  
                                                  
                                                  processedData_SVN_dataProcessingFeaturesICTF_normalized["SvnAsQuery_avgICTF"],
                                                  processedData_SVN_dataProcessingFeaturesICTF_normalized["SvnAsQuery_maxICTF"],
                                                  processedData_SVN_dataProcessingFeaturesICTF_normalized["SvnAsQuery_devICTF"],
                                                  processedData_SVNLogs_dataProcessingFeaturesICTF_normalized["SvnLogsAsQuery_avgICTF"],
                                                  processedData_SVNLogs_dataProcessingFeaturesICTF_normalized["SvnLogsAsQuery_maxICTF"],
                                                  processedData_SVNLogs_dataProcessingFeaturesICTF_normalized["SvnLogsAsQuery_devICTF"],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesICTF_normalized["SvnUnitNamesAsQuery_avgICTF"],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesICTF_normalized["SvnUnitNamesAsQuery_maxICTF"],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesICTF_normalized["SvnUnitNamesAsQuery_devICTF"],
                                                  processedData_JIRA_dataProcessingFeaturesICTF_normalized["JiraAsQuery_avgICTF"],
                                                  processedData_JIRA_dataProcessingFeaturesICTF_normalized["JiraAsQuery_maxICTF"],
                                                  processedData_JIRA_dataProcessingFeaturesICTF_normalized["JiraAsQuery_devICTF"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesICTF_normalized["JiraSummariesAsQuery_avgICTF"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesICTF_normalized["JiraSummariesAsQuery_maxICTF"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesICTF_normalized["JiraSummariesAsQuery_devICTF"],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesICTF_normalized["JiraDescriptionsAsQuery_avgICTF"],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesICTF_normalized["JiraDescriptionsAsQuery_maxICTF"],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesICTF_normalized["JiraDescriptionsAsQuery_devICTF"],
                                              #    processedData_JIRAComments_dataProcessingFeaturesICTF_normalized["JiraCommentsAsQuery_avgICTF"],
                                              #    processedData_JIRAComments_dataProcessingFeaturesICTF_normalized["JiraCommentsAsQuery_maxICTF"],
                                              #    processedData_JIRAComments_dataProcessingFeaturesICTF_normalized["JiraCommentsAsQuery_devICTF"],
                                                  
                                                  processedData_SVN_dataProcessingFeaturesEntropy_normalized["SvnAsQuery_avgEntropy"],
                                                  processedData_SVN_dataProcessingFeaturesEntropy_normalized["SvnAsQuery_medEntropy"],
                                                  processedData_SVN_dataProcessingFeaturesEntropy_normalized["SvnAsQuery_maxEntropy"],
                                                  processedData_SVN_dataProcessingFeaturesEntropy_normalized["SvnAsQuery_devEntropy"],
                                                  processedData_SVNLogs_dataProcessingFeaturesEntropy_normalized["SvnLogsAsQuery_avgEntropy"],
                                                  processedData_SVNLogs_dataProcessingFeaturesEntropy_normalized["SvnLogsAsQuery_medEntropy"],
                                                  processedData_SVNLogs_dataProcessingFeaturesEntropy_normalized["SvnLogsAsQuery_maxEntropy"],
                                                  processedData_SVNLogs_dataProcessingFeaturesEntropy_normalized["SvnLogsAsQuery_devEntropy"],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesEntropy_normalized["SvnUnitNamesAsQuery_avgEntropy"],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesEntropy_normalized["SvnUnitNamesAsQuery_medEntropy"],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesEntropy_normalized["SvnUnitNamesAsQuery_maxEntropy"],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesEntropy_normalized["SvnUnitNamesAsQuery_devEntropy"],
                                                  processedData_JIRA_dataProcessingFeaturesEntropy_normalized["JiraAsQuery_avgEntropy"],
                                                  processedData_JIRA_dataProcessingFeaturesEntropy_normalized["JiraAsQuery_medEntropy"],
                                                  processedData_JIRA_dataProcessingFeaturesEntropy_normalized["JiraAsQuery_maxEntropy"],
                                                  processedData_JIRA_dataProcessingFeaturesEntropy_normalized["JiraAsQuery_devEntropy"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesEntropy_normalized["JiraSummariesAsQuery_avgEntropy"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesEntropy_normalized["JiraSummariesAsQuery_medEntropy"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesEntropy_normalized["JiraSummariesAsQuery_maxEntropy"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesEntropy_normalized["JiraSummariesAsQuery_devEntropy"],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesEntropy_normalized["JiraDescriptionsAsQuery_avgEntropy"],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesEntropy_normalized["JiraDescriptionsAsQuery_medEntropy"],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesEntropy_normalized["JiraDescriptionsAsQuery_maxEntropy"],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesEntropy_normalized["JiraDescriptionsAsQuery_devEntropy"],
                                               #   processedData_JIRAComments_dataProcessingFeaturesEntropy_normalized["JiraCommentsAsQuery_avgEntropy"],
                                               #   processedData_JIRAComments_dataProcessingFeaturesEntropy_normalized["JiraCommentsAsQuery_medEntropy"],
                                               #   processedData_JIRAComments_dataProcessingFeaturesEntropy_normalized["JiraCommentsAsQuery_maxEntropy"],
                                               #   processedData_JIRAComments_dataProcessingFeaturesEntropy_normalized["JiraCommentsAsQuery_devEntropy"],
                                                  
                                                  processedData_SVN_dataProcessingFeaturesQueryScope_normalized,
                                                  processedData_SVNLogs_dataProcessingFeaturesQueryScope_normalized,
                                                  processedData_SVNUnitNames_dataProcessingFeaturesQueryScope_normalized,
                                                  processedData_JIRA_dataProcessingFeaturesQueryScope_normalized,
                                                  processedData_JIRASummaries_dataProcessingFeaturesQueryScope_normalized,
                                                  processedData_JIRADescriptions_dataProcessingFeaturesQueryScope_normalized,
                                                #  processedData_JIRAComments_dataProcessingFeaturesQueryScope_normalized,
                                                  
                                                  processedData_SVN_dataProcessingFeaturesSCS_normalized,
                                                  processedData_SVNLogs_dataProcessingFeaturesSCS_normalized,
                                                  processedData_SVNUnitNames_dataProcessingFeaturesSCS_normalized,
                                                  processedData_JIRA_dataProcessingFeaturesSCS_normalized,
                                                  processedData_JIRASummaries_dataProcessingFeaturesSCS_normalized,
                                                  processedData_JIRADescriptions_dataProcessingFeaturesSCS_normalized,
                                                #  processedData_JIRAComments_dataProcessingFeaturesSCS_normalized,
                                                  
                                                  processedData_SVN_dataProcessingFeaturesSCQ_normalized["SvnAsQuery_avgSCQ"],
                                                  processedData_SVN_dataProcessingFeaturesSCQ_normalized["SvnAsQuery_maxSCQ"],
                                                  processedData_SVN_dataProcessingFeaturesSCQ_normalized["SvnAsQuery_sumSCQ"],
                                                  processedData_SVNLogs_dataProcessingFeaturesSCQ_normalized["SvnLogsAsQuery_avgSCQ"],
                                                  processedData_SVNLogs_dataProcessingFeaturesSCQ_normalized["SvnLogsAsQuery_maxSCQ"],
                                                  processedData_SVNLogs_dataProcessingFeaturesSCQ_normalized["SvnLogsAsQuery_sumSCQ"],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesSCQ_normalized["SvnUnitNamesAsQuery_avgSCQ"],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesSCQ_normalized["SvnUnitNamesAsQuery_maxSCQ"],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesSCQ_normalized["SvnUnitNamesAsQuery_sumSCQ"],
                                                  processedData_JIRA_dataProcessingFeaturesSCQ_normalized["JiraAsQuery_avgSCQ"],
                                                  processedData_JIRA_dataProcessingFeaturesSCQ_normalized["JiraAsQuery_maxSCQ"],
                                                  processedData_JIRA_dataProcessingFeaturesSCQ_normalized["JiraAsQuery_sumSCQ"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesSCQ_normalized["JiraSummariesAsQuery_avgSCQ"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesSCQ_normalized["JiraSummariesAsQuery_maxSCQ"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesSCQ_normalized["JiraSummariesAsQuery_sumSCQ"],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesSCQ_normalized["JiraDescriptionsAsQuery_avgSCQ"],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesSCQ_normalized["JiraDescriptionsAsQuery_maxSCQ"],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesSCQ_normalized["JiraDescriptionsAsQuery_sumSCQ"],
                                                 # processedData_JIRAComments_dataProcessingFeaturesSCQ_normalized["JiraCommentsAsQuery_avgSCQ"],
                                                #  processedData_JIRAComments_dataProcessingFeaturesSCQ_normalized["JiraCommentsAsQuery_maxSCQ"],
                                                #  processedData_JIRAComments_dataProcessingFeaturesSCQ_normalized["JiraCommentsAsQuery_sumSCQ"],
                                                  
                                                 # processedData_SVN_dataProcessingFeaturesPMI_normalized["SvnAsQuery_avgPMI"],
                                                 # processedData_SVN_dataProcessingFeaturesPMI_normalized["SvnAsQuery_maxPMI"],
                                                  processedData_SVNLogs_dataProcessingFeaturesPMI_normalized["SvnLogsAsQuery_avgPMI"],
                                                  processedData_SVNLogs_dataProcessingFeaturesPMI_normalized["SvnLogsAsQuery_maxPMI"],
                                                 # processedData_SVNUnitNames_dataProcessingFeaturesPMI_normalized["SvnUnitNamesAsQuery_avgPMI"],
                                                 # processedData_SVNUnitNames_dataProcessingFeaturesPMI_normalized["SvnUnitNamesAsQuery_maxPMI"],
                                                 # processedData_JIRA_dataProcessingFeaturesPMI_normalized["JiraAsQuery_avgPMI"],
                                                 # processedData_JIRA_dataProcessingFeaturesPMI_normalized["JiraAsQuery_maxPMI"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesPMI_normalized["JiraSummariesAsQuery_avgPMI"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesPMI_normalized["JiraSummariesAsQuery_maxPMI"],
                                                 # processedData_JIRADescriptions_dataProcessingFeaturesPMI_normalized["JiraDescriptionsAsQuery_avgPMI"],
                                                 # processedData_JIRADescriptions_dataProcessingFeaturesPMI_normalized["JiraDescriptionsAsQuery_maxPMI"],
                                                #  processedData_JIRAComments_dataProcessingFeaturesPMI_normalized["JiraCommentsAsQuery_avgPMI"],
                                               #   processedData_JIRAComments_dataProcessingFeaturesPMI_normalized["JiraCommentssAsQuery_maxPMI"],                                                  
                                                 ], axis=1)
#Set the NaN to 0
processedData_dataProcessingFeatures_normalized = processedData_dataProcessingFeatures_normalized.fillna(0)

#Saving feature names for later use
processedData_dataProcessingFeatureNames_normalized = list(processedData_dataProcessingFeatures_normalized.columns)

#Transform pandas data frame into numpy arrays
processedData_dataProcessingFeatures_normalized = np.array(processedData_dataProcessingFeatures_normalized)

#Load labels
processedData_dataProcessingLabels_normalized = pd.read_pickle(r'../data/03_processed/processedData_dataProcessingLabels.pkl')
processedData_dataProcessingLabels_normalized = np.array(processedData_dataProcessingLabels_normalized["is_valid"])


In [108]:

#Merge features into 1 dataframe
processedData_dataProcessingFeatures = pd.concat([processedData_dataProcessingFeaturesTime,
                                                  processedData_dataProcessingFeaturesStakeholder,
                                                  #IR-based
                                                  processedData_dataProcessing_features_VsmLogsJiraAsQuery,
                                                  processedData_dataProcessing_features_VsmLogsLogAsQuery,
                                                  processedData_dataProcessing_features_VsmUnitNamesJiraAsQuery,
                                                  processedData_dataProcessing_features_VsmUnitNamesUnitNamesAsQuery,
                                                 # processedData_dataProcessing_features_VsmUnitNamesCommentsCommentsAsQuery,
                                                 # processedData_dataProcessing_features_VsmUnitNamesCommentsUnitNamesAsQuery,
                                                  processedData_dataProcessing_features_VsmUnitNamesDescriptionDescriptionAsQuery,
                                                  processedData_dataProcessing_features_VsmUnitNamesDescriptionUnitNamesAsQuery,
                                                  processedData_dataProcessing_features_VsmSummaryLogsSummaryAsQuery,
                                                  processedData_dataProcessing_features_VsmSummaryLogsLogsAsQuery,
                                                  processedData_dataProcessing_features_VsmSummaryUnitNamesSummaryAsQuery,
                                                  processedData_dataProcessing_features_VsmSummaryUnitNamesUnitNamesAsQuery,
                                                  processedData_dataProcessing_features_VsmDescriptionDescriptionAsQuery,
                                                  processedData_dataProcessing_features_VsmDescriptionLogsAsQuery,
                                                 # processedData_dataProcessing_features_VsmLogsJiraAsQuery_2gram,
                                                 # processedData_dataProcessing_features_VsmLogsLogAsQuery_2gram,
                                                 # processedData_dataProcessing_features_VsmUnitNamesJiraAsQuery_2gram,
                                                 # processedData_dataProcessing_features_VsmUnitNamesUnitNamesAsQuery_2gram,
                                                 # processedData_dataProcessing_features_VsmVerbPruningUnitNamesJiraAsQuery,
                                                 # processedData_dataProcessing_features_VsmVerbPruningUnitNamesUnitNamesAsQuery,
                                                  processedData_dataProcessing_features_VsmSvnJiraJiraAsQuery,
                                                  processedData_dataProcessing_features_VsmSvnJiraSvnAsQuery,
                                                  processedData_dataProcessing_features_VsmSvnSummarySvnAsQuery,
                                                  processedData_dataProcessing_features_VsmSvnSummarySummaryAsQuery,
                                                  processedData_dataProcessing_features_VsmSvnDescriptionSvnAsQuery,
                                                  processedData_dataProcessing_features_VsmSvnDescriptionDescriptionAsQuery,
                                                #  processedData_dataProcessing_features_VsmSvnCommentsSvnAsQuery,
                                                #  processedData_dataProcessing_features_VsmSvnCommentsCommentsAsQuery,

                                                  
                                                  #Document Statistics
                                                  processedData_JIRA_dataProcessingFeaturesUniqueWordCount,
                                                  processedData_SVN_dataProcessingFeaturesUniqueWordCount,
                                                  processedData_JIRA_dataProcessingFeaturesTotalWordCount,
                                                  processedData_SVN_dataProcessingFeaturesTotalWordCount,
                                                  processedData_JIRA_dataProcessingFeaturesOverlapPercentage,
                                                  processedData_SVN_dataProcessingFeaturesOverlapPercentage,
                                                  processedData_UNION_dataProcessingFeaturesOverlapPercentage,
                                                 #Query Quality
                                                  processedData_SVN_dataProcessingFeaturesIDF['SvnAsQuery_avgIDF'],
                                                  processedData_SVN_dataProcessingFeaturesIDF['SvnAsQuery_maxIDF'],
                                                  processedData_SVN_dataProcessingFeaturesIDF['SvnAsQuery_devIDF'],
                                                  processedData_SVNLogs_dataProcessingFeaturesIDF['SvnLogsAsQuery_avgIDF'],
                                                  processedData_SVNLogs_dataProcessingFeaturesIDF['SvnLogsAsQuery_maxIDF'],
                                                  processedData_SVNLogs_dataProcessingFeaturesIDF['SvnLogsAsQuery_devIDF'],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesIDF['SvnUnitNamesAsQuery_avgIDF'],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesIDF['SvnUnitNamesAsQuery_maxIDF'],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesIDF['SvnUnitNamesAsQuery_devIDF'],
                                                  processedData_JIRA_dataProcessingFeaturesIDF['JiraAsQuery_avgIDF'],
                                                  processedData_JIRA_dataProcessingFeaturesIDF['JiraAsQuery_maxIDF'],
                                                  processedData_JIRA_dataProcessingFeaturesIDF['JiraAsQuery_devIDF'], 
                                                  processedData_JIRASummaries_dataProcessingFeaturesIDF['JiraSummariesAsQuery_avgIDF'],
                                                  processedData_JIRASummaries_dataProcessingFeaturesIDF['JiraSummariesAsQuery_maxIDF'],
                                                  processedData_JIRASummaries_dataProcessingFeaturesIDF['JiraSummariesAsQuery_devIDF'], 
                                                  processedData_JIRADescriptions_dataProcessingFeaturesIDF['JiraDescriptionsAsQuery_avgIDF'],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesIDF['JiraDescriptionsAsQuery_maxIDF'],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesIDF['JiraDescriptionsAsQuery_devIDF'], 
                                                #  processedData_JIRAComments_dataProcessingFeaturesIDF['JiraCommentsAsQuery_avgIDF'],
                                               #   processedData_JIRAComments_dataProcessingFeaturesIDF['JiraCommentsAsQuery_maxIDF'],
                                               #   processedData_JIRAComments_dataProcessingFeaturesIDF['JiraCommentsAsQuery_devIDF'], 
                                                  
                                                  processedData_SVN_dataProcessingFeaturesICTF["SvnAsQuery_avgICTF"],
                                                  processedData_SVN_dataProcessingFeaturesICTF["SvnAsQuery_maxICTF"],
                                                  processedData_SVN_dataProcessingFeaturesICTF["SvnAsQuery_devICTF"],
                                                  processedData_SVNLogs_dataProcessingFeaturesICTF["SvnLogsAsQuery_avgICTF"],
                                                  processedData_SVNLogs_dataProcessingFeaturesICTF["SvnLogsAsQuery_maxICTF"],
                                                  processedData_SVNLogs_dataProcessingFeaturesICTF["SvnLogsAsQuery_devICTF"],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesICTF["SvnUnitNamesAsQuery_avgICTF"],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesICTF["SvnUnitNamesAsQuery_maxICTF"],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesICTF["SvnUnitNamesAsQuery_devICTF"],
                                                  processedData_JIRA_dataProcessingFeaturesICTF["JiraAsQuery_avgICTF"],
                                                  processedData_JIRA_dataProcessingFeaturesICTF["JiraAsQuery_maxICTF"],
                                                  processedData_JIRA_dataProcessingFeaturesICTF["JiraAsQuery_devICTF"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesICTF["JiraSummariesAsQuery_avgICTF"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesICTF["JiraSummariesAsQuery_maxICTF"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesICTF["JiraSummariesAsQuery_devICTF"],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesICTF["JiraDescriptionsAsQuery_avgICTF"],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesICTF["JiraDescriptionsAsQuery_maxICTF"],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesICTF["JiraDescriptionsAsQuery_devICTF"],
                                                 # processedData_JIRAComments_dataProcessingFeaturesICTF["JiraCommentsAsQuery_avgICTF"],
                                                #  processedData_JIRAComments_dataProcessingFeaturesICTF["JiraCommentsAsQuery_maxICTF"],
                                               #   processedData_JIRAComments_dataProcessingFeaturesICTF["JiraCommentsAsQuery_devICTF"],
                                                  
                                                  processedData_SVN_dataProcessingFeaturesEntropy["SvnAsQuery_avgEntropy"],
                                                  processedData_SVN_dataProcessingFeaturesEntropy["SvnAsQuery_medEntropy"],
                                                  processedData_SVN_dataProcessingFeaturesEntropy["SvnAsQuery_maxEntropy"],
                                                  processedData_SVN_dataProcessingFeaturesEntropy["SvnAsQuery_devEntropy"],
                                                  processedData_SVNLogs_dataProcessingFeaturesEntropy["SvnLogsAsQuery_avgEntropy"],
                                                  processedData_SVNLogs_dataProcessingFeaturesEntropy["SvnLogsAsQuery_medEntropy"],
                                                  processedData_SVNLogs_dataProcessingFeaturesEntropy["SvnLogsAsQuery_maxEntropy"],
                                                  processedData_SVNLogs_dataProcessingFeaturesEntropy["SvnLogsAsQuery_devEntropy"],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesEntropy["SvnUnitNamesAsQuery_avgEntropy"],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesEntropy["SvnUnitNamesAsQuery_medEntropy"],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesEntropy["SvnUnitNamesAsQuery_maxEntropy"],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesEntropy["SvnUnitNamesAsQuery_devEntropy"],
                                                  processedData_JIRA_dataProcessingFeaturesEntropy["JiraAsQuery_avgEntropy"],
                                                  processedData_JIRA_dataProcessingFeaturesEntropy["JiraAsQuery_medEntropy"],
                                                  processedData_JIRA_dataProcessingFeaturesEntropy["JiraAsQuery_maxEntropy"],
                                                  processedData_JIRA_dataProcessingFeaturesEntropy["JiraAsQuery_devEntropy"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesEntropy["JiraSummariesAsQuery_avgEntropy"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesEntropy["JiraSummariesAsQuery_medEntropy"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesEntropy["JiraSummariesAsQuery_maxEntropy"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesEntropy["JiraSummariesAsQuery_devEntropy"],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesEntropy["JiraDescriptionsAsQuery_avgEntropy"],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesEntropy["JiraDescriptionsAsQuery_medEntropy"],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesEntropy["JiraDescriptionsAsQuery_maxEntropy"],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesEntropy["JiraDescriptionsAsQuery_devEntropy"],
                                                #  processedData_JIRAComments_dataProcessingFeaturesEntropy["JiraCommentsAsQuery_avgEntropy"],
                                                 # processedData_JIRAComments_dataProcessingFeaturesEntropy["JiraCommentsAsQuery_medEntropy"],
                                                 # processedData_JIRAComments_dataProcessingFeaturesEntropy["JiraCommentsAsQuery_maxEntropy"],
                                                 # processedData_JIRAComments_dataProcessingFeaturesEntropy["JiraCommentsAsQuery_devEntropy"],
                                                  
                                                  processedData_SVN_dataProcessingFeaturesQueryScope,
                                                  processedData_SVNLogs_dataProcessingFeaturesQueryScope,
                                                  processedData_SVNUnitNames_dataProcessingFeaturesQueryScope,
                                                  processedData_JIRA_dataProcessingFeaturesQueryScope,
                                                  processedData_JIRASummaries_dataProcessingFeaturesQueryScope,
                                                  processedData_JIRADescriptions_dataProcessingFeaturesQueryScope,
                                                #  processedData_JIRAComments_dataProcessingFeaturesQueryScope,
                                                  
                                                  processedData_SVN_dataProcessingFeaturesSCS,
                                                  processedData_SVNLogs_dataProcessingFeaturesSCS,
                                                  processedData_SVNUnitNames_dataProcessingFeaturesSCS,
                                                  processedData_JIRA_dataProcessingFeaturesSCS,
                                                  processedData_JIRASummaries_dataProcessingFeaturesSCS,
                                                  processedData_JIRADescriptions_dataProcessingFeaturesSCS,
                                                 # processedData_JIRAComments_dataProcessingFeaturesSCS,
                                                  
                                                  processedData_SVN_dataProcessingFeaturesSCQ["SvnAsQuery_avgSCQ"],
                                                  processedData_SVN_dataProcessingFeaturesSCQ["SvnAsQuery_maxSCQ"],
                                                  processedData_SVN_dataProcessingFeaturesSCQ["SvnAsQuery_sumSCQ"],
                                                  processedData_SVNLogs_dataProcessingFeaturesSCQ["SvnLogsAsQuery_avgSCQ"],
                                                  processedData_SVNLogs_dataProcessingFeaturesSCQ["SvnLogsAsQuery_maxSCQ"],
                                                  processedData_SVNLogs_dataProcessingFeaturesSCQ["SvnLogsAsQuery_sumSCQ"],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesSCQ["SvnUnitNamesAsQuery_avgSCQ"],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesSCQ["SvnUnitNamesAsQuery_maxSCQ"],
                                                  processedData_SVNUnitNames_dataProcessingFeaturesSCQ["SvnUnitNamesAsQuery_sumSCQ"],
                                                  processedData_JIRA_dataProcessingFeaturesSCQ["JiraAsQuery_avgSCQ"],
                                                  processedData_JIRA_dataProcessingFeaturesSCQ["JiraAsQuery_maxSCQ"],
                                                  processedData_JIRA_dataProcessingFeaturesSCQ["JiraAsQuery_sumSCQ"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesSCQ["JiraSummariesAsQuery_avgSCQ"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesSCQ["JiraSummariesAsQuery_maxSCQ"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesSCQ["JiraSummariesAsQuery_sumSCQ"],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesSCQ["JiraDescriptionsAsQuery_avgSCQ"],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesSCQ["JiraDescriptionsAsQuery_maxSCQ"],
                                                  processedData_JIRADescriptions_dataProcessingFeaturesSCQ["JiraDescriptionsAsQuery_sumSCQ"],
                                                 # processedData_JIRAComments_dataProcessingFeaturesSCQ["JiraCommentsAsQuery_avgSCQ"],
                                                 # processedData_JIRAComments_dataProcessingFeaturesSCQ["JiraCommentsAsQuery_maxSCQ"],
                                                 # processedData_JIRAComments_dataProcessingFeaturesSCQ["JiraCommentsAsQuery_sumSCQ"],
                                                  
                                                  #processedData_SVN_dataProcessingFeaturesPMI["SvnAsQuery_avgPMI"],
                                                  #processedData_SVN_dataProcessingFeaturesPMI["SvnAsQuery_maxPMI"],
                                                  processedData_SVNLogs_dataProcessingFeaturesPMI["SvnLogsAsQuery_avgPMI"],
                                                  processedData_SVNLogs_dataProcessingFeaturesPMI["SvnLogsAsQuery_maxPMI"],
                                                 # processedData_SVNUnitNames_dataProcessingFeaturesPMI["SvnUnitNamesAsQuery_avgPMI"],
                                                 # processedData_SVNUnitNames_dataProcessingFeaturesPMI["SvnUnitNamesAsQuery_maxPMI"],
                                                 # processedData_JIRA_dataProcessingFeaturesPMI["JiraAsQuery_avgPMI"],
                                                 # processedData_JIRA_dataProcessingFeaturesPMI["JiraAsQuery_maxPMI"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesPMI["JiraSummariesAsQuery_avgPMI"],
                                                  processedData_JIRASummaries_dataProcessingFeaturesPMI["JiraSummariesAsQuery_maxPMI"],
                                                #  processedData_JIRADescriptions_dataProcessingFeaturesPMI["JiraDescriptionsAsQuery_avgPMI"],
                                                #  processedData_JIRADescriptions_dataProcessingFeaturesPMI["JiraDescriptionsAsQuery_maxPMI"],
                                                  #processedData_JIRAComments_dataProcessingFeaturesPMI["JiraCommentsAsQuery_avgPMI"],
                                                  #processedData_JIRAComments_dataProcessingFeaturesPMI["JiraCommentssAsQuery_maxPMI"],
                                                 ], axis=1)
#Set the NaN to 0
processedData_dataProcessingFeatures = processedData_dataProcessingFeatures.fillna(0)

#Saving feature names for later use
processedData_dataProcessingFeatureNames = list(processedData_dataProcessingFeatures.columns)

#Transform pandas data frame into numpy arrays
processedData_dataProcessingFeatures = np.array(processedData_dataProcessingFeatures)

#Load labels
processedData_dataProcessingLabels = pd.read_pickle(r'../data/03_processed/processedData_dataProcessingLabels.pkl')
processedData_dataProcessingLabels = np.array(processedData_dataProcessingLabels["is_valid"])


# 4. Modeling - Normalization
First select which data set to train:


In [109]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve



def showModelPerformance(trainedModel, testFeatures, testLabels):
    # Use the forest's predict method on the test data
    predictionLabels = trainedModel.predict(testFeatures)
    
    accuracyValue = accuracy_score(testLabels.astype(bool), predictionLabels)
    precisionValue = precision_score(testLabels.astype(bool), predictionLabels, average='binary')
    f1Value = f1_score(testLabels.astype(bool), predictionLabels)
    f2Value = fbeta_score(testLabels.astype(bool), predictionLabels, beta=2.0)
    f05Value = fbeta_score(testLabels.astype(bool), predictionLabels, beta=0.5)
    recallValue = recall_score(testLabels.astype(bool), predictionLabels)
    averagePrecisionValue = average_precision_score(testLabels.astype(bool), predictionLabels)
          
    performanceData = {'Accuracy':  [accuracyValue],
                       'Precision': [precisionValue],
                       'Recall': [recallValue],
                       'F1': [f1Value],
                       'F2': [f2Value],
                       'F0.5': [f05Value],
                       'Average Precision': [averagePrecisionValue]
                      }
    performanceDf = pd.DataFrame(performanceData)
    return(performanceDf)

In [110]:
features_normalized = processedData_dataProcessingFeatures_normalized
labels_normalized = processedData_dataProcessingLabels_normalized

## 4.1 Rebalancing Strategy - None

### 4.1.1 Random Forests

In [111]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

none_randomforest_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    pipeline = Pipeline(steps = [#['smote', SMOTE(sampling_strategy = 0.5, n_jobs=2)],
                              #['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    none_randomforest_normalized_performance_df = pd.concat([none_randomforest_normalized_performance_df, new_performance_df])
    
none_randomforest_normalized_performance_df.to_csv("../data/05_model_output/none_randomforest_normalized_performance_df.csv")



### 4.1.2 XGBoost

In [112]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


none_xgboost_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    GXBoostPipeline = Pipeline(steps = [#['smote', SMOTE()],
                                    #['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    none_xgboost_normalized_performance_df = pd.concat([none_xgboost_normalized_performance_df, new_performance_df])
    

none_xgboost_normalized_performance_df.to_csv("../data/05_model_output/none_xgboost_normalized_performance_df.csv")




Elapsed time to compute best fit: 3855.377 seconds
Cross-validation score: 0.8093347203584663
Test score: 0.6912442396313364
Best Hyperparameters: {}
0.0072720307
0.021310592
0.042551234
0.041790124
0.0071374574
0.020196013
0.040487297
0.009820546
0.009628219
0.011895632
0.015249805
0.016339842
0.006326181
0.0
0.012182788
0.0
0.021711517
0.009050754
0.010176172
0.003454906
0.012495866
0.0050516785
0.0032558648
0.004819459
0.0099877035
0.0019276367
0.0071886512
0.008187002
0.004741479
0.012125912
0.0068566403
0.0112151345
0.0047219396
0.004134175
0.005348617
0.0
0.0
0.0
0.0057823127
0.0
0.008733912
0.001005452
0.06949598
0.003701428
0.0076242057
0.0
0.004628717
0.0
0.0
0.0
0.003037463
0.002444094
0.007412124
0.004630415
0.0019190412
0.0054477425
0.019590078
0.0031403196
0.010805098
0.021323213
0.0006576915
0.0062329173
0.009194413
0.003305996
0.0036867426
0.0015126237
0.0069820113
0.0034024795
0.0071812216
0.0058243675
0.007825753
0.0152278915
0.0051702056
0.0077145654
0.00245673
0.0
0.



Elapsed time to compute best fit: 47.663 seconds
Cross-validation score: 0.7693349987540813
Test score: 0.8222222222222223
Best Hyperparameters: {}
0.01106955
0.027757319
0.037940055
0.04873534
0.0049756966
0.015374304
0.012101854
0.009139158
0.012552656
0.009599595
0.014585914
0.02406329
0.006178337
0.0
0.008648909
0.0
0.032592252
0.013725939
0.0090093
0.008676277
0.013127737
0.0058285953
0.0045383107
0.0092392955
0.0052630343
0.0016542511
0.00903647
0.010684227
0.006041401
0.015838021
0.00667638
0.009026396
0.003582986
0.0033241166
0.004376499
0.0
0.0
0.0
0.0072723012
0.0010496869
0.0058284164
0.0054376777
0.0
0.007791955
0.006062348
0.0
0.0060000145
0.0
0.0
0.0
0.004057158
0.0031926713
0.0057668155
0.007310436
0.0043208753
0.004576024
0.0042422214
0.0086544985
0.012705629
0.020394994
0.01100128
0.0078090737
0.003146409
0.0016561693
0.009271374
0.0065961774
0.0083372295
0.0072680837
0.004530058
0.0036843547
0.0077722296
0.016847676
0.0057897735
0.0065081147
0.0029520027
0.0
0.0014746



Elapsed time to compute best fit: 61.498 seconds
Cross-validation score: 0.7915729806767688
Test score: 0.8076923076923077
Best Hyperparameters: {}
0.008777367
0.026658379
0.039915107
0.034173284
0.008579033
0.013329699
0.009336428
0.010615255
0.016008036
0.010790913
0.019093784
0.023461793
0.008049302
0.0
0.007552781
0.0
0.040489674
0.025841888
0.008942084
0.006999017
0.012628826
0.01107321
0.00866805
0.00265404
0.004636035
0.0013708824
0.009226533
0.009616894
0.0044344296
0.005987857
0.00090078136
0.0068989308
0.0037885166
0.0044540884
0.004356441
0.0
0.0
0.0
0.00859013
0.0
0.0110078035
0.0047950014
0.03430454
0.0057167383
0.0058858795
0.0
0.008088092
0.0
0.0
0.0
0.0042737997
0.002309125
0.0050701983
0.0044947476
0.0064771045
0.0034833925
0.012448268
0.014497395
0.008884413
0.02182633
0.017934568
0.0075403536
0.005115181
0.03543003
0.008070343
0.0018999563
0.008599408
0.0076130787
0.006523615
0.0064100525
0.005185987
0.013716677
0.0065577957
0.014082145
0.0032712067
0.0
0.0051178583




Elapsed time to compute best fit: 69.065 seconds
Cross-validation score: 0.7948657756808954
Test score: 0.7860262008733626
Best Hyperparameters: {}
0.008268814
0.017356342
0.04814626
0.042234715
0.011997478
0.013494013
0.009967134
0.0049449126
0.015616371
0.011163279
0.015319905
0.022351382
0.0061995042
0.0
0.009916468
0.0
0.03048722
0.021613296
0.016842732
0.00860204
0.011218986
0.007509896
0.0019081603
0.0034568035
0.002401422
0.00088325836
0.0097572375
0.0049424465
0.00434491
0.013298701
0.004727901
0.0037671367
0.007911184
0.0019726795
0.0058434037
0.0
0.0
0.0
0.006111858
0.0
0.009037683
0.016638765
0.02321306
0.0054112067
0.013042808
0.0
0.0070226197
0.0
0.0
0.0
0.0034730127
0.0038960988
0.009865763
0.0073520476
0.0035480764
0.0018391093
0.009374121
0.0017275255
0.0065176683
0.015195966
0.027905587
0.014728867
0.014672984
0.0052374466
0.008212981
0.008029019
0.0036558092
0.0006790941
0.01011233
0.005735784
0.007635197
0.025898084
0.0071072364
0.0049031735
0.0054054568
0.0014092788



Elapsed time to compute best fit: 62.325 seconds
Cross-validation score: 0.7695958962564495
Test score: 0.8333333333333334
Best Hyperparameters: {}
0.0062973658
0.024636654
0.037785813
0.04123964
0.004883937
0.00932267
0.015648322
0.0055592745
0.013369267
0.01069951
0.013814198
0.023663064
0.005137663
0.0
0.010335272
0.0
0.019721903
0.027508002
0.0056538335
0.008454163
0.021246921
0.0053080735
0.011504576
0.0033116806
0.016608052
0.0015248482
0.008343672
0.007273103
0.0036291112
0.009285806
0.0072829495
0.0059099523
0.0028044942
0.00613415
0.0037114157
0.0
0.0
0.0
0.0053288112
0.0
0.008694571
0.009293525
0.04474182
0.002589704
0.013406323
0.0
0.004185069
0.0
0.0
0.0
0.0027277444
0.004617458
0.006547253
0.005658588
0.001302171
0.0035199833
0.003176492
0.019614372
0.014605799
0.014050373
0.01150064
0.00816687
0.0012000096
0.012617663
0.009039882
0.0044658664
0.007967335
0.014554722
0.0038719587
0.004653864
0.0070957737
0.022709481
0.005474918
0.0060454854
0.005934338
0.0
0.008414091
0.00



Elapsed time to compute best fit: 56.268 seconds
Cross-validation score: 0.7604130623788384
Test score: 0.7981220657276995
Best Hyperparameters: {}
0.011140352
0.023530059
0.040480975
0.03740592
0.009774723
0.016422795
0.011158168
0.00548748
0.017508388
0.00985937
0.018229414
0.01608066
0.009459737
0.0
0.010900135
0.0
0.022123823
0.033386003
0.008582776
0.007160563
0.013188277
0.005292561
0.008709848
0.0034581004
0.005744124
0.0023707235
0.008242364
0.009609753
0.004798412
0.00834643
0.0034788905
0.0059040636
0.006402474
0.005910664
0.0033164
0.0
0.0
0.0
0.005040065
0.0
0.012657541
0.0058689597
0.033711914
0.0066741826
0.0047565624
0.0
0.0043761623
0.0
0.0
0.0
0.0029576211
0.0029238868
0.007828832
0.009232194
0.005262221
0.0007657983
0.005267405
0.0074461214
0.010947408
0.015295054
0.011251461
0.008486698
0.0034126234
0.004988788
0.0071375235
0.0026765687
0.0062131654
0.0047116703
0.003001685
0.006283363
0.0082689095
0.0073657236
0.005677341
0.0093064215
0.020520322
0.0
0.0013247529
0.



Elapsed time to compute best fit: 52.875 seconds
Cross-validation score: 0.783497937924798
Test score: 0.8414634146341465
Best Hyperparameters: {}
0.009040501
0.018886887
0.03811755
0.03982933
0.004148468
0.014178224
0.008092469
0.008381379
0.016110377
0.0075921444
0.017358843
0.018114384
0.0077339783
0.0
0.014633755
0.0
0.022138251
0.029781004
0.008266265
0.0068442724
0.011815821
0.011124205
0.0012054049
0.007143619
0.0018960952
0.0011580216
0.005843674
0.0059082983
0.0047477824
0.012322464
0.0059421463
0.0025600244
0.0022636591
0.0033693465
0.0027930832
0.0
0.0
0.0
0.0057996036
0.0
0.009633488
0.00442187
0.057856314
0.009734032
0.0031676206
0.0
0.0084217
0.0
0.0
0.0
0.0037567671
0.0028207873
0.0054997094
0.013282697
0.013336824
0.0068688123
0.008752356
0.02059164
0.006277954
0.014406184
0.0086949635
0.013133247
0.0130436625
0.003402007
0.0075581474
0.007065174
0.0087822005
0.0
0.010427884
0.0078519555
0.00762084
0.012412069
0.0036291548
0.008582496
0.009343979
0.00089770835
0.0048381



Elapsed time to compute best fit: 50.989 seconds
Cross-validation score: 0.7721942189189919
Test score: 0.8292682926829268
Best Hyperparameters: {}
0.010328123
0.024368089
0.041044295
0.041168578
0.006636619
0.009610578
0.01231133
0.007963934
0.009661402
0.005642456
0.019575316
0.024685023
0.004793508
0.0
0.0111403875
0.0
0.026258929
0.01834335
0.0146007575
0.005774555
0.015552926
0.0067225965
0.011709586
0.0044320137
0.005235901
0.0052454798
0.0054467595
0.0070332116
0.0055880933
0.0116013335
0.0038194095
0.006671351
0.0037840386
0.002273777
0.005017379
0.0
0.0
0.0
0.013322881
0.0
0.015734648
0.002638747
0.08874446
0.008253174
0.009718988
0.0
0.003939069
0.0
0.0
0.0
0.004041539
0.002890071
0.004224303
0.004820158
0.007251517
0.006612545
0.001342629
0.0039735613
0.011635357
0.023272175
0.0029724285
0.012868728
0.016974555
0.003130285
0.007097074
0.0067021903
0.009237292
0.002871847
0.005491932
0.006647467
0.008378511
0.0062723095
0.00576546
0.006993256
0.003055109
0.0047808466
0.008823



Elapsed time to compute best fit: 57.038 seconds
Cross-validation score: 0.7692511381372505
Test score: 0.7775119617224882
Best Hyperparameters: {}
0.010117233
0.028320367
0.043458156
0.043099232
0.0099946
0.015646001
0.00826731
0.007970349
0.0160201
0.01555053
0.024639029
0.01707815
0.005043994
0.0
0.00962614
0.0
0.018301018
0.027903937
0.016809598
0.0060231257
0.009583237
0.010752375
0.0039711087
0.0058636894
0.007207447
0.0022294084
0.0033211892
0.011991709
0.006296415
0.009241761
0.0021262325
0.0055399216
0.0049169282
0.0029727838
0.0039396915
0.0
0.0
0.0
0.00854286
0.0
0.008750412
0.016544195
0.04208243
0.0066577094
0.015614144
0.00048657454
0.008787503
0.0
0.0
0.0
0.00402532
0.0026607623
0.0038390935
0.0011594757
0.0014703518
0.006296559
0.008047634
0.029397443
0.00992667
0.020215966
0.009969306
0.0042628506
0.0070630005
0.013654117
0.004468709
0.0018439686
0.0095804
0.0034177224
0.0034996606
0.006632459
0.012299591
0.008964296
0.005075293
0.0059063556
0.005565005
0.0
0.003983526



Elapsed time to compute best fit: 59.788 seconds
Cross-validation score: 0.7828636244430898
Test score: 0.7535885167464114
Best Hyperparameters: {}
0.0074877436
0.015600114
0.043884847
0.041787513
0.0069421097
0.02687854
0.006442406
0.00593432
0.009183056
0.011550814
0.02684439
0.011708984
0.0052832216
0.0
0.011550249
0.0
0.015287959
0.021792768
0.016606975
0.0082122935
0.012873781
0.0029332985
0.010601374
0.0074300305
0.0061361715
0.0022005653
0.0021586646
0.0068199104
0.004754813
0.012291529
0.006584107
0.0043528536
0.0034947824
0.003803914
0.003850272
0.0
0.0
0.0
0.0120227765
0.0
0.006844433
0.0041067987
0.04067483
0.0057131657
0.010330755
0.0
0.0026484574
0.0
0.0
0.0
0.0056950827
0.007035271
0.005931881
0.012824582
0.012050418
0.0098044975
0.014569679
0.0
0.0093042515
0.01723533
0.025787283
0.009251116
0.004972011
0.027579226
0.0064613614
0.002994875
0.006032131
0.014085718
0.002916508
0.0046828557
0.0056168595
0.009929804
0.0040173368
0.006296785
0.005386843
0.020582134
0.01697867



Elapsed time to compute best fit: 61.518 seconds
Cross-validation score: 0.7866405309037225
Test score: 0.8050847457627119
Best Hyperparameters: {}
0.009097801
0.01882496
0.04080728
0.04093147
0.0064932136
0.017714782
0.016258307
0.007855554
0.011054167
0.009709007
0.013600587
0.02022436
0.0041389875
0.0
0.010713671
0.0
0.015839253
0.021276038
0.008109864
0.006340629
0.0115983635
0.004968368
0.012596633
0.020504585
0.003703556
0.002201273
0.0028074693
0.012686278
0.004113225
0.011373039
0.008932783
0.0033102264
0.0055025904
0.00353163
0.0034862952
0.0
0.0
0.0
0.0053014136
0.0
0.008575818
0.008426946
0.04019765
0.008816085
0.008656055
0.0
0.00780847
0.0
0.0
0.0
0.0040859114
0.005098699
0.0052820225
0.0138465
0.0032435283
0.004009058
0.004585745
0.012689771
0.008866291
0.013788816
0.012568109
0.012005324
0.0049600457
0.005352866
0.0029844316
0.006432182
0.0098335305
0.012552665
0.0056270505
0.00514917
0.006706442
0.01354999
0.008235135
0.00867219
0.005770343
0.0
0.0116989715
0.006416511




Elapsed time to compute best fit: 59.889 seconds
Cross-validation score: 0.7678431055325743
Test score: 0.7329842931937173
Best Hyperparameters: {}
0.008640365
0.021159105
0.04367635
0.042216346
0.009224978
0.02796826
0.016607674
0.008867302
0.015780797
0.007812663
0.018566636
0.013153021
0.007345635
0.0
0.0077644996
0.0
0.012989495
0.025065847
0.012753375
0.010913027
0.015670272
0.007548425
0.0042768433
0.006597342
0.007402469
0.0087226
0.0030114392
0.010564488
0.008325917
0.007211032
0.012440743
0.0054911273
0.005165629
0.009053804
0.0044690147
0.0
0.0
0.0
0.011684975
0.0
0.012405197
0.005984386
0.05663279
0.0072923964
0.0037826628
0.0
0.009812053
0.0
0.0
0.0
0.0034628857
0.003773327
0.0043315883
0.0055265822
0.004446047
0.004123941
0.013045224
0.011236926
0.009813439
0.01689016
0.01005859
0.005770096
0.006324267
0.0041721244
0.0039188014
0.009245063
0.005450554
0.0015174834
0.004348764
0.0060853614
0.010169922
0.013186748
0.006386462
0.0056430064
0.0016135183
0.0
0.011889012
0.00463



Elapsed time to compute best fit: 66.036 seconds
Cross-validation score: 0.7934270316207854
Test score: 0.8014354066985646
Best Hyperparameters: {}
0.010217933
0.024279589
0.03727286
0.049029827
0.0053580753
0.019703275
0.013697799
0.008740754
0.012074061
0.008616764
0.015638795
0.017570976
0.009649378
0.0
0.0056372383
0.0
0.022586891
0.026953233
0.01057488
0.010196088
0.010269068
0.0033110606
0.0050438996
0.0049970835
0.0017450309
0.0013921799
0.008069269
0.0064731436
0.0062756236
0.0077680754
0.0010601165
0.005217812
0.0068417233
0.007991321
0.006425706
0.0
0.0
0.0
0.0049934858
0.0
0.010219653
0.00807941
0.0
0.004040796
0.0065599442
0.005815424
0.007393159
0.0
0.0
0.0
0.0024109709
0.0037571283
0.005336175
0.0030718343
0.009484339
0.0031848738
0.008786053
0.013430024
0.0072848243
0.016368158
0.021753129
0.009494839
0.006157014
0.0011797533
0.0129491305
0.010325558
0.005988577
0.011333762
0.0034380131
0.0040509766
0.006878934
0.015106085
0.006322958
0.007952531
0.0034000827
0.013775961



Elapsed time to compute best fit: 62.040 seconds
Cross-validation score: 0.7662333066508599
Test score: 0.8039647577092509
Best Hyperparameters: {}
0.010643337
0.023115015
0.04133927
0.03286206
0.013167368
0.011915283
0.007861121
0.008836924
0.017795054
0.009005965
0.013955383
0.016683355
0.005565697
0.0
0.012278064
0.0
0.025495416
0.021512972
0.01656915
0.005570445
0.00950021
0.01155037
0.004338223
0.007559195
0.0038546447
0.0014243858
0.009267242
0.004656523
0.004473581
0.009050101
0.0026362794
0.0044402443
0.004794437
0.0019269015
0.0028671145
0.0
0.0
0.0
0.004655781
0.026606722
0.014368434
0.018298144
0.05284929
0.011474867
0.0065909685
0.0
0.003926104
0.0
0.0
0.0
0.0028079774
0.0067893276
0.006029429
0.0042059696
0.009031699
0.008075512
0.003874037
0.009513943
0.007278259
0.026131252
0.010738687
0.009535836
0.009841898
0.010081216
0.004841115
0.009752571
0.010020747
0.0030078324
0.0046013305
0.0060327253
0.0054094233
0.021693615
0.005740216
0.006323834
0.013142652
0.0
0.0023410511



Elapsed time to compute best fit: 56.142 seconds
Cross-validation score: 0.7661365564812613
Test score: 0.8098591549295775
Best Hyperparameters: {}
0.008596899
0.031160768
0.038418803
0.039482187
0.014235964
0.011918311
0.010883081
0.0065019336
0.015843716
0.012008346
0.012595889
0.026365686
0.010160897
0.0
0.008038389
0.0
0.029525291
0.021561263
0.00907973
0.0024076193
0.0140581755
0.009578231
0.007825629
0.0059521883
0.0043884176
0.0010740584
0.005903713
0.009151918
0.0026879902
0.010535454
0.0062505202
0.0047597867
0.00360369
0.0035140694
0.0055546057
0.0
0.0
0.0
0.009302109
0.0
0.008507315
0.008663501
0.04044108
0.0075835134
0.012109196
0.0
0.007823131
0.0
0.0
0.0
0.0035440098
0.002772971
0.00896204
0.0053077806
0.0
0.0033982247
0.010463209
0.008090769
0.0057585244
0.016873388
0.026828634
0.008974121
0.0051369113
0.0034062548
0.011999262
0.013850852
0.0063599464
0.016590279
0.004357434
0.0039133187
0.0067170537
0.0011258629
0.0075313915
0.009363307
0.0073014502
0.00048002825
0.0039



Elapsed time to compute best fit: 57.345 seconds
Cross-validation score: 0.7889570646796772
Test score: 0.778894472361809
Best Hyperparameters: {}
0.007530308
0.021421576
0.041847505
0.04511774
0.012060384
0.01450843
0.0076066707
0.01034419
0.00971136
0.012644576
0.018118015
0.020920414
0.01117788
0.0
0.010006965
0.0
0.018049583
0.020798596
0.011135194
0.0027343945
0.007340081
0.008697755
0.005233988
0.0029568977
0.006724743
0.0029938663
0.0044129877
0.0059013274
0.0047477186
0.014951488
0.009252648
0.006336814
0.004177569
0.0020873107
0.0033218188
0.0
0.0
0.0
0.005102855
0.0
0.011395153
0.010532475
0.041838147
0.008104067
0.008237011
0.0
0.011882672
0.0
0.0
0.0
0.003573442
0.0045733196
0.0051451586
0.0121977255
0.0043079965
0.006726631
0.0035899698
0.009501794
0.017906645
0.018943692
0.0057343454
0.010107069
0.005511656
0.003943043
0.0048167976
0.006036724
0.00839179
0.00911244
0.011593119
0.0076132487
0.0065390505
0.011154283
0.0045600436
0.0142461695
0.00424275
0.004358325
0.0039913



Elapsed time to compute best fit: 56.333 seconds
Cross-validation score: 0.7958800077732814
Test score: 0.8056872037914693
Best Hyperparameters: {}
0.0112417955
0.025977053
0.04421836
0.042213786
0.007230941
0.014234866
0.015674163
0.0134812165
0.010320677
0.008974717
0.023205232
0.013360264
0.0048218663
0.0
0.011395453
0.0
0.022320762
0.01770784
0.015573109
0.0062833927
0.0136415325
0.005090765
0.003426177
0.006453313
0.005301153
0.0015194126
0.0067537185
0.0033328868
0.005441469
0.006096541
0.008491881
0.010721018
0.0071942643
0.002235228
0.0056493436
0.0
0.0
0.0
0.00947316
0.0
0.005606669
0.0060360734
0.034894943
0.006851567
0.010704549
0.0
0.005763649
0.0
0.0
0.0
0.0038717343
0.0074118134
0.004330423
0.004455151
0.0
0.009800137
0.0077290554
0.011185516
0.012686451
0.014693233
0.016596898
0.014151853
0.003924805
0.005917165
0.008449991
0.0064461054
0.009291118
0.002037795
0.0047341543
0.0039244397
0.00598746
0.010425681
0.0062694
0.0037538987
0.0046120575
0.0
0.0067295833
0.00318953



Elapsed time to compute best fit: 59.012 seconds
Cross-validation score: 0.8020061950469437
Test score: 0.746606334841629
Best Hyperparameters: {}
0.008821782
0.017592007
0.04594738
0.035693638
0.009967553
0.013948845
0.008367034
0.008691069
0.010868806
0.008283003
0.013685507
0.017702924
0.0063053383
0.0
0.010293106
0.0
0.02924331
0.012019385
0.020532768
0.003950585
0.01873862
0.007655516
0.005172141
0.007153474
0.005959771
0.010427473
0.0025573047
0.008564491
0.005603231
0.0069432757
0.0005926909
0.010730144
0.0028667247
0.005393296
0.0044539194
0.0
0.0
0.0
0.004535894
0.0
0.01571075
0.0060019204
0.058102794
0.009887965
0.011250692
0.0
0.008170963
0.0
0.0
0.0
0.0021058347
0.00875365
0.0041776136
0.0076129227
0.009293141
0.0057141394
0.004439945
0.013710605
0.012265761
0.021889286
0.010427487
0.009559896
0.005348625
0.01886979
0.009468897
0.005416063
0.011300894
0.00084108417
0.0038499834
0.008604911
0.006495955
0.026899144
0.004607072
0.011906281
0.004259172
0.0
0.009390768
0.0104574



Elapsed time to compute best fit: 53.607 seconds
Cross-validation score: 0.7583762440201958
Test score: 0.824607329842932
Best Hyperparameters: {}
0.007032947
0.02001645
0.044614438
0.04257268
0.007206734
0.01104397
0.004197523
0.0072752014
0.017767265
0.009862458
0.019002503
0.018363971
0.010921461
0.0
0.009470319
0.0
0.026321493
0.030048236
0.010828892
0.008162059
0.013461606
0.0051763295
0.009573462
0.0
0.0115103815
0.0014518298
0.004175457
0.009803783
0.0057558673
0.008448968
0.0063895197
0.0063286833
0.006225401
0.010580153
0.003869691
0.0
0.0
0.0
0.009075676
0.0
0.01210025
0.009987166
0.048778325
0.0096117165
0.0072976756
0.0
0.0075153406
0.0
0.0
0.0
0.0033160346
0.0037145126
0.004134174
0.015827844
0.0
0.006560524
0.0035592888
0.019186437
0.009746712
0.013494471
0.0076744994
0.01031378
0.0070781885
0.0009991385
0.010782619
0.006057413
0.010281243
0.0042491993
0.0047834218
0.0046460815
0.009129432
0.0090438165
0.0067802942
0.009073345
0.0060886964
0.0050642416
0.0035680674
0.0065



Elapsed time to compute best fit: 57.611 seconds
Cross-validation score: 0.7681383688156428
Test score: 0.7746478873239437
Best Hyperparameters: {}
0.007573598
0.01612204
0.042465948
0.03980355
0.004302463
0.015761698
0.014471467
0.012990926
0.0079920385
0.0065250513
0.013240876
0.026820062
0.004257746
0.0
0.009298226
0.0
0.022923395
0.025506878
0.009675002
0.004652886
0.009384764
0.0034194053
0.008030585
0.051801037
0.0054488378
0.003084732
0.0047669876
0.007639439
0.004863104
0.010773371
0.002677956
0.0056846915
0.004740609
0.0025897445
0.0044568507
0.0
0.0
0.0
0.014894874
0.04027537
0.007318161
0.011336859
0.033611324
0.0053878212
0.0078506265
0.0
0.006520606
0.0
0.0
0.0
0.001849908
0.005391642
0.010314464
0.003983335
0.005072757
0.0045212884
0.0017039577
0.006561507
0.008055092
0.017257651
0.010025952
0.006435952
0.004998477
0.0020417727
0.00458128
0.008293412
0.00896813
0.011480167
0.00681197
0.007318599
0.0059030773
0.010861564
0.005144114
0.0057992428
0.006632355
0.0
0.008928242



Elapsed time to compute best fit: 57.803 seconds
Cross-validation score: 0.7943651232365309
Test score: 0.8091787439613527
Best Hyperparameters: {}
0.01087888
0.021387657
0.039543238
0.04082273
0.0076298513
0.016793549
0.00445769
0.008161405
0.015376237
0.009680373
0.020669635
0.020167794
0.004901908
0.0
0.01295419
0.0
0.029555518
0.027350452
0.010516785
0.0051729744
0.010168122
0.006852328
0.007397613
0.002666512
0.011323313
0.001047845
0.0050199297
0.0063744476
0.005140796
0.009199085
0.0029296686
0.0066233594
0.0053088735
0.00055065803
0.004179055
0.0
0.0
0.0
0.0041266396
0.0
0.01544245
0.005649271
0.041326456
0.0030829767
0.004617733
0.0
0.0076197144
0.0
0.0
0.0
0.0039172266
0.0064459043
0.0061732694
0.004416546
0.0016465987
0.004361922
0.01535318
0.008619253
0.006780077
0.019384593
0.01445462
0.012212642
0.0040809833
0.020197112
0.004441536
0.0044114897
0.005330874
0.0039733853
0.006625506
0.0073665427
0.007574136
0.012132649
0.0029608929
0.015380662
0.0077429283
0.0
0.00470597
0.



Elapsed time to compute best fit: 58.114 seconds
Cross-validation score: 0.7713524660084474
Test score: 0.7971014492753622
Best Hyperparameters: {}
0.010712989
0.022229288
0.041404206
0.04306175
0.0054945243
0.0108074
0.005312722
0.014278583
0.011864149
0.010270972
0.020766377
0.01863864
0.0058446475
0.0
0.014928374
0.0
0.01683828
0.021693602
0.01449223
0.006211707
0.009600592
0.012133764
0.0031541267
0.0067070127
0.0033203922
0.0044331006
0.005921402
0.005746745
0.00818224
0.006265149
0.0031442991
0.00583695
0.005956342
0.004032883
0.004185613
0.0
0.0
0.0
0.017837754
0.0
0.00998979
0.011390425
0.0
0.00487346
0.008939097
0.0
0.007226898
0.0
0.0
0.0
0.004181721
0.0041198297
0.005154217
0.013503724
0.0005294989
0.0021450715
0.009161433
0.00862529
0.0077789994
0.016638052
0.013615757
0.013788667
0.03270744
0.008662664
0.0091475705
0.003535378
0.008906619
0.004328875
0.007063595
0.004912809
0.0046576755
0.008187598
0.0072716875
0.004300379
0.00535172
0.0
0.0064610993
0.0079997135
0.0107905



Elapsed time to compute best fit: 56.937 seconds
Cross-validation score: 0.7751676722868777
Test score: 0.8208955223880596
Best Hyperparameters: {}
0.00861013
0.0240494
0.044718936
0.04070112
0.005407694
0.018937483
0.016883444
0.010964841
0.010345057
0.007848759
0.015147562
0.01572743
0.0059948685
0.0
0.011360878
0.0
0.017536514
0.035227593
0.005692007
0.00549247
0.009914211
0.009756856
0.012316607
0.009293135
0.0030378997
0.0016499743
0.008031726
0.0090119345
0.004016762
0.008069263
0.0071204435
0.0040544756
0.004837778
0.0024691883
0.0049808696
0.0
0.0
0.0
0.0058809873
0.0
0.007242549
0.0041463557
0.03722671
0.009000299
0.004947218
0.0
0.004907612
0.0
0.0
0.0
0.0022106806
0.005732062
0.004969722
0.0077566137
0.009121432
0.0028756156
0.0092523275
0.008099208
0.008474288
0.015637266
0.016166158
0.0090910755
0.010423214
0.002529909
0.009332942
0.010230341
0.0064768065
0.0014911167
0.0018256811
0.004401
0.005855612
0.026242025
0.0041944296
0.01034525
0.0031457886
0.012516392
0.004739023



Elapsed time to compute best fit: 56.356 seconds
Cross-validation score: 0.773681969910238
Test score: 0.7850241545893719
Best Hyperparameters: {}
0.010512614
0.025555968
0.034643132
0.045586217
0.0064643323
0.018135898
0.025073927
0.0077410825
0.012521809
0.0073815123
0.008789041
0.018565364
0.006406236
0.0
0.0106963925
0.0
0.019074313
0.03012308
0.009807511
0.0071833185
0.008859904
0.0056093754
0.0050731283
0.012689624
0.014797073
0.001235311
0.010451722
0.0051372997
0.006997855
0.0076936483
0.0014181623
0.0025276109
0.0059698983
0.0043161404
0.004797971
0.0
0.0
0.0
0.015098564
0.0029884328
0.005889114
0.007882111
0.030590342
0.013696832
0.005498906
0.0
0.008179024
0.0
0.0
0.0
0.002517483
0.004591153
0.0037271075
0.008463863
0.00070902065
0.0032881158
0.0022401253
0.014973396
0.011100464
0.014929364
0.013355708
0.011873118
0.001947837
0.0045850473
0.005234389
0.0054494254
0.0103393
0.00797428
0.009964139
0.004239455
0.0052613025
0.016011966
0.0045586145
0.009546034
0.009398378
0.0014



Elapsed time to compute best fit: 58.221 seconds
Cross-validation score: 0.7609834726051107
Test score: 0.8154506437768241
Best Hyperparameters: {}
0.008819696
0.0246664
0.037739318
0.041535966
0.004965337
0.0117491055
0.005466909
0.019001393
0.017352944
0.0077672205
0.02018629
0.0209104
0.0067978823
0.0
0.011754483
0.0
0.028357722
0.021544797
0.015715858
0.010976634
0.012596728
0.0037771382
0.0126614515
0.005644427
0.008830655
0.0047636568
0.006957495
0.006057494
0.0039817737
0.008612314
0.0033585809
0.0060495674
0.0063391957
0.001201567
0.0052277013
0.0
0.0
0.0
0.01907878
0.0
0.010912555
0.009519917
0.066175334
0.009484135
0.008733344
0.0
0.0055751414
0.0
0.0
0.0
0.0049854927
0.004461651
0.003374166
0.005537759
0.0
0.009554935
0.008495286
0.007951605
0.0061042616
0.021867184
0.007392077
0.0057128244
0.007314897
0.014460364
0.0054272837
0.0040433705
0.0073693492
0.0005370241
0.007198448
0.0067893704
0.008844059
0.009292898
0.009164304
0.006803976
0.0074388436
0.0
0.0022648098
0.000854

### 4.1.3 LightGBM

In [113]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

none_lightgbm_performance_normalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    LightGBMPipeline = Pipeline(steps = [#['smote', SMOTE(sampling_strategy = 0.5, n_jobs=2)],
                                    #['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    none_lightgbm_performance_normalized_df = pd.concat([none_lightgbm_performance_normalized_df, new_performance_df])
    

none_lightgbm_performance_normalized_df.to_csv("../data/05_model_output/none_lightgbm_performance_normalized_df.csv")




Elapsed time to compute best fit: 10.876 seconds
Cross-validation score: 0.6130595047845278
Test score: 0.6262042389210021
Best Hyperparameters: {}
191.88784258067608
2053.0064616948366
5995.148761853576
2742.865093037486
124.0150734782219
223.44090953469276
208.3677591085434
57.57013027369976
6.975838005542755
116.62000703811646
585.0425031036139
1567.2417119294405
90.69214855134487
0.0
388.4070818424225
0.0
65.49207955598831
132.17896434664726
842.3364489674568
105.81433418393135
606.3076569288969
52.98091706633568
182.91041186451912
280.9616495668888
26.02056995034218
14.418460801243782
41.03659442067146
46.1312021613121
88.50659747421741
652.5974779129028
23.762918159365654
128.57492031157017
200.60718332231045
11.79903382062912
74.5416550040245
0.0
0.0
0.0
23.7052640914917
0.0
98.07051086425781
17.289277851581573
0.0
52.9365394115448
221.90993124246597
0.0
182.34940072894096
0.0
0.0
0.0
78.89345373213291
60.33679601550102
92.93217471241951
35.09442600607872
8.801426082849503
13.74



Elapsed time to compute best fit: 8.794 seconds
Cross-validation score: 0.6254132068932542
Test score: 0.607843137254902
Best Hyperparameters: {}
343.1979813724756
2339.639214962721
7365.555709928274
1822.3814313709736
84.8719447851181
219.7590023726225
31.50845703482628
240.20041528344154
53.19367977976799
159.02342176437378
1353.9968592971563
263.5371376425028
120.59759506583214
0.0
252.60600097477436
0.0
54.53525823354721
768.6433652043343
203.71977245807648
55.87404605746269
680.5429578274488
110.89653792977333
45.07080543041229
18.9390330016613
16.912796318531036
7.758906006813049
59.6374858468771
130.9705749452114
52.243033081293106
376.85334999859333
28.965620756149292
78.25717575848103
34.15343318879604
8.644087970256805
56.2928426861763
0.0
0.0
0.0
25.49891099333763
6.7543699741363525
187.1465027630329
69.61893451213837
0.0
88.69135275483131
135.4867094308138
0.0
136.67896151542664
0.0
0.0
0.0
64.20261372625828
54.26268953084946
170.79089763760567
11.98884505033493
26.11409845



Elapsed time to compute best fit: 9.445 seconds
Cross-validation score: 0.6192256989597503
Test score: 0.6358381502890174
Best Hyperparameters: {}
430.319569259882
1799.4049165397882
7607.4380773454905
1337.3627945035696
155.21598514914513
385.6656542122364
96.94008827209473
77.37309534847736
36.86665350198746
146.3593946993351
557.0869827717543
1551.304821997881
44.86775590479374
0.0
377.330889120698
0.0
198.25619255006313
46.79861697554588
625.7128523886204
82.24555438756943
1259.6215595006943
44.9892954826355
24.71471405029297
20.85259872674942
16.807254940271378
122.98060962557793
43.032136261463165
85.03680385649204
104.37726163864136
211.78685918450356
12.362460166215897
88.74278286099434
78.42189040780067
86.9669497013092
28.908849984407425
0.0
0.0
0.0
40.80458377301693
6.150909900665283
132.15319445729256
27.534037053585052
0.289806991815567
44.898754835128784
100.48990915715694
4.827173888683319
402.2169969379902
0.0
0.0
0.0
96.44595223665237
104.15096890926361
139.80896943807



Elapsed time to compute best fit: 8.923 seconds
Cross-validation score: 0.6134353078862647
Test score: 0.6454720616570327
Best Hyperparameters: {}
388.21036264300346
2148.568078055978
7441.188230544329
1710.31727963686
144.65521889925003
1477.3222226798534
74.06029132008553
56.92203739285469
89.62696135044098
113.92299234867096
629.2649148106575
400.4153459817171
99.96229749917984
0.0
163.3634394109249
0.0
102.44994127750397
972.8056979775429
335.36504831910133
27.975145667791367
188.88732682168484
110.9989767074585
66.09740108251572
171.25152876973152
22.334538727998734
50.174476474523544
37.81763659417629
363.3672913610935
63.116788417100906
106.18086144328117
29.697547554969788
69.6114551126957
259.3741612583399
10.514414936304092
60.83459794521332
0.0
0.0
0.0
32.336258351802826
6.636437803506851
77.75241136550903
41.377142518758774
2.8085399866104126
41.85640826821327
80.62000544369221
0.0
98.81898310780525
0.0
0.0
0.0
205.4900031387806
58.006280332803726
137.05054703354836
38.9177



Elapsed time to compute best fit: 8.833 seconds
Cross-validation score: 0.5692739187878398
Test score: 0.6800766283524904
Best Hyperparameters: {}
461.71330443024635
1335.7671325057745
8138.701161205769
1238.2120775133371
111.92823445796967
403.36978329718113
336.5069902986288
52.568564891815186
159.40977489948273
121.57748395204544
471.5698393136263
1422.8278991430998
71.60192902386189
0.0
376.10423393547535
0.0
1006.282327324152
198.03072926402092
58.79797984659672
73.49903610348701
90.68388397991657
58.14958196878433
65.52987125515938
17.769331634044647
23.771742582321167
25.911326974630356
31.022978991270065
85.70270423591137
57.43674051761627
316.9922291338444
13.266120731830597
60.92155835032463
108.88862639665604
37.69192510843277
65.70786318182945
0.0
0.0
0.0
27.182779878377914
0.0
150.3941294848919
464.1445364058018
67.71720123291016
37.410757556557655
82.46520744264126
0.0
70.93953359127045
0.0
0.0
0.0
86.66836060583591
85.87490448355675
75.2163618505001
54.10147851705551
17.



Elapsed time to compute best fit: 8.180 seconds
Cross-validation score: 0.6074365204426938
Test score: 0.5784313725490196
Best Hyperparameters: {}
355.81077517569065
3019.73494926095
6547.084606438875
1294.8738379776478
60.53490027785301
371.5814400911331
77.17389675974846
99.42728425562382
120.66966956853867
195.1508769840002
354.6603474766016
1466.101422086358
20.254092887043953
0.0
224.298932492733
0.0
114.95817166566849
1172.3973253667355
99.50767719745636
62.96489527821541
183.68833942711353
107.2290607392788
51.525392323732376
59.02251219749451
17.128064841032028
7.838290154933929
136.4354173541069
90.03208428621292
63.675598084926605
63.57794992625713
16.592790186405182
77.08637776970863
64.56835652887821
12.260128796100616
95.9430913478136
0.0
0.0
0.0
52.3672434091568
0.0
160.3590776771307
35.51113936305046
0.0
44.00631484389305
309.6287733465433
0.0
21.54290673136711
0.0
0.0
0.0
104.69916243851185
66.93477445840836
108.1096253991127
29.386404395103455
0.33891400694847107
43.14



Elapsed time to compute best fit: 9.676 seconds
Cross-validation score: 0.5806124998350815
Test score: 0.6176470588235293
Best Hyperparameters: {}
668.7349506914616
1941.340067654848
7721.143969491124
796.4132509678602
60.891486749053
278.7822916954756
44.71762990951538
92.27740997076035
90.19524666666985
82.87727549672127
402.59259171783924
1443.069958627224
133.96860593557358
0.0
288.0475986003876
0.0
51.73724764585495
953.2739354968071
620.3396798074245
56.958498656749725
185.48008926212788
31.75403681397438
321.2515267431736
16.164545238018036
12.816315084695816
4.645603775978088
32.38533502817154
112.34551627933979
76.46511533856392
145.19294574856758
20.742261677980423
61.16765196621418
100.23621235787868
26.874431535601616
105.55019226670265
0.0
0.0
0.0
40.19760498404503
7.398894935846329
192.97745244204998
23.145233795046806
20.16444993019104
63.39273747801781
422.7997899353504
0.0
85.84468086063862
0.0
0.0
0.0
127.706000238657
84.95817357301712
127.02179029583931
44.8235303759



Elapsed time to compute best fit: 9.344 seconds
Cross-validation score: 0.6072073247837257
Test score: 0.5825242718446602
Best Hyperparameters: {}
356.6805800944567
2080.6720032691956
8175.330147072673
862.2285888940096
68.41092047095299
448.3839731961489
63.97766596078873
114.59449110925198
81.35570032894611
68.6594810783863
1292.8478401452303
278.2570942044258
244.81553228199482
0.0
179.89127279818058
0.0
80.87565995752811
680.8917411118746
660.129553437233
29.22302746772766
232.18594187498093
31.03858122229576
38.706538289785385
34.52870199084282
11.230124950408936
22.34425640106201
23.01689922809601
193.64278115332127
62.1647936552763
196.3406082689762
20.979909658432007
67.35946950316429
107.29286581277847
9.125116884708405
125.55627024173737
0.0
0.0
0.0
77.18064337968826
0.0
325.59567096829414
20.57675075531006
3.5588070154190063
43.97069761157036
92.94941461086273
0.2305150032043457
18.597442984580994
0.0
0.0
0.0
115.22733730077744
110.19551312923431
297.91932329535484
7.4838040



Elapsed time to compute best fit: 9.890 seconds
Cross-validation score: 0.6431075832785786
Test score: 0.6393129770992366
Best Hyperparameters: {}
357.5837723016739
4062.9672277122736
5742.485748589039
877.3847667425871
55.007219076156616
228.739771515131
19.168934017419815
114.59845477342606
93.86246514320374
38.02131196856499
254.91218276321888
1517.6893932819366
131.42898559570312
0.0
145.80973526835442
0.0
323.71686463057995
1178.621700823307
444.2501187324524
150.18623384833336
143.57022386789322
28.350890338420868
216.93148800730705
9.924629807472229
12.094724178314209
235.75673082470894
24.549294397234917
88.12161350250244
71.57340851426125
104.82374662160873
23.02703222632408
78.11277994513512
99.35019114613533
10.791809916496277
159.76660583913326
0.0
0.0
0.0
16.772519916296005
7.294400215148926
125.74107855558395
46.33552986383438
9.103729844093323
32.46366523206234
65.31394176185131
0.4362879991531372
407.61522203683853
0.0
0.0
0.0
90.21565794944763
67.6302497535944
111.4645



Elapsed time to compute best fit: 9.225 seconds
Cross-validation score: 0.6014210781055906
Test score: 0.6310679611650486
Best Hyperparameters: {}
434.1876778304577
1427.5487801134586
7170.081066146493
2501.802797883749
125.09432530403137
358.46814925968647
88.14973005652428
99.75336989760399
105.0193213224411
120.73431622982025
246.12961447238922
1409.312697187066
46.72607780992985
0.0
193.64421252906322
0.0
123.1537966132164
156.41057699918747
887.4719959497452
46.53564700484276
549.6690424531698
35.26393246650696
59.774787694215775
491.52833887934685
19.12382537126541
5.851063072681427
26.376190692186356
141.73162510991096
46.69946366548538
153.94774816930294
23.94593194127083
67.39606477320194
78.02078300714493
14.71006491780281
78.33150693774223
0.0
0.0
0.0
32.57457995414734
0.0
173.89345821738243
111.18964970111847
0.0
42.44530317187309
100.35508692264557
5.098480224609375
256.31594628095627
0.0
0.0
0.0
97.32444107532501
77.36621049046516
561.7517584562302
17.957394123077393
29.6



Elapsed time to compute best fit: 10.135 seconds
Cross-validation score: 0.6135817423721063
Test score: 0.6309751434034416
Best Hyperparameters: {}
360.1702536344528
2578.3261777609587
6715.16212156415
1275.0425142496824
53.01936513185501
260.952027708292
455.77533185482025
70.57608616352081
70.50807589292526
98.20063298940659
488.64845675230026
1541.3864831477404
89.24911516904831
0.0
406.469925314188
0.0
83.06010821461678
1038.4848932623863
400.6899506896734
16.15014088153839
138.29727283120155
274.57733088731766
37.41294077038765
31.203457444906235
14.30454432964325
13.193460464477539
31.09319919347763
205.82125033438206
171.20236855745316
286.0413018465042
31.24594357609749
72.92530274391174
218.25430020689964
6.950541019439697
104.7566877156496
0.0
0.0
0.0
378.6498934030533
0.0
232.48956403136253
16.524368166923523
0.0
80.03178280591965
73.8975505977869
0.0
273.53213399648666
0.0
0.0
0.0
118.85456019639969
45.151178032159805
116.06242689490318
74.6211903989315
1.892087996006012
12



Elapsed time to compute best fit: 9.562 seconds
Cross-validation score: 0.5990824117966413
Test score: 0.6152343750000001
Best Hyperparameters: {}
311.17102897167206
2116.740668132901
6292.831032052636
3137.675000369549
309.627704679966
1293.0412653684616
67.2870564609766
113.78481122851372
122.05591401457787
174.62160037457943
559.430001527071
579.3809398263693
48.92823204398155
0.0
175.8162228912115
0.0
104.50429806113243
271.0847714841366
824.8944159448147
54.76407943665981
68.17198878526688
264.45446369051933
31.197396010160446
17.460190802812576
166.67843371629715
30.204386591911316
30.234576493501663
119.17571690678596
45.347448378801346
115.2555328309536
28.768956065177917
68.2439292371273
94.81741784512997
16.39541983604431
98.45973062515259
0.0
0.0
0.0
20.599179327487946
0.0
198.30852091312408
69.78213495016098
31.972738206386566
61.15812060236931
70.0398368537426
2.557999014854431
117.42022722959518
0.0
0.0
0.0
59.98036779463291
53.883134961128235
93.25107601284981
20.4251263



Elapsed time to compute best fit: 9.805 seconds
Cross-validation score: 0.6376272617293962
Test score: 0.603112840466926
Best Hyperparameters: {}
474.3132252693176
2795.0607775598764
7091.52537766099
1222.5330173373222
127.07048636674881
169.96960246562958
33.2538720369339
212.07795411348343
194.2115518450737
126.35129418969154
554.5985872894526
1309.6840412914753
486.8015538305044
0.0
246.74724301695824
0.0
214.61144146323204
1091.4312761425972
102.88416811823845
92.18214264512062
177.5962108373642
76.56194558739662
13.799642890691757
21.137795478105545
7.593226045370102
22.724704027175903
53.38850501179695
94.11052741110325
83.60066057741642
208.1271940767765
22.424446552991867
50.226732298731804
283.6038638204336
3.2245779931545258
70.62666344642639
0.0
0.0
0.0
31.468327552080154
12.590755686163902
92.67553381621838
49.1521281003952
0.9615240097045898
79.4931589961052
90.62686458230019
0.0
29.307377099990845
0.0
0.0
0.0
78.59331128001213
68.55580352246761
84.07589164376259
22.355188



Elapsed time to compute best fit: 8.644 seconds
Cross-validation score: 0.6260300291603403
Test score: 0.6395348837209303
Best Hyperparameters: {}
188.59949353337288
2204.1763559877872
7053.072183504701
1962.3445943444967
399.5247097313404
1259.3382821679115
869.1615127623081
93.07912793755531
225.31671196222305
42.888063073158264
574.4886046946049
449.67762994766235
97.99218666553497
0.0
237.65253484249115
0.0
41.75294786691666
92.93542352318764
212.02859038114548
29.84380081295967
95.62834726274014
65.59702515602112
41.0436649620533
35.08294939994812
3.2269350588321686
17.155003309249878
16.59549707174301
152.71603919565678
138.051559522748
317.0860768556595
48.986083805561066
75.01922354102135
226.09651440382004
30.658730179071426
95.22587537765503
0.0
0.0
0.0
129.50320728123188
2.5988099575042725
59.775460839271545
34.95271757245064
24.199100494384766
30.63144099712372
137.4318489730358
1.0983599424362183
32.85684430599213
0.0
0.0
0.0
95.78328067064285
271.41821336746216
176.007464



Elapsed time to compute best fit: 9.311 seconds
Cross-validation score: 0.6086390521218923
Test score: 0.6333973128598848
Best Hyperparameters: {}
507.4408460557461
1463.5305521935225
6568.387496858835
2722.0515902787447
110.8309174478054
335.65711492300034
135.45226721465588
552.5337136387825
64.36468142271042
128.50085666775703
505.21854673326015
1623.131344139576
59.69611991941929
0.0
215.16641601920128
0.0
558.8479103446007
176.7919685691595
150.0266709625721
60.39037275314331
177.30197483301163
24.078366547822952
61.79997429251671
23.446709156036377
36.73512536287308
11.507384300231934
40.00922864675522
79.2411595582962
56.963617369532585
244.00783014297485
8.529179811477661
71.95589983463287
123.19857077300549
9.898164868354797
59.72939604520798
0.0
0.0
0.0
62.85662063956261
0.0
165.70809119939804
418.64972361922264
0.0
31.101489454507828
106.38502186536789
0.0
59.14464098215103
0.0
0.0
0.0
196.67426112294197
55.633207857608795
195.37895691394806
78.54988151788712
3.9915660023689



Elapsed time to compute best fit: 10.385 seconds
Cross-validation score: 0.5933577374789913
Test score: 0.6912878787878789
Best Hyperparameters: {}
388.7924869507551
2984.897285670042
6307.259518459439
1274.937948897481
81.1071304678917
443.62810638546944
69.22982662916183
131.629042416811
99.154820099473
149.17462304234505
433.3081884086132
1425.981778666377
119.1475759446621
0.0
214.11813461780548
0.0
125.04597422480583
184.33834832906723
752.2211510688066
82.57425659894943
1010.8046582341194
47.107676565647125
43.304246455430984
19.856001287698746
2.802040010690689
7.8250532746315
43.693301290273666
97.52355751395226
40.19260489940643
222.91956475377083
24.216876208782196
94.39870864152908
127.97892823815346
21.299229443073273
67.30399015545845
0.0
0.0
0.0
46.29082536697388
2.795370101928711
170.24503675103188
22.571674913167953
0.0
58.53373971581459
76.07044765353203
0.678272008895874
20.700434029102325
0.0
0.0
0.0
163.9672253727913
62.96485711634159
67.03294098377228
44.9037971496



Elapsed time to compute best fit: 9.458 seconds
Cross-validation score: 0.6225789135081066
Test score: 0.6448598130841122
Best Hyperparameters: {}
248.6560999006033
2768.798889413476
6897.212041884661
857.3078832179308
208.7455171495676
317.38288989663124
77.16801336407661
181.77491082251072
47.44501228630543
103.87242704629898
308.90863709151745
1464.8009660243988
95.6990515589714
0.0
297.7548940628767
0.0
164.4658691585064
1105.3117539286613
395.3049319386482
47.28139278292656
93.64416581392288
193.20175391435623
46.07139602303505
18.807708829641342
5.514043182134628
9.617320016026497
33.35869437456131
78.25719064474106
73.10911467671394
319.68350237607956
16.2817519903183
96.00388099253178
139.49098706245422
12.839751809835434
101.80497217178345
0.0
0.0
0.0
26.911237835884094
2.755189895629883
166.4310021698475
39.85409390926361
0.0
73.76826117932796
424.05504244565964
2.732122093439102
330.5864290893078
0.0
0.0
0.0
103.03119264543056
64.91087973117828
94.88858354091644
43.084266632



Elapsed time to compute best fit: 9.637 seconds
Cross-validation score: 0.6245972008626375
Test score: 0.6106870229007633
Best Hyperparameters: {}
165.22992515563965
3216.0402237176895
5747.828407585621
2814.951358243823
48.70241579413414
271.35767313838005
195.0721397101879
67.01079654693604
45.13269433379173
165.2931872010231
646.6671838611364
1506.8185188770294
67.08638171851635
0.0
319.92452198266983
0.0
86.886344820261
169.3649465739727
714.3089457899332
74.74308556318283
603.0677088797092
56.18425476551056
29.6525037586689
255.52669885754585
17.17315897345543
20.29610323905945
30.331446170806885
115.75562797486782
132.13602942228317
67.54285594820976
4.35711807012558
48.11001327633858
140.6455739736557
7.75583690404892
77.6092047393322
0.0
0.0
0.0
35.663293182849884
0.0
212.3305432498455
95.85546424984932
0.0
73.380187779665
104.90827271342278
0.9702330231666565
268.5688037276268
0.0
0.0
0.0
51.78969103097916
148.5735994130373
76.4812980890274
54.371558368206024
0.475970000028610



Elapsed time to compute best fit: 9.262 seconds
Cross-validation score: 0.6241397641499807
Test score: 0.5831739961759083
Best Hyperparameters: {}
375.0067367553711
2082.8413732349873
7922.11585842073
1822.352146834135
104.05040717124939
1377.2155137062073
860.6202957332134
67.52048632502556
155.15196579694748
117.57728138566017
489.2228580713272
391.8020171672106
137.92389200627804
0.0
187.90031960606575
0.0
149.08528017997742
301.3154107630253
199.96707464754581
18.2778417468071
132.0837585926056
52.613170236349106
27.5032077729702
123.32741525769234
15.165818452835083
14.07442282140255
29.845964819192886
38.05852510035038
83.59483209252357
215.23134475946426
8.728961914777756
46.67368087172508
194.4201701581478
16.325167953968048
57.76955269277096
0.0
0.0
0.0
62.625181928277016
3.8280530869960785
220.1682978272438
48.504980981349945
2.2580199241638184
126.78162837028503
49.910867750644684
0.0
58.79650527238846
0.0
0.0
0.0
86.20435117185116
62.49238854646683
98.64980414509773
8.10023



Elapsed time to compute best fit: 9.414 seconds
Cross-validation score: 0.6230795678810096
Test score: 0.625
Best Hyperparameters: {}
594.8029153347015
2129.1557783335447
7943.238493651152
1288.1537075787783
97.01500318944454
242.41411954164505
594.6090791523457
110.43436741828918
57.42947790026665
93.16721266508102
408.5939488708973
1809.1566264629364
121.03729620575905
0.0
213.19907028973103
0.0
80.71835297346115
992.0745775550604
65.64403113722801
63.54748997092247
54.14989212155342
72.73017010092735
32.53489500284195
8.790645837783813
7.113375961780548
30.619122356176376
26.300251990556717
96.22697335481644
53.178741842508316
77.88717858493328
3.8687930703163147
83.05737766623497
149.14603447914124
11.224560141563416
92.1859447658062
0.0
0.0
0.0
65.73608082532883
8.73697018623352
107.49116784334183
60.99396914243698
1.5451799631118774
48.938606560230255
66.49719527363777
0.0
154.08398216962814
0.0
0.0
0.0
68.68756556510925
72.02879375219345
223.22744596004486
20.665339291095734
4.1



Elapsed time to compute best fit: 9.836 seconds
Cross-validation score: 0.6217281616106156
Test score: 0.6250000000000001
Best Hyperparameters: {}
248.64053611457348
2822.9926892071962
5854.022866472602
2028.385982453823
51.428224086761475
288.44156008958817
41.37376430630684
79.51469188928604
51.148249447345734
101.56022581458092
611.0949877947569
1540.8344255238771
95.16485759615898
0.0
153.26774659752846
0.0
120.91864901781082
907.7100674510002
145.7976998835802
140.4723765552044
167.38464641571045
46.78617785871029
28.518912106752396
10.619059920310974
22.45253899693489
13.445969969034195
147.14942735433578
263.2453564107418
61.38393212854862
181.28274078667164
21.56015956401825
77.94672352075577
78.12668463587761
16.481020212173462
70.5365187227726
0.0
0.0
0.0
245.40038393437862
2.423945963382721
82.92400960624218
22.53516387939453
0.0
80.31936672329903
308.98932203650475
2.253649950027466
38.23305922746658
0.0
0.0
0.0
81.2416250705719
86.08577355742455
127.95253956317902
23.37087



Elapsed time to compute best fit: 8.678 seconds
Cross-validation score: 0.6202336416140591
Test score: 0.6104651162790699
Best Hyperparameters: {}
295.4861110448837
3786.1787021905184
4782.57114687562
2494.336791664362
128.64024060964584
191.98056495189667
85.64598685503006
64.9462708979845
46.91145023703575
168.90630486607552
518.4199798703194
1413.7511544972658
95.1946836411953
0.0
402.28600588440895
0.0
208.58003097772598
527.8624988794327
615.4457599520683
74.33464100956917
106.06788308918476
67.1874523460865
78.93232822418213
10.246206104755402
6.802827060222626
9.458068937063217
21.26175794005394
208.6664194315672
214.82501009106636
281.333411693573
16.39619432389736
39.37796977162361
72.36725002527237
33.609185099601746
43.87324284017086
0.0
0.0
0.0
10.254972904920578
2.475160002708435
109.14992243051529
82.37964698672295
21.179310083389282
102.38348492980003
272.6520221531391
0.0
264.7140166461468
0.0
0.0
0.0
75.30007313191891
29.109699934720993
118.87873101234436
25.8895238637



Elapsed time to compute best fit: 9.689 seconds
Cross-validation score: 0.6173296180976963
Test score: 0.6439393939393939
Best Hyperparameters: {}
218.66122686862946
2912.5379432290792
5277.2456344515085
2734.4584579467773
85.7657675743103
217.51692356169224
221.90233844518661
51.44055160880089
95.9822958111763
150.99624279141426
464.2822914123535
1468.361674591899
53.408327460289
0.0
309.12211705744267
0.0
180.62196354568005
116.81427776813507
526.3576332628727
38.40033407509327
898.9240940213203
127.98019379377365
39.30581200122833
130.78615453839302
36.5829091668129
3.9381099939346313
144.2866189479828
54.04309546947479
44.75468799471855
387.2331974953413
34.44236516952515
238.08254712820053
104.67789110541344
12.489466905593872
83.39995451271534
0.0
0.0
0.0
67.37481588125229
8.837560176849365
143.0986429452896
39.45849680900574
133.58984825015068
56.259587198495865
63.03594945371151
0.0
200.4509238600731
0.0
0.0
0.0
75.11180007457733
61.21068063378334
142.52269886434078
58.85437130



Elapsed time to compute best fit: 9.641 seconds
Cross-validation score: 0.6312676388839261
Test score: 0.6165703275529866
Best Hyperparameters: {}
333.5839621424675
2342.0539768338203
6098.981605798006
2645.702203243971
98.91315421462059
273.87561532855034
586.836973041296
41.610378459095955
40.536258071660995
130.8668359220028
540.2337262034416
1423.4366916269064
153.49807426333427
0.0
342.68353478610516
0.0
587.3047384917736
92.5134397894144
479.18351274728775
73.29350247979164
145.12886396050453
29.701776832342148
51.101879596710205
20.119399070739746
7.162970006465912
1.289588987827301
29.150090992450714
95.29418104887009
75.97152578830719
310.2455896139145
15.521442115306854
36.017919197678566
124.93670439720154
30.15763634443283
127.78215056657791
0.0
0.0
0.0
21.46857200562954
0.0
57.92913427948952
52.255668610334396
13.23900032043457
35.70122566819191
108.77055239677429
0.0
20.956349551677704
0.0
0.0
0.0
76.52663457393646
95.14138194918633
137.90066614747047
18.271903038024902
1



Elapsed time to compute best fit: 10.812 seconds
Cross-validation score: 0.6270002360499511
Test score: 0.5098039215686274
Best Hyperparameters: {}
610.9753415733576
2384.7465918809175
6829.1952602267265
1751.1178721040487
94.08435155451298
254.64000983536243
67.7188626229763
100.40210692584515
82.94490987062454
97.42041692137718
707.9156710505486
1654.9327713698149
54.00328651070595
0.0
259.3860004991293
0.0
570.1969384551048
573.8575703501701
94.1754307448864
58.747768342494965
183.34516961872578
70.32459914684296
57.81854099035263
15.75396990776062
11.871670991182327
11.924949884414673
41.962646931409836
47.51361280679703
88.27191165089607
275.6112014055252
106.61053398251534
79.10774847865105
154.15846410393715
15.60291713476181
154.1970375776291
0.0
0.0
0.0
723.1200341880322
5.412090063095093
341.8224932849407
56.899364680051804
1.435539960861206
140.4234277755022
98.83370023965836
0.34644201397895813
115.95608958601952
0.0
0.0
0.0
66.15472459793091
81.8527161180973
101.8206339180

## 4.2 Rebalancing Strategy - SMOTE

### 4.2.1 Random Forest

In [114]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

smote_randomforest_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    pipeline = Pipeline(steps = [['smote', SMOTE()],
                              #['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    smote_randomforest_normalized_performance_df = pd.concat([smote_randomforest_normalized_performance_df, new_performance_df])
    
smote_randomforest_normalized_performance_df.to_csv("../data/05_model_output/smote_randomforest_normalized_performance_df.csv")



### 4.2.3 XGBoost

In [115]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


smote_xgboost_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    GXBoostPipeline = Pipeline(steps = [['smote', SMOTE()],
                                    #['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    smote_xgboost_normalized_performance_df = pd.concat([smote_xgboost_normalized_performance_df, new_performance_df])
    

smote_xgboost_normalized_performance_df.to_csv("../data/05_model_output/smote_xgboost_normalized_performance_df.csv")




Elapsed time to compute best fit: 205.574 seconds
Cross-validation score: 0.7319861127811474
Test score: 0.7279693486590038
Best Hyperparameters: {}
0.004862949
0.026670773
0.09125809
0.17417248
0.015631163
0.014042722
0.0010881393
0.012723474
0.0056096716
0.0034629188
0.027317617
0.021861134
0.0053214324
0.0
0.012583963
0.0
0.0062061134
0.012615289
0.0051923557
0.0027200207
0.0076167467
0.006976148
0.0073543363
0.008510416
0.0024630583
0.0026963123
0.0013072419
0.0020713133
0.00097414147
0.0043674293
0.017694535
0.0032860283
0.0007870265
0.002283094
0.0016020744
0.0
0.0
0.0
0.00394157
0.06075678
0.0049347505
0.014890602
0.0011547727
0.007841366
0.012269386
0.0008488871
0.0053509097
0.0
0.0
0.0
0.0024308946
0.0020295954
0.0024432845
0.007845447
0.011920727
0.00083968014
0.005971131
0.006680582
0.011025372
0.008201467
0.007589162
0.007109618
0.0069505405
0.0018057665
0.002819124
0.00261015
0.0010830822
0.0045403508
0.006319358
0.0025531512
0.0028903477
0.00670986
0.0025759153
0.00036758



Elapsed time to compute best fit: 195.756 seconds
Cross-validation score: 0.7332448073973903
Test score: 0.7452830188679245
Best Hyperparameters: {}
0.011535673
0.056033727
0.07826227
0.1687427
0.012280455
0.014689027
0.0041439137
0.006746475
0.013224888
0.0022422192
0.025485778
0.024755334
0.0037168025
0.0
0.011202426
0.0
0.009263915
0.0060294014
0.009443424
0.0005846728
0.007225416
0.030655261
0.009163436
0.0008511524
0.0013698778
0.00078853365
0.0029107684
0.0012948436
0.0009988254
0.0116905775
0.013652782
0.0006791101
0.002109481
0.0051916917
0.0006785936
0.0
0.0
0.0
0.004874945
0.057166845
0.002840801
0.014781757
0.0016460114
0.0021500583
0.0095750075
0.0068914536
0.004293213
0.0
0.0
0.0
0.0011747993
0.0064738346
0.001125527
0.009019412
0.0029700657
0.0010933978
0.0013954942
0.0025217133
0.013063816
0.014401252
0.0020748386
0.009165702
0.006307625
0.009477249
0.0047385637
0.006090416
0.0016980713
0.007986457
0.0013390873
0.0017061702
0.0020982737
0.008911508
0.00081887364
0.001519



Elapsed time to compute best fit: 197.478 seconds
Cross-validation score: 0.7242912665504102
Test score: 0.7768924302788845
Best Hyperparameters: {}
0.0067882924
0.0336435
0.13780315
0.058819283
0.007666077
0.004651363
0.0053207693
0.00823581
0.018139856
0.005142905
0.03593537
0.05922468
0.002904221
0.0
0.031071596
0.0
0.007075414
0.008607786
0.01005334
0.00172374
0.0076501085
0.0036219063
0.013040473
0.006650099
0.002989401
0.0012972481
0.0028827072
0.000907617
0.0014271311
0.0036419341
0.020288646
0.00074585294
0.0012070381
0.011691881
0.0027215187
0.0
0.0
0.0
0.0019406915
0.020422455
0.008515724
0.0142604625
0.0060900683
0.011486105
0.0059613865
0.0090190405
0.0055932156
0.0
0.0
0.0
0.0016968508
0.0021113649
0.00086534536
0.0019432823
0.0016023004
0.0013344568
0.0009515673
0.0025391947
0.010771462
0.010763883
0.003289421
0.0078006634
0.008534315
0.0025861745
0.0051075197
0.003880634
0.0010963794
0.008533948
0.0029255704
0.0020926506
0.0010372676
0.0085577285
0.001537745
0.0022990191



Elapsed time to compute best fit: 199.880 seconds
Cross-validation score: 0.7333235467924086
Test score: 0.6313993174061434
Best Hyperparameters: {}
0.011381312
0.04502202
0.082826234
0.14624819
0.012260029
0.009662006
0.0038328376
0.009132907
0.004787152
0.005563364
0.036348376
0.029610056
0.0043021743
0.0
0.017676372
0.0
0.005175186
0.013238559
0.003891008
0.0051595694
0.006781634
0.009251729
0.010406357
0.0058269827
0.002450694
0.0021974829
0.0031253854
0.004025433
0.00058638625
0.0029632335
0.013700508
0.0011779543
0.0014940633
0.0016550019
0.0009643181
0.0
0.0
0.0
0.0022854435
0.10267164
0.0024633948
0.008160372
0.0011292567
0.0016542041
0.0053280704
0.0006347222
0.0028474184
0.0
0.0
0.0
0.0012496758
0.0010991574
0.0014294205
0.004046088
0.0010302323
0.0033111232
0.0020398744
0.00179722
0.0096753845
0.008276618
0.00280648
0.0050764596
0.0023933605
0.008033723
0.0032105895
0.0023776132
0.004974533
0.0053833313
0.0006734766
0.0006002986
0.0009017373
0.0049476023
0.0016895917
0.00181



Elapsed time to compute best fit: 203.456 seconds
Cross-validation score: 0.7228451317315894
Test score: 0.7142857142857143
Best Hyperparameters: {}
0.010207167
0.024059845
0.1447877
0.065368384
0.014363447
0.004626365
0.0019768337
0.013066062
0.0030274272
0.002129915
0.03399217
0.05134183
0.0024552592
0.0
0.016895521
0.0
0.005108411
0.0077900942
0.0043384125
0.016987802
0.008517165
0.002661094
0.009582819
0.0010533839
0.004140054
0.0010562477
0.0022233066
0.0010311025
0.000578187
0.0037308976
0.011678389
0.0006187359
0.0024415157
0.010247211
0.0011166357
0.0
0.0
0.0
0.003951251
0.096990824
0.0032208732
0.008825374
0.011008428
0.0035316234
0.013273338
0.0007010526
0.0046516713
0.0
0.0
0.0
0.0019702227
0.0029898006
0.0022335462
0.0016091418
0.02547447
0.0059717535
0.003290895
0.0020693478
0.0070641832
0.008770032
0.0054923883
0.004170169
0.0042777546
0.0076183793
0.0034289898
0.0036427013
0.0023062613
0.0105204685
0.0060450267
0.0024938865
0.0028122514
0.00937408
0.0012083367
0.00134785



Elapsed time to compute best fit: 188.656 seconds
Cross-validation score: 0.7094905754469918
Test score: 0.7608695652173912
Best Hyperparameters: {}
0.007302327
0.029564299
0.09587919
0.1454118
0.01208552
0.009454606
0.0020650302
0.011836686
0.015604221
0.004043895
0.024483323
0.052532874
0.0044964976
0.0
0.011578124
0.0
0.0084512215
0.013009292
0.0038228792
0.0005570619
0.0064555816
0.0020667522
0.0061827297
0.0016247043
0.010211038
0.0007938563
0.004540797
0.0009137312
0.0009176398
0.0021540944
0.013053088
0.0031718628
0.001530203
0.010170742
0.0004770784
0.0
0.0
0.0
0.0028500748
0.0697901
0.0030031651
0.009679392
0.001831495
0.004618536
0.010064595
0.00048061777
0.0052196123
0.0
0.0
0.0
0.0010663291
0.0031475532
0.00089471176
0.0036661972
0.0033468471
0.0013905657
0.0078049237
0.0049397964
0.007931526
0.0073668147
0.008096791
0.005670254
0.005797563
0.0019681945
0.0039041303
0.0071336143
0.001650945
0.0046821753
0.0012803955
0.0034128425
0.0015223671
0.0071856673
0.0018839238
0.0020



Elapsed time to compute best fit: 196.137 seconds
Cross-validation score: 0.7302643951227976
Test score: 0.7116104868913857
Best Hyperparameters: {}
0.0099370405
0.05786025
0.08035552
0.18103695
0.008906963
0.016421305
0.0028615578
0.0044332524
0.0031005153
0.010261573
0.03064885
0.03352565
0.002563923
0.0
0.012574038
0.0
0.0069046444
0.008388344
0.00759145
0.0022231955
0.0071834293
0.005949666
0.0047048833
0.001085964
0.0035127355
0.0036097209
0.001176219
0.0007428595
0.0018558914
0.0067959675
0.019370902
0.003035223
0.0012522559
0.007839034
0.001800993
0.0
0.0
0.0
0.0016909197
0.0
0.0043942155
0.008796939
0.00022765894
0.008349098
0.0038480877
0.00085498026
0.0050466377
0.0
0.0
0.0
0.0018115091
0.0014188632
0.0021714275
0.007423691
0.009017468
0.0021424552
0.0072050528
0.0043527144
0.011365614
0.0148769645
0.0031589766
0.00471186
0.004389926
0.010396002
0.0023661575
0.001854807
0.0028186105
0.001654075
0.0013828898
0.0026403253
0.0028111546
0.0089792805
0.0019796228
0.0019115215
0.00



Elapsed time to compute best fit: 190.707 seconds
Cross-validation score: 0.7263592243327293
Test score: 0.676470588235294
Best Hyperparameters: {}
0.0064796377
0.028162405
0.17256667
0.055437572
0.0071545346
0.018258272
0.0077631488
0.007837152
0.0020671317
0.0027556277
0.029408142
0.03338499
0.001606637
0.0
0.010433452
0.0
0.025420966
0.005623391
0.0067096855
0.012141391
0.010118559
0.0031798491
0.004701101
0.0021154545
0.0074049532
0.0036027667
0.0015340266
0.001491864
0.0005599358
0.0020488345
0.023263251
0.001618905
0.0019224356
0.0027190035
0.0020612252
0.0
0.0
0.0
0.0023707303
0.042224683
0.008609823
0.0056556696
0.0011351535
0.013411203
0.007643094
0.0034971747
0.0024842026
0.0
0.0
0.0
0.0015787849
0.0019738043
0.00188567
0.011323601
0.0005576537
0.0036549626
0.0031031468
0.0024819833
0.0089968825
0.008536146
0.0102505665
0.006998934
0.0033122045
0.00052557536
0.004789417
0.00084278313
0.0008104041
0.00957101
0.0008159267
0.0019543823
0.0048726615
0.008052555
0.00095548335
0.00



Elapsed time to compute best fit: 187.631 seconds
Cross-validation score: 0.7172256150711454
Test score: 0.7875457875457876
Best Hyperparameters: {}
0.008231448
0.05372028
0.07029135
0.21221218
0.011197981
0.0038341826
0.0025893005
0.016287865
0.0038210708
0.006640141
0.038720004
0.022003777
0.0031958919
0.0
0.021796573
0.0
0.01139594
0.014880019
0.0052783536
0.0024328434
0.0057797567
0.0023179778
0.0048549934
0.0107623
0.003811984
0.0014707159
0.004024546
0.009382592
0.0030088727
0.0027235872
0.015989492
0.0014774274
0.0013059088
0.0030607
0.0010426102
0.0
0.0
0.0
0.0040967003
0.021087015
0.0018521091
0.030945994
0.00017041249
0.011427297
0.009013634
0.00017022638
0.0024938427
0.0
0.0
0.0
0.0011050791
0.003181273
0.0009754104
0.0025425907
0.0064988844
0.0013915686
0.0028624027
0.005568952
0.011875818
0.010196344
0.0034077035
0.0034123769
0.006007056
0.0118322205
0.007898009
0.0010253931
0.001949933
0.00034063592
0.00075128826
0.0049175075
0.0023107668
0.008799789
0.0015872737
0.001387



Elapsed time to compute best fit: 199.829 seconds
Cross-validation score: 0.7041002746148058
Test score: 0.7578397212543554
Best Hyperparameters: {}
0.011164854
0.030673387
0.10427279
0.11249377
0.009120506
0.010713674
0.0027882673
0.014035766
0.012146145
0.01024182
0.037956193
0.044774946
0.0032749793
0.0
0.008923824
0.0
0.008724282
0.0062575187
0.0059193945
0.009060528
0.0067262393
0.0066526993
0.012597724
0.0031291796
0.0021134517
0.0003636236
0.0009923254
0.0026347802
0.0014073529
0.004666869
0.037843753
0.0014545615
0.002915241
0.0026783613
0.0016200908
0.0
0.0
0.0
0.0028649687
0.06743918
0.005714817
0.00778877
0.0028305762
0.0039498936
0.014046768
0.016061256
0.0032580963
0.0
0.0
0.0
0.0012172479
0.001345582
0.0022259534
0.002279514
0.013856023
0.003941207
0.0013620132
0.0106460275
0.008998033
0.0068884953
0.006303809
0.0064993245
0.0056149694
0.0036545827
0.005410007
0.0038371077
0.0014775152
0.011389126
0.002101052
0.0034890578
0.001729133
0.007103417
0.0031230298
0.0026964617




Elapsed time to compute best fit: 199.886 seconds
Cross-validation score: 0.7386479538306603
Test score: 0.7490974729241877
Best Hyperparameters: {}
0.009473659
0.024180088
0.12554878
0.060220666
0.012874671
0.0051190695
0.0029702804
0.013816581
0.02005037
0.003966435
0.032175425
0.034036495
0.0017447325
0.0
0.014120037
0.0
0.012509175
0.004898554
0.008217503
0.021830447
0.004695748
0.0057602627
0.013920341
0.0017285785
0.0016221172
0.0010333182
0.0005973006
0.0013927001
0.0021437996
0.0019394903
0.023588292
0.0010913373
0.0011414716
0.0028736661
0.0016689093
0.0
0.0
0.0
0.0034089522
0.119682774
0.002458106
0.005681224
0.009329188
0.003241522
0.015885204
0.00103225
0.0046028392
0.0
0.0
0.0
0.0020798056
0.0025341818
0.0026760676
0.0077283704
0.0053117536
0.01300989
0.0007228627
0.008004057
0.0043087695
0.015683796
0.002247909
0.0052081384
0.0023546894
0.0023145175
0.004119031
0.0022073966
0.0023405056
0.00930207
0.0018843807
0.0014569326
0.0023570922
0.009000389
0.00050128973
0.00109248



Elapsed time to compute best fit: 196.677 seconds
Cross-validation score: 0.7311344188403852
Test score: 0.6405693950177936
Best Hyperparameters: {}
0.010403744
0.026152793
0.109929115
0.13731286
0.008305975
0.0043472084
0.0023878834
0.0052624913
0.0068635195
0.007903805
0.026342668
0.04717962
0.001733058
0.0
0.013532065
0.0
0.008765281
0.007617157
0.0145258885
0.0013604262
0.0061989347
0.006392291
0.006548364
0.0020689664
0.003415306
0.0005938626
0.0036659536
0.0013711711
0.0012707708
0.0009397572
0.014509193
0.0026190605
0.0005882814
0.005269808
0.0009628273
0.0
0.0
0.0
0.00452157
0.05588901
0.002522957
0.008472499
0.007442454
0.0027362166
0.004668094
0.0051412615
0.0032843305
0.0
0.0
0.0
0.0011261624
0.0029877413
0.0013474984
0.007445608
0.017523922
0.0022224884
0.0018849882
0.0038507134
0.0051668026
0.015392342
0.006762538
0.0044306903
0.002997716
0.008129573
0.0062938165
0.0011797786
0.0051688137
0.001822616
0.0018775635
0.0029280584
0.0012470855
0.006716722
0.0012595473
0.0013525



Elapsed time to compute best fit: 185.667 seconds
Cross-validation score: 0.7417083812364081
Test score: 0.6800766283524904
Best Hyperparameters: {}
0.008212082
0.056932453
0.06519321
0.20926392
0.013040192
0.0047833864
0.0029715234
0.013763064
0.005871829
0.0017800477
0.028864542
0.017977323
0.004221272
0.0
0.011223263
0.0
0.0021938987
0.012537719
0.008192141
0.000787391
0.007889809
0.006299487
0.0053609572
0.003990042
0.002502757
0.0028435786
0.001972842
0.0009987758
0.0007659277
0.0034272042
0.0103305755
0.00057353044
0.0017860803
0.00990462
0.0009049947
0.0
0.0
0.0
0.004395874
0.0
0.006667247
0.009055439
0.005091166
0.0033717218
0.012074356
0.001074508
0.0021008796
0.0
0.0
0.0
0.000882781
0.0019967186
0.0033116562
0.0026489256
0.0047145444
0.00066230167
0.003225049
0.006227824
0.00664366
0.012098568
0.0067266733
0.005493208
0.0074565695
0.0023267134
0.003271925
0.0027806684
0.0009310091
0.0060652434
0.0017878655
0.0018982252
0.0025450026
0.0078883255
0.002004397
0.0021584528
0.0021



Elapsed time to compute best fit: 185.428 seconds
Cross-validation score: 0.7044194385615944
Test score: 0.7841328413284133
Best Hyperparameters: {}
0.007308859
0.050543062
0.09334707
0.11543376
0.010860086
0.0076963888
0.004791481
0.012304759
0.007375068
0.0051209275
0.039719515
0.040645532
0.00237215
0.0
0.025837751
0.0
0.015354245
0.0041787494
0.005408503
0.0040780758
0.008554493
0.009556928
0.011929801
0.027844023
0.005602624
0.0013694579
0.0018385076
0.0010628748
0.0017587042
0.0065314183
0.02629653
0.0011936686
0.0038467855
0.0073358626
0.0013693023
0.0
0.0
0.0
0.0028845347
0.047255695
0.0031700714
0.018853888
0.0023417696
0.004421795
0.007184116
0.000911275
0.0027999259
0.0
0.0
0.0
0.00095273805
0.0028530404
0.0014352327
0.0030870303
0.023998138
0.0014821119
0.0043645664
0.0026267425
0.009628841
0.018424677
0.005997102
0.0059346766
0.0035934185
0.006145337
0.00885093
0.0025011143
0.0038999626
0.008155456
0.0012599309
0.0010524221
0.0012270351
0.0068848906
0.00081750815
0.0025728



Elapsed time to compute best fit: 186.751 seconds
Cross-validation score: 0.7457460768349955
Test score: 0.6583629893238435
Best Hyperparameters: {}
0.0068846364
0.015826881
0.13669346
0.06137533
0.0047354526
0.008949917
0.003350161
0.009473154
0.018727973
0.009398043
0.03686371
0.05754357
0.0015707358
0.0
0.012498742
0.0
0.010513518
0.0098871
0.006254811
0.00063980056
0.0044307453
0.007475367
0.011706408
0.009947746
0.0022176774
0.004681194
0.0010126929
0.0011116805
0.00066040823
0.0026874808
0.012243683
0.0010737729
0.0014197674
0.008096929
0.0014914207
0.0
0.0
0.0
0.0064604813
0.12963356
0.0021318533
0.0044376454
0.004625773
0.015510671
0.021599665
0.0019540116
0.0017824332
0.0
0.0
0.0
0.00094048056
0.0032525042
0.0010765117
0.012855336
0.0002043981
0.0021072784
0.0027155424
0.0048679276
0.006128977
0.021124925
0.0027768093
0.007949774
0.006601277
0.010050482
0.0032302584
0.00092541764
0.0010640819
0.008135969
0.00092540437
0.0019986357
0.0011924318
0.0071159974
0.0013969352
0.00298



Elapsed time to compute best fit: 206.823 seconds
Cross-validation score: 0.729117794877675
Test score: 0.756457564575646
Best Hyperparameters: {}
0.011083596
0.03824399
0.08257985
0.1748719
0.012179987
0.0067858133
0.0068891384
0.0098330835
0.004732131
0.0042565404
0.029243885
0.022939283
0.0052461475
0.0
0.0068787714
0.0
0.0027516342
0.004189381
0.007424467
0.0015318141
0.008788575
0.0049096984
0.0085193105
0.004000365
0.0012604244
0.005220647
0.0016847949
0.001356007
0.00038954525
0.0023560592
0.015661526
0.001688351
0.0010381676
0.003734945
0.0007681958
0.0
0.0
0.0
0.0063517075
0.043392334
0.0022154343
0.010211294
0.033861
0.0073052174
0.007246003
0.0047140974
0.0034597076
0.0
0.0
0.0
0.0017774415
0.003376024
0.000782444
0.0019908198
0.004260711
0.0011859516
0.0011887922
0.007355759
0.011525181
0.012988412
0.0045664455
0.005537651
0.004194776
0.009049497
0.002159241
0.0011454668
0.0015894369
0.0047848145
0.0007786223
0.0009710695
0.001869617
0.006647319
0.00088771636
0.0017059217
0



Elapsed time to compute best fit: 195.055 seconds
Cross-validation score: 0.703215715284675
Test score: 0.756457564575646
Best Hyperparameters: {}
0.012238336
0.048486236
0.08997497
0.17758521
0.015919408
0.011280955
0.0046920464
0.021910008
0.016946312
0.005275365
0.029198885
0.036830626
0.0023918678
0.0
0.00855891
0.0
0.0127339205
0.0045752474
0.006020498
0.0040200404
0.005122343
0.0073024416
0.015227321
0.0013256212
0.004124495
0.0036233903
0.0029007983
0.0022494895
0.0021474943
0.0018247745
0.015794653
0.0008571667
0.0014288196
0.0059535457
0.0007224039
0.0
0.0
0.0
0.0036734894
0.04178382
0.0024721862
0.012781348
0.0018214295
0.008099109
0.0058692573
0.0010624448
0.0038886156
0.0
0.0
0.0
0.0032012775
0.0025866008
0.0011949795
0.003342297
0.0029527394
0.0014888233
0.0059262994
0.0043320176
0.009452425
0.006105926
0.0034872168
0.004606133
0.005518576
0.014958922
0.0017927734
0.0030673537
0.001349296
0.005230958
0.0020298287
0.0013702598
0.001995027
0.0075765266
0.0011295987
0.0036186



Elapsed time to compute best fit: 196.434 seconds
Cross-validation score: 0.7209325519421845
Test score: 0.8054393305439331
Best Hyperparameters: {}
0.0145534985
0.037876166
0.090270296
0.15452485
0.022563852
0.014470166
0.003599648
0.011361009
0.014022115
0.011168024
0.039925702
0.028932698
0.0055450066
0.0
0.008045339
0.0
0.0037486984
0.011261818
0.007532045
0.0010933497
0.0075327917
0.0051938156
0.025292022
0.009110658
0.0036652994
0.006052742
0.0016885259
0.0009403617
0.0013097049
0.021018486
0.015425632
0.00064107525
0.0020874175
0.0047498164
0.0010249136
0.0
0.0
0.0
0.0064464817
0.048817173
0.004597658
0.030615652
0.0002264096
0.0022493848
0.0028092912
0.0048455633
0.0054761344
0.0
0.0
0.0
0.0019589649
0.001313255
0.0018465122
0.008072784
0.0010000473
0.0011333866
0.00074916176
0.0077520683
0.007875762
0.010107542
0.005981378
0.00859345
0.002955802
0.00082490186
0.008636326
0.0014450327
0.00069322716
0.0075668343
0.001401282
0.0025063646
0.001785619
0.007264699
0.002111149
0.0023



Elapsed time to compute best fit: 194.798 seconds
Cross-validation score: 0.7318853412274354
Test score: 0.7384341637010677
Best Hyperparameters: {}
0.00844156
0.04132814
0.07943815
0.18092857
0.009733294
0.0036379467
0.0017036572
0.0029497028
0.00567889
0.010600564
0.020745544
0.032476846
0.0029642321
0.0
0.009462707
0.0
0.012088313
0.0032186098
0.005739567
0.001637852
0.005530342
0.005258789
0.0076110656
0.0031406726
0.0032275827
0.0009130485
0.0017941548
0.0015803252
0.00035508638
0.0018119195
0.015968323
0.0014601141
0.0019012487
0.0035781479
0.0013683446
0.0
0.0
0.0
0.005026226
0.100990884
0.003253562
0.010778169
0.039091226
0.003931201
0.010523807
0.0
0.0015561095
0.0
0.0
0.0
0.0010827406
0.002159394
0.0026048268
0.00422474
0.0047874767
0.000766506
0.003897522
0.0032414556
0.010426971
0.007926189
0.008862997
0.0064041126
0.006931142
0.000582996
0.0039382246
0.0013498239
0.0011238917
0.003442327
0.0012003832
0.0022344305
0.0012037086
0.0055929297
0.000493883
0.0015559517
0.0030934



Elapsed time to compute best fit: 204.048 seconds
Cross-validation score: 0.7324452314292782
Test score: 0.7114624505928854
Best Hyperparameters: {}
0.015250603
0.0371033
0.09378683
0.18828239
0.011340965
0.0056377687
0.0031677752
0.015160882
0.009140488
0.020101309
0.024385577
0.048883915
0.0072157374
0.0
0.01462385
0.0
0.004602613
0.003298713
0.008962732
0.0018567945
0.0073114554
0.0042283386
0.010158108
0.0028146326
0.0012636196
0.0015852773
0.00035975376
0.003006706
0.0024070495
0.004328264
0.021352228
0.0011109161
0.0007699684
0.0086329365
0.0014117158
0.0
0.0
0.0
0.00093335734
0.050280765
0.006277124
0.0010926768
0.00040210751
0.0015054804
0.0069483244
0.00097217906
0.0039880727
0.0
0.0
0.0
0.0011442144
0.002593944
0.0018524184
0.0066890246
0.0017260599
0.0018747909
0.0023587476
0.005826239
0.0069871745
0.014493333
0.0054793097
0.0039925408
0.0032259983
0.0037950515
0.0024918783
0.00038465756
0.005427692
0.0025910556
0.0014468076
0.0010127157
0.0014008962
0.006235407
0.0018150156



Elapsed time to compute best fit: 222.353 seconds
Cross-validation score: 0.7218459528912791
Test score: 0.7581227436823105
Best Hyperparameters: {}
0.011628102
0.036580447
0.09578112
0.19054164
0.011884586
0.011463412
0.004750007
0.010558234
0.008145058
0.004967843
0.031230047
0.03312606
0.005400052
0.0
0.011469464
0.0
0.011941567
0.003848099
0.007240555
0.0013979527
0.008264723
0.0068535083
0.014865901
0.0005590043
0.008467628
0.00079081225
0.0017505112
0.0008338755
0.0013021139
0.0018782304
0.01912939
0.0016917019
0.001125209
0.0061389655
0.0014653757
0.0
0.0
0.0
0.0066894093
0.032219227
0.0026606773
0.008024859
0.0054995473
0.010523855
0.013484037
0.0031750202
0.0037511063
0.0
0.0
0.0
0.0026325388
0.0056667402
0.0011252133
0.006951273
0.0026770735
0.0028603915
0.0025734184
0.008170411
0.01007327
0.008563469
0.0045368476
0.0034582543
0.0072431164
0.0032457283
0.0030655703
0.004347139
0.0015738559
0.009546502
0.001509422
0.0030280114
0.0021924877
0.0057101874
0.0017529984
0.000379985



Elapsed time to compute best fit: 196.825 seconds
Cross-validation score: 0.7363766052348645
Test score: 0.7142857142857143
Best Hyperparameters: {}
0.009238318
0.04031944
0.086358674
0.19982962
0.013299839
0.01902462
0.0036275503
0.009724546
0.0018264507
0.0088528795
0.024671588
0.032955013
0.0040315683
0.0
0.007983866
0.0
0.00864555
0.0072075506
0.007989568
0.0023931712
0.011138928
0.004043549
0.004115436
0.0034799078
0.0004959733
0.0031981363
0.001992001
0.0013135895
0.0012617431
0.0026761338
0.013407187
0.001194319
0.0013241362
0.0066331937
0.001019307
0.0
0.0
0.0
0.0011385028
0.03504837
0.0042761685
0.008123712
0.0015445242
0.0061069774
0.017676193
0.007592331
0.0021496054
0.0
0.0
0.0
0.0023047556
0.0025658738
0.001146681
0.006124599
0.0037405102
0.0027664946
0.0068516005
0.0034838654
0.013293949
0.021628287
0.003855668
0.0067692366
0.0030232463
0.0034205203
0.004437184
0.0017105767
0.0029075672
0.0041114273
0.005966725
0.0015725882
0.0014355734
0.006283888
0.0016190374
0.00144949



Elapsed time to compute best fit: 219.043 seconds
Cross-validation score: 0.7115056144798646
Test score: 0.6872509960159361
Best Hyperparameters: {}
0.005730587
0.029684963
0.12066512
0.106892265
0.009689512
0.0058909836
0.003323088
0.009770717
0.0014833403
0.01526792
0.02677468
0.04228451
0.0032126796
0.0
0.010981956
0.0
0.011822876
0.009987202
0.0060588312
0.003075186
0.0039985674
0.0032727236
0.0122455275
0.0033138718
0.0009700221
0.001032737
0.0017292455
0.002293794
0.00023605331
0.0019559846
0.020741055
0.00089658675
0.0017742086
0.002498633
0.0012984962
0.0
0.0
0.0
0.0019660105
0.031161742
0.0048260866
0.013278912
0.008351295
0.010661549
0.0030095496
0.0039069965
0.0038377545
0.0
0.0
0.0
0.0026451799
0.0030161536
0.0027488193
0.003557916
0.03555178
0.00218696
0.0026483557
0.0033997344
0.011507894
0.017594459
0.0034204377
0.007023965
0.0044259154
0.0070050997
0.005111878
0.0016725329
0.002283124
0.010909053
0.0030267455
0.00068047084
0.00084534066
0.00852173
0.002138587
0.00084557



Elapsed time to compute best fit: 194.160 seconds
Cross-validation score: 0.7604541094607576
Test score: 0.7365145228215768
Best Hyperparameters: {}
0.0041124118
0.046049118
0.06335358
0.19060086
0.018074103
0.0069315573
0.0097382255
0.011645311
0.019727094
0.0023675892
0.024431355
0.029310979
0.009030829
0.0
0.008358031
0.0
0.0029139882
0.005047622
0.0090702195
0.0022387577
0.0057557663
0.0056373747
0.0031057387
0.0014269752
0.00038663158
0.0026104902
0.0027993906
0.0012632956
0.001569039
0.0012419081
0.010386137
0.00067075767
0.0013061673
0.0040340126
0.0008922118
0.0
0.0
0.0
0.0040233126
0.12773061
0.0033407994
0.010394664
0.0
0.0043926816
0.013415578
0.0011090878
0.0035816273
0.0
0.0
0.0
0.0010259077
0.0019070038
0.0014549467
0.0019162963
0.007097327
0.0017125282
0.00058267475
0.0055605625
0.0054687676
0.009924497
0.0033491899
0.0040630624
0.003225747
0.010270293
0.0040465244
0.0014227037
0.0033857268
0.0053465767
0.0017286611
0.0014562056
0.0005241961
0.00582792
0.0013825078
0.003



Elapsed time to compute best fit: 191.034 seconds
Cross-validation score: 0.7382150198993085
Test score: 0.7033898305084744
Best Hyperparameters: {}
0.009252361
0.035994772
0.07650478
0.20620301
0.019204024
0.020065807
0.0033014175
0.0051057837
0.0046963505
0.024208434
0.03206965
0.015374003
0.0029772832
0.0
0.012197045
0.0
0.0051257084
0.005675164
0.0081812
0.0012543091
0.006606116
0.0037503531
0.0100193275
0.004182403
0.0042783683
0.0010133705
0.001642974
0.0009832996
0.0020659794
0.0019614913
0.024627406
0.0008538835
0.0011891059
0.009375646
0.0012220946
0.0
0.0
0.0
0.0035589267
0.021522539
0.002810375
0.013088352
0.00050622836
0.0017850135
0.0041241553
0.00086447404
0.0037949802
0.0
0.0
0.0
0.0012741943
0.009341735
0.0013803104
0.0014082829
0.009706735
0.0014026816
0.0021847095
0.00409525
0.011010783
0.012733828
0.0063220775
0.0044302093
0.0023849488
0.01487584
0.005543472
0.0005450408
0.0017650055
0.0106751565
0.0039445655
0.002723759
0.0024955731
0.010125486
0.0009802599
0.000894

### 4.2.4 LightGBM

In [116]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

smote_lightgbm_performance_normalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    LightGBMPipeline = Pipeline(steps = [['smote', SMOTE()],
                                    #['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    smote_lightgbm_performance_normalized_df = pd.concat([smote_lightgbm_performance_normalized_df, new_performance_df])
    

smote_lightgbm_performance_normalized_df.to_csv("../data/05_model_output/smote_lightgbm_performance_normalized_df.csv")




Elapsed time to compute best fit: 17.651 seconds
Cross-validation score: 0.7298782599613796
Test score: 0.6896551724137931
Best Hyperparameters: {}
3034.591679096222
33337.158732414246
147618.53709089756
31074.656392097473
2235.2383992671967
4670.16300535202
718.9539787769318
1342.0890762805939
175.7460277080536
1907.755302786827
11572.093983769417
9562.472279548645
302.24202704429626
0.0
2089.987751722336
0.0
867.9952418804169
659.2112362384796
1494.7923467159271
483.88556241989136
1450.8499910831451
396.9516489505768
562.4954183101654
191.46344637870789
287.04684138298035
192.97569751739502
344.01887035369873
244.62214875221252
26.222009658813477
256.25847148895264
3599.443489074707
688.7226173877716
278.07943058013916
1492.4691672325134
127.86178970336914
0.0
0.0
0.0
532.002934217453
1099.574119567871
817.9205839633942
697.9205069541931
8.193770170211792
629.8832519054413
480.37126445770264
13.427290439605713
569.0507462024689
0.0
0.0
0.0
277.9173493385315
335.8749556541443
576.5611



Elapsed time to compute best fit: 18.777 seconds
Cross-validation score: 0.7345091383669751
Test score: 0.7245080500894455
Best Hyperparameters: {}
3482.219325184822
38102.22701907158
95465.03714728355
83011.42061376572
2322.9722142219543
4702.607264518738
184.62203979492188
1071.2131071090698
2352.534098625183
648.4707469940186
6432.079422831535
11445.186803817749
165.82537937164307
0.0
1675.331224322319
0.0
703.5020341873169
1458.0351576805115
678.9207074642181
128.20154881477356
1226.6227725744247
579.4893131256104
756.448703289032
241.5705989599228
116.02860260009766
201.0756025314331
50.01275086402893
322.3584535121918
46.25312089920044
626.1804845333099
3982.219881772995
171.27481889724731
152.60327053070068
821.1081993579865
144.25212144851685
0.0
0.0
0.0
681.9570713043213
793.7309036254883
479.97062826156616
900.6100537776947
828.2062911987305
1183.6271705627441
2060.6209802627563
37.68008899688721
803.3861703872681
0.0
0.0
0.0
293.93253111839294
702.8808617591858
441.419055461



Elapsed time to compute best fit: 17.170 seconds
Cross-validation score: 0.689624816988399
Test score: 0.7446808510638298
Best Hyperparameters: {}
2799.348131418228
34049.72413814068
94859.11818611622
81837.13396835327
2283.4248340129852
2235.1912758350372
515.0942134857178
1920.360960841179
1218.0896072387695
328.4677040576935
11320.347797870636
8656.913625240326
242.0346703529358
0.0
1830.089147567749
0.0
604.6210656166077
942.6178328990936
1566.2832295894623
287.6931321620941
1429.177418231964
484.85669136047363
1028.7540862560272
469.14963698387146
198.18576049804688
17.56383991241455
15.438289642333984
233.53330063819885
58.16543960571289
907.7045311927795
4454.618639945984
129.05872106552124
412.1093153953552
4141.454426288605
205.100111246109
0.0
0.0
0.0
370.48914194107056
747.927218914032
546.993748664856
553.7776775360107
48.62779998779297
973.0940260887146
866.3586800098419
35.78156018257141
588.9399063587189
0.0
0.0
0.0
299.4655566215515
367.3185234069824
327.3437616825104
1



Elapsed time to compute best fit: 16.946 seconds
Cross-validation score: 0.725213779474884
Test score: 0.7293497363796133
Best Hyperparameters: {}
3747.398884534836
31431.344694375992
96233.07019329071
84865.46729075909
1875.9186263084412
2602.290107488632
209.7799596786499
891.180233001709
1715.8654761314392
300.2483067512512
11218.715385556221
10416.262865543365
763.0088303089142
0.0
2490.0225784778595
0.0
432.32202982902527
808.2033681869507
1034.7182894945145
225.28408694267273
1448.160623550415
184.039879322052
849.7561440467834
168.49957180023193
187.01831126213074
42.59971046447754
113.64985036849976
135.75303888320923
170.06046772003174
24.914459943771362
2828.8505284786224
173.77700209617615
438.12974739074707
3091.093372821808
75.17706990242004
0.0
0.0
0.0
768.7132592201233
997.5597991943359
550.9810688495636
2195.7307381629944
74.40693044662476
1292.989134311676
883.0452075004578
4.0378899574279785
553.4621179103851
0.0
0.0
0.0
339.40417671203613
733.8896579742432
416.594582



Elapsed time to compute best fit: 17.554 seconds
Cross-validation score: 0.7205164491418843
Test score: 0.7015985790408525
Best Hyperparameters: {}
3240.705456018448
40321.893689990044
95745.67917513847
83962.53625130653
2125.990600347519
2731.450898885727
268.86356019973755
340.176118850708
768.8573319911957
216.03306651115417
13197.10570693016
5118.723520755768
384.029390335083
0.0
2297.8355860710144
0.0
363.1184585094452
666.4508378505707
1948.0261752605438
369.08505153656006
1210.8256947994232
518.5113883018494
1135.2115924358368
555.8706092834473
141.37560033798218
12.854049921035767
152.72154903411865
265.8514702320099
22.582510232925415
108.24621033668518
3318.8197162151337
164.6955804824829
171.134752035141
1625.0918035507202
192.8307912349701
0.0
0.0
0.0
902.7992618083954
128.7620964050293
319.06901597976685
2762.580863714218
0.0
782.4894778728485
263.494642496109
4.218939781188965
840.4551365375519
0.0
0.0
0.0
113.27270150184631
445.07132744789124
160.7897675037384
260.203006



Elapsed time to compute best fit: 16.260 seconds
Cross-validation score: 0.7186859814145803
Test score: 0.7155635062611807
Best Hyperparameters: {}
4456.408297538757
20667.904536008835
160423.72867250443
27244.634830355644
2311.8475575447083
896.0149958133698
309.18121790885925
757.39382147789
1643.168471813202
1500.565808057785
9733.690930724144
19423.024414420128
298.3253879547119
0.0
2352.5861122608185
0.0
887.6110394001007
1011.0890748500824
1217.2502760887146
210.34191417694092
911.4900357723236
459.31095838546753
594.1269419193268
83.99168014526367
72.55576980113983
57.296799659729004
37.930750131607056
83.36845970153809
50.75832152366638
294.67926955223083
3449.767720937729
122.84290945529938
215.19807291030884
1875.9242305755615
156.0046901702881
0.0
0.0
0.0
366.8241603374481
927.7501888275146
772.8587329387665
1649.6951036453247
336.92666935920715
834.8380155563354
3017.749575138092
8.291720390319824
267.5994383096695
0.0
0.0
0.0
171.4420783519745
760.230959892273
307.91363167



Elapsed time to compute best fit: 16.980 seconds
Cross-validation score: 0.7079147044453448
Test score: 0.7194899817850636
Best Hyperparameters: {}
1876.417086839676
43499.73503947258
83467.19553637505
93149.3694422245
2555.4111652374268
2843.845088005066
338.3442416191101
1745.9538657665253
621.448086977005
265.5166379213333
11648.883986473083
4518.488857984543
594.8066309690475
0.0
2143.4127044677734
0.0
543.5879812240601
280.8837580680847
1566.186048746109
445.31339478492737
1262.0129222869873
684.4111397266388
701.2527234554291
51.243350982666016
169.91780924797058
57.129169940948486
81.2063102722168
228.1587998867035
74.45099949836731
204.3908383846283
4533.349340200424
245.44885802268982
303.98162031173706
1400.1848413944244
92.15381932258606
0.0
0.0
0.0
438.88857984542847
876.9945542812347
770.9058763980865
2864.918335914612
3.0188701152801514
1108.7757730484009
1644.2964823246002
0.0
536.6907098293304
0.0
0.0
0.0
501.0417733192444
736.3833181858063
301.5968885421753
225.1996135



Elapsed time to compute best fit: 17.110 seconds
Cross-validation score: 0.6972342161098746
Test score: 0.7678571428571429
Best Hyperparameters: {}
4220.160111427307
39401.22453093529
88877.05530452728
80121.41170287132
2011.039183139801
3611.782778978348
213.3035237789154
1633.5155756473541
441.54925060272217
2099.245169401169
11786.889198184013
6094.035541176796
183.67101049423218
0.0
3387.2613220214844
0.0
1590.24276304245
1354.3475861549377
1129.755405664444
422.40728425979614
2014.5208508968353
316.28748393058777
536.7278833389282
237.98398613929749
300.2156307697296
111.5285005569458
65.53467106819153
209.8015410900116
70.77529001235962
440.4777901172638
5378.590913295746
127.68697810173035
116.97681879997253
2511.1834902763367
149.61095118522644
0.0
0.0
0.0
821.1082639694214
219.0736961364746
619.36865234375
395.65639328956604
2.151859998703003
981.4795138835907
1582.0482449531555
11.384200096130371
601.378481388092
0.0
0.0
0.0
193.4144983291626
637.0349159240723
406.35540986061



Elapsed time to compute best fit: 16.214 seconds
Cross-validation score: 0.7193188680584882
Test score: 0.7142857142857143
Best Hyperparameters: {}
3152.1170721054077
30547.809064388275
109302.64442956448
77101.99717307091
3128.2985520362854
2225.717441558838
447.821674823761
1594.5348128080368
762.8795976638794
666.8334646224976
10485.176701784134
10429.127922058105
332.51751828193665
0.0
3343.4303209781647
0.0
1249.8250024318695
722.6342828273773
1404.8706212043762
111.99143838882446
1295.1306993961334
508.5614068508148
1384.7750296592712
109.70769047737122
416.90803956985474
5.042220115661621
48.66133975982666
496.61502146720886
63.11543941497803
277.87263655662537
2753.4702048301697
121.88696098327637
256.34458327293396
2359.518581390381
211.94731736183167
0.0
0.0
0.0
802.8045506477356
272.044456243515
601.236762046814
1141.1602110862732
38.5968017578125
1433.4756321907043
1154.0883897542953
63.42711925506592
269.1626765727997
0.0
0.0
0.0
232.13177013397217
1699.436858177185
142.37



Elapsed time to compute best fit: 16.764 seconds
Cross-validation score: 0.7323376091439747
Test score: 0.6939501779359432
Best Hyperparameters: {}
3948.9007456302643
31668.592492818832
135951.08629918098
38888.51367485523
1478.392930984497
3976.9889624118805
382.8524897098541
604.409924030304
2081.6011571884155
361.9755611419678
14185.25082540512
10279.705929517746
266.1868805885315
0.0
2885.0664430856705
0.0
802.9220972061157
859.6122722625732
978.9173805713654
398.26072788238525
1933.3020904064178
711.3316061496735
948.6917159557343
126.16572070121765
396.8429572582245
83.36209058761597
34.22089982032776
394.8558827638626
116.08745980262756
589.7911670207977
3393.211893796921
63.85983061790466
350.77984952926636
2516.360703587532
167.09529185295105
0.0
0.0
0.0
154.98019242286682
322.05269622802734
509.6163375377655
1931.8612027168274
0.0
1031.71311211586
2308.1317343711853
11.836360216140747
834.9389727115631
0.0
0.0
0.0
309.2751383781433
332.7246296405792
130.5967779159546
233.0894



Elapsed time to compute best fit: 17.202 seconds
Cross-validation score: 0.7166025865609977
Test score: 0.7142857142857143
Best Hyperparameters: {}
1917.4158852100372
31447.33625125885
113959.34388029575
62610.575491786
2140.09587597847
3767.096952199936
443.0462987422943
2155.8532497882843
936.8422849178314
281.5413866043091
12730.245020747185
11222.754175662994
411.8523790836334
0.0
1833.9477849006653
0.0
911.9100878238678
575.5230073928833
1371.9566917419434
285.16631603240967
1882.4313762187958
243.28637218475342
552.8788571357727
11.347999572753906
208.01080012321472
14.191709756851196
79.45787858963013
437.97970724105835
95.3254086971283
283.1043999195099
4762.305089712143
182.82841110229492
150.65259194374084
1456.3735420703888
152.51426911354065
0.0
0.0
0.0
122.45258784294128
803.9635848999023
569.8559412956238
1897.9625625610352
0.0
1755.5789613723755
1766.9342690706253
9.379019737243652
371.8948493003845
0.0
0.0
0.0
198.64563012123108
254.70145654678345
211.9518587589264
139.



Elapsed time to compute best fit: 17.625 seconds
Cross-validation score: 0.7012323985027671
Test score: 0.7205623901581723
Best Hyperparameters: {}
3079.390825986862
43160.27579188347
96140.21007239819
76393.18254947662
3301.6036009788513
2542.2365280389786
240.70359826087952
326.4661898612976
69.01568031311035
623.6831746101379
13617.195855021477
10084.874093055725
279.36342191696167
0.0
2870.8944630622864
0.0
1902.4414601325989
781.029991030693
1017.9060997962952
285.98215103149414
950.8142158985138
136.21596908569336
988.51531457901
80.4685697555542
75.16059947013855
322.2458038330078
69.86127090454102
901.8825304508209
61.79682111740112
199.58206820487976
4539.765466451645
224.77321100234985
294.439804315567
654.1967384815216
222.96462082862854
0.0
0.0
0.0
330.9993407726288
338.1263961791992
780.3982076644897
920.0960645675659
5.7123799324035645
433.23582124710083
914.6351153850555
155.8003716468811
783.0716874599457
0.0
0.0
0.0
277.1594898700714
932.9186911582947
227.5647239685058



Elapsed time to compute best fit: 17.254 seconds
Cross-validation score: 0.6889534203604589
Test score: 0.7726465364120781
Best Hyperparameters: {}
2672.674691438675
45075.33616602421
136042.7744820118
27368.504724144936
1283.2274043560028
1614.3206205368042
262.10311555862427
1613.5381605625153
275.32432985305786
470.4075937271118
10138.547471761703
17754.822768211365
371.1438624858856
0.0
2269.8002853393555
0.0
389.7377812862396
900.0458359718323
1043.7098388671875
90.36275005340576
1269.2703864574432
705.0773019790649
482.99838876724243
177.97686910629272
58.6307897567749
234.4944896697998
24.507969856262207
403.49351501464844
31.901679515838623
296.6259880065918
4489.161177158356
476.32697439193726
118.58903956413269
1393.1119694709778
284.60088086128235
0.0
0.0
0.0
122.98159074783325
1169.9944152832031
689.7265889644623
1105.0751764774323
17.998050212860107
1023.8079025745392
2309.677301645279
0.0
1035.9808690547943
0.0
0.0
0.0
229.02339816093445
340.3587486743927
311.230063199996



Elapsed time to compute best fit: 17.058 seconds
Cross-validation score: 0.7166415264257431
Test score: 0.6713780918727915
Best Hyperparameters: {}
4174.54129242897
36354.681509017944
100378.83212888241
88391.783233881
2321.5425264835358
2602.9788838624954
466.67069697380066
921.0532073974609
540.2339766025543
207.5047309398651
7798.536891222
4115.640128135681
129.36243152618408
0.0
1932.1756957769394
0.0
2457.6346368789673
916.3800311088562
1147.1866781711578
165.2456238269806
1559.1866509914398
1332.9519038200378
681.4787567853928
110.02470874786377
185.40186071395874
57.451589822769165
112.59033012390137
292.93699193000793
57.065959334373474
289.28487491607666
2891.9210535287857
206.93975257873535
103.68862891197205
1699.6461299657822
122.20483899116516
0.0
0.0
0.0
284.87367033958435
470.5836067199707
641.9938863515854
1394.507112145424
80.12777090072632
1022.5796718597412
838.7452218532562
19.265939950942993
440.1483404636383
0.0
0.0
0.0
253.53618240356445
368.1496214866638
446.486



Elapsed time to compute best fit: 16.995 seconds
Cross-validation score: 0.685827589476131
Test score: 0.6927175843694494
Best Hyperparameters: {}
3126.5914862155914
29155.88857638836
117432.14362835884
60145.17150628567
1829.887374162674
1888.3455629348755
569.5167226791382
890.2166783809662
243.29644775390625
977.973147392273
19672.776182055473
7836.355012655258
486.15754675865173
0.0
4564.026119232178
0.0
1247.130654335022
773.7138566970825
1192.0068249702454
329.7758004665375
1295.5143446922302
987.692521572113
629.5036816596985
386.8888509273529
208.48243856430054
6.064620018005371
81.4817099571228
659.2166569232941
36.46260952949524
1637.405993938446
5128.5300624370575
116.20965075492859
238.60328102111816
1649.968335866928
235.2846896648407
0.0
0.0
0.0
279.9814872741699
600.8337020874023
449.57782459259033
307.8883821964264
275.78532695770264
667.083247423172
1607.2907502651215
0.0
765.2889840602875
0.0
0.0
0.0
221.36075091362
413.0202851295471
197.80115675926208
425.29991269111



Elapsed time to compute best fit: 17.511 seconds
Cross-validation score: 0.7149514337316087
Test score: 0.6991150442477876
Best Hyperparameters: {}
2824.591398000717
33444.82302951813
103627.96853327751
85729.44045186043
2120.7005383968353
1858.5343704223633
236.0988576412201
980.1912560462952
250.616947889328
425.56148624420166
17899.150161862373
5169.338451623917
659.771693944931
0.0
1999.9860572814941
0.0
1417.685941696167
361.7230999469757
2223.319410800934
128.1525993347168
1156.5393869876862
357.0256426334381
805.795661687851
45.11501979827881
67.76104974746704
18.970030307769775
28.410439252853394
783.5622320175171
30.271609783172607
321.41039276123047
4251.211800813675
366.12998604774475
38.12794017791748
209.76158809661865
42.01739954948425
0.0
0.0
0.0
143.68924140930176
160.9421739578247
334.4483013153076
2986.466856241226
5.68202018737793
546.4651441574097
509.21344542503357
6.091169834136963
571.1918997764587
0.0
0.0
0.0
706.2986316680908
683.1164112091064
234.0693290233612



Elapsed time to compute best fit: 16.938 seconds
Cross-validation score: 0.7370391515107023
Test score: 0.7078039927404719
Best Hyperparameters: {}
2360.5347967147827
32382.280844449997
110104.14720618725
77200.17706823349
1265.661013841629
1770.5540673732758
611.5394127368927
1677.9838485717773
1238.2958623170853
919.0079810619354
10440.098959445953
8211.298578977585
117.09716200828552
0.0
2240.745315551758
0.0
868.0285437107086
1000.1057875156403
1659.5985023975372
110.01621103286743
1343.2068041563034
1076.980115890503
984.0576643943787
52.99131917953491
82.05815052986145
58.29607129096985
0.0
280.05442929267883
61.00014066696167
357.77796936035156
3200.442016363144
347.5480351448059
165.3853497505188
1872.6907255649567
64.88804912567139
0.0
0.0
0.0
749.5180757045746
0.0
555.4464416503906
288.59089851379395
0.0
1493.3232278823853
1990.1676738262177
0.0
348.551069021225
0.0
0.0
0.0
242.51403093338013
434.439341545105
141.46218872070312
96.05448961257935
10.067300081253052
34.15067005



Elapsed time to compute best fit: 16.645 seconds
Cross-validation score: 0.7099918180546088
Test score: 0.7685512367491166
Best Hyperparameters: {}
1747.3818299770355
34597.324009656906
98132.04168617725
85384.62254595757
1670.2043347358704
2544.70858502388
205.38097095489502
2604.9991312026978
1592.695074081421
279.9057183265686
10055.93643784523
8504.785269975662
163.90725016593933
0.0
2949.4062082767487
0.0
572.3191957473755
854.066251039505
1373.8569777011871
172.5335841178894
1062.6678714752197
574.9105117321014
1186.4264543056488
54.63616943359375
294.0963146686554
126.45141243934631
119.7899215221405
163.18499088287354
78.44414854049683
332.3445494174957
2321.041426181793
247.23851919174194
50.634100914001465
3939.4760179519653
241.0319104194641
0.0
0.0
0.0
605.0954728126526
158.96460342407227
777.0676510334015
1193.8813161849976
27.757829666137695
1358.161628961563
513.3536038398743
31.606689453125
469.2107000350952
0.0
0.0
0.0
249.27340018749237
616.7703585624695
215.410325765



Elapsed time to compute best fit: 16.386 seconds
Cross-validation score: 0.7003843773895141
Test score: 0.75591985428051
Best Hyperparameters: {}
2599.607355117798
39925.57579278946
90971.50414443016
87048.62661981583
2229.961614370346
2688.769159436226
389.8808877468109
934.8297622203827
627.1376104354858
173.07804155349731
12314.442161679268
6629.366549253464
645.3297326564789
0.0
2950.5612983703613
0.0
1493.1660029888153
927.2969183921814
1678.0786242485046
40.10628962516785
1591.1551177501678
252.33413314819336
780.9767217636108
109.27741861343384
69.93980932235718
31.872768878936768
73.34885954856873
303.76741671562195
29.70631980895996
352.419531583786
3229.6423461437225
131.64888095855713
115.60163927078247
3566.6607451438904
299.23772740364075
0.0
0.0
0.0
1133.5993337631226
394.4239959716797
267.3865385055542
4376.835140943527
23.55171036720276
1733.9364247322083
623.9621832370758
16.611900329589844
551.6803305149078
0.0
0.0
0.0
312.72172951698303
567.746178150177
561.982589483



Elapsed time to compute best fit: 17.332 seconds
Cross-validation score: 0.7229629311251875
Test score: 0.7012750455373405
Best Hyperparameters: {}
4318.063259840012
38160.21926963329
99216.23126411438
76455.82616305351
1891.3758170604706
3551.3305385112762
300.5993254184723
1304.8626947402954
2322.2784185409546
369.31676840782166
10464.63623547554
8372.278844833374
341.5647065639496
0.0
1979.879145860672
0.0
821.3234471082687
935.1046800613403
960.1678364276886
347.4462080001831
1184.42227602005
472.2543821334839
431.06122839450836
276.96747756004333
347.83138060569763
0.0
56.90500998497009
1365.246902704239
239.98596096038818
113.91848039627075
3708.3696002960205
134.50816988945007
106.09168934822083
2001.4407052993774
123.65639162063599
0.0
0.0
0.0
181.96001195907593
470.0915493965149
858.8870384693146
712.7938652038574
81.6615982055664
549.8847970962524
825.2297897338867
3.5787100791931152
1208.1578376293182
0.0
0.0
0.0
203.58572721481323
296.6512351036072
259.44583797454834
188.21



Elapsed time to compute best fit: 16.653 seconds
Cross-validation score: 0.7210040168745872
Test score: 0.6773618538324421
Best Hyperparameters: {}
2363.197832584381
32943.008544802666
98224.86213588715
84432.55028820038
4318.940388083458
3146.9236731529236
357.7966833114624
1688.7448253631592
2634.9165921211243
759.0098774433136
13093.943224668503
5415.339228391647
867.9711780548096
0.0
2178.9382667541504
0.0
697.1423850059509
429.76424908638
865.3283064365387
202.74159288406372
638.9299736022949
362.304563999176
704.2080941200256
329.8541886806488
69.80528950691223
55.25399160385132
173.5796880722046
167.60888123512268
69.67571926116943
776.3105030059814
3729.134350299835
125.56550073623657
216.2057192325592
3390.434499025345
141.64568865299225
0.0
0.0
0.0
201.76202774047852
136.03844666481018
399.76637172698975
2185.8575971126556
24.07298970222473
685.2783608436584
1158.9642488956451
0.0
580.2028694152832
0.0
0.0
0.0
232.69040703773499
257.76690125465393
126.01027989387512
33.919270



Elapsed time to compute best fit: 19.213 seconds
Cross-validation score: 0.7106841582631317
Test score: 0.7255244755244756
Best Hyperparameters: {}
2601.5868294239044
31548.313980340958
104950.31928944588
69952.98694312572
2669.157086133957
3345.512600660324
162.4495005607605
2506.2660229206085
966.149783372879
472.677020072937
9276.713557720184
12510.10584950447
1113.6254801750183
0.0
4450.238193511963
0.0
1011.2951726913452
1174.260845899582
1183.3398594856262
705.4270210266113
1378.5412499904633
588.8430681228638
1390.4953734874725
189.493718624115
220.95880389213562
115.94778633117676
57.00664949417114
706.6895425319672
61.73816990852356
195.54170846939087
3640.107560634613
176.96954941749573
177.58071994781494
1974.7770450115204
182.49878787994385
0.0
0.0
0.0
473.98597955703735
1247.1992814540863
866.352002620697
1971.7402896881104
230.14405941963196
1902.0825779438019
1076.9011821746826
13.69573974609375
647.122168302536
0.0
0.0
0.0
265.27919268608093
759.7523589134216
259.096550



Elapsed time to compute best fit: 17.946 seconds
Cross-validation score: 0.7220141500586789
Test score: 0.7142857142857144
Best Hyperparameters: {}
3699.177755355835
39837.5236222744
100096.4614956379
77587.19555354118
1214.952479839325
823.3930594921112
199.03270864486694
1620.9013829231262
871.3653225898743
1017.7703819274902
11191.9698741436
9236.800731658936
328.8068916797638
0.0
2218.2783187627792
0.0
1620.0275602340698
1018.0364565849304
978.4542171955109
15.817749977111816
1130.1082077026367
341.29632472991943
1299.3771657943726
249.40626120567322
344.6453146934509
65.91109848022461
142.1182198524475
485.30841279029846
21.75137948989868
160.89091086387634
3713.8858783245087
125.14388966560364
106.15370965003967
1721.5223505496979
172.89402890205383
0.0
0.0
0.0
535.1684441566467
749.5032849311829
755.5632525682449
1299.356273651123
75.82757234573364
565.4613773822784
505.9783706665039
78.94912195205688
364.9935748577118
0.0
0.0
0.0
538.5337841510773
826.2648038864136
540.36493515



Elapsed time to compute best fit: 17.059 seconds
Cross-validation score: 0.7124256368443951
Test score: 0.7282415630550622
Best Hyperparameters: {}
2853.985150575638
38741.33837735653
111544.09031224251
59374.520918011665
3304.5300221443176
2358.093722820282
408.5057773590088
1785.750606060028
1557.3843274116516
213.79636883735657
13832.974668741226
8826.911336779594
394.5018358230591
0.0
3202.0480976104736
0.0
808.5714948177338
1249.6345417499542
578.5599586963654
1107.026170015335
1385.91894197464
584.820027589798
782.4936056137085
258.5407018661499
241.48430061340332
99.51291084289551
218.04622077941895
188.81529188156128
62.679710388183594
479.4339563846588
4994.235903739929
146.37567138671875
100.75054144859314
2294.292315721512
233.65665793418884
0.0
0.0
0.0
526.6902048587799
993.3970947265625
390.99431133270264
1689.8383975028992
10.317000389099121
633.7296214103699
1992.5721580982208
12.461150169372559
602.7115201950073
0.0
0.0
0.0
196.73114109039307
301.38504695892334
256.7879



Elapsed time to compute best fit: 17.054 seconds
Cross-validation score: 0.7210048564475062
Test score: 0.7597173144876326
Best Hyperparameters: {}
3743.731620788574
33730.937896728516
112800.95230710506
63002.732958078384
1744.64293217659
2320.8736267089844
171.16558837890625
1034.0709710121155
2394.017004966736
282.982919216156
13770.555013179779
11067.331617355347
103.66134285926819
0.0
1772.8117897510529
0.0
1421.6437001228333
847.951719045639
1111.2094762325287
610.2886877059937
1926.7417542934418
212.23660731315613
931.908613204956
23.667459964752197
581.8268423080444
5.14247989654541
108.91076040267944
849.8581681251526
62.187909841537476
437.62067699432373
5012.600026369095
176.14613890647888
240.22036743164062
420.3347170352936
128.3548595905304
0.0
0.0
0.0
492.1343779563904
687.8299012184143
398.05021619796753
1088.3731701374054
4.34119987487793
1241.1470057964325
805.835366487503
3.7035698890686035
736.1745111942291
0.0
0.0
0.0
135.720641374588
461.12433075904846
195.9979197

## 4.3 Rebalancing Strategy - UNDER

### 4.3.1 Random Forest

In [117]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

under_randomforest_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    pipeline = Pipeline(steps = [#['smote', SMOTE(sampling_strategy = 0.5, n_jobs=2)],
                              ['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    under_randomforest_normalized_performance_df = pd.concat([under_randomforest_normalized_performance_df, new_performance_df])
    
under_randomforest_normalized_performance_df.to_csv("../data/05_model_output/under_randomforest_normalized_performance_df.csv")



### 4.3.2 XGBoost

In [118]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


under_xgboost_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    GXBoostPipeline = Pipeline(steps = [#['smote', SMOTE()],
                                    ['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    under_xgboost_normalized_performance_df = pd.concat([under_xgboost_normalized_performance_df, new_performance_df])
    

under_xgboost_normalized_performance_df.to_csv("../data/05_model_output/under_xgboost_normalized_performance_df.csv")




Elapsed time to compute best fit: 3.094 seconds
Cross-validation score: 0.2315612560299824
Test score: 0.236455463728191
Best Hyperparameters: {}
0.024590397
0.055719294
0.07608847
0.03383977
0.008579299
0.027488172
0.019522497
0.013734335
0.0325786
0.01371684
0.06162375
0.04444205
0.0042158826
0.0
0.010131153
0.0
0.021370815
0.0063929595
0.0062145656
0.0331617
0.011451882
0.006209108
0.027327664
0.010353279
0.0
0.0
0.0032890323
0.015345754
0.0063491366
0.0034848568
0.0022542474
0.002347688
0.010142761
0.0033574502
0.0013334291
0.0
0.0
0.0
0.008091014
0.0
0.00819637
0.013041448
0.0165735
0.0040502204
0.0026693644
0.0
0.003529423
0.0
0.0
0.0
0.0029999975
0.005099159
0.003231781
0.005731475
0.0
0.018778471
0.0013303887
0.0025255983
0.008515087
0.0068344735
0.0018128009
0.0053905416
0.0060415114
0.0
0.018255364
0.0009772327
0.014456003
0.0
0.011401154
0.0059545054
0.005605433
0.0022344734
0.007171535
0.0
0.0035371692
0.0
0.0
0.0031330483
0.0048638307
0.0
0.001515818
0.003751223
0.00475602



Elapsed time to compute best fit: 3.201 seconds
Cross-validation score: 0.23215825459774075
Test score: 0.24976437323279924
Best Hyperparameters: {}
0.024178626
0.022355894
0.087369785
0.028630069
0.01728237
0.027845168
0.0030612263
0.019248642
0.0
0.0013070946
0.0075795893
0.06494202
0.010022506
0.0
0.027627643
0.0
0.015646553
0.012203532
0.020748341
0.013385388
0.031119341
0.005382332
0.008352293
0.020925485
0.0014603359
0.0010603332
0.0012558589
0.0053300858
0.00075066416
0.0032703045
0.00133088
0.0067408043
0.0038200216
0.0021528816
0.0031761532
0.0
0.0
0.0
0.012466918
0.0
0.0055948836
0.0049747
0.0
0.031361625
0.009486273
0.0
0.0020320066
0.0
0.0
0.0
0.0012029173
0.002117792
0.012776575
0.0050015342
0.0
0.0068320027
0.013274018
0.0033813582
0.0017634974
0.0010122457
0.0
0.0012826214
0.00932866
0.0
0.01776921
0.004695982
0.012834289
0.004218415
0.0037534507
0.016191514
0.0028553922
0.0004941886
0.002100887
0.004211328
0.001780538
0.0
0.0
0.0018837652
0.0060264575
0.0
0.008661038
0.



Elapsed time to compute best fit: 3.036 seconds
Cross-validation score: 0.23648849110971462
Test score: 0.23202911737943585
Best Hyperparameters: {}
0.007453574
0.053315148
0.08218297
0.029443609
0.033476837
0.044978
0.0013553833
0.0
0.0
0.011579647
0.019197106
0.032986533
0.0
0.0
0.008342188
0.0
0.022900397
0.0053453394
0.016235983
0.002773832
0.024778886
0.009615376
0.01624135
0.0020976048
0.0071019894
0.0066516977
0.006417544
0.011553844
0.0027706248
0.0029685905
0.03957841
0.00069729215
0.0009826353
0.00714606
0.003008726
0.0
0.0
0.0
0.0051229117
0.0
0.0022840337
0.0019627172
0.0
0.005670088
0.0009147654
0.010018775
0.0034077708
0.0
0.0
0.0
0.0021476683
0.00386711
0.006135608
0.009687177
0.0
0.0036336698
0.0037206404
0.006721393
0.0030274813
0.01054316
0.0073643383
0.013156569
0.00029388562
0.003820525
0.011110441
0.0048248186
0.011178597
0.0081925765
0.010110664
0.0028116559
0.0046422835
0.0076489155
0.006925221
0.013588001
0.00898785
0.0
0.0029123381
0.0051198653
0.009991496
0.0




Elapsed time to compute best fit: 2.760 seconds
Cross-validation score: 0.22907067213483331
Test score: 0.22589052997393572
Best Hyperparameters: {}
0.006053674
0.04279351
0.067738764
0.038271386
0.0090121785
0.03284415
0.009100325
0.001095409
0.010201781
0.017581727
0.0064764093
0.014148587
0.0057361405
0.0
0.006334905
0.0
0.048742015
0.003784368
0.012727436
0.00028912502
0.00535589
0.0057047005
0.023390453
0.103838
0.030744387
0.0008868234
0.0026390867
0.0045350674
0.0010908789
0.0021899275
0.0071485876
0.00610321
0.0028915904
0.0022365279
0.0067644008
0.0
0.0
0.0
0.008503024
0.0
0.004070186
0.0010152592
0.0
0.0048629614
0.012072595
0.0
0.004970555
0.0
0.0
0.0
0.008088137
0.0019476982
0.0048017134
0.0
0.001871239
0.00812849
0.005468063
0.0
0.0012845082
0.0061092274
0.050835274
0.0019103744
0.007617148
0.0051334826
0.0006990416
0.0039319494
0.015999729
0.0018831515
0.00067973137
0.003348957
0.0033980669
0.011567367
0.0026993742
0.00051812734
0.002998181
0.0
0.0040149167
0.0012106042
0



Elapsed time to compute best fit: 3.263 seconds
Cross-validation score: 0.23010044512904884
Test score: 0.22825150732127475
Best Hyperparameters: {}
0.010112929
0.021943707
0.07520002
0.029222172
0.04029902
0.008569701
0.01485129
0.0061931172
0.007417752
0.008814081
0.12610224
0.018394684
0.000707727
0.0
0.023153603
0.0
0.010573127
0.02959664
0.0014916975
0.013976593
0.019213889
0.002957098
0.045569964
0.017645733
0.0049235793
0.00097656203
0.013968692
0.009158168
0.011841598
0.0051208767
0.005985792
0.0019741342
0.012330682
0.0025962424
0.0021683588
0.0
0.0
0.0
0.008139836
0.0
0.009936419
0.0030250621
0.0
0.004518435
0.021882374
0.0
0.00083245634
0.0
0.0
0.0
0.006019593
0.0045030406
0.008322428
0.0
0.0
0.0076886076
0.0035870743
0.02249258
0.0032333264
0.009770942
0.0022202835
0.011478252
0.0
0.0
0.017358925
0.0017661266
0.006641142
0.0006095311
0.000976532
0.0036287026
0.007379798
0.00077971554
0.00856225
0.00053058146
0.0141567495
0.0
0.0
0.0072744414
0.0035966106
0.0
0.000658218
0.0



Elapsed time to compute best fit: 2.998 seconds
Cross-validation score: 0.23306115453657678
Test score: 0.24178403755868544
Best Hyperparameters: {}
0.018461842
0.043526586
0.07326489
0.030826198
0.010451916
0.06585034
0.007868084
0.013832236
0.0
0.0033672731
0.031813085
0.032491706
0.019106643
0.0
0.023633195
0.0
0.010064507
0.012180029
0.005596139
0.0076292395
0.015787123
0.012417826
0.0
0.033383552
0.0011687231
0.0039566546
0.00039917076
0.006821065
0.0047276397
0.0023096695
0.0053320257
0.018303825
0.018001955
0.000583725
0.0016839161
0.0
0.0
0.0
0.0024999548
0.0
0.014277897
0.008065859
0.0
0.006462649
0.002799284
0.0
0.0033188288
0.0
0.0
0.0
0.0020244936
0.010651454
0.0065749595
0.0023578124
0.0
0.009051642
0.0020785544
0.0
0.012477224
0.010642319
0.003059811
0.006811955
0.0039565694
0.0
0.0047725234
0.0029560702
0.012614557
0.0
0.010939689
0.0028277766
0.006858191
0.0
0.0056411196
0.0051253773
0.005865184
0.0
0.014849612
0.011065401
0.0042690253
0.0
0.016730014
0.0060749254
0.010



Elapsed time to compute best fit: 2.970 seconds
Cross-validation score: 0.24068212402064612
Test score: 0.22129710780017528
Best Hyperparameters: {}
0.014970441
0.07064836
0.070561625
0.04911373
0.027456947
0.030866314
0.017346289
0.033212435
0.009377892
0.014419274
0.026294235
0.029121902
0.002695711
0.0
0.010532326
0.0
0.07603657
0.010346606
0.028128205
0.0025648251
0.011750787
0.0048994804
0.00263734
0.001253513
0.00033410432
0.033247113
0.008157939
0.003988912
0.002023811
0.0022893543
0.007999132
0.0067423065
0.0028745376
0.00027353867
0.0019762306
0.0
0.0
0.0
0.0016630344
0.0
0.008774127
0.0066405465
0.0011954827
0.0025058817
0.00434691
0.0027498906
0.002234822
0.0
0.0
0.0
0.0065524266
0.0055789687
0.0073297326
0.0004996367
0.012381728
0.0019942701
0.0007865483
0.0015664496
0.0027607381
0.0033321166
0.0
0.0045242803
0.0025340947
0.00071764673
0.0040504546
0.0011017722
0.010181487
0.016951801
0.0027498237
0.0034769417
0.003149011
0.007340255
0.0041725985
0.0055694473
0.004571712
0.



Elapsed time to compute best fit: 3.210 seconds
Cross-validation score: 0.23335837069182283
Test score: 0.2312013828867761
Best Hyperparameters: {}
0.008679917
0.02912235
0.09472655
0.044589628
0.0044471323
0.048513867
0.0034899476
0.011239726
0.00717954
0.006696914
0.017405866
0.05466143
0.012921004
0.0
0.013972387
0.0
0.017447606
0.006889916
0.012328549
0.007863511
0.014245929
0.0046026153
0.00738214
0.042017132
0.0043210415
0.0
0.0
0.01197183
0.012430491
0.0028382007
0.0109781455
0.0015587654
0.0056468113
0.0
0.004657697
0.0
0.0
0.0
0.007860019
0.0
0.0044024587
0.004968975
0.0
0.0095649455
0.023216095
0.007948757
0.0074474025
0.0
0.0
0.0
0.003787803
0.0046805693
0.00611204
0.012121939
0.0007865729
0.0031352553
0.0084910095
0.005191125
0.0023857118
0.0036025548
0.00021139524
0.0039556497
0.0058175246
0.0
0.0
0.007925612
0.004388932
0.0
0.004138733
0.001758564
0.0064575467
0.0
0.003211597
0.0012592379
0.0018093297
0.0
0.025291152
0.0028814168
0.025146082
0.010241855
0.0050593177
0.003



Elapsed time to compute best fit: 2.948 seconds
Cross-validation score: 0.2387442925541738
Test score: 0.2408793264733396
Best Hyperparameters: {}
0.021170594
0.039427582
0.12702839
0.03181581
0.031665362
0.0140399765
0.0226237
0.012810145
0.0
0.00070617074
0.045447204
0.032381177
0.0
0.0
0.0014118108
0.0
0.023874443
0.020989053
0.0047629434
0.0013622086
0.021642076
0.018395254
0.009211768
0.062125098
0.011657439
0.0023595938
0.001480992
0.0049696974
0.0026036934
0.0062672035
0.0021583564
0.010945862
0.002831089
0.0
0.0037907478
0.0
0.0
0.0
0.017261362
0.0
0.004219055
0.015814412
0.0
0.012211512
0.020789396
0.0
0.0013790292
0.0
0.0
0.0
0.0041176677
0.0007107159
0.0021780764
0.009682549
0.0
0.000828809
0.01524141
0.0
0.0049234126
0.0033014978
0.001559106
0.002006549
0.0
0.0
0.021745903
0.0066619385
0.007424322
0.002616168
0.01182226
0.0020109592
0.006242919
0.015989967
0.009880717
0.007677947
0.0039388645
0.0
0.0004987473
0.006536756
0.005580116
0.0
0.0074297544
0.0069054887
0.006784699



Elapsed time to compute best fit: 3.059 seconds
Cross-validation score: 0.2429256777954778
Test score: 0.23515981735159816
Best Hyperparameters: {}
0.0063179033
0.026006939
0.07866695
0.026140343
0.031584103
0.017442977
0.008217458
0.0141664
0.01259238
0.003814559
0.009253769
0.03680634
0.0058752894
0.0
0.023363622
0.0
0.012231493
0.009871603
0.037583064
0.018765356
0.005910229
0.010258911
0.020110603
0.0033038668
0.004190399
0.00085897476
0.0009829621
0.006842152
0.0005811833
0.0014833943
0.001428081
0.0026966643
0.002981454
0.0005394155
0.00056752487
0.0
0.0
0.0
0.016558502
0.06827498
0.020530786
0.0037809722
0.0
0.012103953
0.011535344
0.0
0.008269257
0.0
0.0
0.0
0.0027314017
0.0021308288
0.002503519
0.005835114
0.003973185
0.0025839496
0.02604204
0.0039292085
0.005086708
0.03354239
0.05291475
0.0034849497
0.0
0.008097488
0.01356506
0.0019585858
0.008088401
0.0014697288
0.0046130135
0.0028855584
0.005226805
0.014065136
0.0042920234
0.017532507
0.00044017285
0.0011384619
0.0009911562



Elapsed time to compute best fit: 3.006 seconds
Cross-validation score: 0.2250590516125693
Test score: 0.23843248347497642
Best Hyperparameters: {}
0.010748932
0.030819314
0.08783364
0.032676585
0.028144771
0.021712497
0.029859813
0.018618913
0.034135528
0.0012367412
0.013508242
0.07334712
0.0033051346
0.0
0.008693515
0.0
0.016634058
0.024041181
0.019635128
0.0068843253
0.0084516425
0.015083645
0.012775746
0.0
0.003923007
0.0
0.00867447
0.005209727
0.004828659
0.0029308363
0.0030874629
0.0068552326
0.0043582153
0.0
0.0028409006
0.0
0.0
0.0
0.010625039
0.0
0.01309029
0.0013585268
0.0
0.0006488214
0.011424455
0.0
0.0033531326
0.0
0.0
0.0
0.005884482
0.0017763233
0.021860452
0.002210124
0.00045523926
0.0046938104
0.0023012124
0.015748214
0.002892574
0.008321745
0.0
0.0014730289
0.01103872
0.014436609
0.00629673
0.009505325
0.011972813
0.0
0.003029397
0.0011788914
0.005888245
0.0007735519
0.007860568
0.002248853
0.00079344725
0.00048026175
0.0079496205
0.004804557
0.0034686478
0.0
0.003774



Elapsed time to compute best fit: 2.908 seconds
Cross-validation score: 0.22544251588909886
Test score: 0.22101751459549626
Best Hyperparameters: {}
0.016546037
0.07246301
0.07502703
0.022611264
0.039224997
0.042618793
0.0061577936
0.00032047738
0.044610195
0.014621603
0.005294337
0.021326825
0.0009422576
0.0
0.023294684
0.0
0.0147613855
0.021189352
0.0048001115
0.0
0.019991726
0.0338422
0.0058889138
0.028537856
0.0042752707
0.00037660165
0.0069801994
0.008705601
0.0004655768
0.017413123
0.0023393594
0.00084166636
0.0075923856
0.0009092067
0.008802517
0.0
0.0
0.0
0.004185272
0.033697408
0.0016565395
0.005200394
0.0
0.0014255157
0.006783763
0.0
0.004479776
0.0
0.0
0.0
0.0030637612
0.009298604
0.0023156353
0.0
0.00067092804
0.01666081
0.0032921205
0.0
0.0021283587
0.006618937
0.0013272825
0.0026707533
0.005894994
0.003438312
0.00045048
0.0047228327
0.010084747
0.0
0.0045858356
0.0018295813
0.0111303665
0.0
0.010264015
0.0026460935
0.0020605347
0.0
0.003769876
0.0
0.0087020965
0.003701713



Elapsed time to compute best fit: 2.934 seconds
Cross-validation score: 0.240579300459198
Test score: 0.20987654320987653
Best Hyperparameters: {}
0.029595688
0.044986095
0.09780224
0.034906495
0.015171349
0.08156096
0.014232549
0.009897459
0.02472855
0.01108997
0.009455832
0.020719089
0.0009977035
0.0
0.0056611095
0.0
0.01111863
0.0038279821
0.00037859648
0.002692022
0.028587926
0.00805428
0.019560559
0.037484262
0.001559569
0.005242305
0.02040838
0.022609921
0.0043932856
0.004877473
0.0042583854
0.009079028
0.0018858378
0.0
0.0031595202
0.0
0.0
0.0
0.000562428
0.0
0.009618343
0.0038592366
0.0
0.0126203
0.01492716
0.0
0.0058603757
0.0
0.0
0.0
0.0036479237
0.0023969342
0.0016514832
0.0024891926
0.0012828024
0.002056621
0.00082710414
0.002531327
0.00045067302
0.018750092
0.00078215013
0.0034749014
0.0
0.0
0.018282533
0.0023302252
0.0038421906
0.0
0.0024032807
0.008132012
0.005061331
0.0051015224
0.009930835
0.0058182753
0.0010313211
0.000729051
0.00093643897
0.0009685263
0.007880245
0.0



Elapsed time to compute best fit: 2.821 seconds
Cross-validation score: 0.23433919067097406
Test score: 0.2266894781864842
Best Hyperparameters: {}
0.01317804
0.022809217
0.09713382
0.027771963
0.026633235
0.029806415
0.026155159
0.0046210596
0.0016161841
0.010686124
0.08208194
0.029470194
0.007452042
0.0
0.029992782
0.0
0.013908344
0.009969427
0.016547732
0.001688097
0.043107554
0.00092838384
0.0143936165
0.001316034
0.0028160324
0.0
0.0156158935
0.0026978168
0.0005035903
0.0013307933
0.012077406
0.0059243026
0.012515336
0.004173368
0.002464883
0.0
0.0
0.0
0.012963124
0.0
0.0014153573
0.0029103544
0.0
0.011161877
0.004027776
0.006425802
0.0040822295
0.0
0.0
0.0
0.0011672912
0.0016296877
0.002279803
0.007416094
0.0
0.004376408
0.0020160065
0.0109578
0.0054803602
0.026185725
0.023785297
0.0039285095
0.0065486273
0.0
0.002905532
0.010697313
0.022284493
0.0
0.0020416626
0.0030271183
0.00549936
0.0018702208
0.0037065155
0.0013036025
0.0036274851
0.0
0.0015737795
0.0034046308
0.006266134
0.



Elapsed time to compute best fit: 3.137 seconds
Cross-validation score: 0.2308666621579789
Test score: 0.24262607040913414
Best Hyperparameters: {}
0.009747784
0.019870274
0.07988763
0.026859429
0.026696345
0.020986296
0.004376952
0.006993492
0.0070303697
0.005472907
0.04771497
0.023860551
0.0
0.0
0.016676234
0.0
0.05870514
0.04157664
0.0091314865
0.0014914054
0.027309261
0.0017740563
0.006719523
0.01542077
0.0
0.0
0.004258275
0.0027784763
0.0053493963
0.008150296
0.0005728703
0.003221578
0.002117981
0.0014750512
0.0030577653
0.0
0.0
0.0
0.011639435
0.104451045
0.007795993
0.0037160187
0.0
0.0040735626
0.011900505
0.0
0.0043883934
0.0
0.0
0.0
0.0019234155
0.0016871163
0.006032178
0.0033986475
0.0
0.004939726
0.0047986843
0.0
0.011380332
0.015953321
0.0038696346
0.0033938636
0.000667218
0.0
0.0037128623
0.0020661277
0.013919714
0.0
0.005276275
0.011543884
0.006575927
0.004713711
0.007164134
0.00080202916
0.003463307
0.0028875852
0.0013956372
0.002144394
0.02043267
0.0014767224
0.0019361



Elapsed time to compute best fit: 2.907 seconds
Cross-validation score: 0.23376483337323375
Test score: 0.2078521939953811
Best Hyperparameters: {}
0.0039545554
0.06426889
0.07943324
0.024056982
0.028310543
0.05222067
0.016896259
0.02021232
0.0
0.011032573
0.0058874767
0.020260671
0.009511028
0.0
0.0052847695
0.0
0.024904002
0.009459183
0.0077817943
0.00086556276
0.03403722
0.000895362
0.0
0.053576827
0.0019238916
0.000899597
0.010121255
0.006388738
0.0017112694
0.009198449
0.0009730132
0.0062501845
0.007875702
0.0015149171
0.0019433512
0.0
0.0
0.0
0.020300698
0.0
0.0035173784
0.0065915585
0.0
0.0042724977
0.011675406
0.0
0.0017794376
0.0
0.0
0.0
0.0042212848
0.0059011704
0.0020486952
0.002330086
0.0
0.0064574927
0.0061902353
0.013686772
0.019869743
0.022679128
0.0012283273
0.004169159
0.0018441762
0.019067213
0.012434579
0.012786098
0.04039739
0.00015906342
0.0029045544
0.003358602
0.0019296629
0.02085289
0.0050082477
0.004083326
0.0028124563
0.00065064355
0.001651437
0.0049894284
0.0



Elapsed time to compute best fit: 3.094 seconds
Cross-validation score: 0.2379860566374607
Test score: 0.24514338575393158
Best Hyperparameters: {}
0.02527804
0.03522738
0.09175304
0.022026401
0.007862271
0.0162877
0.0009511971
0.0056078588
0.0
0.009965623
0.040100336
0.014086248
0.005703173
0.0
0.006883967
0.0
0.025493532
0.03978961
0.010452324
0.060341805
0.006201718
0.0034844868
0.009621094
0.068533845
0.004302095
0.0
0.0
0.003733399
0.00370405
0.0060900804
0.0019407574
0.002683547
0.005007336
0.0
0.0056338296
0.0
0.0
0.0
0.011026276
0.0
0.0063034287
0.011826214
0.013496697
0.0014735522
0.021124244
0.0
0.0013140942
0.0
0.0
0.0
0.0012018584
0.002038066
0.0052350955
0.0006251031
0.05262042
0.0061607766
0.0044656545
0.0
0.019126916
0.01387349
0.0
0.00023603305
0.005088485
0.00041421354
0.00089128927
0.011895467
0.010723663
0.0
0.0012704193
0.004585914
0.0018125512
0.02505963
0.00462896
0.0007500506
0.0
0.0
0.0073449244
0.005941399
0.004623323
0.0
0.003847839
0.0029633993
0.02293991
0.0



Elapsed time to compute best fit: 3.129 seconds
Cross-validation score: 0.22133063009248843
Test score: 0.24128686327077745
Best Hyperparameters: {}
0.01241478
0.024154201
0.09359897
0.025726562
0.023536889
0.03169042
0.0030237057
0.026049454
0.0
0.007758209
0.00626651
0.046169087
0.0016952063
0.0
0.011433585
0.0
0.025997581
0.009752058
0.0026780944
0.0010735164
0.010001758
0.0064421
0.018834157
0.09702272
0.0
0.011109718
0.0034916203
0.0006434152
0.0062753246
0.0025632964
0.003333135
0.004635225
0.008073597
0.0021220692
0.002375136
0.0
0.0
0.0
0.0070901695
0.0
0.0015564475
0.00861142
0.0
0.0043359646
0.0053790207
0.0
0.0050984337
0.0
0.0
0.0
0.0012603248
0.017801605
0.0074543557
0.0005435989
0.0
0.008695389
0.0010210961
0.004560431
0.0005620488
0.002271008
0.028976206
0.009012613
0.001411021
0.014557228
0.004465529
0.0026772239
0.016683798
0.0
0.0028728417
0.0015133781
0.0061197453
0.0012426109
0.0032682053
0.0014945299
0.002607617
0.0012046634
0.0
0.0
0.0017650835
0.004044676
0.00252



Elapsed time to compute best fit: 2.870 seconds
Cross-validation score: 0.25032812464454923
Test score: 0.27191129883843723
Best Hyperparameters: {}
0.017306566
0.072639555
0.08346681
0.0433445
0.012312542
0.029804524
0.021017797
0.0087670395
0.005684189
0.0035916069
0.016731162
0.04283855
0.008797856
0.0
0.004740197
0.0
0.030128423
0.0071925367
0.0070692454
0.027132204
0.011666138
0.0028852732
0.002751517
0.047961537
0.017954426
0.00307788
0.0
0.0058646314
0.0004346656
0.020360518
0.0026262766
0.0013436767
0.005063982
0.0
0.0038352879
0.0
0.0
0.0
0.010361468
0.0
0.006179817
0.0075981063
0.0
0.017851261
0.005427424
0.0008770301
0.008108361
0.0
0.0
0.0
0.0036240746
0.007344153
0.0016644057
0.008592718
0.0
0.0
0.003161108
0.0050405534
0.027253808
0.0049700937
0.0
0.010736037
0.0
0.027602406
0.009090269
0.0012269331
0.014731965
0.0
0.00076426263
0.0048820307
0.004083122
0.005714935
0.006965852
0.00047523584
0.0091721825
0.0
0.0003485817
0.020259855
0.007467448
0.0029986906
0.011165728
0.0



Elapsed time to compute best fit: 3.065 seconds
Cross-validation score: 0.2298860138218164
Test score: 0.21632653061224488
Best Hyperparameters: {}
0.0147719765
0.054605804
0.08916528
0.029652365
0.01826846
0.04389049
0.008995253
0.019749323
0.0077561466
0.014773398
0.052912302
0.028779665
0.014691579
0.0
0.0054011964
0.0
0.040911537
0.04186742
0.0039624344
0.0042417636
0.005260841
0.0044708196
0.009552833
0.014063727
0.001005878
0.0
0.0054248953
0.005988618
0.015696943
0.001351834
0.0006966133
0.0024127632
0.0039791656
0.00037727391
0.0015328485
0.0
0.0
0.0
0.0067342427
0.0
0.007764462
0.0023272736
0.0
0.0050956574
0.018719746
0.014048595
0.0019455974
0.0
0.0
0.0
0.0020689548
0.0033334224
0.0025178809
0.002233199
0.0
0.0
0.03550142
0.0025168157
0.031008592
0.01110252
0.0
0.00025205093
0.0056986385
0.0
0.011665003
0.00494246
0.0073620277
0.0053096097
0.004072
0.0036509181
0.0029045062
0.0014244848
0.010020899
0.0014522685
0.0027389894
0.0
0.0005764037
0.0009697221
0.0055022365
0.008498



Elapsed time to compute best fit: 2.917 seconds
Cross-validation score: 0.2234635465216574
Test score: 0.23580786026200876
Best Hyperparameters: {}
0.020576904
0.027018288
0.11410254
0.038583696
0.01892838
0.041258555
0.0
0.0058110645
0.0078014424
0.008650571
0.007866796
0.040784195
0.004641306
0.0
0.012974122
0.0
0.009951503
0.019029586
0.009160396
0.017967526
0.011639675
0.02009633
0.008658296
0.046122696
0.004318243
0.00022633171
0.0
0.012693618
0.0007349681
0.0030086576
0.0
0.009819285
0.0028995497
0.015482259
0.002799467
0.0
0.0
0.0
0.006709109
0.0
0.010690913
0.006365421
0.033520333
0.008176031
0.014889475
0.0
0.00652851
0.0
0.0
0.0
0.0021203877
0.0034918983
0.0022617574
0.0041430998
0.0002773695
0.0
0.0036858222
0.0
0.014179589
0.008260918
0.0
0.010931592
0.000339249
0.012880772
0.013493267
0.0058187144
0.0026927807
0.001360605
0.0018186672
0.0069109807
0.0099885855
0.028320992
0.004011698
0.0037503978
0.0028618332
0.0
0.019275455
0.0
0.004312047
0.0
0.0
0.0109005645
0.002743366



Elapsed time to compute best fit: 3.083 seconds
Cross-validation score: 0.2295967821008375
Test score: 0.23529411764705882
Best Hyperparameters: {}
0.021998363
0.023271773
0.10641677
0.042436454
0.011649582
0.026360147
0.00032913272
0.020612186
0.003344406
0.05253723
0.008666562
0.039399605
0.013841016
0.0
0.00138286
0.0
0.012949377
0.015854493
0.022485312
0.002856042
0.014252212
0.001059963
0.011056129
0.049494937
0.011079111
0.0006549126
0.015936619
0.0085226605
0.00057556044
0.0068174545
0.0
0.0009592491
0.009179302
0.0048626037
0.002515919
0.0
0.0
0.0
0.0052847634
0.005865783
0.0018314088
0.0047975415
0.0
0.0076235696
0.0001757414
0.0
0.0058051217
0.0
0.0
0.0
0.0023157508
0.0019285658
0.0064540105
0.0011552396
0.0
0.0
0.003222303
0.01192142
0.009718324
0.051871113
0.0
0.002750471
0.003428532
0.0074334154
0.005910397
0.008799487
0.0054682847
0.0
0.0
0.0027354758
0.0042490033
0.0272208
0.0036099511
0.0
0.000976923
0.0
0.0
0.0021083371
0.0011920249
0.0
0.009671176
0.0013308891
0.00240



Elapsed time to compute best fit: 2.960 seconds
Cross-validation score: 0.2416747136832725
Test score: 0.23360287511230904
Best Hyperparameters: {}
0.0100640105
0.025303144
0.063018486
0.017489713
0.018059405
0.037308
0.03610322
0.020070346
0.011033511
0.0012730658
0.0845556
0.029188571
0.004343167
0.0
0.005512045
0.0
0.010528291
0.024826577
0.004251544
0.04396354
0.0050745658
0.004913436
0.005962412
0.003312756
0.00039464835
0.003245951
0.0026214551
0.010080627
0.010682728
0.0077310847
0.0003910292
0.009162426
0.00767324
0.0039445423
0.0027094157
0.0
0.0
0.0
0.011671859
0.027146734
0.00084970135
0.009868742
0.0
0.0026417417
0.004302631
0.0
0.021060454
0.0
0.0
0.0
0.0009546327
0.002863938
0.005569001
0.0015342527
0.0
0.0045542503
0.0006827459
0.009498854
0.0020163693
0.011515251
0.0
0.0005360536
0.0005702338
0.0013402546
0.0005586473
0.012582636
0.009064172
0.0002977906
0.0030019737
0.0026365814
0.008239764
0.0015025034
0.004897668
0.004674701
0.0026680755
0.00042542085
0.00066743436
0



Elapsed time to compute best fit: 2.883 seconds
Cross-validation score: 0.23353205029961002
Test score: 0.22707034728406056
Best Hyperparameters: {}
0.021618338
0.06596094
0.103140324
0.04479475
0.04620706
0.010278514
0.0051767114
0.028696189
0.00067414006
0.00473572
0.03367223
0.04330443
0.0019118927
0.0
0.0057944963
0.0
0.022007078
0.0
0.012591175
0.0033989814
0.006794223
0.00675655
0.0108971605
0.0043821647
0.0
0.0
0.0034805897
0.011021068
0.009558966
0.0017211421
0.0052740434
0.0019481445
0.0061991382
0.0016566876
0.005307745
0.0
0.0
0.0
0.0032736182
0.0
0.0039998186
0.005566769
0.012489411
0.014638996
0.0072984896
0.0
0.0048192726
0.0
0.0
0.0
0.0040977946
0.0026636044
0.018598588
0.00065111986
0.0
0.001878222
0.0005352003
0.0047691897
0.0005974377
0.020249767
0.0
0.0016103999
0.0031103678
0.0
0.0057422654
0.0067024752
0.0094840955
0.0
0.0005523703
0.0072910655
0.0051458324
0.0011696443
0.0030440527
0.0
0.012282636
0.0
0.00063825806
0.015952032
0.009562431
0.0
0.0054486985
0.009510



Elapsed time to compute best fit: 3.221 seconds
Cross-validation score: 0.21918892071290147
Test score: 0.21589991928974978
Best Hyperparameters: {}
0.017636048
0.02652873
0.095669694
0.0201544
0.030522775
0.03146034
0.001249305
0.030191228
0.012940425
0.0060857954
0.007900918
0.040023863
0.005362893
0.0
0.011085982
0.0
0.026361395
0.027731836
0.01664898
0.0
0.012597937
0.051852163
0.00879181
0.0
0.009619163
0.0
0.009909413
0.0045733633
0.0061325696
0.0033135512
0.0055943266
0.019443149
0.0019643232
0.0011035858
0.0031139709
0.0
0.0
0.0
0.0023998634
0.0
0.0126089975
0.0057844333
0.02955026
0.0011289036
0.012999816
0.0
0.015632521
0.0
0.0
0.0
0.0034564452
0.0029193438
0.0012890728
0.0
0.011790523
0.0039265775
0.0
0.0
0.002118491
0.003063746
0.0023881486
0.007107381
0.016302329
0.0
0.0027410926
0.001688013
0.011245967
0.0008071655
0.0057476237
0.012239897
0.013702169
0.0021892493
0.005281389
0.0008171035
0.00558659
0.0
0.00084551144
0.06173938
0.018953502
0.0
0.0
0.0011745278
0.003626309

### 4.2.3 LightGBM

In [119]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

under_lightgbm_performance_normalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    LightGBMPipeline = Pipeline(steps = [#['smote', SMOTE()],
                                    ['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    under_lightgbm_performance_normalized_df = pd.concat([under_lightgbm_performance_normalized_df, new_performance_df])
    

under_lightgbm_performance_normalized_df.to_csv("../data/05_model_output/under_lightgbm_performance_normalized_df.csv")




Elapsed time to compute best fit: 2.438 seconds
Cross-validation score: 0.5208624463029276
Test score: 0.5368421052631579
Best Hyperparameters: {}
128.17763460675533
897.6135786655759
2529.4968725752406
396.99222721987235
15.446151219703253
319.2199864705285
2.6299168462865055
51.06762977573089
5.064322017133236
11.632966913282871
10.244008139884727
536.3242331639176
2.2614343781024218
0.0
11.697231274899877
0.0
66.3174445856589
5.299690263058778
14.196107375543534
2.3016856581361935
36.46961541962128
67.3978357411532
16.511767738920753
3.6317044515453745
6.5127006305419854
1.2784424056229682
3.346276456461055
14.385104182121722
2.695970993489027
51.53921023647945
0.9539300131145865
10.652039762419008
16.430634188694512
5.843560919165611
10.31432556641812
0.0
0.0
0.0
13.985399647302984
0.0
13.05900316409273
22.54590149669093
0.36308398842811584
5.433005164144561
15.073137813936142
1.0969224572181702
6.560041961216484
0.0
0.0
0.0
17.428112116316996
8.465549324861684
13.443375804825337
1



Elapsed time to compute best fit: 2.460 seconds
Cross-validation score: 0.5367762091838465
Test score: 0.5291005291005292
Best Hyperparameters: {}
101.72705585170516
1431.4538779212285
2427.649737984232
451.27907614465875
19.515555351097092
148.91068813248577
1.0216245064220857
19.29664921760559
14.240076314577891
23.386872703421343
131.14235769967416
438.52568015590913
2.0695759417971615
0.0
3.2590327997796322
0.0
49.6857596068628
53.69222203053852
16.605028021695034
3.871395389083773
15.575913982378552
11.360082634580863
2.316550866713442
13.620897568574946
1.7279710979346419
3.7332321256399155
3.8002824830109603
18.78311291852242
1.4414915535598993
5.1029024714797195
0.0018487054476281628
23.725476919062203
7.291331639400596
1.1041745879801965
19.56156380835455
0.0
0.0
0.0
13.128788183613324
0.0
3.5735335928325753
17.846795896925414
0.0
32.262276544293854
9.82470382529269
0.0
3.81235777575057
0.0
0.0
0.0
12.539496568184518
12.096285934521859
20.818547950983728
0.4638010002672672
0.4



Elapsed time to compute best fit: 2.071 seconds
Cross-validation score: 0.5249060763625601
Test score: 0.5244399185336049
Best Hyperparameters: {}
88.51795671809285
970.8751778409076
3107.3353815406504
438.4849214544304
70.86880737333665
222.3583088216089
18.28858184790134
19.299695117762894
0.5393980449371156
25.87145135353785
30.36321024764038
167.7836792543967
21.99703614858572
0.0
40.8614320603169
0.0
38.94976390185006
19.77661792612333
46.7467624189933
43.875570624566535
9.070144460739357
9.4518140736036
4.528035776227966
0.012398682986805909
3.0358400145632345
3.24372199177742
2.8033377294777893
36.84570753502848
2.8280339193931923
14.62476732658979
23.60931306348175
3.7318671702676625
8.344102981167822
1.9080511046850361
33.004997810803616
0.0
0.0
0.0
10.214677098934771
0.0
5.721225652243959
0.3474169103077429
0.004863426089286804
23.23450970751128
1.2415296850466575
0.0
23.449255268045818
0.0
0.0
0.0
14.8662914450004
19.7412706821151
18.399770538381745
1.2746004216686089
0.7729



Elapsed time to compute best fit: 2.281 seconds
Cross-validation score: 0.5305794593624641
Test score: 0.48521256931608137
Best Hyperparameters: {}
76.18148547333517
1018.7052982631664
2823.624006263716
287.373650449603
98.15339985216733
136.91670297975054
2.3492803457193077
23.711757898097858
1.6015853993594646
22.569149792194366
55.26797061289386
407.39299773074595
6.653296336531639
0.0
12.237620450890411
0.0
55.55562968473953
31.62660601201725
64.54699445211915
0.6637616898515262
48.17932315117622
30.850509669211306
4.750698250143614
0.6806720048189163
1.0038164053112268
0.5581301935017109
9.939217524505239
46.79582313822017
43.658218724992594
25.010258336757033
6.520906703962282
12.189669923996917
32.55184722616795
2.9441970139741898
9.639059545737155
0.0
0.0
0.0
20.1791698591256
0.0
9.497124917177423
26.171858189698128
0.0
29.714295853532064
15.761863892897964
0.08498819917440414
22.175985513575142
0.0
0.0
0.0
8.882757803957816
15.89834909479805
91.8061761518602
1.4002455970435221



Elapsed time to compute best fit: 2.057 seconds
Cross-validation score: 0.5267399789406735
Test score: 0.5369807497467071
Best Hyperparameters: {}
106.47292049177219
918.3068336866037
2757.7752476079254
293.41246756140276
59.957573431704006
402.1586406009599
1.3598781898617744
31.284673014229575
3.560665886849165
74.22053885087371
71.42847177629875
182.13324955000127
1.4058355577290058
0.0
60.31798214947776
0.0
4.778112453150811
55.725297777763544
5.095222709569498
18.927945172366208
85.14959474191437
13.137943846872076
13.101143666506232
5.849202881450765
5.055244938648684
1.0929204835665587
5.361046332396654
8.931113624493264
8.449199535592925
37.227835330963046
0.13776266816421412
7.397935169625271
15.672651610088636
2.1901727907066686
11.08042934624973
0.0
0.0
0.0
20.93982069721973
0.0
2.8164649652317166
5.452770183015822
0.0
13.569221230194671
12.97052678997943
11.142407905310392
4.731449018786236
0.0
0.0
0.0
0.7968555405823281
10.634854733158136
22.86263708325214
8.14258507522754



Elapsed time to compute best fit: 2.286 seconds
Cross-validation score: 0.5378608828346844
Test score: 0.5579868708971553
Best Hyperparameters: {}
33.150577122260486
1011.2864326630176
2510.3447307338347
445.90252670270974
44.42381955284321
264.58130883654223
0.7009926768951118
51.320177384069076
10.61107208389376
14.737436225633132
35.61787262229586
355.08916664195215
0.5127160102128983
0.0
44.85680476950017
0.0
149.24258600868188
27.7288621184899
22.822011139161358
4.478858292539371
29.15803652342821
12.136642050897521
6.506284562910196
8.00378019715572
4.084504278719166
1.026132565792068
2.2738252949862385
23.86273769607419
0.9277200172382436
12.00354877966413
0.0014323450159281492
9.811282590026394
8.960763693954789
1.9104574457258963
8.783957058106271
0.0
0.0
0.0
28.983239278989913
2.1067789532244205
6.451483985650611
10.514829065250787
7.390870138124228e-08
16.156957167071482
22.848209419278305
2.061340040881987e-07
11.069374176594458
0.0
0.0
0.0
4.003424311650633
4.3124897368979



Elapsed time to compute best fit: 2.115 seconds
Cross-validation score: 0.5076125253666266
Test score: 0.5174129353233832
Best Hyperparameters: {}
100.80887316853051
730.7624522986812
3235.4238975672283
294.19537876753907
111.38208356832163
250.78921623307176
22.171402591310652
1.460690160325612
0.4691616370109841
4.486466172988287
34.397348835507856
223.86408375965806
4.853271923959255
0.0
30.10838559885392
0.0
85.15451165119686
6.30086274015995
52.0339744403027
20.037349173158873
21.525591372104827
47.326715451044606
3.4631707835144425
8.916512922696711
0.025765980233359187
1.3235523708863184
8.961075320479722
8.865800819840103
4.636620571880485
5.837509828401693
12.114545156713575
19.161365668977368
10.425221503610373
4.003289498388767
23.882996799540127
0.0
0.0
0.0
6.975325434615115
0.5296940207481384
6.663590889540501
34.55266577139082
0.0
43.74706517001607
19.189700440715797
0.0
7.720598112101854
0.0
0.0
0.0
18.468571025555008
5.611373243943035
25.865161335921584
0.57499341844231



Elapsed time to compute best fit: 2.113 seconds
Cross-validation score: 0.5151534990621673
Test score: 0.5484558040468583
Best Hyperparameters: {}
62.43305291155119
919.8004038092622
2757.584320628779
231.59536156822287
4.783353176724631
242.64760060424575
10.674497909995807
40.01078585162759
8.37623883940978
6.211319004975422
40.01739194547372
374.335307600267
6.947994727641344
0.0
25.95209109813004
0.0
23.739307771618414
49.1390882933556
10.898918663664517
10.901298820982504
85.25765917587773
12.94540788861923
9.15270506219349
6.939406471065013
2.0124709013002757
2.155806759605184
8.497479525558731
9.167877631232841
0.4727598041645251
48.3333787815518
2.68150834672997
29.012628165646674
24.9453180946914
11.366654194891453
19.441246730522835
0.0
0.0
0.0
6.699402070608613
0.0
5.596316176321125
16.072829320822166
0.00041032899753190577
5.573467945585435
23.385503677651286
0.0
4.6406046544952915
0.0
0.0
0.0
17.20923658955809
29.700521477239818
9.405562568873506
1.1867620274424553
0.38575



Elapsed time to compute best fit: 2.309 seconds
Cross-validation score: 0.5372003346929363
Test score: 0.5470459518599563
Best Hyperparameters: {}
62.03913323828877
734.0980766168027
3027.3698951323813
342.4454710618518
41.23665061673431
173.6427419758883
30.22104517383704
8.583275340497494
1.9873792009984754
15.751707738847472
45.085160085267376
344.15184612960593
1.3245795740076574
0.0
25.250424132622285
0.0
27.07766448669218
6.211710394418262
147.1056171311211
2.7236467078530637
82.30904810554483
64.47380660358613
8.848040493335454
9.999004841665737
0.46511817794817034
3.394142977311276
4.598716640844941
19.53496932515344
24.343729527667165
29.270604559867962
7.697872520424426
4.767326367924397
33.78899105947676
2.454271855764091
19.74651552284422
0.0
0.0
0.0
32.82508112927901
0.0
8.483020941380346
6.081676628357059
2.9726551601370375
45.91899832242416
27.983642031322233
0.0
2.101906006407944
0.0
0.0
0.0
20.234316528719546
5.669154161106235
15.033816997377734
0.40205609967233613
1.9



Elapsed time to compute best fit: 2.070 seconds
Cross-validation score: 0.5204564040028247
Test score: 0.502439024390244
Best Hyperparameters: {}
76.95493008221912
1466.2376715193684
2173.2874498912447
298.3529502052115
84.54170392001694
201.13174944559788
56.41078884280046
24.814697883817644
2.0278721468289262
13.408534290269017
16.152696772953846
468.20648946352054
4.890804616094101
0.0
53.75884894444607
0.0
133.35555177473543
19.66695811263139
14.85913417581105
5.4843134151933555
14.142862550041173
6.9151881887810305
13.342706644377614
37.2384587327324
1.3964616955745512
10.345170070882887
20.63220122642815
18.281337287614548
10.018613801862784
5.022012598818037
2.9563787726306145
8.193733002524823
19.587700912138693
0.9322977885603905
13.649459420092171
0.0
0.0
0.0
2.167486360410294
0.0
5.707705000741839
52.238711153375334
0.0
13.393228941010648
11.472260938433465
0.0
38.78501007205341
0.0
0.0
0.0
36.95509993837399
11.74890017714813
16.392793689246503
0.7373809086140852
1.812735021



Elapsed time to compute best fit: 2.285 seconds
Cross-validation score: 0.522816960036027
Test score: 0.5526315789473684
Best Hyperparameters: {}
94.25711944017071
708.248240239064
3085.706218493548
236.14362148669352
13.06620043325627
259.66441776594775
1.1501330137252808
2.706206016242504
9.011462797701824
3.97219528912683
2.903728721223729
345.49524891827
5.8039383558189
0.0
23.671737235959597
0.0
110.92519989089999
4.488812131665213
85.75592022552817
8.659550078788016
94.38918772481986
2.3281971998403606
2.2336964637506753
0.40608997137132974
3.047938585940388
4.453367606169195
4.264386620023288
74.33758070402389
4.734856991795823
21.484178198558766
4.3432826064527035
4.020973976003006
10.67659219456948
2.651533991098404
16.28319253181462
0.0
0.0
0.0
8.520306098491346
0.0
9.997312859166414
10.226493669373667
0.0
16.784239899367094
36.9124839444321
4.112710030312883e-06
11.005826266700751
0.0
0.0
0.0
21.903035044524586
15.12905114935711
15.203821632413508
8.324766555801034
1.3243579



Elapsed time to compute best fit: 2.148 seconds
Cross-validation score: 0.5031241312932664
Test score: 0.5018939393939393
Best Hyperparameters: {}
52.7341930260668
1395.889846095719
2599.3497026609057
356.9925029089063
26.285881080268805
176.14438932640036
6.81654808709726
52.17833470188452
2.849288111727219
36.760650062715285
68.7245634452874
198.52835236205283
2.8968881943095255
0.0
31.43994756126267
0.0
5.561212336889753
24.344171852229124
11.119299754849635
6.876340558581433
6.838194115500073
35.41664632027096
14.74988207977492
5.465799633663792
7.733370121568441
0.5213986858725548
8.541039773263037
33.79839957692184
19.80376994637063
17.2891350755026
8.357798580662347
15.813676325923552
14.20836728641325
1.4800631281668757
36.446212507587916
0.0
0.0
0.0
17.779504896648756
3.178538039326668
12.673921561363386
49.09985857177526
3.5806899070739746
24.904848698398155
17.039413318881998
0.7478770017623901
9.290608809943478
0.0
0.0
0.0
13.688886675983667
6.529695047714995
6.904664974066



Elapsed time to compute best fit: 2.261 seconds
Cross-validation score: 0.5164117604716114
Test score: 0.537525354969574
Best Hyperparameters: {}
82.03561450302647
448.52699936057786
3450.544807766371
245.81241112577146
90.96159782232917
128.61522688804837
3.2958583452818857
38.32524948245009
1.6715798923978582
11.570775577798486
188.97294633401987
168.71227464808499
2.2048630288336426
0.0
68.62225400361785
0.0
31.527528829003643
22.57501872774128
125.03006068409414
9.258009055125854
51.93917124287691
50.98377175397604
5.5919491986278445
16.778500527667347
3.8713630245092645
2.527356456790585
9.382380061539674
10.590415309022546
0.9254507565310632
23.300694441571437
54.27459194061521
24.30511799058877
10.196009406254085
13.963948763906956
7.586040114232105
0.0
0.0
0.0
4.449136199428551
0.0
15.751952644757694
16.668245446751826
0.00021843215415628947
25.407724536307796
18.90275050854595
0.0
6.679625911099265
0.0
0.0
0.0
14.936070261004296
12.894977878779173
16.94369459456736
4.761452378



Elapsed time to compute best fit: 2.155 seconds
Cross-validation score: 0.5191877270205593
Test score: 0.5658324265505984
Best Hyperparameters: {}
56.83827638590269
375.1970992832689
2907.727723989926
489.8293508640754
234.7957109269612
118.81120763467334
13.910029360000408
7.27856478583476
6.42398869106546
20.213115524609748
50.63427144055618
352.33805198353343
15.702945608592927
0.0
47.96275978266385
0.0
45.7212672869141
93.93877164514163
29.44136141839408
6.6546629756036
28.289384065850072
18.92369622729933
20.17864004214232
1.2273174414626737
0.36373580037616193
5.269938953220844
2.3126878356561065
28.50266877971717
0.11243513864246779
14.893278670146685
25.71724531223299
12.046071568645857
33.33180054140939
1.7465759921672088
27.129928615526296
0.0
0.0
0.0
11.037696759207165
0.0
17.386584701685933
33.57898796722293
0.0
53.72808149215962
78.19730473264235
0.0
8.049036297088605
0.0
0.0
0.0
6.673953933008306
12.960552670701873
35.758670663774126
1.3923199815476437
0.00045762224544887



Elapsed time to compute best fit: 2.247 seconds
Cross-validation score: 0.5246929077949172
Test score: 0.5675675675675677
Best Hyperparameters: {}
43.46686471202588
1180.6687897431082
2478.03184329058
453.94440192072807
111.96632643515795
301.65471359517767
20.89985900465399
16.231026394991204
2.681466738345989
10.062945868819952
38.63832518947311
80.90949144757565
8.437448452340405
0.0
106.86598921811625
0.0
24.881067028909
29.141184556887794
77.20110893628487
13.450451379441034
14.95028102396202
6.87694506040134
3.857339364107929
9.144869350588124
2.093648704004181
0.739024905487895
12.362946629989892
25.852574589079737
7.243305782594689
15.957608328843548
9.736287258565426
18.707020301964384
44.76851528597035
1.8176916868251283
13.150010167795472
0.0
0.0
0.0
24.025374143414865
0.0
9.855557978589786
35.469068316376706
0.0034981349745066836
24.437455067876726
9.056259260836384
0.20982348088364233
5.258213816371381
0.0
0.0
0.0
24.6733790775761
18.22757669299608
26.356172099092092
4.661



Elapsed time to compute best fit: 2.146 seconds
Cross-validation score: 0.5186126615874355
Test score: 0.5172413793103449
Best Hyperparameters: {}
59.6999040397205
1518.9121904182039
2347.4576101618663
417.1605826576149
76.79686461499728
127.86658192761911
9.212213639169931
18.98799653723836
2.9778960198163986
1.5355654917780157
77.05265985777623
258.81341051566415
0.31656520161777735
0.0
38.509046180697226
0.0
20.26643833915398
103.20819915582604
26.256583853873153
4.6638084599926515
23.873954407954216
58.97468072909396
16.064084051799
13.282466586531882
4.637933732214151
11.290725173212877
25.86638826701528
28.112631885371968
4.3166487266425975
4.771850179834132
7.677126277238131
20.538240424088144
15.145395121022887
1.8464040915459918
8.616320703662723
0.0
0.0
0.0
8.805264379121581
0.0
19.41214176066692
9.32043124660413
0.11069799959659576
3.2887897749897093
30.847885805853366
0.0
13.264871197119646
0.0
0.0
0.0
16.149310870884335
8.673111521022435
27.305975125760803
0.93208489613607



Elapsed time to compute best fit: 2.420 seconds
Cross-validation score: 0.5350924650598072
Test score: 0.5345911949685535
Best Hyperparameters: {}
128.97712581094038
686.5917596320229
2828.967086445581
335.05010658476385
29.969278461034378
294.1180430898794
2.1573917402420193
35.448492186078056
16.547171935692404
46.11208054947201
28.83855806647989
352.77792748854074
1.9760421952232718
0.0
29.88812168209632
0.0
74.03842558592441
42.57032770588444
11.559679098019842
1.4907466475442561
72.85784303532455
26.971513113268884
6.131925147220045
4.346838872879744
9.737238715191324
1.6362609714269638
7.3563391752541065
23.457441560414736
16.656334321247414
26.32083118818378
2.259123077383265
20.356408348758123
8.568234934689826
8.691620530606087
7.24622022500202
0.0
0.0
0.0
9.380577645522862
7.709764167666435
4.739399938378483
14.067699505064411
0.0
34.386581976609705
16.218246967560844
0.0
1.3459518241907062
0.0
0.0
0.0
9.29967256022087
4.4495282645802945
91.33617443703406
1.530127303674817
5.



Elapsed time to compute best fit: 2.038 seconds
Cross-validation score: 0.5069012485459548
Test score: 0.5433646812957157
Best Hyperparameters: {}
69.96894126456341
734.8182410695628
3060.8545629374694
240.18455569454105
145.46603681496987
402.95312770100213
8.222783863544464
9.584262061864138
3.1465515457093716
12.290732927337558
45.73725555282042
137.17552035332204
0.6682854816317558
0.0
12.99058492266613
0.0
29.41417974692581
41.58528970433183
21.819074594148077
6.7520740571198985
67.21964222951416
3.6441224558059417
38.50449665822089
14.467821039492264
5.770304855890572
1.749480348055883
3.3150798595743254
8.81403261104225
8.645595129853518
25.329783470049733
20.181736571219517
17.33158024237963
21.712031359073933
3.465030960738659
11.200741543641925
0.0
0.0
0.0
4.504366508737576
0.0
40.98794784299662
26.706578440572684
0.0
18.574863535759505
45.83684919807638
0.0
8.309315433580196
0.0
0.0
0.0
22.807619027631517
9.222149297454223
11.031888184187437
1.4750630024354905
0.659719899296



Elapsed time to compute best fit: 2.176 seconds
Cross-validation score: 0.5124846868517846
Test score: 0.5538302277432713
Best Hyperparameters: {}
84.482842805601
1432.5900111632577
2134.7271419980607
404.6144976690157
48.0447085037669
237.24973687517738
22.309005862362824
10.859979767235927
5.461164638400078
23.638770387042314
154.36819113296042
162.38193059246163
1.2785477599827573
0.0
105.32397631455699
0.0
37.53142027754744
26.246019984046367
61.66333127185044
37.7250572018022
7.8107739505929885
6.658129532946987
10.420632834131077
5.911790276128158
0.18674034663399652
3.7620770878347685
2.9963393712532707
96.26443989910149
9.277372232958442
35.12648255705321
2.3876616487544737
8.570568695016746
21.061508981425277
1.2240381427109241
11.684342678170651
0.0
0.0
0.0
4.320020481944084
0.0
13.850499612906788
39.83948097376606
0.0
4.247612681298051
57.96807802703802
0.0
30.214456508811566
0.0
0.0
0.0
19.414917622169014
5.294949807532248
10.14064870373295
0.8370254039764404
0.387519869836



Elapsed time to compute best fit: 2.213 seconds
Cross-validation score: 0.5341632742106873
Test score: 0.49124513618677046
Best Hyperparameters: {}
27.99329366578526
715.6549487437264
3195.4594791229033
341.7242120228034
64.56850577092564
452.1885564212006
12.137874815613031
15.73727783366212
4.749401617795229
1.1653459966182709
52.17559930225215
162.42865323673323
13.559769641608
0.0
25.35778526458796
0.0
32.23415266811537
18.7623523198329
47.81724847610637
3.434761039024977
14.527338877748562
15.33861395840745
3.7860571841010824
10.708683598812058
10.11862554012356
0.5577983240136746
1.8238413165090606
6.025020256132848
6.7372378372404
8.920164267857217
33.515779679368734
10.439165202732012
11.729154016208668
2.131692047143588
13.22590922425286
0.0
0.0
0.0
11.807113881487595
0.0
17.715268723724876
39.32443047564266
0.0
27.464606850123232
7.440994741998793
0.0
9.439319427357987
0.0
0.0
0.0
2.8100770277636684
21.047629669897248
23.56435987891564
7.311914711317513
3.7087856605648994
1.4



Elapsed time to compute best fit: 2.149 seconds
Cross-validation score: 0.5195116092083343
Test score: 0.500467726847521
Best Hyperparameters: {}
140.04509314049304
1518.9887688628373
2348.251745731473
286.3368317016259
12.62735267933877
233.92326513349738
0.7837754930478695
6.378131166100502
1.689933903515339
4.824991619371303
10.470130962188009
362.7762228906721
7.8127661645412445
0.0
26.227727992929154
0.0
11.136411580563163
193.10915529728373
33.8285278225535
3.9421139538753778
74.59888687586499
11.599358556756954
4.080973027739674
2.5237081246086746
9.351638830732554
1.399085790850222
2.3826089780486654
28.253647050253033
8.216359316497737
32.076884230130474
6.367211958778759
3.699862416895691
26.04059320431145
2.73128220718354
10.65207010492125
0.0
0.0
0.0
10.173873715510126
2.7483458830902237
6.333293368726345
14.164684710169011
0.15869110822677612
14.060921320691705
20.40945262641905
0.0
7.454296268351754
0.0
0.0
0.0
11.54625903093256
11.58818897488527
49.153123429525294
21.276



Elapsed time to compute best fit: 2.326 seconds
Cross-validation score: 0.5266220847533334
Test score: 0.5255402750491159
Best Hyperparameters: {}
51.007856767109956
786.9190239145722
3070.338608186704
251.7596333665606
143.4530968052539
122.43616165728542
34.49807823658921
57.628728722178494
1.7625749036669731
13.626936497719726
127.79020928755621
193.93710367990752
3.1297032609581947
0.0
222.19121372025242
0.0
28.9288545365294
11.13918191677493
7.135188292298579
6.707486300668279
26.138788179930998
15.719640905011602
0.6071288843629361
3.8198483736487105
3.361582141609972
9.98915410041809
0.4942033680690656
8.756162214015603
5.797516702557914
8.16917898538577
0.7448856744522345
22.854526821567106
24.7176617297564
0.6782607957720757
7.558403595889672
0.0
0.0
0.0
21.856082919004763
0.05071280151605606
25.16773926111864
0.7712681627917846
0.054676301777362823
27.242797576549492
23.0480309327977
20.308793613687158
11.636570905972803
0.0
0.0
0.0
33.22275038134012
12.929848265387136
5.8186



Elapsed time to compute best fit: 2.082 seconds
Cross-validation score: 0.5340328737745244
Test score: 0.5721925133689839
Best Hyperparameters: {}
52.28521724835459
642.4945465437102
3132.6062123438674
317.0019174426799
27.032731273631157
142.0274218161249
18.095866666873917
12.998665608465672
3.1982356312219054
12.163864817470312
11.640082266825061
499.37364960292757
1.1441226033493876
0.0
24.74088329790893
0.0
60.66997075798355
93.1385634379892
18.821952532042175
4.172723020169656
24.09485054045217
7.360459642135872
3.867288671899587
20.265437969297636
1.723152806982398
4.156017254455946
0.899350450723432
17.456046478868075
2.4124184052824376
5.144919643387141
4.787428878247738
4.986283067904878
24.39191323681007
2.288975223636953
12.923449616035214
0.0
0.0
0.0
17.386006080330212
0.03177110105752945
18.639200685778633
27.74039539138721
0.03874969854950905
7.0675287686831325
40.44140003375088
0.0
4.838682579342276
0.0
0.0
0.0
46.31143687106669
16.42189151213097
53.68393376173481
0.533



Elapsed time to compute best fit: 2.402 seconds
Cross-validation score: 0.5219927864197407
Test score: 0.5379098360655737
Best Hyperparameters: {}
35.93679201813325
814.5200844873979
2773.5200697255873
460.37247953076496
246.40027112768675
209.23885036267347
21.690617072745226
16.150217264753095
2.0394301740452647
9.39798990637064
97.4340217208337
140.84409405570477
8.763730511360336
0.0
15.000508131694765
0.0
30.772092991271734
12.514945029265846
29.18275129387621
13.483246063406114
53.499597086296944
25.329775126185268
5.858807003638503
4.068677143739478
12.897395834326744
2.0856625379199727
2.366050683078356
38.68755070684547
25.221961115390513
7.0727473947779345
1.7361074150539935
11.722410879970994
8.176385496869159
3.3718064893619157
10.408814996002263
0.0
0.0
0.0
20.290834128157258
0.0
13.511450044164121
34.47645440616907
0.0
7.160171312279999
13.018234052477055
0.0
30.25349925208866
0.0
0.0
0.0
18.60944515515439
15.258674375079863
13.904789511194394
0.09770160168409348
2.671838



Elapsed time to compute best fit: 2.038 seconds
Cross-validation score: 0.5297735264349687
Test score: 0.547872340425532
Best Hyperparameters: {}
45.74920008496439
866.5431191290842
2913.6130443271863
464.3929242839749
45.73225411370692
215.2162403388998
18.265842999797314
12.782711520703742
6.318191834376194
8.440920068482797
92.42872106503052
228.88049284633965
13.864394584186812
0.0
22.95063956581845
0.0
48.64594395843335
46.58404570405563
38.264953731297
6.3545448501035935
38.90772369972933
11.985531194672603
6.688867648523228
1.209338936212589
2.297323778271675
1.590325545752421
10.659795840878814
61.363982141952306
10.158654846795798
22.78422634195192
2.0645270467502996
10.439628556618118
26.09150174818086
4.996890441220984
10.028933387582583
0.0
0.0
0.0
16.442022099721896
0.0
32.50120375765255
65.15208839286402
0.0
18.024697798316083
20.764340837926284
0.0
7.996500440844102
0.0
0.0
0.0
9.478942094493014
0.9722989307238095
5.306539534688454
1.2461999133229256
0.039806898683309555

## 4.4 Rebalancing Strategy - 5050

### 4.4.1 Random Forest

In [120]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

fiftyfifty_randomforest_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    pipeline = Pipeline(steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                              ['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    fiftyfifty_randomforest_normalized_performance_df = pd.concat([fiftyfifty_randomforest_normalized_performance_df, new_performance_df])
    
fiftyfifty_randomforest_normalized_performance_df.to_csv("../data/05_model_output/fiftyfifty_randomforest_normalized_performance_df.csv")



### 4.4.2 XGBoost

In [121]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


fiftyfifty_xgboost_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    GXBoostPipeline = Pipeline(steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                                    ['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    fiftyfifty_xgboost_normalized_performance_df = pd.concat([fiftyfifty_xgboost_normalized_performance_df, new_performance_df])
    

fiftyfifty_xgboost_normalized_performance_df.to_csv("../data/05_model_output/fiftyfifty_xgboost_normalized_performance_df.csv")



Elapsed time to compute best fit: 93.617 seconds
Cross-validation score: 0.6542222956258827
Test score: 0.6789137380191692
Best Hyperparameters: {}
0.0106297
0.037096053
0.06911636
0.19779211
0.018936576
0.013144382
0.003597119
0.014104634
0.0063070944
0.014423884
0.030415202
0.03158292
0.0027424726
0.0
0.019230993
0.0
0.006317593
0.013707835
0.006882311
0.0032925995
0.0034680725
0.0013378866
0.010380429
0.003039499
0.0062365034
0.0018018563
0.0016879594
0.001479366
0.0016679103
0.0017482352
0.023221394
0.0015191814
0.0017612918
0.0073791468
0.0015134866
0.0
0.0
0.0
0.009688829
0.0
0.0026973067
0.012645856
0.033510227
0.004513615
0.0052753794
0.0
0.0034747748
0.0
0.0
0.0
0.0015291603
0.0014847999
0.0013645359
0.0012008359
0.0047232886
0.0022870505
0.00025052042
0.0031061503
0.012578895
0.007821428
0.0045793243
0.010332603
0.004614126
0.0021705758
0.00816047
0.0023719112
0.0034785257
0.0029496478
0.004081366
0.0020705622
0.0037515745
0.007626381
0.0029602803
0.0019383974
0.0028918164
0.



Elapsed time to compute best fit: 90.889 seconds
Cross-validation score: 0.6549732254225227
Test score: 0.5626780626780626
Best Hyperparameters: {}
0.0058857603
0.04489248
0.12060093
0.058306728
0.019993156
0.007093753
0.004824447
0.023188269
0.006996318
0.0056961533
0.013635336
0.063130744
0.004582089
0.0
0.013366641
0.0
0.009538685
0.003392958
0.009047751
0.02307728
0.0057767634
0.0026875732
0.009184337
0.013744788
0.00089500035
0.005835647
0.004504006
0.0020573575
0.0010723418
0.006468636
0.02106064
0.0019719768
0.0016396764
0.005113802
0.00091664086
0.0
0.0
0.0
0.0019376348
0.079327576
0.0120055545
0.010079622
0.005921881
0.0031005181
0.008718751
0.01943021
0.004724662
0.0
0.0
0.0
0.0012961018
0.0020518722
0.001516119
0.0021743055
0.00058522227
0.002299631
0.013392156
0.004090171
0.010709024
0.0095883235
0.0006218478
0.0059090084
0.0043096333
0.0043470003
0.006273657
0.0032975897
0.0014013818
0.004961987
0.0036810285
0.0015357339
0.0022106844
0.0062228143
0.0031943994
0.003722448
0



Elapsed time to compute best fit: 91.915 seconds
Cross-validation score: 0.6440142673219799
Test score: 0.6625441696113074
Best Hyperparameters: {}
0.008093859
0.043757368
0.081672266
0.17091991
0.009660478
0.005106443
0.0049671475
0.005103347
0.0053363275
0.0061321734
0.036381222
0.045860652
0.0033845422
0.0
0.019596227
0.0
0.013206808
0.006165996
0.008829612
0.00063780695
0.004426119
0.0023475422
0.012799343
0.0040019653
0.003537942
0.00022162552
0.0014782122
0.009031146
0.0031503756
0.014214525
0.017154211
0.004725584
0.0015714222
0.004740257
0.00090882
0.0
0.0
0.0
0.004819252
0.045328505
0.009064078
0.01990094
0.005703941
0.003546538
0.006245426
0.0018341998
0.0033059514
0.0
0.0
0.0
0.0020806245
0.004075024
0.0038860694
0.006126325
0.005046493
0.0044565443
0.0053711184
0.003531154
0.008932749
0.006555131
0.0028125853
0.0024734118
0.004531841
0.01021148
0.009143539
0.0029162106
0.00382655
0.002087141
0.002074495
0.002001034
0.0021220671
0.009086202
0.0007186118
0.0016118473
0.004440



Elapsed time to compute best fit: 90.279 seconds
Cross-validation score: 0.6616825499726582
Test score: 0.6861198738170347
Best Hyperparameters: {}
0.0042545297
0.06386024
0.05822674
0.20379588
0.0062180585
0.022269147
0.006670115
0.0061632334
0.01731584
0.007344767
0.027191374
0.03471643
0.009828611
0.0
0.01065497
0.0
0.0024001298
0.009410128
0.010170034
0.006950909
0.008635424
0.0053368453
0.016295949
0.002620342
0.0017166276
0.0019870682
0.0019403606
0.0029698857
0.002259199
0.006064646
0.020689053
0.001479573
0.0011178998
0.0025935385
0.0016276207
0.0
0.0
0.0
0.006016755
0.001685442
0.0016835146
0.015800713
0.002365768
0.004529716
0.0014223556
0.00018649439
0.003965901
0.0
0.0
0.0
0.0011648836
0.0029344412
0.0020903046
0.0010314658
0.004289253
0.0047694007
0.0013221549
0.0009309017
0.013913085
0.011264704
0.0032615664
0.00511789
0.005463561
0.0103822695
0.007965406
0.0022907911
0.0025284342
0.004010939
0.0011821789
0.0032351962
0.002157754
0.004657666
0.003654286
0.0019119611
0.001



Elapsed time to compute best fit: 90.833 seconds
Cross-validation score: 0.6544540891978818
Test score: 0.6006006006006006
Best Hyperparameters: {}
0.0125951385
0.03837981
0.091309085
0.15212841
0.010904423
0.020968242
0.003055709
0.014200084
0.003961478
0.013359199
0.034567293
0.03661294
0.001401337
0.0
0.017968478
0.0
0.012681422
0.0024765895
0.008595092
0.002697297
0.007826662
0.011029391
0.008400343
0.0019900694
0.0014805016
0.0015289881
0.0012847519
0.0024000455
0.0020788182
0.0021411234
0.009417837
0.00107027
0.0010472389
0.006456479
0.0014753933
0.0
0.0
0.0
0.0063258405
0.021473257
0.0032074938
0.007266786
0.005787232
0.006248958
0.0064652897
0.0
0.0039338185
0.0
0.0
0.0
0.0018366333
0.0056785946
0.0031182251
0.004727751
0.0049982993
0.00043999916
0.0013609099
0.0055074836
0.01665837
0.014347329
0.005009996
0.005630327
0.007081968
0.0009243634
0.004952906
0.0024189097
0.0024003962
0.00622196
0.0019333089
0.0018070722
0.0012261472
0.010439921
0.0016484945
0.0019615844
0.001252297



Elapsed time to compute best fit: 91.337 seconds
Cross-validation score: 0.6438325575667185
Test score: 0.6109324758842444
Best Hyperparameters: {}
0.0053505865
0.04075707
0.09235971
0.1504457
0.016804598
0.018075095
0.008551759
0.014979646
0.007236933
0.007531438
0.019526556
0.043271534
0.0042584194
0.0
0.0064267963
0.0
0.008708964
0.0043035294
0.009585008
0.0044348524
0.00832988
0.004736622
0.0067706546
0.0019910575
0.0019272701
0.004024565
0.0014970022
0.0017076682
0.0013751006
0.0017885725
0.011128691
0.0006303936
0.003126426
0.0069941822
0.0017788997
0.0
0.0
0.0
0.006132769
0.043337017
0.009001415
0.009976295
0.002569461
0.001763754
0.010171306
0.025102906
0.0036317199
0.0
0.0
0.0
0.0031729068
0.0017831045
0.0020168726
0.0034520566
0.0003170464
0.0010665384
0.004711523
0.0052324072
0.00862279
0.014374587
0.0045994595
0.007649218
0.0043537337
0.0039867125
0.0026330198
0.004264241
0.0013946738
0.010000303
0.0029601478
0.0013268377
0.0037126439
0.0058432734
0.0025562553
0.00095659797



Elapsed time to compute best fit: 92.478 seconds
Cross-validation score: 0.6440223828374443
Test score: 0.705574912891986
Best Hyperparameters: {}
0.008260873
0.054288056
0.10935521
0.064160086
0.013500042
0.001744193
0.004411779
0.009725647
0.003977532
0.016992042
0.05345043
0.05063214
0.0042947624
0.0
0.013625546
0.0
0.0085945465
0.0112084355
0.006046182
0.005984241
0.0038582494
0.013298218
0.015948283
0.0036458555
0.004444721
0.0020870515
0.0026383253
0.0013322227
0.0035342402
0.0022999432
0.026171587
0.0014715877
0.0022654121
0.010243649
0.0017237279
0.0
0.0
0.0
0.003959794
0.09292084
0.0023480747
0.0060045184
0.0030308734
0.006891442
0.0052532926
0.003511244
0.003414873
0.0
0.0
0.0
0.0012035591
0.006457799
0.0065914467
0.003507952
0.00825755
0.0023435326
0.0012246906
0.0031719215
0.014335767
0.017441386
0.005686812
0.004712483
0.0059939665
0.0005496005
0.006290136
0.0019371699
0.0012752473
0.017531196
0.0034158963
0.0033346866
0.0015349453
0.009817943
0.0022990785
0.0041621835
0.0



Elapsed time to compute best fit: 93.850 seconds
Cross-validation score: 0.6590556530991191
Test score: 0.6345565749235474
Best Hyperparameters: {}
0.013267646
0.052225307
0.08656119
0.21773185
0.010668632
0.021338053
0.008026223
0.01930028
0.008296057
0.011295916
0.030806094
0.029043024
0.0038547155
0.0
0.016277004
0.0
0.011890357
0.016113177
0.005132109
0.0030272228
0.0054878043
0.0039764093
0.004793156
0.004867435
0.0077768303
0.003911761
0.0010150038
0.0052726427
0.00077799975
0.0036401069
0.0077745505
0.0011953358
0.001537135
0.006444732
0.0013093873
0.0
0.0
0.0
0.003583758
0.0055837445
0.0045573353
0.011140187
0.0
0.0040503116
0.007715288
0.0008795673
0.0036237282
0.0
0.0
0.0
0.002111725
0.0018204171
0.0025644598
0.004903026
0.005031332
0.002850094
0.0057546175
0.0028611713
0.007095618
0.011878232
0.01944757
0.0032138084
0.009138752
0.0047000777
0.002235681
0.0022922659
0.0020955438
0.0033386156
0.001073393
0.0016407428
0.0030508447
0.011451412
0.0014456902
0.0028899848
0.0023147



Elapsed time to compute best fit: 89.454 seconds
Cross-validation score: 0.6598650690395169
Test score: 0.6268011527377523
Best Hyperparameters: {}
0.009610755
0.038151808
0.1333858
0.06667497
0.0084101185
0.019312507
0.0025413402
0.009270428
0.008769897
0.008063144
0.04129692
0.054863334
0.002630305
0.0
0.018147396
0.0
0.007850457
0.007366264
0.008081338
0.029525993
0.005679251
0.01445894
0.015509196
0.003461502
0.0026466383
0.0029348556
0.0017368592
0.0015383906
0.0031180272
0.0049888864
0.018701563
0.0015967801
0.0042121215
0.0022527215
0.0027402209
0.0
0.0
0.0
0.004069046
0.0
0.0024808368
0.017749265
0.0
0.002404442
0.012857043
2.8219405e-05
0.005016166
0.0
0.0
0.0
0.001298242
0.003699336
0.0029481123
0.00047915374
0.0013146581
0.0011278498
0.010441837
0.0041381037
0.013721916
0.0035344448
0.0085258875
0.003917112
0.002191682
0.040874373
0.0035929934
0.0011538146
0.0015673193
0.02048445
0.0009842619
0.001589946
0.003776115
0.0073328153
0.0022802223
0.0023694208
0.002704977
0.001527



Elapsed time to compute best fit: 93.381 seconds
Cross-validation score: 0.6409783604142592
Test score: 0.6542056074766355
Best Hyperparameters: {}
0.015463327
0.043690003
0.066557065
0.22853386
0.012720408
0.014827053
0.0033003855
0.01263263
0.0013465296
0.009989853
0.03550529
0.02857732
0.0020586962
0.0
0.018492918
0.0
0.010520752
0.015697807
0.0070931395
0.0074394406
0.004689925
0.00064369093
0.010700327
0.00062030734
0.0031884583
0.0021264032
0.0016552149
0.006332614
0.001636709
0.0028076496
0.012048287
0.00052112876
0.0011017856
0.0043706787
0.0012100774
0.0
0.0
0.0
0.0051257606
0.0
0.0020568552
0.006622382
0.002986559
0.0028991848
0.013791624
0.0008255234
0.0022198835
0.0
0.0
0.0
0.0010154585
0.0018303533
0.0017212181
0.002165945
0.00089021126
0.0027493865
0.002740486
0.009622968
0.016965391
0.011443465
0.00097089325
0.0062675355
0.006097711
0.009323843
0.008061661
0.0020279721
0.004240893
0.0059770225
0.003865585
0.0031248783
0.0019547974
0.008628078
0.0022102047
0.002556065
0.0



Elapsed time to compute best fit: 88.936 seconds
Cross-validation score: 0.6487636174435096
Test score: 0.6193353474320241
Best Hyperparameters: {}
0.011095247
0.041664485
0.08028096
0.16381995
0.015623197
0.010829009
0.0034985966
0.021067398
0.0023372718
0.019107543
0.02517437
0.029118184
0.0038332085
0.0
0.008035851
0.0
0.0051828222
0.0052870293
0.00611804
0.0040023285
0.006812459
0.0030169482
0.005383875
0.010359049
0.009407952
0.0005482971
0.0005507219
0.0023580904
0.0011534763
0.0049028494
0.0054698247
0.0018023662
0.0020126333
0.0051972475
0.0010735241
0.0
0.0
0.0
0.0029966894
0.04797309
0.0024853845
0.017656477
0.0
0.0043985397
0.0018302219
0.00068161276
0.004323499
0.0
0.0
0.0
0.0013671552
0.0022122017
0.0016014857
0.003795821
0.0014123132
0.0028338127
0.0018583579
0.0052316384
0.0053423312
0.0064240154
0.0034422933
0.009635694
0.003922337
0.00925301
0.007810811
0.0007450577
0.0008454445
0.055090312
0.0021291696
0.0034893702
0.0029669527
0.0034685312
0.0022620913
0.003831194
0.



Elapsed time to compute best fit: 92.425 seconds
Cross-validation score: 0.6420378040678044
Test score: 0.6587301587301587
Best Hyperparameters: {}
0.012408329
0.02111556
0.11355849
0.061448086
0.005363673
0.0069712787
0.008772034
0.008290253
0.018340942
0.016053358
0.049014967
0.051170427
0.0023172223
0.0
0.0138210375
0.0
0.009628218
0.011184892
0.005261103
0.008591473
0.0077934386
0.0044589206
0.006746661
0.0099539645
0.0033677882
0.0006054219
0.000771424
0.0021882055
0.0029575531
0.0025372256
0.022086142
0.0018268084
0.0015044161
0.0036975734
0.0021486953
0.0
0.0
0.0
0.017273752
0.07633612
0.0021482313
0.007847121
0.01176305
0.00910499
0.00870557
0.00036898252
0.0025867661
0.0
0.0
0.0
0.0020200892
0.0030332115
0.0016325851
0.0013079396
0.00051103055
0.0021680966
0.004762565
0.0049936203
0.006640958
0.0119886715
0.015226371
0.0057704994
0.0040700785
0.0042621023
0.006436419
0.0012910697
0.0016013437
0.0066861366
0.002886178
0.0036140154
0.0017859909
0.008345131
0.0016094793
0.0019817



Elapsed time to compute best fit: 89.869 seconds
Cross-validation score: 0.6352121134817315
Test score: 0.6466876971608833
Best Hyperparameters: {}
0.00644753
0.044020604
0.102949455
0.17194536
0.009899379
0.005780363
0.001783824
0.014640563
0.0056355745
0.001212454
0.028097264
0.047900762
0.0062903604
0.0
0.011234594
0.0
0.004084664
0.014073077
0.0052861758
0.007967968
0.006908175
0.0017611628
0.008072423
0.0022094562
0.0029404308
0.0020990865
0.0019106133
0.0063167927
0.0027835683
0.004252085
0.0061729345
0.0013633763
0.001984768
0.011366739
0.0008170992
0.0
0.0
0.0
0.0062064347
0.057781998
0.007133946
0.0064973077
0.0016930877
0.0015567574
0.0038870971
0.0
0.004750994
0.0
0.0
0.0
0.0014376914
0.0035451413
0.0007437489
0.0015963854
0.010568663
0.0025049297
0.0022995574
0.009767865
0.015201419
0.027154101
0.0020672043
0.010328716
0.0027086574
0.0048599388
0.0056952704
0.0037415193
0.0050002835
0.0035305445
0.0050156577
0.0009500554
0.0013043917
0.0020892539
0.001450858
0.0030070595
0.



Elapsed time to compute best fit: 88.657 seconds
Cross-validation score: 0.6317891457236059
Test score: 0.6752411575562701
Best Hyperparameters: {}
0.009214336
0.026275624
0.08380681
0.18221344
0.023577921
0.008634037
0.0012264728
0.008717503
0.009876694
0.0028042018
0.027914444
0.02975429
0.0042125443
0.0
0.0121630905
0.0
0.021568462
0.009120267
0.004339207
0.00477121
0.0056860335
0.004854411
0.0059197703
0.0013325752
0.011645273
0.005926315
0.0016415082
0.0021631217
0.0008883103
0.0029993348
0.015482524
0.00089578936
0.0015558073
0.0021297003
0.0016154444
0.0
0.0
0.0
0.018922495
0.030707093
0.0038905172
0.013875383
0.011410707
0.005794044
0.005807126
0.0020308313
0.0024372733
0.0
0.0
0.0
0.00090222474
0.0041643125
0.0016274883
0.0017506768
0.008830262
0.0038731988
0.0014676614
0.007697965
0.010075485
0.007119596
0.011258327
0.0056903646
0.0007909367
0.00738372
0.006492537
0.01752415
0.0010641537
0.009139607
0.0015091092
0.0014184689
0.0023292128
0.0063833743
0.0012502397
0.0021200469



Elapsed time to compute best fit: 90.486 seconds
Cross-validation score: 0.6507404197641069
Test score: 0.6978798586572439
Best Hyperparameters: {}
0.0065488485
0.03226309
0.10155145
0.1481658
0.01675933
0.014978229
0.001705751
0.00403408
0.002905748
0.00076801615
0.021701103
0.048411634
0.0032698906
0.0
0.0076229926
0.0
0.012127095
0.010689253
0.005513015
0.0026558568
0.008486539
0.0048356983
0.02451537
0.0038367051
0.0033680056
0.00555755
0.00052972103
0.0031285617
0.001326838
0.0025617096
0.013080693
0.0009872844
0.0007781153
0.014085358
0.0025821426
0.0
0.0
0.0
0.005708227
0.0027226934
0.005982146
0.014316475
0.0
0.0050040632
0.00663424
0.0024377482
0.0025639653
0.0
0.0
0.0
0.001442035
0.0022038925
0.0011301003
0.0020399834
0.015761197
0.0027189853
0.004396547
0.0014569308
0.020145608
0.004350678
0.0068069403
0.003278182
0.0051536094
0.01750573
0.005232578
0.04150898
0.004160495
0.003432559
0.0019914652
0.002556242
0.0008828458
0.0071227835
0.0014590292
0.0019541867
0.0021790667
0.



Elapsed time to compute best fit: 95.428 seconds
Cross-validation score: 0.6506712017778348
Test score: 0.6615384615384614
Best Hyperparameters: {}
0.007392515
0.042622175
0.07289857
0.18959694
0.011790654
0.017885286
0.0041228505
0.029138222
0.0077599953
0.016045317
0.025125584
0.019185234
0.00547051
0.0
0.015668115
0.0
0.013690584
0.0068752323
0.016736768
0.009482015
0.0038217145
0.005190025
0.017591426
0.022038773
0.0028303433
0.0017596489
0.001769151
0.0015116716
0.0014318586
0.006891419
0.009717879
0.0022293366
0.002689547
0.010485517
0.0019397123
0.0
0.0
0.0
0.010645051
0.0
0.0034252661
0.009963819
0.004239319
0.0106469365
0.013848552
0.026408762
0.0014219943
0.0
0.0
0.0
0.0028243528
0.017644543
0.001611302
0.0012034372
0.0027181574
0.001372461
0.033463076
0.007960224
0.0134421345
0.006058471
0.0021023636
0.009421805
0.007343862
0.0036591454
0.0058191256
0.0019839245
0.007958095
0.0068700514
0.0012691966
0.0005930945
0.0012211583
0.007824025
0.00081974437
0.0009265246
0.002041101



Elapsed time to compute best fit: 91.260 seconds
Cross-validation score: 0.6716985793199738
Test score: 0.6269113149847095
Best Hyperparameters: {}
0.010254397
0.041511297
0.071726866
0.18583478
0.028814957
0.007700446
0.0037069523
0.015391553
0.011819054
0.004800887
0.043136735
0.018432096
0.0016665019
0.0
0.012237415
0.0
0.008604575
0.0035748847
0.006507039
0.0008660301
0.0041592857
0.003263319
0.006672943
0.01186741
0.010660751
0.0012956548
0.0027627551
0.0041472414
0.0011860399
0.0044237277
0.015925921
0.00092099514
0.0022367656
0.0034327458
0.00097306183
0.0
0.0
0.0
0.007456009
0.04569818
0.0034413615
0.028620189
0.004916884
0.004717954
0.006937267
0.0068953247
0.0035340039
0.0
0.0
0.0
0.0010070407
0.002364166
0.0006232585
0.0067159724
0.00073309627
0.002300852
0.00239093
0.004192091
0.012299583
0.01301302
0.007090455
0.0034114553
0.0012685378
0.0022961327
0.007849133
0.00090693275
0.003502176
0.003800994
0.0016016557
0.0018887498
0.0025318586
0.008227434
0.0027681466
0.003727562




Elapsed time to compute best fit: 90.215 seconds
Cross-validation score: 0.645282764246095
Test score: 0.663430420711974
Best Hyperparameters: {}
0.0059339595
0.021894155
0.13515341
0.0648723
0.0076864213
0.0055716974
0.004895409
0.013487714
0.00431297
0.0028164359
0.025645124
0.051713105
0.0009468382
0.0
0.009430641
0.0
0.016371869
0.0033335036
0.0058634942
0.02607946
0.010737025
0.006424966
0.0077557764
0.016652934
0.0017498984
0.0017317649
0.0036334482
0.004548139
0.0016202981
0.0033340352
0.014096252
0.0026331416
0.0009905811
0.0049922657
0.0012363897
0.0
0.0
0.0
0.0038914667
0.11534247
0.0052540335
0.0041529555
0.0
0.0021916314
0.007672006
0.00048051387
0.005171293
0.0
0.0
0.0
0.000977658
0.0017269903
0.0009909039
0.00030895486
0.026450584
0.002306485
0.0040631713
0.002551298
0.008340257
0.0066969283
0.00075008697
0.003954148
0.0016059002
0.010902681
0.0045112404
0.0021146275
0.006682187
0.0013597822
0.00036849585
0.0021283554
0.0016506368
0.003016726
0.0006094535
0.0016784385
0.0



Elapsed time to compute best fit: 92.451 seconds
Cross-validation score: 0.6585765471990446
Test score: 0.6596091205211727
Best Hyperparameters: {}
0.004154681
0.05890317
0.0796988
0.14720148
0.008188882
0.011761996
0.0035744556
0.003682224
0.0017415412
0.0032939764
0.031403795
0.03531145
0.0013573304
0.0
0.0062876604
0.0
0.015761614
0.0063991128
0.007114038
0.018962543
0.0081814425
0.010905995
0.0046893177
0.0020188994
0.0041841078
0.0008172921
0.0006277532
0.0010555781
0.001554603
0.0041007255
0.014910072
0.0016848977
0.0020473788
0.0028413394
0.0006955451
0.0
0.0
0.0
0.006194614
0.08666835
0.0026706185
0.0028409497
0.0016569621
0.009559041
0.0026535515
0.011690018
0.0027551723
0.0
0.0
0.0
0.0010962405
0.0020756074
0.002306965
0.0025527657
0.0046557155
0.0030032918
0.0008285297
0.0018347779
0.01669522
0.0034705668
0.002419257
0.0060086004
0.0056099924
0.0030701656
0.007016422
0.0048762276
0.0025687532
0.015742581
0.0027638322
0.002328523
0.0022254083
0.0057498356
0.0009306158
0.00295



Elapsed time to compute best fit: 104.599 seconds
Cross-validation score: 0.6605541817262289
Test score: 0.7049180327868851
Best Hyperparameters: {}
0.006114035
0.034108877
0.07457282
0.18205021
0.0176613
0.015735028
0.003119043
0.009210391
0.0054581054
0.0083631165
0.04385079
0.028092965
0.0039898925
0.0
0.010261996
0.0
0.01914069
0.008591
0.009555547
0.0037245841
0.0068013268
0.00068835396
0.0045159617
0.0015823807
0.0055970512
0.001188757
0.0027898455
0.0011698075
0.0017600718
0.006653428
0.018544182
0.0015102279
0.0019561315
0.0034188735
0.0017180586
0.0
0.0
0.0
0.00437669
0.0
0.013361306
0.013946894
0.004716165
0.010522085
0.006150885
0.0034600992
0.0020159867
0.0
0.0
0.0
0.0012576776
0.0012043912
0.0015219158
0.00051726605
0.00057203195
0.0015931434
0.004076523
0.0124606965
0.0074379747
0.005065145
0.0058094077
0.009488056
0.0016577089
0.0012207574
0.0031685196
0.0037258952
0.0023794556
0.00054132444
0.0008456245
0.0025029588
0.0035333151
0.0076678502
0.0013494679
0.00034828743
0



Elapsed time to compute best fit: 93.762 seconds
Cross-validation score: 0.6448017398419014
Test score: 0.6756756756756757
Best Hyperparameters: {}
0.010084781
0.04870236
0.064991884
0.18404154
0.022285614
0.012078663
0.0017523563
0.0118584875
0.0065018386
0.0057145925
0.022496654
0.030922271
0.004886064
0.0
0.011634396
0.0
0.009122167
0.012723041
0.00826784
0.010568389
0.006057374
0.004205222
0.00561556
0.0023444237
0.0027740225
0.0032359508
0.0016412954
0.0019336276
0.0005730815
0.0017038628
0.009694239
0.00091167074
0.0019497818
0.01295248
0.0010438935
0.0
0.0
0.0
0.004454938
0.025406208
0.0029861617
0.00988004
0.002577164
0.0006785996
0.0035987094
0.010575402
0.0029806977
0.0
0.0
0.0
0.0018009696
0.0016626086
0.0010021267
0.008352074
0.003588765
0.0011601525
0.0128851095
0.004156124
0.010047396
0.014874128
0.008416321
0.0055782846
0.007329612
0.008425554
0.007100964
0.0012578742
0.0024644618
0.0024365277
0.0013251242
0.002348041
0.0020544312
0.006433078
0.0010829195
0.0006647059
0.



Elapsed time to compute best fit: 88.092 seconds
Cross-validation score: 0.6479586402870573
Test score: 0.6153846153846154
Best Hyperparameters: {}
0.005991357
0.04146293
0.08205144
0.17466068
0.014648049
0.00938167
0.002565965
0.01752779
0.009570332
0.008574445
0.048693288
0.022245673
0.004925241
0.0
0.0063737133
0.0
0.008778981
0.009249364
0.0023040639
0.0014865972
0.010078925
0.0055415616
0.010915456
0.004425607
0.002610775
0.0007992115
0.0012529097
0.002817651
0.0019925924
0.003948143
0.014630593
0.0020931747
0.0014430612
0.0048394245
0.0015865791
0.0
0.0
0.0
0.0028979008
0.0
0.0019297058
0.0238295
0.0050639748
0.007917222
0.007257125
0.0
0.0023343025
0.0
0.0
0.0
0.0019184921
0.0013201776
0.0016623273
0.007111666
0.01013007
0.005458471
0.018183004
0.018754674
0.00784813
0.013952036
0.0043439884
0.004940847
0.0054193693
0.002632113
0.0027499434
0.0011622244
0.0016799862
0.012226963
0.0020518256
0.0025116387
0.0016073639
0.0061129774
0.0029038857
0.0016369842
0.0027941894
0.003742747



Elapsed time to compute best fit: 93.723 seconds
Cross-validation score: 0.6400145750449078
Test score: 0.663430420711974
Best Hyperparameters: {}
0.003348031
0.05185909
0.09088186
0.13897672
0.011392645
0.008240988
0.0024452396
0.0167549
0.013198281
0.008548536
0.040269002
0.019819977
0.002199685
0.0
0.008325648
0.0
0.0072956695
0.008387623
0.008631887
0.00092683063
0.0058120885
0.013594667
0.0112510165
0.0077136536
0.0017774147
0.002211818
0.003181568
0.0013558902
0.0015877064
0.0010924146
0.018152375
0.0013438433
0.002036581
0.009597346
0.0016489179
0.0
0.0
0.0
0.010488418
0.004263933
0.004321757
0.033450406
0.0043995297
0.0066187372
0.019577371
0.0
0.0039278576
0.0
0.0
0.0
0.0012838268
0.003305181
0.00074486044
0.002410678
0.0014401021
0.003803682
0.00054399396
0.005730464
0.018077444
0.028775902
0.0039946265
0.010197373
0.004735508
0.00077744777
0.0033749957
0.002688458
0.003092514
0.004435112
0.009150821
0.0022649018
0.001594104
0.006987034
0.0024610893
0.0053289705
0.0024880404




Elapsed time to compute best fit: 92.219 seconds
Cross-validation score: 0.6427530755052009
Test score: 0.6590257879656161
Best Hyperparameters: {}
0.0070899217
0.023696769
0.12332998
0.054493967
0.003623745
0.013823977
0.0031301116
0.010054089
0.009501624
0.016549282
0.045150016
0.027469905
0.0036538544
0.0
0.014465588
0.0
0.004723375
0.017385984
0.0057888525
0.026602741
0.006856602
0.0027834768
0.016863758
0.006140741
0.0054938034
0.0037133938
0.0007040035
0.0018133387
0.0016027808
0.0053949985
0.011340614
0.004721286
0.001770492
0.008499371
0.0018945383
0.0
0.0
0.0
0.0033384566
0.09996466
0.0058438694
0.0124465125
0.0038306087
0.007956768
0.019507393
0.0021166273
0.006106013
0.0
0.0
0.0
0.0013389048
0.0025190224
0.0014801057
0.009076966
0.001054998
0.002102719
0.0006775312
0.0042251544
0.012538612
0.0048292796
0.0029278493
0.0017040891
0.0024805537
0.012390694
0.0041431193
0.0022998904
0.0022207627
0.010555076
0.0031591177
0.0020786598
0.0020765623
0.009410596
0.0026873143
0.0006036



Elapsed time to compute best fit: 90.261 seconds
Cross-validation score: 0.6435120152753425
Test score: 0.6881533101045296
Best Hyperparameters: {}
0.0065741586
0.038653944
0.10079169
0.17460862
0.0073710177
0.019608052
0.0058231982
0.01998833
0.0033375488
0.0059282808
0.030052269
0.030470574
0.0024934544
0.0
0.01094138
0.0
0.006205807
0.020610463
0.004303129
0.00076779124
0.008044412
0.0058274297
0.014090878
0.0014996849
0.0024547416
0.0025462955
0.006120712
0.0032643252
0.0034848966
0.0035519341
0.015091841
0.0011720839
0.0006647053
0.0042586573
0.0018205146
0.0
0.0
0.0
0.0063358173
0.016836096
0.00320581
0.011821207
0.0009597714
0.0007785248
0.0046698027
0.0
0.007951825
0.0
0.0
0.0
0.0010312693
0.002904712
0.0021346232
0.0006500587
0.0026576745
0.002932293
0.0017972605
0.0057492685
0.00910103
0.0178379
0.006345519
0.005998445
0.0017880966
0.0029280211
0.005137078
0.0023141298
0.0036800215
0.004065859
0.0014033857
0.0021179914
0.002556058
0.011995342
0.0018703568
0.002229207
0.002091

### 4.4.3 LightGBM

In [122]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

fiftyfifty_lightgbm_performance_normalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    LightGBMPipeline = Pipeline(steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                                    ['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    fiftyfifty_lightgbm_performance_normalized_df = pd.concat([fiftyfifty_lightgbm_performance_normalized_df, new_performance_df])
    

fiftyfifty_lightgbm_performance_normalized_df.to_csv("../data/05_model_output/fiftyfifty_lightgbm_performance_normalized_df.csv")




Elapsed time to compute best fit: 11.573 seconds
Cross-validation score: 0.7113322075117299
Test score: 0.8006814310051108
Best Hyperparameters: {}
1108.9611670970917
16569.272807240486
53781.95633506775
35968.37216287851
817.7277435064316
1737.569772720337
557.7845948934555
981.9974361658096
571.7290093898773
75.38642001152039
7601.337283492088
4701.134654164314
114.09622025489807
0.0
1289.7106617689133
0.0
355.00965666770935
398.4365887641907
193.59717071056366
41.46989047527313
694.1690406799316
574.3064433336258
497.6763904094696
63.71808075904846
267.9891700744629
177.4033603668213
35.870099902153015
199.4885734319687
62.64492917060852
166.92065978050232
1020.827041387558
128.54270935058594
96.77608954906464
1234.900576710701
114.67648923397064
0.0
0.0
0.0
312.2677712440491
593.4102935791016
258.02084159851074
603.380760550499
9.557729840278625
501.63649463653564
931.9587335586548
2.2435801029205322
259.8534710407257
0.0
0.0
0.0
134.94150006771088
192.6993991136551
203.61084973812



Elapsed time to compute best fit: 10.663 seconds
Cross-validation score: 0.7148026796216979
Test score: 0.793103448275862
Best Hyperparameters: {}
2049.8524140119553
20816.011796832085
50442.749661684036
34686.83184838295
1715.4797345399857
2792.493401288986
140.96754145622253
457.57757818698883
393.4008604288101
204.80873787403107
4049.3840406537056
2550.3752804994583
199.0824717283249
0.0
1131.6125655174255
0.0
1084.7472531795502
275.64681804180145
675.9692735671997
124.45666813850403
682.2007576227188
615.4949785470963
507.9502600431442
26.1626296043396
291.5008804798126
20.143969774246216
16.44225025177002
137.6755303144455
45.53889989852905
177.4220187664032
1955.2982362508774
56.25673949718475
129.83602094650269
707.6825747489929
98.37774062156677
0.0
0.0
0.0
87.98735904693604
481.17309951782227
562.588161945343
518.4457023143768
244.93666863441467
391.7012845277786
602.688807964325
9.674630165100098
366.0910601615906
0.0
0.0
0.0
132.84565901756287
131.5467791557312
163.704448819



Elapsed time to compute best fit: 10.611 seconds
Cross-validation score: 0.7222079411967096
Test score: 0.717948717948718
Best Hyperparameters: {}
1893.6666091680527
11442.75721013546
70324.26694130898
21756.37646383047
1137.0104916095734
1275.1970286369324
341.6856869459152
919.1239117383957
1215.7926586866379
244.19519859552383
7264.655363082886
5432.315036535263
171.29113841056824
0.0
1828.9265640974045
0.0
843.1030620336533
293.5478184223175
560.0480768680573
211.73759961128235
450.8877223730087
195.77138900756836
305.6233217716217
111.84850406646729
177.4068409204483
44.22406005859375
30.785760641098022
305.14410841464996
42.16319113969803
94.29113900661469
1965.9946982860565
130.8017807006836
120.56174767017365
647.2822470664978
132.36259126663208
0.0
0.0
0.0
397.9594416618347
99.89603805541992
234.97394239902496
789.0252265930176
64.24994015693665
1523.0562249422073
263.3106108903885
11.338789939880371
224.54418003559113
0.0
0.0
0.0
102.68954885005951
117.40590953826904
147.2854



Elapsed time to compute best fit: 11.459 seconds
Cross-validation score: 0.715940265224458
Test score: 0.7267950963222416
Best Hyperparameters: {}
2234.7423692941666
19394.220409989357
36847.76154136658
47693.888878941536
656.1539055109024
1389.2928440570831
117.05675065517426
619.4855482578278
179.88804745674133
222.19455790519714
6954.64342302084
3903.9255390167236
247.07235205173492
0.0
1586.4959570169449
0.0
255.23604547977448
560.4895685911179
689.5253887176514
163.30754852294922
717.1182072162628
181.82795453071594
279.9107493162155
46.79617881774902
333.7668145895004
119.90547132492065
180.05910819768906
232.62331986427307
38.485779881477356
2277.6006846427917
1279.950276017189
12.367808043956757
103.16449081897736
623.6989525556564
52.853615164756775
0.0
0.0
0.0
95.65476953983307
76.88400268554688
206.60782063007355
752.6916253566742
2.710200071334839
639.8931721448898
927.5404909849167
7.6805901527404785
456.81989681720734
0.0
0.0
0.0
124.79160153865814
156.46405243873596
202.



Elapsed time to compute best fit: 11.148 seconds
Cross-validation score: 0.7390483556742797
Test score: 0.745762711864407
Best Hyperparameters: {}
1146.2193437814713
18142.092892766
51314.1918336153
36736.00713765621
1115.7734904289246
1452.1886581778526
161.691850066185
1050.094745516777
1044.1040669679642
285.6979978084564
7806.909984767437
4326.361228466034
153.12213003635406
0.0
1260.6907513141632
0.0
482.03988683223724
490.03167057037354
1099.3011783361435
52.69474947452545
316.86607003211975
233.40615022182465
325.75257766246796
202.53573501110077
38.063169956207275
549.884992480278
92.5187097787857
178.97999739646912
19.03948986530304
212.0175679922104
1573.0936275720596
85.24565982818604
147.03667974472046
1434.214410185814
179.59317028522491
0.0
0.0
0.0
805.7555242776871
10.395099639892578
640.8033825159073
216.37744188308716
53.840988636016846
322.59556090831757
462.7033680677414
0.0
153.68649125099182
0.0
0.0
0.0
116.49950778484344
394.87204897403717
123.38160073757172
80.61



Elapsed time to compute best fit: 10.886 seconds
Cross-validation score: 0.7325110807894761
Test score: 0.7225913621262459
Best Hyperparameters: {}
2192.8607031702995
30120.59655046463
62271.43039226532
14508.436970055103
766.1007667779922
638.8747042417526
352.5196306705475
470.3918652534485
517.4101469516754
1480.187304854393
4626.445353925228
6536.068227887154
200.05383944511414
0.0
1419.8421911001205
0.0
1056.2852057218552
415.73746079206467
132.85869789123535
142.1340926885605
322.50888085365295
304.54968988895416
473.5254123210907
124.69645929336548
80.87070918083191
107.2045841217041
77.8666307926178
253.4625198841095
39.89567983150482
130.08071041107178
1116.2461915016174
233.74645900726318
91.93163013458252
1062.943949341774
55.40374767780304
0.0
0.0
0.0
261.5613007545471
482.44300079345703
349.53541922569275
752.2158100605011
7.609920144081116
396.1140556335449
516.9284588098526
0.0
349.3116898536682
0.0
0.0
0.0
197.0887097120285
452.2994393110275
76.5986498594284
18.66081035



Elapsed time to compute best fit: 11.889 seconds
Cross-validation score: 0.7256420402925963
Test score: 0.7529610829103215
Best Hyperparameters: {}
1937.4317017793655
21573.19527310133
47801.40674316883
35854.57718920708
836.475130200386
2759.7162145376205
116.16540157794952
934.3711495399475
1093.2195389270782
624.1378499269485
4234.847002506256
4616.270655274391
137.14206159114838
0.0
817.5111310482025
0.0
240.6583675146103
483.30788147449493
578.3651556968689
54.475568652153015
529.8632113933563
380.4309251308441
183.541916847229
142.16855001449585
48.42493987083435
12.181789875030518
14.543149709701538
120.0149108171463
52.07836973667145
317.4539465904236
2309.805554509163
61.961899518966675
84.96338880062103
1000.4056272506714
117.68521046638489
0.0
0.0
0.0
298.6319979429245
174.1034984588623
425.8212181329727
619.0853855609894
124.13744020462036
609.4888268709183
334.0729305744171
12.736100196838379
687.2908495664597
0.0
0.0
0.0
219.21318972110748
280.39885890483856
283.653760671



Elapsed time to compute best fit: 11.782 seconds
Cross-validation score: 0.7140248107993976
Test score: 0.782312925170068
Best Hyperparameters: {}
1849.9970343708992
15598.41131079197
55420.215818583965
39022.03396511078
1110.0650035142899
1519.9199427366257
159.24449002742767
1190.752360701561
462.4353997707367
85.23139989376068
3656.463211297989
3985.9011603593826
157.3651180267334
0.0
2258.1091824769974
0.0
575.675443649292
319.94138979911804
522.6723386049271
270.97812831401825
524.6785662174225
363.67275726795197
817.5276560783386
172.55742013454437
46.376248836517334
120.1076283454895
61.57519996166229
228.13852632045746
15.655330181121826
221.80129849910736
1072.3443039655685
101.67684054374695
233.61578845977783
670.2672637701035
119.71120941638947
0.0
0.0
0.0
110.0335898399353
439.26549530029297
252.54774963855743
1058.5027515888214
4.714240074157715
281.80224573612213
315.6325581073761
3.2125000953674316
366.05407989025116
0.0
0.0
0.0
134.43480956554413
221.4416913986206
157.



Elapsed time to compute best fit: 10.722 seconds
Cross-validation score: 0.7294699498397498
Test score: 0.7534246575342466
Best Hyperparameters: {}
1623.1346909999847
19891.351700425148
56890.60516387224
32431.530672192574
1537.7206735610962
2032.6523849964142
254.33651685714722
388.2857085466385
1216.7548820972443
598.8404351472855
6482.246711432934
1958.605426311493
176.50122517347336
0.0
678.0492078065872
0.0
546.3807648420334
495.4324113726616
272.1888107061386
35.11715018749237
676.8883814811707
219.66895020008087
445.766504406929
128.91688930988312
115.98914563655853
0.0
4.613789916038513
219.778071641922
12.04410994052887
427.58567559719086
3433.6290144324303
314.79440796375275
51.48307263851166
464.63580572605133
55.50294637680054
0.0
0.0
0.0
383.15943002700806
122.57055902481079
439.6284810900688
348.56298828125
93.91766738891602
1430.4276849031448
665.3735270500183
25.08916997909546
103.49716997146606
0.0
0.0
0.0
149.21028912067413
239.1174511909485
109.18876886367798
88.6607



Elapsed time to compute best fit: 11.292 seconds
Cross-validation score: 0.7340226983760286
Test score: 0.7360406091370558
Best Hyperparameters: {}
1344.815575838089
23504.30445933342
70127.81583303213
13495.628183960915
1420.0095257759094
542.0827037096024
300.46496176719666
977.1080323457718
396.2121649980545
645.6273137331009
4818.700614690781
4670.650526344776
329.8191065788269
0.0
1315.5104795694351
0.0
584.6564019918442
661.3447433114052
954.297040939331
85.89217925071716
478.39563179016113
361.4420176744461
506.59876108169556
146.51751112937927
74.71705043315887
88.19821059703827
88.23841893672943
209.3453322649002
117.7469584941864
394.71744680404663
2410.132154941559
75.18692982196808
83.09176063537598
767.5149657726288
163.07692551612854
0.0
0.0
0.0
273.17419934272766
608.6776885986328
268.37196123600006
638.7712100744247
0.0
471.01687800884247
1443.286003947258
7.7355499267578125
370.1259799003601
0.0
0.0
0.0
268.92904782295227
354.16004168987274
115.03308856487274
486.00416



Elapsed time to compute best fit: 11.367 seconds
Cross-validation score: 0.7263935067188823
Test score: 0.7252559726962456
Best Hyperparameters: {}
1831.301873266697
15453.937365293503
54899.589441776276
39386.00859928131
1306.308688044548
1632.119856595993
48.27983009815216
1097.495815873146
222.92497277259827
945.2831463813782
3631.006372332573
4974.088476896286
185.82160127162933
0.0
974.6507402658463
0.0
156.87803781032562
502.5884972810745
1527.5564962625504
89.36502122879028
721.6285817623138
160.8809517621994
273.51810824871063
254.83249640464783
59.303658962249756
4.366239905357361
31.641693890094757
104.69316971302032
87.1342601776123
95.80349111557007
583.4421784877777
63.00941759347916
79.04646003246307
1278.5534296035767
146.48109591007233
0.0
0.0
0.0
306.3006031513214
323.977112531662
286.54804050922394
414.0638723373413
1.6198300123214722
167.5581488609314
180.24896943569183
5.331469893455505
467.61604356765747
0.0
0.0
0.0
202.98836064338684
259.1284190416336
283.10297989



Elapsed time to compute best fit: 10.535 seconds
Cross-validation score: 0.7150973916848016
Test score: 0.7565217391304346
Best Hyperparameters: {}
2122.1335178613663
22466.53583264351
50842.25670862198
31375.955252289772
1167.3865295648575
1163.4984226226807
490.55251955986023
1078.084954738617
393.95170843601227
182.78098797798157
5786.9656311273575
6886.7722355127335
174.50857937335968
0.0
991.3281750679016
0.0
829.8159619569778
496.90288257598877
287.8754519224167
229.50563901662827
774.5329765081406
232.4582793712616
323.33544927835464
23.763049721717834
96.66674041748047
29.436780333518982
60.75372004508972
249.6015807390213
18.08424997329712
183.73764944076538
1480.2831597328186
71.9397302865982
137.54132890701294
1061.6817072629929
42.18949019908905
0.0
0.0
0.0
281.55423951148987
293.0365982055664
222.8318418264389
878.331200838089
57.15298092365265
310.57018887996674
315.03444933891296
0.0
434.6173413991928
0.0
0.0
0.0
370.5944970846176
276.13411915302277
289.0615898370743
42.



Elapsed time to compute best fit: 10.724 seconds
Cross-validation score: 0.7392840306365187
Test score: 0.7240204429301534
Best Hyperparameters: {}
1241.6949095726013
22276.641138494015
41168.59157752991
44836.3922650218
476.8529465198517
2016.6468905210495
209.80092132091522
1156.9730715751648
680.6913869380951
98.4564915895462
5280.517081260681
2428.113750219345
303.4702101945877
0.0
1154.6850452423096
0.0
155.15281021595
829.3834926486015
513.4765467643738
60.923559069633484
725.5165892839432
353.93735206127167
351.6442121267319
16.694469690322876
123.73887169361115
34.831090450286865
54.636390209198
153.2761801481247
50.78087019920349
37.359169721603394
1493.434088587761
318.0154279470444
90.61070108413696
392.8677531480789
176.45993828773499
0.0
0.0
0.0
321.15270578861237
177.0097999572754
284.4720995426178
1496.6871834993362
104.91785335540771
529.3778423070908
327.14468944072723
4.623389959335327
307.18323040008545
0.0
0.0
0.0
187.9018730521202
531.1989860534668
142.849928140640



Elapsed time to compute best fit: 11.056 seconds
Cross-validation score: 0.7254339605732673
Test score: 0.7057823129251701
Best Hyperparameters: {}
2514.1892384290695
13277.08177292347
77544.15365064144
12285.316515147686
395.0370372533798
554.85657787323
139.41542053222656
1073.6336677074432
808.1846351623535
189.6610722541809
8157.709217727184
12101.451895296574
58.73745954036713
0.0
943.6060526371002
0.0
379.2280424833298
624.5309461951256
349.0644875764847
323.3068585395813
615.5635305643082
115.90943497419357
820.6080976724625
105.2830102443695
298.9186851978302
18.822630167007446
68.65670096874237
103.22144162654877
29.548929691314697
580.3952924013138
887.3953570127487
191.74371421337128
77.60937011241913
723.5743283629417
161.43411946296692
0.0
0.0
0.0
119.35522043704987
434.91679763793945
198.2485888004303
671.4770572185516
1.6256099939346313
381.69568276405334
1465.5706502199173
1.6687099933624268
196.54153990745544
0.0
0.0
0.0
202.46507674455643
100.01976376771927
191.110499



Elapsed time to compute best fit: 12.151 seconds
Cross-validation score: 0.7292867231865345
Test score: 0.748709122203098
Best Hyperparameters: {}
1133.6365641355515
16079.03278374672
51473.37727230787
43816.010541558266
1899.9785282611847
2117.1612956523895
247.53709435462952
1427.6448378562927
81.12763905525208
212.44061470031738
2067.8561711907387
3803.0946966409683
84.45685887336731
0.0
1108.3028490543365
0.0
220.28802752494812
431.34576058387756
1324.8840582370758
45.64882040023804
820.0113226175308
164.49324202537537
539.532518863678
50.22391998767853
149.7172315120697
245.63310742378235
69.7619196176529
224.0946593284607
35.48057007789612
308.32151222229004
1316.6704038381577
135.26700007915497
136.39481925964355
1138.539246559143
34.68773031234741
0.0
0.0
0.0
201.4840705394745
486.279296875
88.31287002563477
431.501473069191
7.720279932022095
667.5114821195602
813.9066804647446
10.93589997291565
248.0749695301056
0.0
0.0
0.0
223.4781875014305
269.08733916282654
329.773360848426



Elapsed time to compute best fit: 11.106 seconds
Cross-validation score: 0.7178981738445456
Test score: 0.7347972972972971
Best Hyperparameters: {}
1057.0401697158813
14272.903890669346
62759.084968447685
33391.534328341484
1098.986296057701
1424.0290260314941
57.65412974357605
703.0522482395172
357.0572214126587
57.73793077468872
3546.7167825698853
5688.968520283699
69.48821067810059
0.0
1349.2757366895676
0.0
746.5807324647903
212.50333684682846
397.77681159973145
33.79679977893829
711.1079415082932
74.6656197309494
493.56843280792236
53.596060156822205
87.88930034637451
39.58120012283325
133.2192167043686
433.825102686882
157.75612330436707
369.08376228809357
2111.2461540699005
103.62459933757782
99.53153073787689
148.37400043010712
53.7360897064209
0.0
0.0
0.0
206.12842893600464
160.6990828514099
168.31679928302765
866.110001206398
4.640500068664551
149.21986937522888
307.6306425333023
0.0
191.10987854003906
0.0
0.0
0.0
315.1774901151657
358.8117204904556
228.11833083629608
181.834



Elapsed time to compute best fit: 11.112 seconds
Cross-validation score: 0.7436666107042915
Test score: 0.7267950963222416
Best Hyperparameters: {}
2128.406953692436
23139.809481859207
36685.1865991354
44630.27548646927
716.3184058666229
1880.0562890768051
108.2225399017334
154.99128937721252
374.8245325088501
509.400267124176
7133.741362810135
2662.872521340847
300.2161223888397
0.0
1515.9938496351242
0.0
296.3203032016754
702.8547803163528
627.955714225769
48.126060009002686
496.2379198074341
187.52570915222168
431.4161425828934
18.473959922790527
60.67326104640961
54.85408091545105
107.92828977108002
233.22636723518372
52.65692484378815
254.95930290222168
1188.5083082914352
46.712780237197876
88.17702031135559
588.7105394601822
113.52838170528412
0.0
0.0
0.0
163.7829166650772
100.59074831008911
136.14413797855377
1757.3754279613495
87.75192046165466
710.6624580621719
499.6276854276657
0.0
210.83349084854126
0.0
0.0
0.0
184.79801392555237
178.67503082752228
133.8635412454605
33.79807



Elapsed time to compute best fit: 10.659 seconds
Cross-validation score: 0.7238732416702727
Test score: 0.6993006993006994
Best Hyperparameters: {}
1345.5565011501312
18968.609617352486
48193.310784339905
38410.86540663242
1230.3890227079391
1395.2025787830353
159.86734223365784
718.6129386425018
426.6765806674957
169.6151306629181
7950.405189871788
3369.838593006134
141.45908975601196
0.0
1338.529358625412
0.0
388.7860474586487
275.13748359680176
555.8717194795609
28.456199288368225
888.171869635582
164.29890990257263
845.2846401333809
41.37930953502655
135.24491786956787
141.78212022781372
101.73437893390656
313.9095711708069
39.7885205745697
121.93922138214111
2072.611233830452
126.37271094322205
162.49030113220215
1209.68914437294
100.7382402420044
0.0
0.0
0.0
280.49551153182983
32.98776042461395
380.1126229763031
1008.2582561969757
25.36789035797119
534.9390248060226
356.9072594642639
7.3969197273254395
299.8132004737854
0.0
0.0
0.0
141.41261756420135
320.18924939632416
237.751119



Elapsed time to compute best fit: 10.613 seconds
Cross-validation score: 0.6990684207632283
Test score: 0.7487520798668885
Best Hyperparameters: {}
1530.0740922689438
15176.218529105186
61266.77670991421
32537.290947794914
1506.2111617326736
1768.9163875579834
346.1689952611923
848.783182144165
597.207763671875
267.41003143787384
4784.208731532097
5227.135758399963
141.64660036563873
0.0
1282.687227845192
0.0
188.49051916599274
306.93840074539185
870.340991973877
445.7541218996048
526.9934189319611
135.42168962955475
391.20224702358246
132.50021147727966
35.53356075286865
91.7164980173111
17.321269989013672
157.13958752155304
115.08877742290497
216.99238401651382
1581.845808148384
183.50359070301056
84.95366024971008
691.943793296814
164.63637018203735
0.0
0.0
0.0
319.2087984085083
304.4470977783203
266.16132140159607
246.37575018405914
2.2554399967193604
726.4385358095169
574.4523704051971
0.0
195.54683029651642
0.0
0.0
0.0
330.97779512405396
233.82038176059723
233.84462094306946
40.2



Elapsed time to compute best fit: 11.120 seconds
Cross-validation score: 0.725557499242922
Test score: 0.7413793103448276
Best Hyperparameters: {}
1397.011470079422
32073.655363619328
31944.821681141853
45089.93683338165
872.9522761106491
1488.262512087822
193.1503518819809
871.6905626058578
792.0642856359482
358.12660336494446
5362.435202598572
2014.97678732872
147.171400308609
0.0
1361.9049398899078
0.0
286.8861618041992
579.5572057962418
284.2733118534088
92.56723976135254
1009.8085148334503
336.12900042533875
774.925575375557
475.54684579372406
42.29206991195679
8.451760053634644
63.27267861366272
95.16419041156769
108.80295741558075
254.31182837486267
832.3664420843124
178.5492868423462
48.358800172805786
1291.6704493761063
187.73011898994446
0.0
0.0
0.0
440.1233078241348
2.863529920578003
537.3516186475754
713.1786235570908
2.003159999847412
349.6230938434601
671.930079638958
0.0
225.8266612291336
0.0
0.0
0.0
174.6582908630371
196.0182707309723
142.3894007205963
70.97089910507202



Elapsed time to compute best fit: 10.786 seconds
Cross-validation score: 0.7310890150029213
Test score: 0.7204116638078902
Best Hyperparameters: {}
2451.4005274772644
18129.75202047825
53647.42368507385
34531.56077814102
1314.0615981817245
1354.5845386981964
200.62373995780945
941.2102273702621
626.5326660871506
375.2196183204651
4674.002467513084
4969.148459911346
116.01209092140198
0.0
1301.529133796692
0.0
1252.645390033722
446.19638073444366
579.7286930084229
45.78921926021576
508.26567709445953
556.999098777771
433.10373842716217
58.30955570936203
156.32615220546722
54.64267110824585
53.31257081031799
381.4098211526871
17.86532950401306
35.38395953178406
2722.160179376602
167.49177038669586
106.13167071342468
455.6182919740677
228.35751008987427
0.0
0.0
0.0
324.6967535018921
98.2948055267334
372.95820540189743
840.7642532587051
0.0
200.18249011039734
262.84257996082306
6.769889831542969
242.3349403142929
0.0
0.0
0.0
174.62926924228668
228.55705189704895
378.96722984313965
28.89232



Elapsed time to compute best fit: 10.799 seconds
Cross-validation score: 0.7260320436600903
Test score: 0.7155025553662692
Best Hyperparameters: {}
1290.3145529031754
22541.058974444866
53853.17872548103
34164.3702802062
986.6406263113022
1754.7345762252808
141.23260056972504
257.58149087429047
124.19242918491364
211.0453199148178
2738.0630959272385
5965.151413202286
63.676448702812195
0.0
1011.7533338069916
0.0
342.77836287021637
591.7953639030457
441.5535808801651
15.19927990436554
718.7258973121643
356.85342848300934
408.8818165063858
38.60277020931244
104.4147207736969
30.93575930595398
149.49984848499298
365.9681121110916
44.94986951351166
121.89129948616028
2537.9849566221237
51.870739102363586
190.58501958847046
301.11200189590454
210.91582822799683
0.0
0.0
0.0
268.8052316904068
129.2368984222412
435.50250709056854
893.8417984247208
48.14900016784668
288.07927989959717
248.57171189785004
1.8923399448394775
280.20690274238586
0.0
0.0
0.0
368.69933235645294
225.93611228466034
293.



Elapsed time to compute best fit: 11.399 seconds
Cross-validation score: 0.7235508364889803
Test score: 0.7296849087893864
Best Hyperparameters: {}
1365.4406245946884
14599.331531107426
74394.21602606773
14105.616352915764
756.5793653726578
1599.4869140386581
152.6886500120163
575.2779241800308
1055.8494180440903
363.5055555701256
7776.722949385643
6075.03005695343
41.83263027667999
0.0
858.05053627491
0.0
407.2339823246002
397.91875898838043
539.5449234247208
665.5598523616791
479.57561683654785
369.7288194298744
496.87330508232117
19.65872025489807
319.23925018310547
27.598199605941772
55.210869908332825
415.17097640037537
81.16247081756592
210.42901146411896
1813.2839270830154
50.91035079956055
91.92147958278656
536.9621539115906
61.62764930725098
0.0
0.0
0.0
528.4396619796753
412.7642147541046
346.23763704299927
748.7668689489365
62.917600870132446
447.4701586961746
375.244166970253
0.0
258.1834725141525
0.0
0.0
0.0
226.93282961845398
232.08978152275085
153.43059968948364
1.7169799



Elapsed time to compute best fit: 10.694 seconds
Cross-validation score: 0.7258924321079123
Test score: 0.7267950963222416
Best Hyperparameters: {}
1823.8137967586517
15369.063959419727
74883.97286123037
12639.984292149544
1761.3924096822739
3971.3097344636917
142.57275938987732
611.992448925972
735.4214318990707
1856.253789126873
3655.2103900909424
5930.774025022984
162.5073620080948
0.0
1249.1584602594376
0.0
152.34100008010864
335.8065503835678
765.3733311891556
100.5371001958847
589.6326860189438
250.67604100704193
513.0104701519012
166.45973205566406
113.20288002490997
23.149409770965576
34.12822091579437
211.61353945732117
179.38846731185913
237.1324644088745
1671.2337287664413
91.57542073726654
87.75231003761292
1140.0287085175514
184.0133512020111
0.0
0.0
0.0
128.36924958229065
207.52820205688477
418.35869348049164
594.5261583328247
0.0
628.7560595273972
701.4488559961319
34.64144992828369
373.5529224872589
0.0
0.0
0.0
321.4784553050995
186.86721003055573
199.99183082580566
394



Elapsed time to compute best fit: 11.461 seconds
Cross-validation score: 0.7296240902848316
Test score: 0.7627118644067796
Best Hyperparameters: {}
1117.9896434545517
16277.309513688087
48241.55089235306
47393.18129134178
1456.6808675527573
571.0435646772385
198.26833963394165
318.4999783039093
287.96522068977356
221.7514539361
4726.446916759014
4028.1729102134705
98.6113201379776
0.0
1061.1738232374191
0.0
448.511682510376
495.86164700984955
591.8822147846222
26.031230211257935
308.9890310764313
256.286843419075
474.5250688791275
323.4394559264183
22.87582039833069
175.54630947113037
85.17455899715424
83.7679306268692
110.00637173652649
145.745210647583
2011.5201169252396
133.0195701122284
168.22475862503052
470.1514302492142
122.7443699836731
0.0
0.0
0.0
217.73092138767242
125.67099857330322
182.90488147735596
580.5894455909729
0.0
516.2268371582031
254.5800187587738
0.0
354.9631813764572
0.0
0.0
0.0
145.32853066921234
232.26086592674255
292.0691806077957
3.084019899368286
75.9323674

# 5. Modeling - Non-Normalization

In [123]:
features = processedData_dataProcessingFeatures
labels = processedData_dataProcessingLabels

## 5.1 Rebalancing Strategy - None

### 5.1.1 Random Forest

In [124]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

none_randomforest_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    pipeline = Pipeline(steps = [#['smote', SMOTE(sampling_strategy = 0.5, n_jobs=2)],
                              #['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    none_randomforest_nonnormalized_performance_df = pd.concat([none_randomforest_nonnormalized_performance_df, new_performance_df])
    
none_randomforest_nonnormalized_performance_df.to_csv("../data/05_model_output/none_randomforest_nonnormalized_performance_df.csv")



### 5.1.2 XGBoost

In [125]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


none_xgboost_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    GXBoostPipeline = Pipeline(steps = [#['smote', SMOTE()],
                                    #['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    none_xgboost_nonnormalized_performance_df = pd.concat([none_xgboost_nonnormalized_performance_df, new_performance_df])
    

none_xgboost_nonnormalized_performance_df.to_csv("../data/05_model_output/none_xgboost_nonnormalized_performance_df.csv")




Elapsed time to compute best fit: 52.574 seconds
Cross-validation score: 0.7943487905824902
Test score: 0.7587064676616916
Best Hyperparameters: {}
0.013673065
0.038827274
0.022194145
0.038925923
0.006414761
0.009344051
0.016272571
0.01011065
0.011354514
0.01299262
0.024927579
0.01959663
0.0067885793
0.0
0.0068058097
0.0
0.018527366
0.038212873
0.012633735
0.0061570755
0.012663103
0.005368493
0.0054052463
0.008540484
0.0070756273
0.0050819777
0.0068657272
0.005867497
0.0049412223
0.0087335585
0.001553941
0.0040831147
0.006716085
0.012181806
0.0040340396
0.0
0.0
0.0
0.013644274
0.007093266
0.013622502
0.009180837
0.056394484
0.0069894874
0.012951855
0.0
0.005571084
0.0
0.0
0.0
0.0030316683
0.0096761035
0.00263082
0.0066177202
0.00048010182
0.0014484135
0.0052340613
0.0031875717
0.009917059
0.01720336
0.0053173536
0.011848517
0.002811026
0.0
0.0028873545
0.008089255
0.008821128
0.007151657
0.005811154
0.0041539567
0.007005304
0.0045890366
0.0052734944
0.0058140983
0.0042420463
0.00215722



Elapsed time to compute best fit: 49.811 seconds
Cross-validation score: 0.7838847002222387
Test score: 0.776595744680851
Best Hyperparameters: {}
0.011549534
0.04041337
0.021421613
0.03414669
0.0068754815
0.021914495
0.0052481517
0.017238686
0.0073794466
0.010664693
0.022428533
0.015349091
0.007651202
0.0
0.0092774425
0.0
0.01074853
0.025450299
0.009054806
0.011097582
0.023490215
0.0057023955
0.00938348
0.0030970112
0.010997929
0.0016200732
0.010069268
0.0078100073
0.005127718
0.00652278
0.0031599714
0.004061552
0.004816493
0.0044258647
0.005002044
0.0
0.0
0.0
0.0064649405
0.0
0.010197273
0.011025544
0.058805972
0.0054097534
0.0056263017
0.0
0.004608552
0.0
0.0
0.0
0.0026462115
0.00650141
0.0031863505
0.015546669
0.002474781
0.0060798493
0.020587856
0.011211712
0.007908421
0.02178341
0.004192411
0.00885753
0.008670436
0.009323634
0.012624352
0.007367905
0.007285347
0.0076506846
0.005307299
0.0058359145
0.008246113
0.009133852
0.009599252
0.003405865
0.010457972
0.0035669098
0.00257649



Elapsed time to compute best fit: 49.731 seconds
Cross-validation score: 0.7865771165157622
Test score: 0.7777777777777777
Best Hyperparameters: {}
0.008593232
0.03831817
0.021278426
0.033878528
0.009973142
0.018004289
0.005663406
0.009489425
0.005344261
0.011956795
0.011014188
0.018398462
0.00936866
0.0
0.0068332423
0.0
0.013382602
0.03870423
0.011929326
0.0050533977
0.023735309
0.003182579
0.017719654
0.0048168083
0.0034504016
0.0011776885
0.0052511147
0.00561604
0.0054646377
0.009146831
0.004312155
0.0033270682
0.0049536065
0.007521489
0.0029657255
0.0
0.0
0.0
0.0066189505
0.007965442
0.010325753
0.016952224
0.08348459
0.0048443056
0.0026389563
0.012516444
0.005129018
0.0
0.0
0.0
0.002280306
0.010978419
0.0037604636
0.0002569781
0.005862486
0.0038641451
0.011462471
0.011862565
0.013934568
0.023965105
0.0064212843
0.008217632
0.015825085
0.0038125254
0.0093288105
0.0044783372
0.0059771533
0.00054031494
0.0044288887
0.002865569
0.0066485633
0.0016107454
0.0044627893
0.007031785
0.0033



Elapsed time to compute best fit: 52.547 seconds
Cross-validation score: 0.7958818855746616
Test score: 0.776255707762557
Best Hyperparameters: {}
0.009084873
0.0335843
0.022897614
0.03144035
0.004979528
0.012212322
0.016472213
0.011181392
0.012992586
0.012001741
0.015466572
0.018722214
0.007726316
0.0
0.0041659325
0.0
0.013436887
0.02124313
0.017925778
0.007092706
0.016906964
0.0023240128
0.013793586
0.005708853
0.0074008233
0.0010895644
0.0028711779
0.0060367384
0.0062980405
0.004911047
0.0
0.0020850925
0.0029519915
0.0010878391
0.0039518825
0.0
0.0
0.0
0.012767292
0.0017480833
0.008178089
0.023086822
0.058069233
0.0024486207
0.0050155637
0.0
0.0034988965
0.0
0.0
0.0
0.0028516275
0.0048679486
0.005109834
0.012128495
0.0082413
0.0065203593
0.035464596
0.0060713775
0.009138279
0.019134678
0.004946645
0.018197093
0.0029017727
0.016896537
0.004703365
0.008252639
0.008211042
0.012364468
0.0024840492
0.005431607
0.003956969
0.011083236
0.005393975
0.004802457
0.0040124175
0.014314453
0.006



Elapsed time to compute best fit: 48.890 seconds
Cross-validation score: 0.7774914297793345
Test score: 0.7333333333333333
Best Hyperparameters: {}
0.0062192217
0.038525954
0.022082137
0.034949526
0.009419273
0.013806917
0.00711142
0.010799585
0.016740808
0.019427244
0.011530601
0.018438913
0.0033236423
0.0
0.011728307
0.0
0.013518719
0.01076542
0.021835783
0.0031752582
0.024176735
0.0038901628
0.0146381
0.007816752
0.0062543005
0.0019171328
0.009061779
0.010189645
0.0032238502
0.0064268354
0.0045825904
0.0030350585
0.0052941423
0.008025531
0.0064927284
0.0
0.0
0.0
0.0067989607
0.0017421858
0.005824931
0.007530493
0.050756097
0.011860534
0.009694705
0.0
0.005124647
0.0
0.0
0.0
0.0020950323
0.007241777
0.004683142
0.008568936
0.005235306
0.0056670355
0.019940756
0.0068245567
0.009470696
0.025325889
0.0016696388
0.005794893
0.0034336469
0.010600872
0.0057888986
0.005858925
0.006833607
0.010178478
0.00457993
0.0056907027
0.00619922
0.0146054225
0.005330361
0.0071178535
0.0050296634
0.0038



Elapsed time to compute best fit: 51.494 seconds
Cross-validation score: 0.7378808789568843
Test score: 0.7938388625592417
Best Hyperparameters: {}
0.010386325
0.033064835
0.025022196
0.036404867
0.009176772
0.013535816
0.0076377885
0.005370149
0.008542503
0.005266632
0.01782553
0.014711135
0.0059833108
0.0
0.0077187163
0.0
0.018245194
0.027185028
0.01272178
0.007353692
0.009757439
0.0033709656
0.009733597
0.0015684479
0.005651891
0.0049294615
0.0036426685
0.009072888
0.0056543523
0.008218938
0.0036485964
0.0058394694
0.0035508673
0.0015832563
0.005967133
0.0
0.0
0.0
0.009829128
0.0154103
0.0073814257
0.018819971
0.07292144
0.008448927
0.0054686875
0.0
0.0044191955
0.0
0.0
0.0
0.003214084
0.004270449
0.0046144226
0.010281334
0.008457675
0.0015101696
0.019723827
0.015019132
0.008882029
0.01332618
0.019172305
0.0135001
0.0034040771
0.05326975
0.006299096
0.00507093
0.009718788
0.008688024
0.0033962484
0.005348523
0.00399816
0.014755097
0.0040173796
0.007778765
0.0036664915
0.0
0.00270150



Elapsed time to compute best fit: 51.553 seconds
Cross-validation score: 0.7708283755637515
Test score: 0.8296943231441049
Best Hyperparameters: {}
0.00879397
0.040952943
0.022584815
0.04281236
0.008793308
0.022098353
0.01348669
0.0070263227
0.02055933
0.013830703
0.016941877
0.016209712
0.0065838965
0.0
0.0068150065
0.0
0.013408414
0.014596866
0.013435792
0.0046028136
0.031284414
0.0031031996
0.015825696
0.020670986
0.0050794007
0.002076408
0.007277445
0.006238776
0.004635133
0.009005945
0.014070232
0.0032574479
0.006365047
0.002149021
0.0053166132
0.0
0.0
0.0
0.011104129
0.0
0.005776432
0.0063009453
0.0
0.0020220473
0.0069829533
0.0
0.0015623411
0.0
0.0
0.0
0.0044368026
0.0074066776
0.00252857
0.0036290744
0.024890922
0.0074109486
0.01380932
0.0030987686
0.0127851395
0.018334394
0.010578327
0.0089709805
0.013145108
0.012305255
0.010204198
0.004329044
0.009426919
0.0
0.0038012294
0.0037717908
0.008082655
0.010729287
0.0048294305
0.00498801
0.009043701
0.0
0.011765155
0.01006223
0.0155



Elapsed time to compute best fit: 53.347 seconds
Cross-validation score: 0.7543160260438778
Test score: 0.852017937219731
Best Hyperparameters: {}
0.008424765
0.041686293
0.026664225
0.048738357
0.005851217
0.020196721
0.005355311
0.00769248
0.01596651
0.015095571
0.010831098
0.023982063
0.004102097
0.0
0.008342378
0.0
0.0150608225
0.031977635
0.016101647
0.0039570774
0.018520545
0.0063023423
0.016681835
0.006549715
0.008128894
0.0029449142
0.011842869
0.011130747
0.0044121207
0.008647534
0.0051366133
0.0045771534
0.0036134662
0.0013175249
0.0074393856
0.0
0.0
0.0
0.006363327
0.0
0.007995807
0.017360551
0.0
0.0054103513
0.010618094
0.0
0.0022273976
0.0
0.0
0.0
0.0040744506
0.0050193695
0.008613045
0.0044071865
0.008248104
0.0037899078
0.012459162
0.010383316
0.006685667
0.0128767155
0.0060640676
0.015606696
0.0017090889
0.011706569
0.010396193
0.0071088253
0.00927884
0.0024629312
0.008274969
0.006949219
0.008924163
0.012025224
0.0068220613
0.0129518295
0.0029206527
0.0
0.0057875966
0.0



Elapsed time to compute best fit: 48.269 seconds
Cross-validation score: 0.7991026998482577
Test score: 0.7906976744186046
Best Hyperparameters: {}
0.008806972
0.041567564
0.019070966
0.034128077
0.009180924
0.014887017
0.009795197
0.0062067467
0.011424116
0.012660915
0.019876372
0.016996816
0.00795204
0.0
0.008639734
0.0
0.011672412
0.042399608
0.0090802135
0.0058247754
0.015482116
0.0059939255
0.020586949
0.007974205
0.0015896505
0.00079686847
0.0044017085
0.007593481
0.006567975
0.005343175
0.0070262128
0.0049200626
0.0047376635
0.0029570234
0.0038517523
0.0
0.0
0.0
0.014756359
0.0
0.0030096846
0.015841348
0.06989661
0.0030993922
0.011699718
0.0
0.0052136765
0.0
0.0
0.0
0.003851903
0.00936506
0.004735593
0.010084241
0.008472481
0.0028614735
0.024834642
0.0005731519
0.011465812
0.014436361
0.0030809774
0.013445288
0.002338445
0.020048877
0.003920922
0.0043357452
0.006054907
0.0024192513
0.008856792
0.0054055094
0.0083746305
0.025935296
0.0037372075
0.0019082275
0.005081039
0.00123536



Elapsed time to compute best fit: 51.738 seconds
Cross-validation score: 0.7674842370266111
Test score: 0.7960199004975124
Best Hyperparameters: {}
0.007197385
0.040781703
0.024533983
0.045098573
0.008413847
0.014975139
0.0061925007
0.013223841
0.009157376
0.004910028
0.01489232
0.016160216
0.008802473
0.0
0.010038669
0.0
0.009471123
0.038357396
0.009934578
0.0049872366
0.0127624385
0.009223605
0.0151984235
0.0031238974
0.008590533
0.003573584
0.010912368
0.007519071
0.0040484043
0.0070053493
0.0040364624
0.004412929
0.0065672128
0.0029103397
0.0024379343
0.0
0.0
0.0
0.006971974
0.0
0.009521948
0.014537194
0.03128943
0.015912341
0.0054891123
0.0
0.0070347195
0.0
0.0
0.0
0.00354173
0.010346591
0.005156599
0.00523806
0.00018234177
0.003521794
0.019479567
0.008851292
0.008712094
0.017942782
0.0048458143
0.0106743155
0.011937627
0.0022235531
0.010565989
0.0064605405
0.009705438
0.0057923985
0.0029354515
0.005139751
0.007309288
0.029046193
0.006080935
0.004272222
0.0034999303
0.0064056027
0



Elapsed time to compute best fit: 52.058 seconds
Cross-validation score: 0.7844466745574425
Test score: 0.776255707762557
Best Hyperparameters: {}
0.008161903
0.03572283
0.02591601
0.034211412
0.0073413723
0.011138076
0.0057098344
0.008695235
0.0066754166
0.009759712
0.013238437
0.021882968
0.007602126
0.0
0.0083837295
0.0
0.0292868
0.016027337
0.015602427
0.0072861
0.022933153
0.006833621
0.021605272
0.013623274
0.009368275
0.0013080233
0.0051574316
0.011393539
0.006522037
0.0058481055
0.014077268
0.0049989657
0.005786126
0.004517846
0.003943715
0.0
0.0
0.0
0.006097311
0.014584902
0.00878837
0.010711256
0.05273621
0.0062189223
0.0028601503
0.010935429
0.0026823038
0.0
0.0
0.0
0.0046244957
0.0064989193
0.0033893897
0.014115301
0.00053567404
0.0034408674
0.017594485
0.014726797
0.009201995
0.014008876
0.00855235
0.007884117
0.0020618183
0.014828427
0.011928573
0.005703885
0.0077192276
0.0076306215
0.0050862357
0.004433205
0.006432188
0.00483154
0.0055093686
0.002381058
0.0046768906
0.01



Elapsed time to compute best fit: 51.660 seconds
Cross-validation score: 0.7774300488736146
Test score: 0.796943231441048
Best Hyperparameters: {}
0.014492184
0.04048951
0.019611195
0.031050367
0.0061460207
0.0089793485
0.0061540175
0.006422314
0.014355794
0.0075864093
0.015599742
0.0117407255
0.0064035496
0.0
0.008304838
0.0
0.01492232
0.030359259
0.008007915
0.003972428
0.019646736
0.0053972895
0.008715254
0.0068869344
0.011796149
0.0037145566
0.006069297
0.009888622
0.0028540937
0.00815007
0.0035713397
0.0031625102
0.0041589807
0.0018896293
0.0035030185
0.0
0.0
0.0
0.005587821
0.0
0.004918265
0.014232446
0.06381296
0.013085733
0.008991716
0.0
0.004521279
0.0
0.0
0.0
0.0042084106
0.004756837
0.0018397159
0.0046275933
0.00019871038
0.0018776956
0.021298228
0.0057495856
0.0050471895
0.0209876
0.003569676
0.010263926
0.008236087
0.00057602505
0.0032961166
0.0030832358
0.008096944
0.072550446
0.006054159
0.006184548
0.0072992896
0.008005342
0.005114862
0.0042906627
0.0019242016
0.0
0.006



Elapsed time to compute best fit: 51.237 seconds
Cross-validation score: 0.780441296878927
Test score: 0.8179723502304148
Best Hyperparameters: {}
0.00984272
0.03736597
0.020654438
0.040969763
0.008329297
0.011824221
0.015781293
0.0054369075
0.0076781875
0.0072273086
0.020417064
0.019851796
0.008519856
0.0
0.007159644
0.0
0.011649809
0.024045842
0.011095999
0.021451717
0.018247947
0.0040492476
0.019092882
0.008039865
0.008461623
0.0019997854
0.007226131
0.005591989
0.004311013
0.0057434533
0.005361721
0.0024437755
0.0065221135
0.008524835
0.002444727
0.0
0.0
0.0
0.009186763
0.0
0.009681533
0.008262655
0.044000994
0.004340532
0.0044493093
0.0006864714
0.007835554
0.0
0.0
0.0
0.0033085626
0.0039137853
0.0047768145
0.027495636
0.0056816265
0.0049629943
0.014391295
0.014410618
0.009449751
0.018368328
0.008342131
0.0074117756
0.0023200854
0.0025334829
0.004407536
0.007396436
0.008684715
0.008505607
0.005166766
0.00648539
0.005701381
0.0077487864
0.0049478877
0.0086167995
0.008499902
0.0
0.0



Elapsed time to compute best fit: 51.259 seconds
Cross-validation score: 0.7747456163808172
Test score: 0.813953488372093
Best Hyperparameters: {}
0.008029168
0.034448523
0.022877539
0.035585638
0.01153847
0.009798312
0.007384064
0.010430042
0.016081616
0.005713904
0.01937629
0.0164545
0.004845095
0.0
0.0041016517
0.0
0.010860201
0.025311429
0.013881219
0.00790814
0.024297869
0.004072338
0.018716352
0.0019822612
0.0019132468
0.003680791
0.006695846
0.0051723626
0.0031473138
0.005158537
0.0032661895
0.004245064
0.0027543684
0.0039123795
0.0038445636
0.0
0.0
0.0
0.009352686
0.025705943
0.01137507
0.00981479
0.084783114
0.005476671
0.007414181
0.0
0.0058807014
0.0
0.0
0.0
0.0030589781
0.0058451076
0.004069308
0.008637941
0.0018465611
0.0026019155
0.021423759
0.0045746816
0.0108342655
0.018811295
0.007548653
0.0110425735
0.0018507021
0.004420403
0.0036706862
0.00586634
0.0052369563
0.027086057
0.0041422504
0.006213337
0.0062559578
0.008309256
0.005064886
0.003090211
0.006491756
0.0
0.00642



Elapsed time to compute best fit: 52.963 seconds
Cross-validation score: 0.7865735584476192
Test score: 0.8031674208144796
Best Hyperparameters: {}
0.0054327385
0.043171942
0.019284423
0.044899207
0.006586632
0.016251205
0.0056427037
0.013596846
0.013814171
0.008515269
0.011576697
0.0204174
0.008205335
0.0
0.0105625065
0.0
0.028770031
0.01583568
0.013456806
0.0068089995
0.015990883
0.0025190134
0.0135847945
0.0029720818
0.006169789
0.0034589195
0.005534856
0.008115281
0.0031525572
0.0065784683
0.011830765
0.006047164
0.0033011665
0.003123048
0.0030672986
0.0
0.0
0.0
0.005094078
0.0044147447
0.006489419
0.0027178042
0.0654721
0.0016457917
0.00737161
0.019091988
0.005311467
0.0
0.0
0.0
0.0048536644
0.0024258096
0.004114179
0.004880004
0.0017970966
0.0036041413
0.00484312
0.002512189
0.005608659
0.013847655
0.005648013
0.0096795475
0.0011581981
0.0025227007
0.007763432
0.0051030978
0.004956275
0.002552788
0.0089895325
0.0045719445
0.008218535
0.0104736
0.007179503
0.0068739736
0.005296263



Elapsed time to compute best fit: 51.261 seconds
Cross-validation score: 0.7553424232780277
Test score: 0.8255813953488371
Best Hyperparameters: {}
0.008115801
0.037458215
0.023358395
0.041816637
0.006528998
0.013294287
0.014598434
0.0046840482
0.015059688
0.008317777
0.010039455
0.019509008
0.008041203
0.0
0.005683639
0.0
0.012806635
0.025458049
0.013416219
0.005830656
0.021943275
0.008814254
0.021367423
0.00510232
0.0042932704
0.0021002626
0.0132534355
0.00722725
0.005998106
0.0039383476
0.004092798
0.0016273784
0.0067018718
0.004007574
0.004103403
0.0
0.0
0.0
0.0039214925
0.009363252
0.0052855434
0.011859308
0.083853036
0.005414245
0.005261997
0.0
0.008708156
0.0
0.0
0.0
0.003947945
0.002939463
0.0028076358
0.0057754116
0.0
0.01688695
0.03272576
0.0024589393
0.015307789
0.016756916
0.006354458
0.009233297
0.004744729
0.0019378423
0.0069450014
0.006076461
0.010804634
0.0
0.006586731
0.0037698771
0.0068124253
0.01590524
0.0054487283
0.00854826
0.003899863
0.010242911
0.0036185007
0.00



Elapsed time to compute best fit: 48.128 seconds
Cross-validation score: 0.7886938916785817
Test score: 0.7711442786069652
Best Hyperparameters: {}
0.011776216
0.039655276
0.019727895
0.035898373
0.00799946
0.007994087
0.010496023
0.010240232
0.014249214
0.008070839
0.026563311
0.019795597
0.006493156
0.0
0.0065196194
0.0
0.010982275
0.03716889
0.013410349
0.006348401
0.013038929
0.006628711
0.008342351
0.0051423656
0.0029347464
0.0024605424
0.007727272
0.0066418387
0.0053091436
0.0072330763
0.017493995
0.0046064383
0.0032653513
0.0022720501
0.002691589
0.0
0.0
0.0
0.01151612
0.0
0.0028995168
0.012469191
0.07114054
0.00650994
0.010224884
0.016900143
0.003084522
0.0
0.0
0.0
0.002846348
0.0017445077
0.0054113064
0.00690626
0.010668548
0.0022778534
0.030589335
0.0019923123
0.009077061
0.017655201
0.0015054975
0.0069940304
0.00668941
0.0029852544
0.0058435732
0.003954925
0.007743284
0.0073998286
0.0053616418
0.0077275406
0.007446352
0.00856532
0.0028114337
0.009660794
0.0023223278
0.0
0.00



Elapsed time to compute best fit: 50.317 seconds
Cross-validation score: 0.7908016823959094
Test score: 0.76036866359447
Best Hyperparameters: {}
0.008843636
0.035552163
0.027817339
0.035957173
0.011495388
0.011569991
0.010805217
0.0064803236
0.009458472
0.0124841
0.018020773
0.020272018
0.007099143
0.0
0.006913373
0.0
0.015193592
0.027236562
0.011094445
0.006638817
0.012821615
0.009428263
0.02773149
0.006898087
0.0037901173
0.0040066135
0.0071097463
0.008822847
0.0031812342
0.0068344274
0.0017438264
0.0031942055
0.006128096
0.003455125
0.0048354934
0.0
0.0
0.0
0.0034432814
0.012524542
0.00661447
0.011729602
0.03206823
0.007235016
0.0069554304
0.0010260622
0.002732622
0.0
0.0
0.0
0.0038010082
0.0036317615
0.0041042655
0.006137839
0.003466434
0.0015697979
0.0057888175
0.0062440466
0.006834272
0.013031265
0.0075125797
0.016628796
0.0007065112
0.0069202743
0.0022372028
0.01188921
0.005168599
0.0011300433
0.0045271236
0.0020452542
0.007900064
0.0117388265
0.0048967344
0.0043553226
0.006448



Elapsed time to compute best fit: 49.091 seconds
Cross-validation score: 0.771165391423877
Test score: 0.7718894009216588
Best Hyperparameters: {}
0.011552278
0.040006455
0.026013976
0.037765253
0.0036344584
0.01285091
0.009189581
0.007943627
0.0068265474
0.0114512015
0.020561723
0.016069854
0.0040474585
0.0
0.00932881
0.0
0.0106092915
0.04443886
0.021972626
0.010405289
0.017170962
0.0035823444
0.015292876
0.003605646
0.0072466065
0.0014553173
0.003403395
0.0043894076
0.008647666
0.009039656
0.0018101406
0.0048943846
0.0046507223
0.0015975537
0.003401945
0.0
0.0
0.0
0.0067834924
0.0
0.008222969
0.01706187
0.071071774
0.011734421
0.005138747
0.0
0.002124782
0.0
0.0
0.0
0.0041206516
0.007919785
0.006695512
0.008698077
0.0058005517
0.001582195
0.021111693
0.003037761
0.0074654785
0.010720088
0.00058769324
0.01215622
0.005258427
0.011580561
0.0058499486
0.003969962
0.008836686
0.005763343
0.0063025854
0.006550239
0.009934967
0.007042544
0.004262006
0.005935954
0.006450014
0.0041246144
0.00



Elapsed time to compute best fit: 51.746 seconds
Cross-validation score: 0.7448820363868645
Test score: 0.7834101382488479
Best Hyperparameters: {}
0.010280275
0.03651531
0.020293672
0.04300255
0.0057739196
0.02371659
0.003325439
0.01077572
0.011999447
0.004588374
0.0257604
0.0133623835
0.0050873756
0.0
0.0057291244
0.0
0.016323455
0.071868114
0.0058262567
0.005314111
0.017521188
0.009531667
0.014388264
0.0052664215
0.0043167225
0.0023803182
0.005338606
0.0043168045
0.003163414
0.0070449715
0.007954574
0.0037361262
0.005594454
0.0019985642
0.0033192234
0.0
0.0
0.0
0.0044341404
0.0
0.0069227912
0.013018626
0.0
0.0032546218
0.0040836465
0.0
0.0069897408
0.0
0.0
0.0
0.0043700766
0.005453768
0.0034784009
0.006567241
0.023843957
0.0035042542
0.01858597
0.030477328
0.006882882
0.018595295
0.0037161328
0.013861626
0.013826461
0.006313964
0.0077596316
0.0052573215
0.00935816
0.0051232832
0.0058172806
0.006322859
0.005561674
0.010440837
0.0047153006
0.0028738081
0.0060616815
0.014166054
0.01056



Elapsed time to compute best fit: 50.877 seconds
Cross-validation score: 0.7837744408983065
Test score: 0.7981220657276995
Best Hyperparameters: {}
0.013346521
0.03731898
0.02492649
0.034692872
0.009039849
0.014622272
0.010766867
0.008820492
0.010819749
0.007769785
0.021294111
0.013454036
0.0075243115
0.0
0.008332186
0.0
0.011902539
0.025353128
0.016139762
0.0045054103
0.01333673
0.0062061544
0.020146552
0.00917386
0.0031390188
0.004608129
0.0039814445
0.0057253866
0.0036937033
0.005202986
0.015429238
0.0040042475
0.005281496
0.0065570357
0.004273839
0.0
0.0
0.0
0.0057922862
0.0
0.0030411168
0.012425488
0.08834183
0.0036256663
0.013462778
0.0
0.0033540567
0.0
0.0
0.0
0.003163492
0.0033073504
0.0047113826
0.0049572354
0.0
0.011404081
0.012376164
0.0010046797
0.009922163
0.020697994
0.02217761
0.011009418
0.009380584
0.004954254
0.008834451
0.003968379
0.007840256
0.00059938064
0.004249658
0.0061175474
0.007173194
0.008875553
0.0039091003
0.007912637
0.006770211
0.0
0.007909315
0.0024478



Elapsed time to compute best fit: 52.528 seconds
Cross-validation score: 0.771188875873208
Test score: 0.8259911894273128
Best Hyperparameters: {}
0.009243852
0.038908154
0.019315159
0.043612037
0.004954951
0.011616047
0.006579722
0.011832722
0.010152744
0.009065675
0.018748553
0.019190868
0.006118149
0.0
0.0070401393
0.0
0.02118614
0.035185795
0.002451119
0.008774038
0.005615203
0.0064675286
0.02157499
0.011352702
0.0022095768
0.007882296
0.0036989378
0.007816773
0.0061312625
0.007542109
0.022889249
0.0033248533
0.0039010707
0.004112656
0.0029738056
0.0
0.0
0.0
0.0054295384
0.0
0.0048008137
0.008295802
0.073081546
0.006011778
0.0053063403
0.0
0.003492023
0.0
0.0
0.0
0.004870249
0.0032760734
0.0058679283
0.0072175893
0.015205934
0.0053188875
0.013508126
0.006153977
0.010000559
0.011671909
0.0060463613
0.012294875
0.0056148544
0.017342668
0.02064662
0.0050536646
0.0045786705
0.0064434004
0.00539
0.004678086
0.0058045164
0.009705654
0.005065712
0.0082010785
0.005821743
0.009165564
0.0068



Elapsed time to compute best fit: 51.790 seconds
Cross-validation score: 0.7825256313690718
Test score: 0.7805429864253394
Best Hyperparameters: {}
0.008528941
0.042684227
0.024844458
0.043180782
0.00808366
0.013555648
0.012216077
0.018249922
0.017087745
0.008460912
0.01140263
0.022303568
0.0055939155
0.0
0.011218955
0.0
0.007659506
0.024994828
0.010799097
0.008545104
0.019048475
0.0050001005
0.031948306
0.005430555
0.0
0.0017578648
0.007204109
0.00898043
0.0035934849
0.0079695415
0.0065560066
0.00351934
0.005196374
0.0053430567
0.005358307
0.0
0.0
0.0
0.0065094177
0.01697582
0.010219694
0.01067276
0.0
0.006209888
0.007639411
0.0
0.004757683
0.0
0.0
0.0
0.0044205175
0.005724893
0.0029665842
0.018883387
0.0036291273
0.005943693
0.013291193
0.004130991
0.014648087
0.015534618
0.011550189
0.018598238
0.008038765
0.0074738464
0.004304604
0.0075628706
0.0061868946
0.0035121143
0.0055398783
0.00692254
0.008888832
0.013054058
0.0056618038
0.009447827
0.0057840114
0.0
0.006885675
0.0069547375




Elapsed time to compute best fit: 48.853 seconds
Cross-validation score: 0.7956779519120877
Test score: 0.8098591549295775
Best Hyperparameters: {}
0.0074192425
0.03824348
0.022104185
0.041227873
0.0041692285
0.013604953
0.004508375
0.010414446
0.008717864
0.006773005
0.010811497
0.023428256
0.007485488
0.0
0.006303833
0.0
0.035586707
0.020837221
0.016063457
0.008564654
0.00802856
0.0036503822
0.019460142
0.009615314
0.0039785514
0.0025248476
0.003666893
0.0072789593
0.003437196
0.012307642
0.0060900236
0.0042885104
0.0034512354
0.00057328393
0.0043108496
0.0
0.0
0.0
0.009518069
0.0
0.010541301
0.016539535
0.057760466
0.0077652805
0.0055879145
0.0
0.0018100846
0.0
0.0
0.0
0.002888344
0.0056637265
0.006221456
0.002015919
0.0014923677
0.0060309274
0.019184181
0.009417094
0.009181195
0.024989279
0.0055653555
0.02518136
0.0036244818
0.004211949
0.0029696643
0.007718376
0.005041451
0.000352686
0.0073411544
0.0071272645
0.005627775
0.03380122
0.0055450154
0.0047870805
0.0063520963
0.01276202



Elapsed time to compute best fit: 51.146 seconds
Cross-validation score: 0.7925620077938232
Test score: 0.7819905213270142
Best Hyperparameters: {}
0.008613803
0.040045418
0.020018434
0.041107
0.0061157383
0.013546397
0.011338985
0.0071358616
0.014266515
0.010590964
0.022717293
0.017617328
0.0053169127
0.0
0.005586013
0.0
0.017180713
0.028797038
0.01317697
0.007018
0.0132104205
0.0054460587
0.014810969
0.0038498049
0.0022371248
0.003004274
0.0053772437
0.011255035
0.005511435
0.0087433625
0.011439115
0.002286975
0.006300562
0.0031899745
0.005320085
0.0
0.0
0.0
0.005778776
0.0066454047
0.008751745
0.022926131
0.0
0.0047243377
0.0075112707
0.0
0.0039838427
0.0
0.0
0.0
0.005119916
0.004785549
0.005381851
0.007937292
0.0064368667
0.0019205806
0.009668996
0.017688483
0.011057298
0.0223956
0.002877701
0.008306445
0.0028347205
0.0024074188
0.00823666
0.004172716
0.0046606595
0.0010958561
0.010513189
0.0053238636
0.0077434904
0.010987774
0.0058018025
0.004461736
0.0065197265
0.01622711
0.00158

### 5.1.3 LightGBM

In [126]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

none_lightgbm_performance_nonnormalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    LightGBMPipeline = Pipeline(steps = [#['smote', SMOTE()],
                                    #['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    none_lightgbm_performance_nonnormalized_df = pd.concat([none_lightgbm_performance_nonnormalized_df, new_performance_df])
    

none_lightgbm_performance_nonnormalized_df.to_csv("../data/05_model_output/none_lightgbm_performance_nonnormalized_df.csv")




Elapsed time to compute best fit: 8.966 seconds
Cross-validation score: 0.618763435377045
Test score: 0.669811320754717
Best Hyperparameters: {}
236.81718602776527
5086.749857366085
4806.088783323765
1716.6344927996397
79.02853347361088
218.7990177720785
477.8463351726532
117.22598275542259
66.88403397798538
178.0059562176466
531.5828012377024
1587.9948019236326
105.01305139064789
0.0
260.7926665395498
0.0
497.90744894742966
111.0339644998312
294.2510560154915
94.89365392923355
182.60693091154099
21.208560049533844
144.96122720837593
24.765694975852966
63.60423073172569
6.1161399483680725
38.93726372718811
91.11219865083694
67.45208398997784
162.00929361581802
134.11920991539955
82.04123404622078
100.43496891856194
14.995389580726624
80.44237262010574
0.0
0.0
0.0
16.75202763080597
0.0
79.6896425485611
39.060318648815155
3.7243099212646484
243.72514724731445
293.8617513179779
0.0
196.45522797107697
0.0
0.0
0.0
48.47652570903301
107.36542177200317
272.10050824284554
35.3600158393383
1.94



Elapsed time to compute best fit: 8.094 seconds
Cross-validation score: 0.60202866778262
Test score: 0.6774809160305345
Best Hyperparameters: {}
301.67398262023926
3503.9736139327288
5406.888564556837
1659.004014968872
224.92948472499847
1243.6827162653208
796.4524876773357
176.34487602114677
86.12465226650238
138.41502776741982
659.5325096100569
813.4105129241943
102.1234682649374
0.0
197.2353612780571
0.0
106.72046023607254
177.0925753712654
120.35816258192062
43.65975549817085
130.35710138082504
34.85205878317356
356.565791875124
12.053607016801834
59.885939210653305
15.084419965744019
46.0181660503149
74.04450438916683
79.37779527902603
301.88374872505665
36.95068025588989
76.24121960997581
52.56388708949089
7.588577091693878
58.821273401379585
0.0
0.0
0.0
49.491980254650116
0.3845190107822418
85.39098834991455
39.537973672151566
1.3344759941101074
529.5565130710602
64.21434000134468
0.0
78.04656720161438
0.0
0.0
0.0
242.7565257847309
28.76146075129509
199.29697793722153
45.9363592



Elapsed time to compute best fit: 8.213 seconds
Cross-validation score: 0.6122765880873011
Test score: 0.6273764258555133
Best Hyperparameters: {}
731.2953087985516
4397.968645229936
5743.019003495574
803.8696671873331
200.87328596413136
342.4119475632906
584.5465688705444
182.45156851410866
59.8464358150959
179.96587981283665
1428.3274023532867
211.35171121358871
72.15045201778412
0.0
77.42764601111412
0.0
127.97371737658978
78.76007542014122
668.4204389899969
54.76136238873005
289.57861594855785
42.64469435811043
122.78049977123737
96.34417402744293
10.556520879268646
16.307029634714127
25.7755089700222
255.89883695542812
49.05109894275665
244.53894364833832
37.88380390405655
70.98832403123379
105.8810792863369
26.02333176136017
55.852357894182205
0.0
0.0
0.0
71.18601679801941
6.090346097946167
61.83711379766464
26.627515077590942
1.5094499588012695
187.30815663933754
132.8614314943552
0.0
33.57556942105293
0.0
0.0
0.0
115.4362960755825
76.97611472010612
139.17388662695885
89.3633893



Elapsed time to compute best fit: 8.435 seconds
Cross-validation score: 0.6300838923378616
Test score: 0.6417624521072798
Best Hyperparameters: {}
1074.3751464784145
4040.5080693364143
5757.937516242266
1036.0471305251122
76.5578975379467
360.0703664124012
50.46272575855255
81.93525964021683
83.88984568417072
167.05927427113056
1286.8548194468021
588.5264898687601
52.09764476120472
0.0
218.17298144102097
0.0
726.7791884541512
97.25655800104141
947.5939475297928
199.2187423557043
190.8379280269146
88.27538105845451
135.40535229444504
94.92557549476624
48.99746897816658
25.480683356523514
54.499457120895386
244.3610004335642
74.6994546353817
154.38719102740288
18.68465781211853
56.575333297252655
40.70910179615021
33.14631789922714
92.95885643362999
0.0
0.0
0.0
30.526020139455795
10.392819911241531
44.264063596725464
16.743465945124626
21.225550413131714
254.9264786541462
52.8328201174736
0.39229699969291687
85.29358598589897
0.0
0.0
0.0
59.00827044248581
78.61087885499
158.9992508590221



Elapsed time to compute best fit: 8.474 seconds
Cross-validation score: 0.6028474685263696
Test score: 0.6153846153846153
Best Hyperparameters: {}
677.995606392622
3831.291507586837
5579.225536093116
1261.230287194252
99.34559132158756
139.4031447917223
63.12892493605614
170.76095268130302
112.31149423122406
117.5769951492548
326.278588026762
1773.2128856778145
465.2669628858566
0.0
407.22593265771866
0.0
186.89328706264496
1006.5647510439157
79.95364537835121
56.27070704102516
115.72545102238655
36.62853188812733
146.02095919847488
18.828872114419937
55.099682450294495
7.093935906887054
42.83420440554619
112.85472957789898
58.483424603939056
148.7209390103817
19.388710618019104
47.607210755348206
118.26223643124104
28.825423270463943
82.6630914658308
0.0
0.0
0.0
10.487734764814377
10.373773038387299
116.48790819942951
40.18940603733063
0.0
28.101247400045395
44.56902000308037
0.0
372.92489832639694
0.0
0.0
0.0
82.44318148493767
48.883464485406876
84.94220209121704
85.49442884325981
8.



Elapsed time to compute best fit: 11.396 seconds
Cross-validation score: 0.5913672792776328
Test score: 0.6766917293233081
Best Hyperparameters: {}
360.414502248168
5216.080850943923
4166.635012313724
1460.0847129374743
199.72051894664764
313.16826559603214
42.20658677816391
111.91412222385406
121.50218069553375
61.99418170750141
475.9064276367426
1560.8634651452303
57.3719337284565
0.0
267.7040456533432
0.0
181.1407727599144
938.3450070619583
506.2663908600807
55.912646144628525
109.93129894137383
20.528121948242188
92.79269142448902
31.742142617702484
98.55869115889072
3.801326036453247
55.65882124006748
128.645138412714
146.64659376442432
36.791096687316895
41.486051082611084
90.97201105952263
286.82278087735176
3.890860080718994
189.56104916334152
0.0
0.0
0.0
33.48029500246048
1.1233690083026886
107.48267602920532
358.8346267938614
30.87689971923828
69.50816145539284
73.70524248480797
0.0
280.6058230996132
0.0
0.0
0.0
117.51564881205559
99.2348031103611
116.0267773270607
73.2312710



Elapsed time to compute best fit: 8.282 seconds
Cross-validation score: 0.6318520930850876
Test score: 0.6333973128598848
Best Hyperparameters: {}
349.41455352306366
4863.799665138125
4252.391612753272
1902.7537911087275
182.30111289024353
160.04007621109486
715.6356597691774
90.27814641594887
42.99130895733833
155.88545405864716
546.0636392235756
1894.0780540555716
108.46386553347111
0.0
172.6263406276703
0.0
64.81505477428436
113.09436124563217
94.3659098893404
106.06234887242317
577.7168335467577
34.051037937402725
151.39650303125381
30.780348420143127
55.9973064661026
19.023466020822525
110.90139946341515
200.78496704995632
147.80479066073895
140.22579136490822
38.010817766189575
80.20410610735416
152.54156236350536
5.598551034927368
88.42435416579247
0.0
0.0
0.0
52.195885837078094
0.8077679872512817
133.2927483022213
479.08808909356594
0.0
23.123805567622185
283.9404867142439
1.1603200435638428
63.256810039281845
0.0
0.0
0.0
107.53062656521797
89.64596223831177
81.19762322306633
2



Elapsed time to compute best fit: 8.641 seconds
Cross-validation score: 0.6330739509168225
Test score: 0.5973025048169557
Best Hyperparameters: {}
616.8535190820694
3754.5645439326763
5356.676074579358
1597.6488440334797
151.3804227411747
151.7512945830822
29.83666391670704
86.6097402125597
34.725024819374084
116.0450222492218
1700.885854959488
410.5709300339222
68.33129447698593
0.0
372.44296081364155
0.0
142.65303948521614
1272.5912483632565
393.4618661403656
41.712744146585464
182.40928438305855
15.702812016010284
101.86629298329353
21.40607511997223
89.94908046722412
16.899736911058426
58.1308773458004
91.91811500489712
53.896865636110306
297.83719001710415
38.97633922100067
42.17751029133797
84.93635475635529
39.77402997016907
63.410155951976776
0.0
0.0
0.0
48.76078975200653
20.683171927928925
209.70653942227364
13.398919016122818
66.17069214582443
34.157407850027084
71.53043243288994
0.0
109.57357215881348
0.0
0.0
0.0
74.35527557134628
61.72319906949997
141.94086062908173
17.7083



Elapsed time to compute best fit: 8.154 seconds
Cross-validation score: 0.6068018830885822
Test score: 0.6551059730250481
Best Hyperparameters: {}
203.88621239364147
5027.564640328288
4656.371179521084
788.6562306135893
87.0681121647358
379.1061022877693
59.33028319478035
238.61565014719963
91.07241508364677
181.0716128796339
504.95777001976967
1369.9865719079971
129.4464180022478
0.0
232.80321517586708
0.0
236.08789485692978
35.81314504146576
643.9242567867041
61.81406432390213
1191.3143828511238
46.24860927462578
90.35860252380371
27.435396432876587
12.77844688296318
158.5350306034088
22.924918621778488
107.37241812050343
27.18907180428505
133.11747831106186
16.1965209543705
61.23798069357872
291.1786618232727
9.026393949985504
129.04236111044884
0.0
0.0
0.0
22.598286285996437
0.4846799969673157
158.3719933629036
64.146508872509
1.1015199422836304
66.63600239157677
40.70548112690449
0.0
272.6311220228672
0.0
0.0
0.0
77.87707817554474
107.41220837831497
113.7175524532795
63.3444559574



Elapsed time to compute best fit: 8.863 seconds
Cross-validation score: 0.6123533354529141
Test score: 0.674373795761079
Best Hyperparameters: {}
272.63102862238884
4612.888325870037
4965.957887887955
1357.1306707561016
155.75785574316978
193.88963836431503
59.51186816394329
254.85216888785362
159.01167613267899
121.73135191202164
467.34823720157146
1323.1044998019934
267.3678183555603
0.0
206.6881452947855
0.0
1194.2709303349257
152.93538376688957
76.51837468147278
32.84802949428558
145.63027849793434
43.06656491756439
173.5739123225212
11.354339361190796
24.99391669034958
11.897299766540527
43.60598812997341
119.15902426838875
59.97109255194664
148.68567994236946
15.06747704744339
31.966894924640656
106.4173135459423
10.346572041511536
67.23832556605339
0.0
0.0
0.0
17.99247509241104
7.619401156902313
205.66660580039024
22.904273837804794
12.407710075378418
99.55742034316063
44.43862220644951
0.43791499733924866
23.591155767440796
0.0
0.0
0.0
91.51868169009686
63.55752310156822
129.41



Elapsed time to compute best fit: 8.824 seconds
Cross-validation score: 0.6168337429068864
Test score: 0.6978967495219885
Best Hyperparameters: {}
276.47998654842377
5805.53736743331
2849.4795857667923
2283.5275245904922
181.6988553404808
361.2548297941685
61.15360698103905
103.97992724180222
84.63756704330444
178.50559391081333
514.0872870534658
1463.603046387434
49.37064227461815
0.0
191.24646891653538
0.0
162.0608387887478
1076.4721654057503
126.49392196536064
11.024034976959229
104.19130513072014
84.32024013996124
151.8531112074852
68.97244925796986
43.17336317896843
20.04866275191307
90.34790733456612
104.23313075304031
107.24807712435722
230.1785948574543
31.47722578048706
74.7417971342802
64.37611481547356
23.58027893304825
55.84447717666626
0.0
0.0
0.0
54.54851746559143
14.985559940338135
150.54459369182587
488.67807602882385
0.0
70.22898489236832
24.966027945280075
5.172510147094727
50.04874438047409
0.0
0.0
0.0
87.39634710550308
71.01864790916443
157.87202732264996
23.8359801



Elapsed time to compute best fit: 8.972 seconds
Cross-validation score: 0.6025058682333381
Test score: 0.6952380952380953
Best Hyperparameters: {}
393.7018696665764
5491.238663524389
3888.6165546029806
884.9533642083406
317.6525005698204
180.0266016870737
59.808383882045746
108.0059066414833
39.17365288734436
167.45879274606705
193.7862132191658
1763.813303425908
75.54220174252987
0.0
181.11725270748138
0.0
130.73268631100655
1432.0972109884024
95.65249451994896
91.34931737184525
176.96115896105766
38.48049320280552
169.3329720199108
22.590639978647232
7.11418491601944
35.57852813601494
89.3252075612545
74.6369881182909
52.20557005703449
100.8743564337492
31.63283336162567
46.42426073551178
325.7577235400677
10.568388044834137
72.0352280586958
0.0
0.0
0.0
97.64874196052551
0.0
93.02126786112785
48.79612571001053
63.310001373291016
58.52590489387512
88.71655884385109
0.9690009951591492
31.148310005664825
0.0
0.0
0.0
123.2404962927103
82.6026486903429
227.54889941215515
50.45906288921833



Elapsed time to compute best fit: 9.226 seconds
Cross-validation score: 0.6299930170749743
Test score: 0.6883365200764817
Best Hyperparameters: {}
466.0761782824993
5252.819257214665
4069.1421276032925
1654.2610243856907
64.25542625784874
459.9853920638561
649.5598819553852
93.68848024308681
91.35932487249374
133.1169178187847
457.2975061684847
1542.9380558878183
69.40296070277691
0.0
206.1086989045143
0.0
59.40297666192055
567.5261755436659
33.771580934524536
44.753071278333664
215.1199410557747
56.72624748945236
158.22385519742966
18.144661784172058
9.114113211631775
49.216891050338745
98.03447495400906
158.87335693836212
62.17063209414482
236.87413284182549
7.226905196905136
69.4295908510685
131.5149658769369
22.76252508163452
58.84708493947983
0.0
0.0
0.0
60.71281045675278
0.0
87.09616865217686
34.16379788517952
1.8016599416732788
66.15949967503548
436.83809354901314
12.546579599380493
24.34414768218994
0.0
0.0
0.0
96.3742741048336
48.2133312523365
164.80737951397896
114.8343423604



Elapsed time to compute best fit: 8.663 seconds
Cross-validation score: 0.6243172682282165
Test score: 0.609284332688588
Best Hyperparameters: {}
369.95186975598335
3841.144299477339
5117.541533008218
1655.4596875607967
247.02007740736008
1599.4010574668646
45.09791308641434
56.82756206393242
32.36422052979469
91.5913515985012
696.2392307817936
771.2740694433451
73.6024272441864
0.0
147.67678299546242
0.0
190.40328985452652
1239.792845994234
132.99909935891628
69.5722973048687
143.90729784965515
56.55488084256649
145.47546130418777
23.095721036195755
14.932369768619537
35.50547295808792
54.53089068830013
81.78625932335854
36.69766165316105
335.5431143641472
23.625922113656998
80.46002322435379
175.20089727640152
11.132309138774872
56.85276240110397
0.0
0.0
0.0
32.54120683670044
4.086826980113983
73.68890663981438
61.7032188475132
2.289139986038208
409.9556996226311
52.01121082901955
0.0
203.34342962503433
0.0
0.0
0.0
139.4790177643299
47.94602698087692
87.65235191583633
23.980337008833



Elapsed time to compute best fit: 8.161 seconds
Cross-validation score: 0.6156544983153648
Test score: 0.669811320754717
Best Hyperparameters: {}
830.0519628375769
5649.707980200648
3851.877077817917
1178.3070507496595
310.0339452326298
319.98233965039253
42.50820855796337
102.91205148398876
56.2464237511158
242.31982311606407
545.4406355023384
1325.7685444206
26.665332913398743
0.0
172.678834348917
0.0
77.28901049494743
1300.9909149855375
132.49688801169395
50.317828834056854
189.72575888037682
81.91478681564331
239.70316594839096
6.50813490152359
16.729125291109085
25.603035748004913
65.56087398529053
142.91442677378654
35.987294018268585
133.54597079753876
11.208387106657028
63.43678396940231
165.86414754390717
21.712908446788788
45.43411669135094
0.0
0.0
0.0
15.13335120677948
23.509825974702835
189.7145862877369
13.904413998126984
8.487541794776917
50.27306681871414
67.08619682490826
6.382359981536865
90.75982949137688
0.0
0.0
0.0
105.53250385820866
31.848705023527145
112.594656527



Elapsed time to compute best fit: 8.749 seconds
Cross-validation score: 0.6361691525445868
Test score: 0.660377358490566
Best Hyperparameters: {}
647.323083281517
6574.677807852626
3088.542843773961
1489.639909029007
104.27850480377674
199.8286745995283
20.861951768398285
69.87268282473087
218.6385756134987
139.47157771885395
1712.1658414304256
412.93958409130573
66.39395987987518
0.0
245.02341693639755
0.0
128.91825798153877
535.1996234059334
553.180594265461
12.708037823438644
114.4554395377636
32.500585705041885
124.07223181426525
25.24833568930626
15.335286915302277
15.862835124135017
112.30083583295345
239.58160918951035
109.58661060035229
330.7164399474859
72.66262024641037
68.57871153950691
176.24295887351036
8.839666187763214
106.10958041250706
0.0
0.0
0.0
38.95414572954178
1.9220330119132996
100.51494997739792
51.80163127183914
8.301801085472107
59.81402686238289
162.74596720933914
0.0
265.8654035329819
0.0
0.0
0.0
49.80083703994751
44.07525749504566
128.7233991920948
33.33576



Elapsed time to compute best fit: 11.080 seconds
Cross-validation score: 0.6234453422444028
Test score: 0.6427221172022684
Best Hyperparameters: {}
748.4386533498764
7261.525697261095
1424.1611806303263
2658.9114255458117
54.568602591753006
138.57721637189388
72.58585691452026
131.42858678102493
121.72797551751137
240.26730707287788
484.02711306512356
1547.4172868430614
449.6332732886076
0.0
205.74467206001282
0.0
74.60747191309929
841.53950548172
165.43114411830902
120.18821388483047
80.20289286971092
155.1442994773388
175.9391142576933
51.546044796705246
7.595850914716721
7.325210094451904
34.482555866241455
280.0267698466778
44.191073790192604
272.00567451119423
22.012929648160934
26.755962014198303
48.03057888150215
24.604148149490356
62.56792148947716
0.0
0.0
0.0
53.63625279068947
25.84569078683853
145.74801275134087
79.92269715666771
8.255309820175171
55.499559700489044
27.31271356344223
0.0
25.705049991607666
0.0
0.0
0.0
101.25470545887947
143.85930681228638
157.27559180557728
3



Elapsed time to compute best fit: 8.568 seconds
Cross-validation score: 0.6010017353849124
Test score: 0.6576402321083171
Best Hyperparameters: {}
603.328220397234
4941.898043200374
4245.960856869817
911.0966204851866
238.67287212610245
279.550581112504
39.74008724093437
96.17803275585175
147.62494058907032
233.27208298444748
285.81888088583946
1581.7664353102446
93.69847330451012
0.0
312.656652957201
0.0
157.81469333171844
1060.267334535718
475.45434352755547
90.49367055296898
243.7636348605156
72.48245944082737
235.2626084536314
19.365526005625725
33.62589621543884
8.970550060272217
157.1121824979782
81.33425916731358
40.10754582285881
296.4405789524317
2.0952540040016174
75.5324715077877
61.28120270371437
15.897780299186707
85.38099190592766
0.0
0.0
0.0
26.010690927505493
11.65658986568451
307.77752363681793
65.03688883781433
0.0
129.73351487517357
73.28710989654064
0.0
266.9569102227688
0.0
0.0
0.0
66.69821690022945
199.30906262993813
562.3406022191048
51.15201997756958
21.60396778



Elapsed time to compute best fit: 8.273 seconds
Cross-validation score: 0.6405968252470022
Test score: 0.6504854368932039
Best Hyperparameters: {}
726.625086247921
5034.888682439923
4179.1119834929705
1689.2815611809492
70.55754232406616
372.1162849664688
24.798648476600647
111.83172865211964
44.2508530318737
99.4387856721878
350.10834005475044
1544.7987786531448
96.15699011087418
0.0
207.50471945106983
0.0
171.75904288887978
740.3389211297035
74.64177396893501
112.56250056624413
667.7848724722862
41.35834676027298
114.57158482074738
52.101155787706375
44.929983764886856
0.0
37.25730957090855
77.75674617290497
49.59398376941681
198.74282932281494
10.888370156288147
61.79837939143181
139.32600191235542
7.633894115686417
105.58734473586082
0.0
0.0
0.0
43.36187407374382
20.488985061645508
180.74050962924957
72.62786966562271
15.338970243930817
45.07122024893761
88.12153734266758
0.0
35.56412163376808
0.0
0.0
0.0
87.30556789040565
54.43738575279713
539.705147087574
105.29364615678787
9.169



Elapsed time to compute best fit: 8.238 seconds
Cross-validation score: 0.619506131565627
Test score: 0.6262042389210021
Best Hyperparameters: {}
383.93249271810055
5274.083665207028
3782.5303759872913
1954.901929050684
93.10468405485153
388.3582643568516
30.528989553451538
107.47540584206581
72.4789209663868
180.6105374097824
370.06446512043476
1790.5130575150251
66.19100511074066
0.0
129.49997335672379
0.0
104.00111755728722
798.066584199667
85.37951624393463
148.35078233480453
488.12524822354317
58.22408252954483
174.0521116256714
34.73188692331314
2.8110529482364655
15.52189415693283
38.638307094573975
125.08131992816925
207.95935426652431
178.8441792279482
94.13783299922943
50.23835200071335
395.7481545507908
26.765473127365112
117.41169048845768
0.0
0.0
0.0
31.275204807519913
3.949585109949112
56.253294944763184
37.28965815901756
56.271653056144714
68.3420539200306
50.99050296843052
0.0
72.38300368189812
0.0
0.0
0.0
118.55106797814369
78.15345740318298
86.23324328660965
86.759294



Elapsed time to compute best fit: 8.483 seconds
Cross-validation score: 0.6122996530892907
Test score: 0.6525911708253359
Best Hyperparameters: {}
845.8763016909361
3993.524365246296
5261.380523175001
1355.8838939368725
71.37059880793095
259.6872690320015
44.938432827591896
34.903305143117905
27.10482393205166
167.77380776405334
582.6368335783482
1768.449501246214
77.59695920348167
0.0
286.466643884778
0.0
152.40048836171627
1151.1766200512648
441.41546380519867
31.134824380278587
84.12949734926224
31.28113777935505
113.04260438680649
1.690187007188797
14.751931339502335
3.315239906311035
35.65407782793045
79.28115801513195
80.62461307644844
444.39423429965973
6.269718945026398
41.70019552111626
129.51546594500542
12.079959124326706
108.19154267013073
0.0
0.0
0.0
32.85922899842262
8.485548853874207
252.8283749818802
35.67819771170616
0.0
32.37511873245239
382.2234131991863
1.4199899435043335
281.38717302680016
0.0
0.0
0.0
81.31230814754963
83.96591830253601
101.6738511621952
86.6797903



Elapsed time to compute best fit: 8.623 seconds
Cross-validation score: 0.6227698908597797
Test score: 0.6285714285714284
Best Hyperparameters: {}
451.27142012119293
4757.384876132011
4845.550682783127
911.2887695431709
749.7821979224682
170.96539679169655
104.93999636173248
90.84894193708897
119.22228416800499
51.10991179943085
1247.090426594019
415.1080854833126
66.50592195987701
0.0
281.73333574831486
0.0
121.80132946372032
297.64683985710144
1047.1518893539906
101.94568431377411
160.2554016262293
101.27301007509232
334.72185480594635
11.408115059137344
68.27615714073181
6.514196217060089
27.055678457021713
112.55368509888649
48.658122062683105
117.61390389502048
38.06928217411041
201.59612022340298
67.55584129691124
45.27747315168381
109.70805476605892
0.0
0.0
0.0
331.1963452398777
4.126697093248367
126.06111335754395
32.85888063907623
0.0
39.542321264743805
40.522422164678574
0.0
63.19222342967987
0.0
0.0
0.0
106.0546506345272
219.886941075325
95.68668445944786
21.909611344337463




Elapsed time to compute best fit: 8.695 seconds
Cross-validation score: 0.619223164717299
Test score: 0.6634615384615384
Best Hyperparameters: {}
236.55860601365566
6604.265230670571
3705.672010421753
828.4162772297859
73.62321001291275
226.75248393416405
384.77520973980427
135.16622632741928
71.84624046087265
122.62074440717697
492.1132995635271
1514.9076753556728
102.84186564385891
0.0
258.76287293434143
0.0
337.3767586052418
1053.9656388759613
218.24329762160778
70.31689223647118
176.25270465016365
17.847881883382797
135.97816687822342
45.87411254644394
32.56686571240425
20.37209990620613
47.67302557826042
86.56871543824673
52.06269985437393
128.84303402900696
25.357601583003998
88.54030612111092
53.11287200450897
21.17236092686653
130.60410007834435
0.0
0.0
0.0
62.2843354344368
32.24898570775986
75.00197505950928
46.36262220144272
88.41419792175293
57.47762385010719
311.0878777652979
0.0
202.28254371881485
0.0
0.0
0.0
95.42488208413124
105.59040503203869
59.5446712821722
54.1588041



Elapsed time to compute best fit: 8.285 seconds
Cross-validation score: 0.6376987042091952
Test score: 0.5697445972495088
Best Hyperparameters: {}
318.5286400318146
5646.988034456968
3891.4503009468317
1380.006118953228
114.8118938356638
281.79751202464104
109.46483188867569
114.91259931027889
43.79066416621208
163.2238631248474
481.5278813838959
1824.6028325110674
57.46236616373062
0.0
123.59221589565277
0.0
322.7930211722851
998.4824509471655
428.0892706513405
43.280224204063416
91.55675083398819
84.3042571246624
156.14274165034294
21.496972888708115
5.07604905962944
1.9392919540405273
109.44733446836472
296.8245312124491
56.69528332352638
151.46888291835785
25.351554721593857
87.527040168643
77.84787839651108
7.012681037187576
93.14253570139408
0.0
0.0
0.0
49.6953327357769
5.248962968587875
252.69817689061165
33.30941531062126
14.6289102435112
62.0941841006279
197.35187529027462
0.0
349.0758992433548
0.0
0.0
0.0
81.4062702357769
78.71445274353027
122.29131910204887
42.78689396381378



Elapsed time to compute best fit: 8.230 seconds
Cross-validation score: 0.6160937003523307
Test score: 0.6641366223908919
Best Hyperparameters: {}
462.15792202949524
4010.6286092996597
5451.535548776388
1401.2947278022766
124.70618537068367
131.6644412726164
414.79819808900356
154.1063518077135
58.3311852067709
132.01888391375542
316.5247438400984
1906.31139652431
63.12201350927353
0.0
186.93671776354313
0.0
71.07565993070602
1126.9016907215118
84.30614361166954
25.209879964590073
199.52487522363663
99.1984845995903
120.22989603877068
42.084785372018814
28.30312642455101
20.508500158786774
85.55805534124374
118.26705834269524
67.60411888360977
133.4564081132412
16.185610830783844
68.02134653925896
136.1793885231018
9.404029041528702
156.62221121788025
0.0
0.0
0.0
21.825744807720184
5.993275940418243
83.26985596120358
103.82246959209442
1.1377400159835815
23.121607929468155
45.64872832596302
2.8099400997161865
367.78271064162254
0.0
0.0
0.0
74.76820680499077
66.67617286741734
389.423738

## 5.2 Rebalancing Strategy - SMOTE

### 5.2.1 Random Forests

In [127]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

smote_randomforest_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    pipeline = Pipeline(steps = [['smote', SMOTE()],
                              #['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    smote_randomforest_nonnormalized_performance_df = pd.concat([smote_randomforest_nonnormalized_performance_df, new_performance_df])
    
smote_randomforest_nonnormalized_performance_df.to_csv("../data/05_model_output/smote_randomforest_nonnormalized_performance_df.csv")



### 5.2.2 XGBoost

In [128]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


smote_xgboost_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    GXBoostPipeline = Pipeline(steps = [['smote', SMOTE()],
                                    #['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    smote_xgboost_nonnormalized_performance_df = pd.concat([smote_xgboost_nonnormalized_performance_df, new_performance_df])
    

smote_xgboost_nonnormalized_performance_df.to_csv("../data/05_model_output/smote_xgboost_nonnormalized_performance_df.csv")




Elapsed time to compute best fit: 193.780 seconds
Cross-validation score: 0.7524878247596201
Test score: 0.6563706563706564
Best Hyperparameters: {}
0.00525769
0.05061125
0.040986404
0.23542185
0.024630431
0.0082406495
0.0016243821
0.0053882664
0.01421211
0.007907729
0.04186708
0.010887819
0.0026738609
0.0
0.029493619
0.0
0.004960153
0.009383938
0.005050887
0.0024130365
0.008432649
0.0025162585
0.002291012
0.004723724
0.0010612314
0.0010231949
0.0011729628
0.0012204946
0.0005451432
0.004605183
0.040863555
0.003359318
0.0016725222
0.0022050191
0.00092671
0.0
0.0
0.0
0.0046040895
0.017749868
0.005173797
0.0024194457
0.0056858715
0.002133358
0.0071200407
0.011168241
0.0017220662
0.0
0.0
0.0
0.0014005539
0.023588601
0.0011521432
0.0015595371
0.001992373
0.0025724825
0.00051562994
0.0023965838
0.005778449
0.025121655
0.008286
0.003313666
0.0030314738
0.0105748065
0.007262591
0.000300138
0.0004041523
0.0076658013
0.0010506199
0.0019977693
0.0009914518
0.011459172
0.0027809334
0.00089006027
0



Elapsed time to compute best fit: 189.679 seconds
Cross-validation score: 0.7261552187954183
Test score: 0.7843137254901961
Best Hyperparameters: {}
0.0075841215
0.0692853
0.03967269
0.27755046
0.010879404
0.005807276
0.001300435
0.017757451
0.01159755
0.0057496107
0.038929142
0.010848034
0.003927154
0.0
0.009776107
0.0
0.008999016
0.010781177
0.015828677
0.0014584579
0.007851445
0.0014808631
0.004398926
0.0026938764
0.0013300683
0.0006989454
0.00069248735
0.003044154
0.0011303254
0.0075227483
0.023930129
0.0023013821
0.0012166927
0.010105777
0.0010684029
0.0
0.0
0.0
0.0042644134
0.03238827
0.0033450185
0.0049074935
0.012008121
0.0014265097
0.00859473
0.00013502719
0.0015345446
0.0
0.0
0.0
0.0014096788
0.00751527
0.0008630003
0.0032609848
0.00032072453
0.0021538471
0.0030855788
0.0011181344
0.005305999
0.007378133
0.003152871
0.0024366044
0.007770586
0.008954816
0.009219096
0.0012366978
0.0010038083
0.008702426
0.0023021349
0.0009458509
0.0012371934
0.009205975
0.0013285251
0.001264256



Elapsed time to compute best fit: 193.034 seconds
Cross-validation score: 0.7628064616321473
Test score: 0.7228915662650603
Best Hyperparameters: {}
0.013352857
0.04763574
0.03549767
0.23377036
0.016264267
0.0076887286
0.0013227928
0.0059303595
0.013004995
0.00936545
0.048296213
0.009834621
0.0083810715
0.0
0.017727135
0.0
0.003930939
0.008795356
0.0073352596
0.0016189242
0.010751478
0.0028564795
0.0116946995
0.008875253
0.0044908393
0.0005834526
0.0012576901
0.01145308
0.00039679653
0.004369553
0.038223807
0.0023682837
0.0009180374
0.0017791763
0.00055122393
0.0
0.0
0.0
0.006393147
0.015387443
0.0032867314
0.005157799
0.0051952945
0.0044207666
0.0038432598
0.018661013
0.0036162517
0.0
0.0
0.0
0.0006837658
0.006640376
0.0029884437
0.0007188591
0.003120712
0.0014695828
0.0016604008
0.0020491423
0.007619279
0.022323322
0.0006225157
0.0036797328
0.012711776
0.009902396
0.0055645723
0.00024862902
0.0008558235
0.0107830595
0.0009566459
0.0023462486
0.000970802
0.007066786
0.0009940268
0.000



Elapsed time to compute best fit: 186.652 seconds
Cross-validation score: 0.7278643517538368
Test score: 0.8108108108108107
Best Hyperparameters: {}
0.010534716
0.0436171
0.043454338
0.23807512
0.015344344
0.0024869875
0.0005478936
0.01469249
0.005617764
0.008936913
0.053102706
0.012173135
0.005149382
0.0
0.016660314
0.0
0.011338773
0.011926131
0.00802424
0.001209757
0.0041634734
0.0013347211
0.008669645
0.006717091
0.0019263027
0.004622298
0.0025033716
0.016650857
0.0023097321
0.0026469547
0.030942336
0.001653911
0.00132902
0.010406737
0.0006108249
0.0
0.0
0.0
0.00707094
0.02927047
0.004831397
0.0033860914
0.009290041
0.0020694956
0.0111700995
0.0024242606
0.0019784463
0.0
0.0
0.0
0.00058227166
0.006396961
0.0020839614
0.00035408884
0.003739629
0.0041336827
0.001518795
0.0005061136
0.007523146
0.012369185
0.00089561474
0.0030151121
0.003685485
0.0017667988
0.0031433413
0.0006088977
0.0004209196
0.009958334
0.0013288632
0.0013694123
0.0017323649
0.012463614
0.0009913223
0.0017641955
0.



Elapsed time to compute best fit: 195.640 seconds
Cross-validation score: 0.7466283871562728
Test score: 0.7843137254901961
Best Hyperparameters: {}
0.007503551
0.068051
0.026391895
0.21354376
0.014620676
0.002175601
0.0040044044
0.009614108
0.012479649
0.0014820709
0.04291525
0.013339425
0.014164132
0.0
0.0062695583
0.0
0.01463307
0.016077729
0.016468583
0.0019457703
0.014348515
0.0018687932
0.0045785033
0.0051830937
0.0016809626
0.00031521654
0.0009804448
0.003369955
0.0010589692
0.0038504116
0.024451183
0.0007134816
0.0012634781
0.007191418
0.00059688516
0.0
0.0
0.0
0.0035139546
0.037853874
0.0024436763
0.0046462384
0.008018663
0.002037598
0.008370615
0.0
0.0027254312
0.0
0.0
0.0
0.0010382928
0.022609336
0.002012366
0.00043636697
0.011581035
0.0012690634
0.0013475323
0.0012052126
0.00547057
0.015132114
0.0031758405
0.0012494338
0.0029013692
0.004523926
0.00790725
0.002504749
0.0012271273
0.00878967
0.0002440168
0.0027152505
0.001796079
0.012471258
0.001617746
0.00039375792
0.0017314



Elapsed time to compute best fit: 190.956 seconds
Cross-validation score: 0.7358300053354428
Test score: 0.7510288065843621
Best Hyperparameters: {}
0.009950824
0.047721855
0.037075542
0.23122421
0.009882256
0.0061902124
0.00569001
0.01022858
0.008082047
0.009553972
0.04726807
0.012756389
0.0029086904
0.0
0.014520553
0.0
0.015748499
0.00975008
0.006949476
0.0018496688
0.0035498107
0.0036051876
0.0035414684
0.0061738486
0.011504603
0.0020248014
0.0030107365
0.007026071
0.00045173947
0.0010578021
0.05659193
0.0009886824
0.0011070943
0.008718665
0.0004600502
0.0
0.0
0.0
0.013636146
0.0011734575
0.003832819
0.00406316
0.010396344
0.00036224938
0.0057163
0.0061424756
0.0016903796
0.0
0.0
0.0
0.00068991736
0.0012466238
0.0045614233
0.0011278188
0.008815819
0.0054118517
0.0015562114
0.0068833847
0.007438447
0.020282643
0.0052657495
0.003008724
0.004680754
0.015392989
0.00049611187
0.0007622472
0.0018072814
0.010232589
0.0008436321
0.0007858581
0.0011887134
0.00546034
0.001013123
0.0014913696




Elapsed time to compute best fit: 193.071 seconds
Cross-validation score: 0.745618615463365
Test score: 0.7509505703422052
Best Hyperparameters: {}
0.011592356
0.06334278
0.032487437
0.24425392
0.012449554
0.0021418715
0.0023661244
0.017260715
0.007856908
0.010773137
0.054027587
0.010113113
0.008297807
0.0
0.014128479
0.0
0.009951019
0.009463989
0.011449506
0.001060152
0.003438414
0.0034968173
0.006319425
0.0049007726
0.0029038263
0.0003302264
0.00051800336
0.014187456
0.0013232859
0.0043321447
0.03185768
0.0006429823
0.0016467132
0.0075607826
0.0004252693
0.0
0.0
0.0
0.005895712
0.036380418
0.003652819
0.0029219417
0.013535761
0.0016127336
0.0042520477
0.0
0.0010571978
0.0
0.0
0.0
0.0010642327
0.0019248242
0.0019478837
0.00070918136
0.0022354228
0.0012785103
0.0015989413
0.0022074026
0.005900862
0.011230497
0.0019944066
0.0024972176
0.006266907
0.0030406094
0.0032042526
0.00042978616
0.0012801297
0.010977428
0.0011683676
0.0022394396
0.0010013534
0.008476204
0.001217693
0.002153582
0.



Elapsed time to compute best fit: 189.768 seconds
Cross-validation score: 0.7584926757071828
Test score: 0.6903765690376569
Best Hyperparameters: {}
0.011161687
0.04569251
0.038338855
0.23753868
0.012434962
0.004130986
0.0009809488
0.0076407716
0.009108021
0.011142881
0.048115227
0.016477758
0.0037457899
0.0
0.023080252
0.0
0.0036132617
0.012944969
0.01247448
0.0006934728
0.0026192307
0.0057979957
0.009859637
0.0075023933
0.0010500229
4.3793152e-05
0.00064733194
0.015322598
0.001268728
0.0043478073
0.030665196
0.00045307327
0.0021717923
0.010789718
0.00060503185
0.0
0.0
0.0
0.0032439418
0.013745634
0.0048600226
0.0070530265
0.0019088539
0.0033491573
0.004545556
0.007865454
0.0009966143
0.0
0.0
0.0
0.0009635654
0.0010327394
0.0018937388
0.008141905
0.0011001149
0.00024292282
0.009164298
0.0026853066
0.0067795413
0.014727468
0.0015499607
0.0028925538
0.0037770728
0.0072096763
0.007634264
0.0001080021
0.0014043838
0.008081908
0.0004168971
0.0010661496
0.001848096
0.013319619
0.001209574
0



Elapsed time to compute best fit: 196.738 seconds
Cross-validation score: 0.7265589809137698
Test score: 0.7959183673469389
Best Hyperparameters: {}
0.015337742
0.064874984
0.04246613
0.23282926
0.014032687
0.011238061
0.0014530201
0.00056709966
0.010137229
0.006217873
0.04375459
0.01269084
0.0019706176
0.0
0.021687584
0.0
0.006861218
0.026878914
0.009828961
0.00029214032
0.004125878
0.0044421186
0.005554931
0.0010839193
0.0066912975
0.0006894812
0.0023456987
0.018144544
0.0016741959
0.004296917
0.023995861
0.0010530641
0.0010035858
0.004352878
0.0008338553
0.0
0.0
0.0
0.0037382238
0.010865347
0.0022619637
0.0008339561
0.008427297
0.007813069
0.011721294
0.0014196441
0.0029810963
0.0
0.0
0.0
0.0004778139
0.004856848
0.0013011581
0.0035883798
0.0025349
0.002669599
0.0014486738
0.0019422309
0.004904996
0.022739667
0.0050095394
0.0053483793
0.0020819607
0.0035105872
0.0019098435
0.0012081616
0.0015822593
0.009468644
0.0012574082
0.001437888
0.0017579624
0.006753747
0.0020237525
0.00100933



Elapsed time to compute best fit: 192.883 seconds
Cross-validation score: 0.7442229460338051
Test score: 0.7591093117408907
Best Hyperparameters: {}
0.009863241
0.06622937
0.030524962
0.22327533
0.011133535
0.0025716645
0.004331254
0.0077013117
0.0068128044
0.0059584943
0.046866603
0.0109172445
0.0058751297
0.0
0.017302597
0.0
0.0254538
0.0077015124
0.008634338
0.0006001958
0.0100147715
0.0023390432
0.009120167
0.0059062247
0.005433014
0.0013414923
0.0014957734
0.008287787
0.0014883092
0.0019629933
0.025906455
0.0030428488
0.0011850719
0.01007338
0.00080650277
0.0
0.0
0.0
0.012342573
0.014939425
0.0027390947
0.0050077797
0.008302334
0.0032308374
0.010589529
0.00038192514
0.0015473102
0.0
0.0
0.0
0.0012598243
0.024779448
0.001536979
0.0069271256
0.004195295
0.0006042413
0.00163582
0.0012495426
0.00690386
0.009940304
0.0020301684
0.0047502345
0.014436739
0.003783363
0.0013778529
0.00046371904
0.0010944669
0.012313581
0.00046991103
0.0009953822
0.0020532547
0.009914654
0.0011008302
0.0006



Elapsed time to compute best fit: 194.493 seconds
Cross-validation score: 0.7302633885050731
Test score: 0.7448979591836735
Best Hyperparameters: {}
0.01452943
0.060634833
0.03243454
0.22196701
0.015554966
0.008212931
0.0016433399
0.0050664684
0.012923694
0.0042771045
0.05582707
0.012634951
0.008137341
0.0
0.020427689
0.0
0.007676212
0.013559728
0.015385887
0.0087585775
0.008150714
0.0014398975
0.005801264
0.0034558105
0.00037730698
0.0005016626
0.0010603688
0.0050573163
0.0010087522
0.0036148648
0.028950408
0.003036177
0.00051254826
0.011205907
0.0011303779
0.0
0.0
0.0
0.0059124567
0.03153623
0.003950676
0.0011731577
0.027292315
0.002738474
0.006987045
0.0018414534
0.002039039
0.0
0.0
0.0
0.0009917336
0.017249594
0.0016658341
0.0023963465
0.004094516
0.0050321715
0.001078874
0.0025032342
0.009624381
0.012556102
0.0013230811
0.0025540134
0.003728406
0.0025088885
0.0024942954
0.0003371557
0.0008641499
0.0075641684
0.0005999304
0.0008585119
0.0014446758
0.018175784
0.0005538503
0.0017894



Elapsed time to compute best fit: 195.429 seconds
Cross-validation score: 0.7373622402459127
Test score: 0.7181818181818181
Best Hyperparameters: {}
0.01298186
0.04951579
0.031112565
0.2512025
0.005376065
0.00269838
0.005720179
0.014406105
0.011479841
0.002381277
0.0448862
0.009687088
0.0016110853
0.0
0.015725749
0.0
0.0060330057
0.017227279
0.011837762
0.00095198594
0.007834732
0.0026410571
0.0030699263
0.0028708437
0.008635196
0.0064606457
0.0008101919
0.009496331
0.002329641
0.0030501976
0.04006551
0.0017130441
0.0007550712
0.0029902316
0.0024816336
0.0
0.0
0.0
0.0051952302
0.021189677
0.004069138
0.0054711103
0.008485032
0.0024092158
0.001942785
0.0006961803
0.0013029723
0.0
0.0
0.0
0.0016912638
0.030072883
0.0018590936
0.0002551308
0.0006576082
0.001485017
0.0023942664
0.0020436458
0.0067629963
0.033364512
0.0048138797
0.001887416
0.0065989187
0.0045861467
0.002380549
0.0012172756
0.0008765449
0.00588593
0.002436857
0.0009921631
0.0013383406
0.012377316
0.0007681066
0.0045307362
0



Elapsed time to compute best fit: 194.801 seconds
Cross-validation score: 0.7417159800737257
Test score: 0.7470119521912351
Best Hyperparameters: {}
0.010135213
0.065448694
0.04262299
0.23536798
0.016904576
0.007835024
0.0015698097
0.008530185
0.009528007
0.001089776
0.047073014
0.011817823
0.0036969474
0.0
0.014258699
0.0
0.01045044
0.013318448
0.013262366
0.00070427166
0.0076220045
0.015063915
0.009301526
0.004243534
0.0038323503
0.0010600002
0.0011180816
0.0011176494
0.0006862744
0.0041190656
0.032054808
0.0008420281
0.0013969593
0.010500012
0.0006226125
0.0
0.0
0.0
0.006688107
0.0
0.0021750806
0.0022262973
0.00285018
0.0040605213
0.007422623
0.0047208383
0.004536089
0.0
0.0
0.0
0.0014599775
0.019883055
0.0017364497
0.0028746026
0.0015867173
0.0036477284
0.00515966
0.002123679
0.007031573
0.013180545
0.006372732
0.002673729
0.004751454
0.008346144
0.0026855895
0.0013883716
0.0009842451
0.007172615
0.00042529203
0.0020050586
0.0018813332
0.01390444
0.0015021242
0.0035994234
0.0011959



Elapsed time to compute best fit: 191.012 seconds
Cross-validation score: 0.74105042276971
Test score: 0.7684824902723735
Best Hyperparameters: {}
0.014902942
0.058677256
0.033002377
0.26130557
0.012684692
0.007129107
0.0067658992
0.012930236
0.015331833
0.010669155
0.031015662
0.00818798
0.0035443548
0.0
0.013139381
0.0
0.006518826
0.009644905
0.013810458
0.0036784878
0.0047065113
0.002766371
0.008642338
0.001920933
0.0015673814
0.0009454911
0.00074051955
0.00088390993
0.000801702
0.009853811
0.02949808
0.0009992145
0.0014413776
0.0038251553
0.0013652093
0.0
0.0
0.0
0.0019402464
0.0030841238
0.0020223141
0.0067236484
0.0070146085
0.0021626097
0.0016620818
5.5211327e-05
0.0010758598
0.0
0.0
0.0
0.0012514932
0.0007459476
0.002516691
0.0060548717
0.0039941417
0.0017705865
0.0061251367
0.0025364386
0.009800222
0.023721058
0.006092092
0.0026239273
0.004409605
0.0048532374
0.002600887
0.00068225333
0.0050901817
0.008270011
0.0004283699
0.0013856255
0.00056414097
0.011562614
0.0015607305
0.0



Elapsed time to compute best fit: 188.762 seconds
Cross-validation score: 0.7429430779935569
Test score: 0.7472324723247232
Best Hyperparameters: {}
0.014075778
0.051633168
0.042642836
0.25310645
0.023160072
0.0075060953
0.0026175152
0.010341648
0.013444189
0.012795836
0.045344777
0.008868825
0.0019292604
0.0
0.015521284
0.0
0.010211035
0.014739571
0.008514141
0.0016076468
0.0021434743
0.0038497425
0.0063609513
0.0013247193
0.00085584575
0.0005971063
0.0027498086
0.010055837
0.0019416664
0.002298349
0.025251362
0.0011336955
0.0006998395
0.009509113
0.0004567105
0.0
0.0
0.0
0.0088840965
0.03306239
0.0016354556
0.0020394765
0.0035915277
0.0036884323
0.0018037054
0.00019376406
0.0010305829
0.0
0.0
0.0
0.0011008716
0.0020299219
0.002955354
0.0034499716
0.002523089
0.00048462677
0.0017050052
0.005299857
0.010026896
0.018540286
0.0016535391
0.004264861
0.001216592
0.005296701
0.00076627935
0.0028854525
0.0009928265
0.006850217
0.0032258283
0.0009265022
0.0014225551
0.007982301
0.0012812363
0



Elapsed time to compute best fit: 192.596 seconds
Cross-validation score: 0.7420700342050581
Test score: 0.7127659574468086
Best Hyperparameters: {}
0.007053599
0.038195413
0.051469963
0.21604481
0.022225881
0.0021362521
0.005635032
0.014133386
0.0041141026
0.007345026
0.047442812
0.009713037
0.005053296
0.0
0.014404693
0.0
0.005748643
0.010448841
0.015708635
0.0014464249
0.00783418
0.0011198793
0.011831103
0.003829902
0.022909759
0.0043267603
0.00037576447
0.005192068
0.0003415678
0.0027633898
0.0237161
0.0013797459
0.0004987575
0.009022734
0.0006271836
0.0
0.0
0.0
0.0018097856
0.05623
0.003372628
0.0015748704
0.002888868
0.016703721
0.008800322
0.034264017
0.0016886254
0.0
0.0
0.0
0.0014770562
0.0018848408
0.0016806944
0.0028153213
0.0031805295
0.00072958483
0.0018486159
0.0033291362
0.0031575165
0.005173935
0.0005356773
0.0025287773
0.003308436
0.00433941
0.0021604777
0.0028751888
0.0010562115
0.008317952
0.0018030672
0.00082051323
0.0011298062
0.010973002
0.0011102265
0.0022406313




Elapsed time to compute best fit: 206.548 seconds
Cross-validation score: 0.7113630563111
Test score: 0.8016877637130801
Best Hyperparameters: {}
0.009975026
0.042742364
0.04515406
0.23606683
0.015372389
0.0033357781
0.0017122757
0.0066222753
0.01303444
0.011314544
0.052756764
0.0093622245
0.0024216902
0.0
0.015203258
0.0
0.0076223
0.010713442
0.011357888
0.0005652881
0.010844307
0.0025477142
0.0027210738
0.00504915
0.0032240523
0.00037054042
0.0013247969
0.019384507
0.0012206162
0.0028196808
0.049801826
0.0011045357
0.0009955642
0.008402316
0.00051246973
0.0
0.0
0.0
0.0030017183
0.018015472
0.0022544104
0.00499109
0.021347184
0.002338948
0.008078804
0.0031229844
0.0016041576
0.0
0.0
0.0
0.00064153277
0.0026498267
0.0018738104
0.0007355063
0.00026650965
0.0030516856
0.0005708808
0.0021497835
0.0060741715
0.010868699
0.005773968
0.0016997244
0.022988133
0.0035847565
0.0063955225
0.00043911228
0.0005967807
0.006276224
0.00075392943
0.00209505
0.0012904155
0.012170404
0.0017884485
0.00061



Elapsed time to compute best fit: 192.163 seconds
Cross-validation score: 0.7447761391422887
Test score: 0.7224334600760456
Best Hyperparameters: {}
0.011439118
0.070180826
0.030836405
0.21471755
0.00715636
0.00396769
0.0022935788
0.011059534
0.0051942123
0.0010841283
0.044460043
0.021463046
0.0017970717
0.0
0.01997199
0.0
0.004942789
0.010332603
0.020295395
0.0016543383
0.010467868
0.0032250728
0.009080526
0.0019246177
0.0004989302
0.005546985
0.0013559747
0.00601043
0.005061484
0.0043794494
0.043118328
0.0005013994
0.00078217435
0.00998367
0.00053694827
0.0
0.0
0.0
0.011747198
0.013879262
0.003779592
0.0042748884
0.036225125
0.0032915603
0.008278776
0.0056238575
0.0029612258
0.0
0.0
0.0
0.00092586974
0.001312804
0.00052646495
0.0043043867
0.0026286375
0.0007886924
0.0036334025
0.00039375422
0.007521037
0.017843524
0.0036355986
0.0014155676
0.009276854
0.00472618
0.0013970231
0.0013281782
0.0018577578
0.013893916
0.00062164775
0.0023355463
0.0025673814
0.01375254
0.0009987059
0.001126



Elapsed time to compute best fit: 192.142 seconds
Cross-validation score: 0.7456623801822685
Test score: 0.7296137339055794
Best Hyperparameters: {}
0.0084994035
0.06490964
0.03882697
0.24808282
0.012703417
0.0041271914
0.00562011
0.016802445
0.0111519275
0.008137742
0.036594596
0.020317124
0.00069123105
0.0
0.011898559
0.0
0.006861441
0.015287104
0.017663153
0.0012198829
0.0027979873
0.0012253819
0.015960524
0.001587984
0.00078199233
0.0009108721
0.0005434398
0.0030273728
0.0011116149
0.005515338
0.021920891
0.0010255753
0.00070701964
0.009650025
0.00064919976
0.0
0.0
0.0
0.007306139
0.0
0.0030906536
0.00156569
0.0012255028
0.0039160205
0.017684381
0.0035258504
0.0006047358
0.0
0.0
0.0
0.000635243
0.00085753686
0.0014484151
0.0010571573
0.009766416
0.00095940236
0.0033671798
0.0026306608
0.0046141543
0.032639783
0.019371102
0.001916008
0.0053916797
0.0070661157
0.003181018
0.00062973186
0.0017902913
0.011796635
0.004065613
0.0005627524
0.0014638887
0.006172667
0.0010809577
0.000608107



Elapsed time to compute best fit: 191.580 seconds
Cross-validation score: 0.7530484027337192
Test score: 0.7707509881422926
Best Hyperparameters: {}
0.0074342326
0.06759253
0.036916196
0.25417086
0.007798965
0.006965168
0.0017938041
0.030935442
0.014127007
0.0039069178
0.03796627
0.01701538
0.0014194321
0.0
0.013398366
0.0
0.0072205756
0.010460918
0.02496018
0.00074803387
0.0032792762
0.0042072283
0.011365861
0.0018177435
0.00077169546
0.00028751552
0.00075814227
0.0036406969
0.0011693516
0.00025343566
0.029706517
0.0020445818
0.0010466414
0.010462987
0.0007601464
0.0
0.0
0.0
0.0020829672
0.014236301
0.0026069237
0.00575392
0.0024267847
0.0033093656
0.006113207
0.0
0.001083373
0.0
0.0
0.0
0.001343398
0.0015062129
0.0015592013
0.006238821
0.0005903724
0.00063429726
0.0027558706
0.00043284343
0.010660393
0.024083797
0.0017236831
0.001223172
0.0031466538
0.0036933066
0.0012596467
0.0010302686
0.003315441
0.008020268
0.0021877314
0.0036044999
0.0015496919
0.0068798605
0.0019058737
0.001315



Elapsed time to compute best fit: 194.532 seconds
Cross-validation score: 0.7513813333187993
Test score: 0.7509505703422052
Best Hyperparameters: {}
0.009960299
0.04385435
0.045583017
0.21489792
0.013752539
0.003857227
0.0049784677
0.00859041
0.0050049564
0.0118162315
0.05757815
0.00904649
0.0014210201
0.0
0.026398828
0.0
0.006533885
0.013428433
0.010237938
0.0006295537
0.0033134327
0.0033560686
0.0021561757
0.004388219
0.00722305
0.0013825265
0.0035772638
0.010041157
0.00063889776
0.0039545763
0.04750438
0.008304942
0.00139237
0.008747506
0.00034487972
0.0
0.0
0.0
0.003602925
0.019378385
0.005073526
0.008028374
0.00016853391
0.0033160904
0.0087980395
0.00030214057
0.0017521441
0.0
0.0
0.0
0.0007808598
0.014964277
0.001781765
0.0023222165
0.002757688
0.00651979
0.0016118856
0.0009911904
0.0060680574
0.024479995
0.0032261328
0.002459873
0.007128329
0.007897484
0.013700624
0.0007821069
0.0005042993
0.008238176
0.0042967345
0.0015528776
0.001620716
0.012512021
0.0010877752
0.002732406
0.0



Elapsed time to compute best fit: 190.623 seconds
Cross-validation score: 0.7258858411410879
Test score: 0.8137254901960783
Best Hyperparameters: {}
0.0052918857
0.06652971
0.032544322
0.20644632
0.01541526
0.0035210717
0.003568153
0.011118191
0.012222709
0.0042522894
0.04057616
0.017365579
0.0022396576
0.0
0.023161134
0.0
0.014657663
0.0068711746
0.011423267
0.00038116044
0.0061192764
0.0015574171
0.0049945237
0.00047037657
0.0009931354
0.0004517287
0.0015757105
0.004051593
0.0010794308
0.0058543915
0.03175762
0.0034530787
0.0007018035
0.0052276235
0.00047369683
0.0
0.0
0.0
0.0014107988
0.07389981
0.0057652076
0.008144194
0.010217833
0.003429709
0.004556418
0.0
0.0026743957
0.0
0.0
0.0
0.0011835353
0.004531771
0.0023158602
0.0007234531
0.0037032831
0.018043201
0.0017074895
0.0015640869
0.0074255364
0.010485555
0.0044720666
0.0021066032
0.011020428
0.01176073
0.0053490675
0.00027446795
0.0008249712
0.011452936
0.004516542
0.0015001763
0.0010348626
0.00793998
0.0006647014
0.0008584166
0



Elapsed time to compute best fit: 193.721 seconds
Cross-validation score: 0.7428274040744502
Test score: 0.7722007722007722
Best Hyperparameters: {}
0.011666604
0.051721785
0.036039323
0.22951534
0.011857374
0.0043295487
0.0009223951
0.0071902657
0.009334827
0.015264731
0.05185328
0.011261412
0.014931522
0.0
0.016180903
0.0
0.007353923
0.007841704
0.015133268
0.0010341965
0.0042816033
0.002566973
0.0031093447
0.0045434656
0.00723055
0.002393125
0.0010835767
0.011887208
0.0013497443
0.0017655584
0.04292096
0.0015835911
0.00049802044
0.008263151
0.0006211426
0.0
0.0
0.0
0.015459277
0.003441338
0.0029329972
0.0028904919
0.000542004
0.0034548184
0.001964425
0.0019661433
0.000898951
0.0
0.0
0.0
0.0014477455
0.004281284
0.0027964565
0.00042773527
0.0018455597
0.0007335098
0.000465388
0.0005313482
0.011655682
0.013259074
0.003257083
0.00665227
0.0032930956
0.005268615
0.0031503704
0.00070112554
0.0004577077
0.006628084
0.0007822642
0.0010501267
0.0023324478
0.011420417
0.00088423013
0.0013389



Elapsed time to compute best fit: 191.979 seconds
Cross-validation score: 0.7359724762955734
Test score: 0.7432432432432433
Best Hyperparameters: {}
0.01088599
0.07088621
0.02780458
0.21921939
0.0076626516
0.0052301018
0.004639827
0.0061404426
0.014207691
0.0092589455
0.042307146
0.012784339
0.006086051
0.0
0.014239264
0.0
0.011579559
0.010843325
0.008351326
0.000773669
0.0035809027
0.00363144
0.0051770555
0.004214562
0.004729261
0.0016838791
0.0004742866
0.011376346
0.0010500781
0.0032351767
0.04026354
0.0026871313
0.0009336258
0.006939491
0.0003687095
0.0
0.0
0.0
0.010185592
0.0101526445
0.0007416387
0.004017608
0.028743802
0.0037826267
0.007295189
0.0047163935
0.00078896055
0.0
0.0
0.0
0.0008021473
0.0016666945
0.0017011747
0.0047531296
0.0006480366
0.0006991751
0.00062244263
0.0018713312
0.011115595
0.023534402
0.0006551878
0.0034814172
0.0010655342
0.007814791
0.0005589558
0.001745178
0.0005295135
0.011937023
0.005457194
0.0017308631
0.0014445158
0.008954869
0.0018430216
0.0018828



Elapsed time to compute best fit: 192.927 seconds
Cross-validation score: 0.7233840039015755
Test score: 0.7868525896414343
Best Hyperparameters: {}
0.0082736425
0.05840512
0.034573864
0.22182721
0.014908007
0.0033735249
0.0025830285
0.009588895
0.01069895
0.0074327122
0.053530123
0.011858174
0.0039835447
0.0
0.013558029
0.0
0.0030369058
0.010116766
0.016233096
0.0026437035
0.0073260977
0.004090056
0.005612213
0.0038419312
0.004107092
0.00011470387
0.0006719409
0.0045202305
0.0029766487
0.0045538726
0.016872186
0.0018355353
0.0011384322
0.00926833
0.0008136784
0.0
0.0
0.0
0.0054138615
0.00218683
0.0030471992
0.008734292
0.04522098
0.006472061
0.0025958035
0.0029430615
0.00087044376
0.0
0.0
0.0
0.0015499749
0.039169498
0.00087750336
0.0005256625
0.0019245613
0.0026039034
0.0025652663
0.0030478602
0.0035048136
0.012847153
0.00015109948
0.0020191583
0.007479355
0.004115293
0.0052674864
0.0005975995
0.0010972869
0.0070092157
0.0006785144
0.00057784095
0.0010163003
0.01201169
0.0009314929
0

### 5.2.3 LightGBM

In [129]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

smote_lightgbm_performance_nonnormalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    LightGBMPipeline = Pipeline(steps = [['smote', SMOTE()],
                                    #['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    smote_lightgbm_performance_nonnormalized_df = pd.concat([smote_lightgbm_performance_nonnormalized_df, new_performance_df])
    

smote_lightgbm_performance_nonnormalized_df.to_csv("../data/05_model_output/smote_lightgbm_performance_nonnormalized_df.csv")




Elapsed time to compute best fit: 16.902 seconds
Cross-validation score: 0.7099480727357724
Test score: 0.7194899817850636
Best Hyperparameters: {}
3900.371079325676
59152.480063438416
47898.93631482124
119280.19019961357
918.4079179763794
1950.923248410225
382.5296812057495
1198.4395923614502
2159.477682828903
160.21924304962158
10377.836953401566
2881.9588836431503
345.5498995780945
0.0
2143.817976474762
0.0
2204.613297700882
1061.5529433488846
1708.2432408332825
92.01044178009033
1002.0787310600281
399.30009841918945
821.9259569644928
137.27825284004211
100.85436916351318
127.28158903121948
71.50426983833313
818.1807438135147
114.91595149040222
580.8271074295044
3659.515256881714
138.22069096565247
183.26189839839935
3772.8329498767853
179.62153959274292
0.0
0.0
0.0
1155.4487628936768
578.0998954772949
617.3042297363281
664.0184555053711
413.9515018463135
460.5337951183319
430.3351626396179
9.358880043029785
233.86254978179932
0.0
0.0
0.0
115.83571982383728
3105.93168759346
165.3714



Elapsed time to compute best fit: 16.756 seconds
Cross-validation score: 0.7004816482907344
Test score: 0.7103825136612022
Best Hyperparameters: {}
5921.1541056633
64704.876873373985
37428.13244473934
99283.07229375839
987.7654886245728
749.3597033023834
348.72208857536316
1113.637427330017
834.6570088863373
733.76757645607
17213.983033776283
3531.718520760536
461.79728531837463
0.0
4730.241250753403
0.0
2892.705738544464
1032.440937757492
1062.6090532541275
63.467450857162476
965.6584670543671
225.89367127418518
1383.7612627744675
240.08264899253845
331.40453374385834
118.68348026275635
30.005320072174072
2011.2386627197266
104.44186973571777
632.1194410324097
6063.466977119446
253.3857854604721
199.0261504650116
3563.904029607773
70.14544034004211
0.0
0.0
0.0
714.8501334190369
222.2967610359192
491.6956355571747
786.4361951351166
303.4112503528595
535.8357267379761
604.620735168457
76.3834981918335
207.7481451034546
0.0
0.0
0.0
194.23107981681824
100.64150106906891
586.3448268175125




Elapsed time to compute best fit: 17.030 seconds
Cross-validation score: 0.7048233502172461
Test score: 0.661764705882353
Best Hyperparameters: {}
4253.689149737358
64810.426946759224
48876.38107788563
91054.64364004135
2145.5788773298264
1399.472220301628
188.6774706840515
1252.160834312439
1646.7380571365356
1003.5140264034271
17408.59711587429
3990.0660350322723
508.59022521972656
0.0
3669.759666085243
0.0
1431.0446178913116
1466.3476121425629
928.3371880054474
135.6100800037384
1080.000557899475
162.55825328826904
1304.7078601121902
228.1510692834854
203.65428805351257
40.351290225982666
32.64784049987793
860.9927945137024
89.11492025852203
1318.980866909027
8674.557468771935
85.50037050247192
60.910759925842285
2105.818926692009
80.5784205198288
0.0
0.0
0.0
1105.479700088501
323.03660583496094
497.99244928359985
256.43770456314087
0.0
674.6111776828766
775.9673562049866
0.0
185.52555990219116
0.0
0.0
0.0
154.90470016002655
492.6213322877884
167.56502223014832
30.724519729614258
65



Elapsed time to compute best fit: 17.307 seconds
Cross-validation score: 0.6958832994492389
Test score: 0.7180851063829787
Best Hyperparameters: {}
5584.057245969772
52428.80007278919
51930.729150891304
100077.33393144608
587.0601027011871
712.1595286130905
386.5489149093628
1534.4165720939636
1659.1428365707397
826.9153261184692
17522.156497716904
3493.895288825035
562.8525497913361
0.0
3306.2345373630524
0.0
922.5627007484436
696.3782210350037
1865.69144821167
74.87291026115417
1151.805932044983
434.7825071811676
1000.583712220192
452.0565791130066
68.21100902557373
74.72109079360962
19.283349990844727
2605.4364144802094
56.59523034095764
547.1920696496964
6545.062749624252
130.6324818134308
82.92585039138794
2099.131036043167
117.52260982990265
0.0
0.0
0.0
845.283730506897
333.5713562965393
328.24190962314606
584.0664148330688
96.33778095245361
429.01048946380615
559.2136135101318
7.935429811477661
458.2133409976959
0.0
0.0
0.0
201.0962918996811
179.18516993522644
173.83373320102692



Elapsed time to compute best fit: 19.511 seconds
Cross-validation score: 0.6864876212473476
Test score: 0.7950530035335689
Best Hyperparameters: {}
4360.5238468647
79199.21832585335
34547.10519635677
84861.94523882866
733.3141877651215
776.6282389163971
269.64181637763977
1046.975064754486
1371.6753120422363
796.8243169784546
17043.30936217308
5623.307381868362
952.6898713111877
0.0
4399.754866600037
0.0
1359.5385792255402
1486.1311156749725
1817.377918958664
20.396469831466675
1237.0654733181
128.30990314483643
1741.571153998375
232.7331416606903
188.71411037445068
51.56296968460083
0.0
1007.4288265705109
74.12237858772278
579.1902277469635
7998.410750389099
180.21805143356323
126.0276985168457
2670.1652863025665
23.24150013923645
0.0
0.0
0.0
986.9479491710663
315.36440658569336
1035.0085921287537
680.8907723426819
429.683251619339
675.464019536972
809.2985415458679
73.1920485496521
200.44631171226501
0.0
0.0
0.0
148.58325016498566
1728.6391999721527
80.83479022979736
79.9190292358398



Elapsed time to compute best fit: 17.075 seconds
Cross-validation score: 0.7009424530346003
Test score: 0.6818181818181818
Best Hyperparameters: {}
5365.485716819763
59330.346202373505
53033.435784101486
102790.85757684708
1831.6529049873352
212.0233016014099
376.45624566078186
1353.59010720253
839.1369721889496
934.2990074157715
17288.849063158035
2437.5660145282745
150.79161071777344
0.0
3191.9865748882294
0.0
544.6616404056549
1412.8247725963593
1413.5193132162094
63.95264005661011
555.880108833313
90.20675921440125
1536.1600127220154
217.32614946365356
143.46976947784424
0.0
68.38348150253296
986.2023416757584
42.05174970626831
732.5972738265991
5466.515163660049
224.1536122560501
254.39324855804443
640.7785782814026
70.97331011295319
0.0
0.0
0.0
138.3231234550476
183.6395025253296
1293.5506483316422
1152.3609416484833
142.92960596084595
434.1139512062073
607.8113121986389
2.5827999114990234
191.12813997268677
0.0
0.0
0.0
221.5562014579773
183.70542764663696
212.55408096313477
79.2



Elapsed time to compute best fit: 17.462 seconds
Cross-validation score: 0.695666764828615
Test score: 0.7245080500894455
Best Hyperparameters: {}
4423.402437448502
58217.290619015694
47843.594529271126
103612.07035827637
875.4086136817932
795.8040311336517
358.36251044273376
1749.00887966156
1911.652407169342
666.2333455085754
15847.288912177086
2436.1547837257385
68.87265086174011
0.0
5655.8478145599365
0.0
1091.4134347438812
1601.3441487550735
1598.515738248825
140.202211022377
370.87511444091797
211.3896267414093
2106.332120656967
429.8359606266022
28.400089740753174
14.398280143737793
44.60483932495117
1429.8575339317322
38.64323925971985
130.63491213321686
9111.356292963028
139.2824010848999
204.7732696533203
1560.6883219480515
87.24836111068726
0.0
0.0
0.0
1378.2735168933868
64.53520202636719
571.5114464759827
366.3952853679657
24.104660034179688
371.78457164764404
575.9777574539185
1.9247499704360962
338.2874093055725
0.0
0.0
0.0
186.1050500869751
289.801709651947
103.136889457



Elapsed time to compute best fit: 17.055 seconds
Cross-validation score: 0.6891064672817951
Test score: 0.6756756756756757
Best Hyperparameters: {}
6364.724728703499
52934.31886649132
43707.896997094154
109769.13946866989
882.4838098287582
1394.0443661212921
416.62125873565674
648.3889889717102
1442.3797702789307
464.1550419330597
17120.99944472313
4125.480103135109
495.92449033260345
0.0
3951.3993582725525
0.0
1422.030938744545
1599.4048433303833
2536.0411982536316
21.03367030620575
1979.19225025177
417.95466327667236
546.4939086437225
163.47466003894806
67.73640072345734
87.70284676551819
122.05619978904724
2660.7023535966873
45.04804980754852
592.5758061408997
6004.892676830292
151.80984115600586
66.48695135116577
3579.0572600364685
33.013349413871765
0.0
0.0
0.0
881.296968460083
465.81849575042725
961.7234590053558
167.2384696006775
151.3861961364746
490.85032534599304
1083.466349363327
5.896059989929199
164.2886004447937
0.0
0.0
0.0
99.10374021530151
50.507890462875366
142.8812105



Elapsed time to compute best fit: 17.032 seconds
Cross-validation score: 0.6835461520701174
Test score: 0.6915887850467289
Best Hyperparameters: {}
6341.147221803665
51165.559864997864
44059.26327443123
99711.94759070873
907.7189404964447
447.48654341697693
176.12437748908997
846.4393117427826
1669.7799618244171
1700.9761744737625
22457.86635518074
3394.651347875595
110.70072889328003
0.0
2993.2634630203247
0.0
1745.8897223472595
1952.3394969701767
1047.189204454422
19.407750368118286
630.877475976944
193.9202231168747
1015.7619023323059
272.20684480667114
110.96452808380127
32.98349952697754
72.19404029846191
2134.7716207504272
74.42241990566254
214.7369725704193
4964.458826780319
366.2470566034317
291.24991822242737
2311.3677830696106
118.14626038074493
0.0
0.0
0.0
778.2367329597473
77.00263047218323
758.762721657753
165.2511796951294
394.3265368938446
394.06511783599854
850.0492767095566
6.405940055847168
480.3472851514816
0.0
0.0
0.0
206.8156008720398
1149.433336019516
174.26534080



Elapsed time to compute best fit: 16.684 seconds
Cross-validation score: 0.7063115989504831
Test score: 0.6813996316758747
Best Hyperparameters: {}
2317.257621407509
76744.96936249733
45614.95752429962
95320.15180623531
674.733540058136
1475.6508885622025
212.15796887874603
382.79167914390564
1492.324018239975
374.6949141025543
15247.843441009521
5632.639905333519
844.2722593545914
0.0
4554.608289480209
0.0
3620.3098771572113
1348.8827517032623
2214.9079483747482
34.23909020423889
811.672848701477
126.43710923194885
751.7734730243683
371.9204980134964
86.1565899848938
8.659149885177612
31.12574076652527
884.3981385231018
15.733379602432251
149.08787977695465
5441.393172979355
173.9891586303711
65.59780180454254
3149.484027028084
104.34264063835144
0.0
0.0
0.0
1579.1143908500671
368.2761001586914
423.546648979187
633.9115591049194
5.350879907608032
1368.8714740276337
869.423154592514
20.264349699020386
135.4606202840805
0.0
0.0
0.0
138.75409817695618
113.03049063682556
66.02536964416504



Elapsed time to compute best fit: 17.051 seconds
Cross-validation score: 0.6797299719829798
Test score: 0.7545454545454545
Best Hyperparameters: {}
5450.037558078766
66658.94178140163
41125.87833225727
102102.02326798439
2000.7556464672089
600.4897202253342
364.01049184799194
250.49528312683105
1244.17607152462
253.40871596336365
15693.76724243164
2852.0655539035797
619.0121238231659
0.0
2713.3889710903168
0.0
881.3800331354141
987.7573355436325
1668.8072772026062
54.33037889003754
925.9873576164246
889.4981260299683
1384.715472817421
419.69801020622253
17.038729667663574
76.55370140075684
18.465850114822388
3712.0129569768906
18.22350025177002
671.113819360733
5914.753729224205
921.2338500022888
52.0251202583313
1245.544173836708
98.69453954696655
0.0
0.0
0.0
1258.951583623886
159.38908874988556
492.64445984363556
835.489753484726
180.13903617858887
581.6246502399445
340.55415546894073
102.9342589378357
601.495986700058
0.0
0.0
0.0
134.6643022298813
437.9777433872223
440.512717962265




Elapsed time to compute best fit: 16.911 seconds
Cross-validation score: 0.7044253858217627
Test score: 0.7117117117117118
Best Hyperparameters: {}
5070.678148031235
58627.438447117805
47843.005447268486
106573.50450825691
1282.5070937871933
960.5769815444946
109.51798915863037
1077.6039788722992
1567.445995092392
623.9884878396988
14748.54658794403
3283.077088713646
977.6462063789368
0.0
3400.9418720006943
0.0
708.8542740345001
1221.1198327541351
1663.6864159107208
188.4601833820343
277.115322470665
193.73023533821106
642.3060544729233
36.68454098701477
73.39288902282715
138.00566983222961
20.43535017967224
1618.3025860786438
151.20470070838928
998.5261756181717
6635.217361211777
247.51022219657898
118.88394975662231
3259.7507903575897
21.473549962043762
0.0
0.0
0.0
1354.2646255493164
186.18240356445312
473.5630741119385
678.3036720752716
45.90279150009155
430.97787976264954
281.5268484354019
0.0
349.65213215351105
0.0
0.0
0.0
183.12520599365234
339.66756761074066
310.83056819438934
2



Elapsed time to compute best fit: 17.495 seconds
Cross-validation score: 0.7159744062470057
Test score: 0.6727272727272727
Best Hyperparameters: {}
2887.5930891036987
64419.017167449
50736.3287191391
106227.69407105446
1808.0900859832764
2581.8402428627014
158.03208017349243
1228.1250159740448
811.2812922000885
88.49734854698181
10721.626001119614
4155.831585764885
1268.7261880636215
0.0
2316.0940551757812
0.0
2668.4575204849243
1860.6989885568619
1509.6506628990173
154.2381227016449
674.1084980964661
501.71077966690063
1172.5184519290924
320.5591526031494
87.16282200813293
73.98167848587036
42.01310896873474
1010.1571619510651
39.85316014289856
1238.4719619750977
4214.878399372101
460.17177867889404
74.64867973327637
3279.697033762932
307.03272223472595
0.0
0.0
0.0
1270.8494815826416
1086.4768867492676
365.637283205986
323.5845847129822
440.5092077255249
665.109870672226
1266.3431956768036
3.3139901161193848
142.0708498954773
0.0
0.0
0.0
104.98857951164246
205.14618754386902
171.01484



Elapsed time to compute best fit: 16.867 seconds
Cross-validation score: 0.6988495378721563
Test score: 0.6818181818181818
Best Hyperparameters: {}
1151.6057641506195
62164.91126704216
49664.26794719696
112489.3818860054
1596.2001498937607
571.7415001392365
639.393632888794
676.097428560257
1642.1224168539047
356.0229196548462
10067.589273452759
3605.120437502861
672.2443342208862
0.0
4175.738835334778
0.0
648.2717124223709
1943.8000729084015
1660.8647377490997
35.09843039512634
688.4359314441681
773.09605717659
850.4431916475296
316.8343303203583
625.9476313591003
118.75129127502441
26.698830127716064
1293.0654760599136
62.46346950531006
305.7698675394058
5125.485845565796
217.68992948532104
277.10284781455994
1714.6014506816864
108.6585705280304
0.0
0.0
0.0
1112.120080947876
111.34830093383789
797.5791573524475
537.2015166282654
1.6921700239181519
264.8538485765457
1224.1525559425354
32.96403980255127
406.1606891155243
0.0
0.0
0.0
351.663006067276
850.7439875602722
52.02891027927399




Elapsed time to compute best fit: 17.456 seconds
Cross-validation score: 0.6914281958359133
Test score: 0.6721915285451199
Best Hyperparameters: {}
5636.789045453072
51814.676302075386
54684.20792388916
103045.71916532516
2629.4013674259186
614.5678403377533
161.2435553073883
1031.8838157653809
1374.4630188941956
425.9916262626648
14108.074934244156
3974.566601037979
958.1912882328033
0.0
3622.586737513542
0.0
895.8344111442566
1248.7242622375488
2353.2175619602203
60.82170867919922
540.0358881950378
394.7691240310669
1036.0990334749222
284.4923782348633
28.575809717178345
179.31520080566406
18.71028971672058
920.0943031311035
127.02317023277283
774.4165418148041
4713.351580858231
51.354140520095825
152.82904267311096
2303.0212807655334
93.15503180027008
0.0
0.0
0.0
819.323573589325
343.02185237407684
410.8984808921814
958.6273822784424
21.231879949569702
424.6488026380539
648.2226289510727
48.05204153060913
56.82294034957886
0.0
0.0
0.0
236.50651001930237
545.740574836731
462.39825212



Elapsed time to compute best fit: 18.059 seconds
Cross-validation score: 0.7150267911896393
Test score: 0.6739526411657558
Best Hyperparameters: {}
3496.1049315929413
65508.40929162502
48086.121334671974
102903.45266032219
1995.6091253757477
843.8316069841385
325.09323382377625
2207.2642438411713
786.9386010169983
173.00263094902039
15640.644904494286
4349.165652632713
691.7651458978653
0.0
3038.218677997589
0.0
605.54261469841
1714.0712637901306
1238.630331993103
160.85786843299866
393.9669539928436
275.65134716033936
401.97906959056854
309.1804482936859
45.70301103591919
6.684500217437744
23.72492003440857
1876.1996648311615
53.315189599990845
880.9706737995148
5213.573926925659
139.53811693191528
67.90099954605103
1384.267379283905
83.02758979797363
0.0
0.0
0.0
1134.7924635410309
565.4887924194336
308.95736253261566
720.1162909269333
336.9098151922226
973.0972998142242
354.37929034233093
28.498850345611572
295.6906088590622
0.0
0.0
0.0
184.52223825454712
254.93363094329834
454.14613



Elapsed time to compute best fit: 18.069 seconds
Cross-validation score: 0.7006158507996171
Test score: 0.6517690875232776
Best Hyperparameters: {}
4394.758103251457
53604.2725135088
51502.69929218292
108728.92433273792
1812.4411475658417
1832.4622688293457
241.63729882240295
787.2965620756149
2179.628791332245
818.5391023159027
12113.489886879921
4537.763682484627
627.738214969635
0.0
2460.271353125572
0.0
1232.7868649959564
1831.0157265663147
1589.3345412015915
103.51403188705444
1586.8124318122864
662.0959460735321
724.6690872907639
304.0807888507843
298.77174139022827
56.075910806655884
23.24141001701355
1458.9223954677582
34.44155979156494
176.50939083099365
4773.9395896196365
145.41628229618073
194.40350663661957
2475.3957625627518
84.48839044570923
0.0
0.0
0.0
1218.4612078666687
270.42809295654297
993.0496417284012
427.21726155281067
62.26111888885498
476.5430737733841
641.2157797813416
45.93310022354126
203.95007848739624
0.0
0.0
0.0
157.60970962047577
249.6763722896576
128.980



Elapsed time to compute best fit: 17.360 seconds
Cross-validation score: 0.6748607483118271
Test score: 0.732838589981447
Best Hyperparameters: {}
2285.2293860912323
60430.40183210373
46897.10546827316
111162.34578096867
1149.0131480693817
650.8606204986572
159.39170241355896
2132.9140627384186
1133.8755853176117
29.716899871826172
11394.922811865807
4183.899742364883
316.31976795196533
0.0
4265.348390340805
0.0
1292.6543459892273
1561.0284719467163
2382.558998823166
161.8838791847229
907.332902431488
503.4614005088806
1397.119749903679
331.2338397502899
220.1469349861145
287.94746112823486
26.314669847488403
193.27531337738037
111.75680947303772
638.988388299942
5309.741581201553
232.93769884109497
94.64172983169556
3495.7700526714325
109.63077020645142
0.0
0.0
0.0
666.8008937835693
204.03550052642822
392.68667101860046
250.74112224578857
135.37423992156982
565.4498245716095
832.3973290920258
11.265800476074219
367.7507700920105
0.0
0.0
0.0
202.0403699874878
815.2449145317078
337.9883



Elapsed time to compute best fit: 16.796 seconds
Cross-validation score: 0.6925648824227754
Test score: 0.7001795332136445
Best Hyperparameters: {}
4200.5484136343
54990.069106936455
48868.11638855934
106272.16088795662
1573.3436233997345
1608.2637667655945
201.82959485054016
1297.5782659053802
1303.9266382455826
1022.3981997966766
14479.123188734055
5209.525472044945
816.0582540035248
0.0
2379.5912133455276
0.0
1038.6041071414948
1350.7875797748566
2101.3877716064453
69.66319119930267
780.4492893218994
567.3338840007782
1155.6314058303833
136.02455139160156
234.53991603851318
95.81460189819336
0.0
1162.7164001464844
59.62551999092102
326.7717092037201
8456.391734600067
355.48847806453705
69.52778005599976
1906.8277679681778
75.68315088748932
0.0
0.0
0.0
1706.5925614833832
530.4349627494812
342.6210424900055
169.4365484714508
515.893298625946
391.580090880394
552.02718770504
9.69359016418457
188.9149296283722
0.0
0.0
0.0
125.86858904361725
1166.78520321846
191.52959179878235
80.5146410



Elapsed time to compute best fit: 20.431 seconds
Cross-validation score: 0.6919917271649478
Test score: 0.6377079482439926
Best Hyperparameters: {}
4741.087108373642
50787.52330803871
53690.538999676704
97726.70022034645
1786.2649748325348
230.4459149837494
451.7165286540985
164.05966067314148
1099.3830020427704
488.4079577922821
16920.743730306625
3050.810169816017
1071.0073614120483
0.0
6115.542775392532
0.0
1208.1179953813553
849.0756287574768
1876.549264550209
36.33867025375366
755.4099454879761
160.86012315750122
1488.0898607969284
186.19173908233643
142.48500776290894
61.9643497467041
9.765389919281006
2623.5080976486206
141.8056719303131
755.4763736724854
8368.593299388885
226.93559956550598
47.34602904319763
3531.654639005661
83.05471980571747
0.0
0.0
0.0
685.7478893995285
0.0
297.3415507078171
59.876739501953125
327.83061707019806
1195.376078248024
635.5125379562378
10.366990089416504
142.55193758010864
0.0
0.0
0.0
92.94564962387085
544.7006126642227
230.73020946979523
136.520



Elapsed time to compute best fit: 17.825 seconds
Cross-validation score: 0.672369908814175
Test score: 0.7259528130671505
Best Hyperparameters: {}
3666.3835299015045
56760.24871253967
51277.265004873276
97143.41354894638
1979.9668999910355
789.4861688613892
464.39043164253235
1108.4815620183945
2523.0794513225555
254.6485414505005
16426.726028203964
2434.7035896778107
944.9889554977417
0.0
3781.25071144104
0.0
1046.7299653291702
1427.517252445221
1832.3340191841125
9.337120056152344
466.51395630836487
185.65921926498413
997.5378683805466
165.62167358398438
193.37548208236694
208.9482500553131
11.486319541931152
2272.5283093452454
72.52489018440247
588.5198802947998
5166.327541947365
159.5756494998932
86.18494093418121
4657.290601491928
177.77621912956238
0.0
0.0
0.0
1632.105840921402
52.76850128173828
478.9585461616516
141.49236154556274
152.18311142921448
487.83879828453064
598.1909437179565
3.1599600315093994
146.41356205940247
0.0
0.0
0.0
84.89999961853027
197.09407138824463
364.097



Elapsed time to compute best fit: 17.892 seconds
Cross-validation score: 0.7116443252204484
Test score: 0.7464028776978417
Best Hyperparameters: {}
5531.958305954933
63248.269982934
49726.68783259392
96366.05734610558
1351.542448759079
491.4131462574005
912.6091589927673
158.9367083311081
2679.40390253067
486.6381833553314
17750.85640501976
4342.840272784233
634.8704867362976
0.0
2303.744786262512
0.0
789.5950906276703
1007.0636987686157
1530.988467335701
41.223140001297
820.3543229103088
251.93578791618347
876.9959584474564
486.31188130378723
22.877599716186523
87.79714941978455
24.146519660949707
1310.9602267742157
36.498509883880615
1302.6057963371277
4399.330453753471
310.06325674057007
264.48297691345215
1776.0479373931885
23.825629949569702
0.0
0.0
0.0
1667.253180027008
1082.8203434944153
484.670068025589
89.58136034011841
109.36141777038574
1098.0884788036346
441.7266821861267
33.64736986160278
313.7306385040283
0.0
0.0
0.0
171.82280015945435
329.0022574663162
232.2337280511856




Elapsed time to compute best fit: 18.467 seconds
Cross-validation score: 0.7202395979136291
Test score: 0.692167577413479
Best Hyperparameters: {}
6997.63779091835
50252.03070604801
47409.7155585289
110406.40503406525
2125.2770874500275
215.27081298828125
135.88258147239685
838.9180858135223
554.1680235862732
863.8564283847809
17858.186023950577
2481.8483053445816
257.9123589992523
0.0
3169.0269606113434
0.0
747.5431718826294
2105.9832549095154
1335.8438764810562
30.220829963684082
1869.3208196163177
338.6058645248413
741.2292827367783
123.93777799606323
327.28197169303894
144.93162858486176
40.634870290756226
1048.0537610054016
64.6801598072052
72.92905116081238
2157.566005706787
173.57450127601624
35.156309604644775
5308.216179728508
77.59574031829834
0.0
0.0
0.0
253.68869590759277
110.47259902954102
749.41361784935
330.51696133613586
0.0
380.41802072525024
1128.8845269680023
67.58632111549377
161.79440903663635
0.0
0.0
0.0
285.4004417657852
153.6404619216919
800.5262686014175
14.942



Elapsed time to compute best fit: 17.995 seconds
Cross-validation score: 0.6985173877478379
Test score: 0.6813996316758747
Best Hyperparameters: {}
3543.3429757356644
57392.65256035328
59221.123529434204
103903.24221515656
1345.8192344903946
1329.203527212143
215.18648648262024
1139.4237592220306
2357.8077044487
680.5423212051392
11927.999646544456
2877.806119084358
674.1130938529968
0.0
2815.1778180599213
0.0
690.045458316803
1572.3097256422043
1581.7540900707245
179.80820035934448
454.2009754180908
389.70332300662994
484.3234131336212
189.27947795391083
269.3044602870941
65.44298887252808
107.93204641342163
275.725053191185
127.36660826206207
1159.2873351573944
4938.088877916336
163.77329015731812
148.47062754631042
1170.402629852295
145.78987956047058
0.0
0.0
0.0
897.0124835968018
711.8808994293213
653.2097233533859
1730.5172216892242
9.059100151062012
971.009659409523
1022.007071018219
6.6336798667907715
211.70837140083313
0.0
0.0
0.0
148.67568182945251
547.6650027036667
63.8091899



Elapsed time to compute best fit: 18.448 seconds
Cross-validation score: 0.704586677012954
Test score: 0.7169117647058824
Best Hyperparameters: {}
4300.055571079254
55022.31045615673
49819.01075053215
100099.87506818771
2245.847046971321
802.9349844455719
271.41444849967957
1481.5863947868347
1175.2291429042816
869.0530073642731
18235.07115828991
3550.468329191208
673.2486928701401
0.0
4166.862241268158
0.0
956.7923665046692
1440.3449432849884
1063.377096414566
125.32465648651123
1687.7815891504288
321.9295175075531
1604.4366900920868
275.78368377685547
100.51816844940186
10.570430040359497
69.3440911769867
1947.7833168506622
120.60043907165527
420.5415108203888
3920.7907407283783
164.91348934173584
202.78461861610413
3186.9361525774
85.60099053382874
0.0
0.0
0.0
810.395201921463
513.7961230278015
795.2659628391266
156.89549779891968
436.11900329589844
344.3380877971649
954.1574367284775
21.48721981048584
269.7064998149872
0.0
0.0
0.0
250.15655040740967
3732.6708142757416
185.917096853

## 5.3 Rebalancing Strategy - UNDER

### 5.3.1 Random Forest

In [130]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

under_randomforest_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    pipeline = Pipeline(steps = [#['smote', SMOTE()],
                              ['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    under_randomforest_nonnormalized_performance_df = pd.concat([under_randomforest_nonnormalized_performance_df, new_performance_df])
    
under_randomforest_nonnormalized_performance_df.to_csv("../data/05_model_output/under_randomforest_nonnormalized_performance_df.csv")



### 5.3.2 XGBoost

In [131]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


under_xgboost_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    GXBoostPipeline = Pipeline(steps = [#['smote', SMOTE()],
                                    ['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    under_xgboost_nonnormalized_performance_df = pd.concat([under_xgboost_nonnormalized_performance_df, new_performance_df])
    

under_xgboost_nonnormalized_performance_df.to_csv("../data/05_model_output/under_xgboost_nonnormalized_performance_df.csv")




Elapsed time to compute best fit: 3.204 seconds
Cross-validation score: 0.236367501511174
Test score: 0.24124881740775783
Best Hyperparameters: {}
0.008932898
0.105446465
0.012237301
0.038926322
0.01035187
0.016863307
0.0125013795
0.025453655
0.0
0.037332963
0.008407761
0.07036311
0.0019039809
0.0
0.0213366
0.0
0.081243925
0.012032536
0.004944778
0.0010522313
0.008906705
0.0026579578
0.04153524
0.008759077
0.032173283
0.0
0.00030380042
0.0141612915
0.0
0.0011050543
0.0
0.0017085117
0.0037101787
0.0
0.0040199184
0.0
0.0
0.0
0.021656994
0.0
0.0005065724
0.0074417
0.0
0.008106202
0.0026315218
0.01223846
0.0006026797
0.0
0.0
0.0
0.0022273285
0.0014744141
0.003000947
0.0034027568
0.002651608
0.0
0.0
0.000985439
0.017292911
0.0024766088
0.018173683
0.0033696804
0.0035160067
0.0
0.0028879512
0.0013491049
0.0063736755
0.0
0.003135796
0.0044567985
0.008918232
0.004992852
0.006269652
0.003932522
0.00076918217
0.0
0.0
0.0035418884
0.013877351
0.0
0.0054494943
0.0029602686
0.006390042
0.0
0.000261



Elapsed time to compute best fit: 2.984 seconds
Cross-validation score: 0.23701334712464228
Test score: 0.23403324584426946
Best Hyperparameters: {}
0.015359615
0.10441049
0.009565899
0.06586302
0.018814975
0.019670164
0.012344416
0.010826367
0.009032095
0.032030832
0.004038565
0.07526053
0.0021001166
0.0
0.0021731227
0.0
0.040141348
0.02324748
0.010462643
0.030263893
0.0069285566
0.00069524144
0.011504449
0.01357932
0.017488157
0.029296393
0.0035615924
0.0076974705
0.0
0.0044439156
0.0
0.008859777
0.0012286699
0.0070205117
0.0050433953
0.0
0.0
0.0
0.009725811
0.0
0.0003988545
0.008042502
0.0
0.014502089
0.014898926
0.0
0.0009217674
0.0
0.0
0.0
0.002177653
0.0063820756
0.0030329167
0.022947213
0.0
0.0
0.00071279705
0.0044079567
0.010628175
0.0039033631
0.0016424822
0.003064805
0.0035592124
0.0
0.009966595
0.0016301975
0.009096656
0.0
0.0021439288
0.0096171005
0.004623651
0.0016435846
0.003583682
0.0035720041
0.001098424
0.0
0.0012475294
0.0042674253
0.005813111
0.0
0.007270923
0.016613



Elapsed time to compute best fit: 3.174 seconds
Cross-validation score: 0.23248489961423138
Test score: 0.24839006439742414
Best Hyperparameters: {}
0.009187599
0.09520037
0.010690296
0.051722717
0.02179799
0.044974893
0.05086818
0.010421977
0.0
0.00756252
0.037349835
0.03799007
0.0
0.0
0.012701637
0.0
0.030247593
0.020173289
0.0014277335
0.0051819026
0.013856829
0.0058165034
0.02094537
0.004177021
0.009866873
0.00053789705
0.0062247952
0.009081209
0.0055483123
0.00538304
0.0027776395
0.00026694004
0.0034820708
0.0
0.001836381
0.0
0.0
0.0
0.00778004
0.013203323
0.0035404332
0.004941285
0.0
0.0012017807
0.0033910673
0.001560811
0.005963138
0.0
0.0
0.0
0.004870277
0.0032727725
0.006085293
0.0013114406
0.038796965
0.0057838485
0.008200856
0.012648222
0.005257177
0.014491724
0.0068894885
0.012376036
0.008975209
0.008815257
0.0040763975
0.0011276677
0.0053534205
0.0
0.018056404
0.0038068914
0.003016665
0.0009384169
0.008409429
0.0024655217
0.0030086746
0.0
0.00065949274
0.0038577714
0.01018



Elapsed time to compute best fit: 3.016 seconds
Cross-validation score: 0.2416278164409878
Test score: 0.24495677233429397
Best Hyperparameters: {}
0.005606349
0.12349808
0.010715228
0.022746276
0.008519645
0.0104564335
0.008054382
0.01203345
0.00782626
0.019749139
0.041653156
0.06286441
0.00596213
0.0
0.009769093
0.0
0.0128719555
0.044771593
0.0015418112
0.00037613983
0.010436407
0.014231927
0.022252394
0.012513835
0.012754105
0.0
0.0
0.0055301525
0.0029747603
0.0028908644
0.006109099
0.003145314
0.008981449
0.0028512243
0.0022288638
0.0
0.0
0.0
0.014917332
0.084297694
0.0032011895
0.006392535
0.0
0.0021491942
0.007424178
0.01714973
0.0013604555
0.0
0.0
0.0
0.0097935535
0.0025605953
0.0040834253
0.008762936
0.0
0.01599868
0.007430732
0.006232735
0.00837877
0.012980249
0.002010158
0.00094534905
0.0071033197
0.0
0.0008158806
0.005028837
0.008055427
0.0
0.03404782
0.010412309
0.0022879061
0.005300104
0.005102847
0.012171094
0.00062902435
0.0
0.0
0.00921635
0.0043322127
0.0
0.0023281914
0



Elapsed time to compute best fit: 2.955 seconds
Cross-validation score: 0.2422204283798516
Test score: 0.22129710780017528
Best Hyperparameters: {}
0.008659482
0.09496852
0.025860732
0.02838529
0.035871062
0.015007537
0.024100157
0.035345063
0.0033391018
0.0015181297
0.019800462
0.102182835
0.003801707
0.0
0.0068155746
0.0
0.041760076
0.027528
0.0043172124
0.040050752
0.0044160197
0.00515578
0.040905584
0.0053766374
0.0024128344
0.0
0.0072435834
0.0054027005
0.002800491
0.0019107821
0.006223211
0.004426444
0.002374803
0.003591367
0.00071896164
0.0
0.0
0.0
0.010500882
0.0
0.0147222495
0.0060319225
0.0
0.0023768952
0.0040187105
0.0
0.0013936458
0.0
0.0
0.0
0.005061704
0.0055022817
0.0011881039
0.022590853
0.0010367797
0.0016621352
0.0013833115
0.0009579692
0.009886032
0.013052088
0.0008201682
0.00494091
0.006301048
0.00036689168
0.0071015265
0.0006631149
0.0057965377
0.0
0.002098891
0.0023633572
0.0031056583
0.0097782435
0.004714634
0.00097628403
0.00069791806
0.0
0.0
0.0052915835
0.0153



Elapsed time to compute best fit: 3.075 seconds
Cross-validation score: 0.2346206822083749
Test score: 0.24178403755868544
Best Hyperparameters: {}
0.016575601
0.07520435
0.0318284
0.035277963
0.013069217
0.030581819
0.008176452
0.0069598886
0.01495137
0.0059331553
0.0140246125
0.053049356
0.0017531394
0.0
0.0056524975
0.0
0.056940462
0.049487535
0.006266879
0.01162178
0.009079324
0.007828072
0.03455432
0.007914094
0.005727046
0.0005227763
0.0028847384
0.0041298326
0.0007763231
0.006983744
0.0
0.003881475
0.006227
0.00041626318
0.0051838206
0.0
0.0
0.0
0.0069121155
0.0
0.008771559
0.005957102
0.0
0.0037427992
0.026802152
0.0
0.0017517473
0.0
0.0
0.0
0.002454214
0.0013729181
0.00288807
0.0
0.0
0.0007429168
0.014013302
0.0030975528
0.004698747
0.00971881
0.024315257
0.010713371
0.0027856939
0.00029125554
0.009794612
0.00038692713
0.008877992
0.0
0.0036998335
0.007399538
0.004961698
0.02823064
0.003667505
0.0035849465
0.007614729
0.0017329111
0.002355969
0.0012282478
0.012975477
0.0
0.0
0



Elapsed time to compute best fit: 2.876 seconds
Cross-validation score: 0.22769603261907193
Test score: 0.23103748910200522
Best Hyperparameters: {}
0.014668474
0.110050015
0.025224868
0.031191537
0.019102281
0.0148929665
0.015389689
0.0021873356
0.04872795
0.00306355
0.0
0.08193568
0.00823969
0.0
0.031174483
0.0
0.034506515
0.04236083
0.0351591
0.0015662021
0.019798828
0.005242732
0.020843139
0.004741165
0.004814168
0.0
0.0046094013
0.0009684614
0.000970716
0.007940982
0.0032930041
0.00039675296
0.00987021
0.013333982
0.00583097
0.0
0.0
0.0
0.00231027
0.012909243
0.003471301
0.0025564884
0.0
0.0025749828
0.01694644
0.0
0.0031334918
0.0
0.0
0.0
0.0033898416
0.0012437137
0.00826813
0.00743629
0.00055471004
0.018827327
0.013441529
0.0047394307
0.002293
0.027087182
0.0
0.008272943
0.0
0.0
0.001785808
0.0005594795
0.011296251
0.0005109621
0.0007895924
0.00424416
0.0043818695
0.0057175714
0.01052554
0.003215841
0.002964072
0.00042043245
0.0
0.0020354546
0.007749234
0.0014977586
0.0024912916



Elapsed time to compute best fit: 3.119 seconds
Cross-validation score: 0.2416575621365326
Test score: 0.24223894637817495
Best Hyperparameters: {}
0.015650153
0.10082425
0.013315312
0.05033514
0.010781446
0.024178373
0.0065672826
0.012665335
0.015992904
0.0072037843
0.024752157
0.061058488
0.0030040285
0.0
0.012661457
0.0
0.014694503
0.05162008
0.0022495866
0.0
0.014791872
0.010577604
0.017203027
0.010794744
0.006834306
0.00065112865
0.0
0.011080776
0.0003428626
0.0009534912
0.009498437
0.005356831
0.0033579175
0.0
0.0019123596
0.0
0.0
0.0
0.016486537
0.0
0.020677261
0.0014672178
0.0062497286
0.0015654416
0.0028262269
0.0
0.0028849896
0.0
0.0
0.0
0.002415746
0.0025750678
0.0092785675
0.0010316208
0.0
0.0
0.006243865
0.009471633
0.010129859
0.0164461
0.0018338959
0.008177525
0.00042334144
0.0
0.009342004
0.0
0.0045976127
0.0
0.011032478
0.0053129257
0.0050242045
0.0
0.00948403
0.0021235421
0.01156055
0.0006339203
0.0005792269
0.0009062166
0.012259226
0.0
0.0022148986
0.0127694905
0.004



Elapsed time to compute best fit: 3.044 seconds
Cross-validation score: 0.23959890554216826
Test score: 0.22911051212938008
Best Hyperparameters: {}
0.0053811558
0.08609034
0.00951254
0.038527276
0.054801103
0.017488725
0.0037621672
0.011365029
0.022965262
0.01316813
0.08874938
0.007987569
0.0021692056
0.0
0.005346834
0.0
0.0055431784
0.042013608
0.027481982
0.0
0.0058256895
0.0034730344
0.028145067
0.00812827
0.007480574
0.0
0.0021970887
0.0027791685
0.0009930518
0.0022734609
0.0024992062
0.0063266307
0.0033433405
0.0040815393
0.0011661183
0.0
0.0
0.0
0.00782268
0.0
0.005540849
0.009015526
0.0
0.0031908965
0.010631748
0.0
0.0015750438
0.0
0.0
0.0
0.0039753094
0.0011560147
0.0034752102
0.0
0.0
0.0014334855
0.009944891
0.00038292044
0.0019592617
0.010736616
0.09590403
0.0038783136
0.0
0.0
0.0092981635
0.023293024
0.0059707244
0.014534897
0.0028507302
0.003170997
0.001870512
0.004049585
0.0089148665
0.0036071632
0.024319408
0.0
0.0
0.0020088889
0.013590873
0.0
0.0
0.00043189974
0.0072926



Elapsed time to compute best fit: 3.074 seconds
Cross-validation score: 0.22595378329696789
Test score: 0.24245196706312902
Best Hyperparameters: {}
0.010940769
0.0504091
0.034565523
0.10824634
0.04061524
0.019437438
0.00046500456
0.03903264
0.015278333
0.0052950657
0.010952687
0.04389112
0.0016323926
0.0
0.016150651
0.0
0.027510637
0.04855138
0.022532532
0.0006738298
0.0031759494
0.008550211
0.070313774
0.009633237
0.008655913
0.0050422344
0.0
0.0047511538
0.0005944735
0.0028789095
0.0049766097
0.009883065
0.0056405333
0.0015431728
0.005055231
0.0
0.0
0.0
0.0016823135
0.0
0.0050833616
0.0012276925
0.018188952
0.002322653
0.012751832
0.0
0.0022467142
0.0
0.0
0.0
0.0036337646
0.0014164923
0.00215448
0.0
0.0
0.002195862
0.0044627236
0.0063345064
0.0017349023
0.0067025926
0.0
0.0030705682
0.0027772908
0.0
0.017851999
0.00844727
0.021254253
0.0
0.010939738
0.0016661144
0.0011344741
0.00045159747
0.009765352
0.0027525376
0.0026741195
0.0
0.0008929741
0.0012934303
0.006447889
0.0
0.012544658



Elapsed time to compute best fit: 3.073 seconds
Cross-validation score: 0.2491012380339707
Test score: 0.2347310847766636
Best Hyperparameters: {}
0.015268898
0.087177634
0.021328198
0.037583742
0.042663906
0.01753485
0.0047584525
0.04835543
0.0020926145
0.032638267
0.07550017
0.017231733
0.0037065505
0.0
0.016801711
0.0
0.0023025398
0.02242533
0.0033522993
0.0008520771
0.0011285623
0.0047331722
0.01597433
0.030673444
0.0
0.0053083226
0.0047171456
0.0052755005
0.009793072
0.0025431588
0.0013779612
0.009578523
0.013425644
0.00092648686
0.002327284
0.0
0.0
0.0
0.011971206
0.0
0.0018888226
0.025413657
0.0
0.012089107
0.0016440008
0.0008740863
0.0
0.0
0.0
0.0
0.0014619908
0.0029418871
0.009899188
0.00142569
0.005747009
0.0004932238
0.0017742292
0.0
0.0034856973
0.0044051907
0.016188454
0.02023975
0.01076974
0.0016235436
0.02017205
0.002494456
0.017483812
0.0
0.0012477
0.009790009
0.0060962453
0.005905147
0.0058688927
0.008931804
0.005678414
0.00010719679
0.0035945394
0.031060489
0.00491508



Elapsed time to compute best fit: 3.071 seconds
Cross-validation score: 0.2303002118776774
Test score: 0.24085637823371986
Best Hyperparameters: {}
0.017722312
0.07780785
0.055312097
0.04133689
0.030144447
0.026409581
0.026658298
0.0016614752
0.010449814
0.013759478
0.0068169753
0.05402556
0.005156748
0.0
0.0024015962
0.0
0.014922427
0.016172478
0.00413293
0.0
0.015183091
0.0048656794
0.047688793
0.0027495804
0.0019362207
0.0
0.0
0.0035036423
0.0034730842
0.0020984355
0.0010206297
0.004138987
0.00942949
0.0022117128
0.0040504374
0.0
0.0
0.0
0.0061922814
0.018666228
0.0117695965
0.01058463
0.009935504
0.0073306146
0.00042169308
0.0
0.0041721324
0.0
0.0
0.0
0.0036419684
0.006427005
0.0054470417
0.0044572824
0.0031156898
0.0026613947
0.008510577
0.0
0.005181109
0.011734774
0.0086487355
0.014963488
0.0010498554
0.0035832573
0.012927875
0.0032116314
0.013576912
0.0
0.003469847
0.00705992
0.01168211
0.021295343
0.0066180103
0.0007586297
0.0017396027
0.0
0.0
0.009212887
0.006349228
0.0
0.0
0.



Elapsed time to compute best fit: 3.207 seconds
Cross-validation score: 0.24313350108381337
Test score: 0.24178403755868544
Best Hyperparameters: {}
0.011115514
0.11586232
0.02619204
0.046048626
0.01782126
0.03170186
0.016360015
0.01714591
0.012577127
0.07690697
0.00089645834
0.08092329
0.003956052
0.0
0.011704922
0.0
0.0034452835
0.024837652
0.0053941505
0.01633561
0.0071688206
0.011081513
0.030645704
0.020367932
0.0006834314
0.0028279007
0.0031869777
0.0019024098
0.0014114669
0.0026561075
0.003047381
0.009570053
0.0012049695
0.004471029
0.0015402266
0.0
0.0
0.0
0.003819547
0.0
0.0054161525
0.0026187196
0.0
0.013975685
0.007912493
0.0
0.0012060745
0.0
0.0
0.0
0.0012542405
0.0052145864
0.008201235
0.0
0.00021369026
0.010749887
0.010539878
0.0
0.011560098
0.012672114
0.006924865
0.005030841
0.0030146558
0.0045544617
0.010479975
0.0017223081
0.009848218
0.00011570309
0.003807105
0.006859491
0.0022037572
0.0
0.003320246
0.0
0.0072004353
0.0
0.007390603
0.002661017
0.020481473
0.0
0.000378



Elapsed time to compute best fit: 3.121 seconds
Cross-validation score: 0.231281840122676
Test score: 0.22727272727272724
Best Hyperparameters: {}
0.0050092675
0.08319835
0.013744527
0.053641606
0.018706264
0.014776204
0.017065365
0.0070932303
0.009225316
0.0
0.010930549
0.09138782
0.0041671535
0.0
0.013687574
0.0
0.017441522
0.01759798
0.009966194
0.022740463
0.0065732757
0.008658643
0.032113302
0.0021195034
0.009913069
0.0
0.0
0.009397855
0.020635258
0.009239248
0.0065843337
0.0016515796
0.004769093
0.0016264945
0.0052363276
0.0
0.0
0.0
0.004251572
0.013428171
0.004912865
0.012236615
0.0
0.001997339
0.015886217
0.0
0.0022125035
0.0
0.0
0.0
0.0046969927
0.01218821
0.0025473125
0.0016177
0.02777238
0.0042310604
0.004314817
0.007224737
0.00878135
0.0032658994
0.00030079164
0.0062405136
0.006829737
0.005718351
0.0022561317
0.0036683544
0.007981447
0.05272587
0.013259286
0.0043442883
0.002722143
0.0030222475
0.0032469623
0.0006175569
0.0
0.0
0.0
0.0138349775
0.018825497
0.0
0.0013781214
0



Elapsed time to compute best fit: 2.897 seconds
Cross-validation score: 0.2439922403356129
Test score: 0.23669972948602344
Best Hyperparameters: {}
0.009152232
0.104879245
0.011623736
0.030203832
0.015366937
0.0021095932
0.03555579
0.028411731
0.024922768
0.0037686785
0.015435827
0.047935568
0.0043359855
0.0
0.0012618386
0.0
0.020990636
0.028679268
0.016657557
0.009162596
0.017130004
0.0063480437
0.042170133
0.08622811
0.011564818
0.016211024
0.004559145
0.009876052
0.004559179
0.0019059944
0.002169657
0.009598585
0.0029899175
0.0013577736
0.0005450811
0.0
0.0
0.0
0.021329233
0.0
0.0015465282
0.0020710987
0.0
0.0040190336
0.001472094
0.0
0.0061910576
0.0
0.0
0.0
0.0015523368
0.0023877
0.0034335486
0.00032370506
0.0028045275
0.0006095009
0.021632018
0.0022199368
0.013107153
0.017263817
0.006736717
0.0017262573
0.0042624003
0.0
0.0060037724
0.012227422
0.0060543916
0.04976114
0.004595481
0.0043119835
0.0034804537
0.009101981
0.0022776544
0.0026697542
0.0029351392
0.0013978702
0.003509804



Elapsed time to compute best fit: 3.190 seconds
Cross-validation score: 0.22555222159705654
Test score: 0.22727272727272727
Best Hyperparameters: {}
0.009072816
0.086541615
0.022259519
0.040883478
0.032266017
0.0072571193
0.002563241
0.0
0.0016116786
0.006142021
0.008077963
0.042172708
0.010350218
0.0
0.011767402
0.0
0.012398298
0.016056823
0.00095066614
0.0033168022
0.026189744
0.0071175653
0.11894309
0.027047267
0.005113537
0.0
0.00887016
0.00066175027
0.0010776327
0.0039880304
0.009232881
0.00090929534
0.0016278402
0.0
0.0064722807
0.0
0.0
0.0
0.010579535
0.039832767
0.006454774
0.009636131
0.0
0.0034534766
0.004655589
0.0
0.0036003701
0.0
0.0
0.0
0.0033730294
0.0028378186
0.0016435266
0.000703683
0.0
0.0027597034
0.002581798
0.0
0.008035278
0.009125813
0.0138197215
0.010939615
0.0017690427
0.00021120088
0.0024506794
0.0029496734
0.008340931
0.0007531664
0.0067722797
0.004650097
0.009914553
0.0022977544
0.0059112096
0.01921545
0.0010301737
0.0072389906
0.0013101324
0.0053149727
0.00



Elapsed time to compute best fit: 2.902 seconds
Cross-validation score: 0.23409371834527337
Test score: 0.26884422110552764
Best Hyperparameters: {}
0.007679014
0.06471868
0.014198986
0.07780646
0.055156443
0.019945603
0.008066162
0.009805793
0.0
0.003076493
0.10449295
0.0373435
0.0013585351
0.0
0.007854826
0.0
0.015469203
0.015668802
0.011677306
0.0
0.02504026
0.003993275
0.037239715
0.00021993343
0.011519118
0.00065015384
0.0
0.014997016
0.0010956008
0.0010031565
0.0011382694
0.0038774933
0.0022362021
0.0
0.0017472535
0.0
0.0
0.0
0.009985336
0.053896736
0.0010900222
0.00034380396
0.0
0.0044903057
0.0143385045
0.0
0.0053266087
0.0
0.0
0.0
0.0019172634
0.0025961262
0.0048236093
0.0
0.028824195
0.002267343
0.004380859
0.0
0.0021195738
0.0010256973
0.0041978452
0.015536256
0.0007598459
0.029907746
0.013289984
0.0063794577
0.0055454667
0.0005298986
0.006006545
0.0036859193
0.005723782
0.00149034
0.008521046
0.01904978
0.004449971
0.018171249
0.0
0.0030130805
0.007184961
0.0028045771
0.001



Elapsed time to compute best fit: 3.030 seconds
Cross-validation score: 0.22535136950520157
Test score: 0.2507374631268437
Best Hyperparameters: {}
0.019229662
0.1083551
0.015964886
0.0527541
0.055427298
0.026909562
0.016401751
0.01546679
0.007917006
0.004281721
0.008582177
0.06825699
0.001742528
0.0
0.01521969
0.0
0.017054537
0.001476817
0.0022735433
0.008875964
0.011300976
0.008357923
0.016563162
0.0110417465
0.0027496258
0.0008273035
0.003625515
0.0017517967
0.0024914097
0.0020827649
0.0044287904
0.0002925083
0.0051681804
0.0010889265
0.005211635
0.0
0.0
0.0
0.009191006
0.0
0.0022967388
0.0040761693
0.0
0.0008455878
0.019656723
0.0
0.0038838808
0.0
0.0
0.0
0.0065331683
0.004834581
0.0036856404
0.0054725027
0.0027720442
0.0012097592
0.007771784
0.006322273
0.00034012677
0.046358712
0.0063735973
0.006021186
0.0
0.0
0.0045660725
0.013113395
0.003271799
0.004062006
0.0065152426
0.008060349
0.0019176025
0.0056768428
0.0103755025
0.006070126
0.001685138
0.0
0.00803764
0.0
0.0039940896
0.0



Elapsed time to compute best fit: 3.119 seconds
Cross-validation score: 0.23127542619959068
Test score: 0.2315227070347284
Best Hyperparameters: {}
0.014169849
0.13354701
0.01924396
0.04201268
0.018845117
0.03807828
0.00047675468
0.0068696346
0.0021502452
0.0
0.028039142
0.029940315
0.0032597077
0.0
0.00897395
0.0
0.021487588
0.0456519
0.004423682
0.004081475
0.007595136
0.00060211756
0.01440296
0.0014067806
0.00043622687
0.0052090865
0.013883904
0.006378677
0.0015439431
0.005300833
0.006539849
0.006132208
0.0059985765
0.0037972305
0.0024427446
0.0
0.0
0.0
0.002689412
0.0
0.0062550963
0.007823047
0.02213419
0.011581293
0.0022056182
0.0
0.0027697878
0.0
0.0
0.0
0.006362857
0.00039628014
0.004343917
0.0005884318
0.00066802534
0.0
0.00079449825
0.0069048023
0.009513087
0.02061657
0.0013731404
0.0037764264
0.004353784
0.007865037
0.019132633
0.004953255
0.007973382
0.0071981545
0.002285609
0.004751173
0.00200716
0.0
0.0063237464
0.0
0.0037051192
0.0
0.00027570044
0.003911343
0.008233547
0.



Elapsed time to compute best fit: 2.791 seconds
Cross-validation score: 0.24288749491131179
Test score: 0.2347417840375587
Best Hyperparameters: {}
0.008580804
0.10812471
0.01145645
0.037739493
0.0077964473
0.030895166
0.0070527475
0.010701266
0.04411713
0.030914828
0.016745906
0.05408603
0.0036621857
0.0
0.0026152376
0.0
0.005535185
0.024766788
0.013220678
0.016996857
0.011126892
0.0067720376
0.003228472
0.0017019762
0.00087602105
0.007797957
0.00024236205
0.008968025
0.0031730644
0.008393186
0.0026203354
0.0020830347
0.002094022
0.00870889
0.0021220786
0.0
0.0
0.0
0.0069855284
0.0
0.0071041095
0.0037699575
0.0
0.0072180405
0.007386708
0.0
0.0011449096
0.0
0.0
0.0
0.0062346933
0.0077236057
0.0036098748
0.0012009317
0.0
0.0
0.010771378
0.009845704
0.056946468
0.015938848
0.004608443
0.014468603
0.008853599
0.0
0.0059614074
0.006006075
0.004486967
0.0
0.0
0.0073081725
0.005950282
0.0
0.0032419679
0.0005287822
0.0
0.002099482
0.009409075
0.008566041
0.003766198
0.0
0.0016707109
0.0199315



Elapsed time to compute best fit: 3.155 seconds
Cross-validation score: 0.23036290120248068
Test score: 0.2804746494066882
Best Hyperparameters: {}
0.0068285232
0.061481457
0.029042987
0.10172449
0.02737624
0.0258813
0.004987246
0.013070973
0.0033210337
0.008459207
0.030581858
0.046300486
0.0060717515
0.0
0.014549932
0.0
0.01507383
0.013215396
0.009347969
0.0019891139
0.03219245
0.007800964
0.034181207
0.023652649
0.00046139833
0.0
0.00049106963
0.0057232725
0.0012625484
0.004555834
0.0
0.004390813
0.0015824357
0.0
0.0016763152
0.0
0.0
0.0
0.010985913
0.0
0.0045080795
0.007213044
0.0
0.00121488
0.016642623
0.0
0.0027317656
0.0
0.0
0.0
0.0035906823
0.012573623
0.0030034885
0.00039081703
0.0023833613
0.00032947268
0.00082536094
0.0
0.01194453
0.026420396
0.017994659
0.009806122
0.007275315
4.1594045e-05
0.0046160365
0.0041263266
0.007726192
0.0011491298
0.008399877
0.00432414
0.0035811178
0.0074738446
0.006813224
0.0076480154
0.0018907419
0.0
0.0011957199
0.011235136
0.0069027003
0.00096



Elapsed time to compute best fit: 2.999 seconds
Cross-validation score: 0.24203752454964458
Test score: 0.23430178069353327
Best Hyperparameters: {}
0.010003602
0.10885615
0.01835491
0.038405824
0.055899482
0.016484745
0.0021224527
0.012645372
0.03411032
0.0021475616
0.01687575
0.07864407
0.0048584905
0.0
0.009392061
0.0
0.008985777
0.0
0.005922246
0.0038007298
0.011095146
0.0013374669
0.0040609166
0.009192284
0.0045512314
0.021977454
0.009912506
0.0060301283
0.0007359039
0.004174276
0.008282864
0.0021035909
0.007827392
0.00285409
0.004125184
0.0
0.0
0.0
0.003934848
0.030018464
0.003531108
0.0037392539
0.0
0.01835759
0.052459285
0.0
0.0005698688
0.0
0.0
0.0
0.007454303
0.0049455706
0.0022572451
0.0020185516
0.001689057
0.001226799
0.004885088
0.0
0.0007152516
0.0075354353
0.015370773
0.0026319895
0.00793651
0.0015274229
0.0032262886
0.004992631
0.00060792186
0.00017059177
0.005305721
0.009215803
0.001868084
0.0065034674
0.0059846095
0.0009201094
0.009462461
0.0
0.0039060083
0.006730549



Elapsed time to compute best fit: 3.007 seconds
Cross-validation score: 0.21549003605622877
Test score: 0.22114574557708505
Best Hyperparameters: {}
0.012795819
0.11135613
0.01904132
0.05489642
0.017554956
0.008154875
0.006581836
0.010025983
0.00085245183
0.0
0.015707782
0.06411354
0.0065945196
0.0
0.006891063
0.0
0.009359587
0.019199647
0.008977883
0.0007625498
0.014305347
0.010070258
0.032334037
0.005292172
0.0
0.0
0.0015986316
0.006397624
0.00066271314
0.0027400476
0.01434547
0.007564424
0.0057163592
0.00088665605
0.0043348577
0.0
0.0
0.0
0.012592952
0.0
0.007509344
0.009119156
0.0
0.009796657
0.008414662
0.0
0.0054185246
0.0
0.0
0.0
0.0032196841
0.005119823
0.005728638
0.00044651388
0.008874167
0.006139816
0.0035031247
0.0044800625
0.017698124
0.010912635
0.0
0.007155764
0.0013679392
0.0
0.006467508
0.015452899
0.0052562477
0.0
0.0
0.005615196
0.016466215
0.014454806
0.006961315
0.0043010875
0.010429999
0.0
0.0016951882
0.0017430484
0.006265932
0.0
0.0
0.0024725983
0.0033477992
0.0



Elapsed time to compute best fit: 3.218 seconds
Cross-validation score: 0.23415835996520618
Test score: 0.2619532044760936
Best Hyperparameters: {}
0.011625928
0.085117854
0.013739719
0.023870947
0.0035404763
0.0246345
0.020316582
0.020760588
0.011501614
0.005972808
0.030234434
0.08820448
0.0011761235
0.0
0.013613508
0.0
0.015353894
0.0033308887
0.044477124
0.016065607
0.009081809
0.0137655595
0.03384059
0.0053942227
0.0022677537
0.00075684476
0.0060520484
0.022124112
0.0037656508
0.0035174033
0.005708033
0.0059889564
0.0042631426
0.0025387546
0.005122807
0.0
0.0
0.0
0.007289718
0.0
0.0011408475
0.006446529
0.0
0.002030459
0.010886976
0.0
0.00094759057
0.0
0.0
0.0
0.005205764
0.013553513
0.0050169155
0.0031328986
0.0
0.0051359865
0.02282454
0.0
0.008270662
0.007945857
0.0001906471
0.005103622
0.006565177
0.0016888641
0.009641523
0.0006721684
0.0019802034
0.0008785375
0.008871
0.0065169376
0.004319596
0.0018320949
0.004868975
0.0013933834
0.0009325358
0.0011299028
0.0043003582
0.0016379



Elapsed time to compute best fit: 2.779 seconds
Cross-validation score: 0.24682557211924488
Test score: 0.24418604651162787
Best Hyperparameters: {}
0.01602115
0.12103926
0.012850132
0.033533283
0.016195746
0.032529995
0.011973028
0.03069947
0.021291718
0.004480376
0.017807104
0.05155085
0.0054286057
0.0
0.030786473
0.0
0.014470752
0.0029554572
0.002948709
0.0023671857
0.012019133
0.021149946
0.033649523
0.00054600596
0.0020028858
0.0019062263
0.00018086037
0.0072763306
0.0033652666
0.011766444
0.00573401
0.0058680153
0.004546205
0.012587584
0.0018990512
0.0
0.0
0.0
0.0075978176
0.018983187
0.003407018
0.0044756215
0.0
0.00066557457
0.016023524
0.0
0.0027878613
0.0
0.0
0.0
0.004188456
0.00790893
0.00157258
0.0
0.0
0.006057332
0.014808511
0.0037350433
0.012341575
0.0023045875
0.005109695
0.011656374
0.005569311
0.002369188
0.005343778
0.0028352668
0.005712649
0.0
0.0011244102
0.0033879846
0.0028033832
0.0016773576
0.004606196
0.0015287973
0.008034304
0.003647622
0.0065791323
0.001581448

### 5.3.3 LightGBM

In [132]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

under_lightgbm_performance_nonnormalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    LightGBMPipeline = Pipeline(steps = [#['smote', SMOTE()],
                                    ['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    under_lightgbm_performance_nonnormalized_df = pd.concat([under_lightgbm_performance_nonnormalized_df, new_performance_df])
    

under_lightgbm_performance_nonnormalized_df.to_csv("../data/05_model_output/under_lightgbm_performance_nonnormalized_df.csv")




Elapsed time to compute best fit: 2.654 seconds
Cross-validation score: 0.5442983547468272
Test score: 0.5457380457380457
Best Hyperparameters: {}
102.56482813711528
2899.76322573139
463.66483466626505
503.710488024556
97.85973849899767
26.617625543022427
19.77260872721672
33.62322353411582
1.914439294487238
5.617603927850723
157.34156470421937
509.9597844036473
0.48840099573135376
0.0
9.223153146682762
0.0
71.7888124741944
127.8402286388482
24.666146193095805
0.4404008385317866
21.453447363788428
16.88864783977624
81.95686902850866
10.905291831120849
14.361005205024448
2.0664402171969414
3.5962906446438865
48.84102214441191
18.302681906307818
3.8857120160505474
2.2555190482817125
3.445211237271451
9.637089700912156
12.770899466471747
27.988893614707195
0.0
0.0
0.0
39.578473768646745
3.811746120452881
9.143721015951542
3.5921980251059438
0.0
10.906555224881458
11.590786973956256
0.0
24.251966911629097
0.0
0.0
0.0
15.827853435694124
20.1582634113729
23.374203259484318
5.968284328087748




Elapsed time to compute best fit: 2.185 seconds
Cross-validation score: 0.5398857010208227
Test score: 0.5274043433298863
Best Hyperparameters: {}
49.464144393193195
3296.0707585178147
315.69418557174504
408.648957844445
81.93083470537107
143.8082627371991
3.005401916586834
7.442603515269866
3.9112019156455062
0.23303818731801584
69.84524070213956
475.3873577627055
5.151126345692319
0.0
37.84287174039861
0.0
77.91622367167957
55.601354531272364
16.656397734859183
19.132957033136336
7.227243445385966
13.630447527449178
18.972647119527334
13.789026819169521
13.141938929798926
2.354512097314
15.942798369344018
46.60331336782792
7.505173727771762
14.978067878398633
2.4028760332173533
8.376281185192056
8.604425292003725
0.7162689554461394
15.727438549947692
0.0
0.0
0.0
4.772855860663867
5.959715008735657
11.16776104505125
1.3742517961072735
0.9525889754295349
22.719014227899606
23.94313447176205
0.00432544993236661
8.654190015797212
0.0
0.0
0.0
6.751498925310443
1.3804755110267024
8.0459831



Elapsed time to compute best fit: 2.482 seconds
Cross-validation score: 0.5423201136510053
Test score: 0.4960707269155207
Best Hyperparameters: {}
44.19894664166783
3126.1568743302278
561.2321672748201
335.3842000156754
65.11983299146937
214.08341911726214
5.321028790669516
37.09584405273199
1.079820904880762
13.500881283485796
42.13645897908441
364.990769128548
1.865151233971119
0.0
19.695449105161245
0.0
37.66435448973871
72.3310030131322
10.65090514962202
0.07515269976072858
72.70802915214472
12.699052354234937
22.44722521904623
8.708961578690833
2.91535033636319
5.087618799114921
11.550496432672844
23.576672824725392
8.232635524589568
10.718804277437815
14.354725643293932
11.031435265802202
25.09341128640017
5.12853636786167
21.528122881951276
0.0
0.0
0.0
3.7694525171525584
5.204100131988525
4.4193763199218665
1.0925054578355287
0.0
11.899837645611115
12.666930553023121
0.0
16.551230040204246
0.0
0.0
0.0
21.75740234459056
2.480340861454877
8.90866019722587
5.838140605035505
9.12768



Elapsed time to compute best fit: 2.189 seconds
Cross-validation score: 0.5387857630343362
Test score: 0.557341907824223
Best Hyperparameters: {}
89.17968952540099
2777.726568408053
578.5273414684091
460.28315428576025
4.155071501637565
248.35145966025925
10.736406956944847
17.014083581883597
5.1792863458395
3.031088158255443
47.54537227781748
530.4467359789159
34.45768417138606
0.0
2.35067508717475
0.0
65.83147399034812
17.058228188794146
64.78323835440995
0.6898995221126825
29.052733827224074
11.093225196382264
39.75690858839266
34.224944948754455
14.628384385403479
1.702178981155157
0.4491728898137808
35.129205804812386
7.6503852950409055
10.024576001131209
4.542332114192277
9.217176847289238
71.96935018237616
4.055087244138122
24.789080830176772
0.0
0.0
0.0
5.218142086145235
0.0
8.511110679048556
8.613879863834882
0.7764779925346375
8.014188802115086
33.61552504763313
0.0
8.64789965638488
0.0
0.0
0.0
10.619274249681297
13.109853873317661
25.544434081401704
8.11183441416506
0.937945



Elapsed time to compute best fit: 2.362 seconds
Cross-validation score: 0.5227160970700008
Test score: 0.5515088449531738
Best Hyperparameters: {}
72.23335481725564
2033.0110973995686
1489.8937458604182
388.86775793864047
16.11688728032472
435.40764240668176
14.932524732430466
8.794210320184902
7.576622102409601
6.796068510848272
58.72542051768319
424.1251738059341
0.21578259766101837
0.0
14.487380945720517
0.0
14.798277541378766
48.0669533118159
18.035669219550982
72.01330457636504
84.3454163344577
29.96136160187463
18.798376032099895
0.33661509697344627
7.414350334816845
1.2358835619920683
7.628967605664002
9.592199228424732
19.67329677298403
8.926574016663835
1.774920301697648
11.62243832283329
6.233991343994063
1.1010535343375523
17.04519859522884
0.0
0.0
0.0
3.733471187072837
0.9746899232268333
5.7216528954279084
4.58603541367313
4.394930147810783e-05
6.377903709032751
32.34870268204571
0.05710120126605034
8.12674393504858
0.0
0.0
0.0
4.788523350485946
9.503669014062552
7.48013771



Elapsed time to compute best fit: 2.259 seconds
Cross-validation score: 0.5355286841583617
Test score: 0.5299999999999999
Best Hyperparameters: {}
31.752182488249176
3190.586048655782
458.01496717012344
431.35981229907327
141.39693082001202
76.58345598026754
1.4464689652954803
70.68643465939022
0.2793419025838375
14.66345442731108
94.8030988061887
385.33780389559234
6.713173594442196
0.0
41.23605094623511
0.0
28.663667709249978
60.29421242987371
6.068351432473719
7.47387281912961
52.509985353150114
8.738626166114045
26.71453645036013
58.7732061798266
13.917691484559327
0.9976371802040376
1.0100609667570097
5.60382734855142
3.6556706685223617
8.85614066844073
1.211951189005049
27.616537450781237
5.5928601213527145
0.15463606321281986
11.960437638614515
0.0
0.0
0.0
10.436888135711605
0.3130979984998703
7.911929529625922
11.567533566918428
4.43625020980835
7.0671603854279965
21.334431277262013
0.0
17.688467288261563
0.0
0.0
0.0
13.21044465249679
7.768375657602178
10.945736344588568
0.8923



Elapsed time to compute best fit: 2.278 seconds
Cross-validation score: 0.5339541026251999
Test score: 0.5611111111111111
Best Hyperparameters: {}
65.77692700835934
2546.9256612718
527.0631387830249
490.35826527520067
276.00813552781324
154.98239579004468
12.085122109227086
14.29869367275387
2.965938509912121
17.5701023475267
344.0763085137038
45.274891378236134
5.631445914506912
0.0
17.01239965746936
0.0
150.07114564535755
20.690935208212295
48.48703049662163
3.135717892429284
32.59365975565743
18.14534000839467
35.05140786607626
9.406553321903715
5.479603808373213
5.13739326709765
2.881620348081924
12.909257695398736
5.940309159582512
16.56932975522767
2.6667795459507033
15.56365213069148
4.692087288713253
2.476773185422644
3.093804532806473
0.0
0.0
0.0
19.447273115338632
0.0
5.848510623117761
3.4858609085600065
0.0
25.04010777519568
19.01199515403823
0.0
1.7950164858048083
0.0
0.0
0.0
1.8202164042231743
12.86341738415831
13.063520071284984
6.314867121167481
0.6171630173921585
3.3129



Elapsed time to compute best fit: 2.417 seconds
Cross-validation score: 0.5379034994002639
Test score: 0.5780022446689114
Best Hyperparameters: {}
31.007976988372995
2220.0824348705473
770.4046548075733
877.0617627543716
42.16278033281148
149.09729341510138
1.4473418019632982
31.23482552714995
12.679420321258796
32.6808557679542
128.97162453983765
379.2732174052326
1.8043351140804589
0.0
10.68814196837775
0.0
101.11281981336049
27.494560113320148
124.52666955685709
5.367918482262658
30.852811589687917
23.455425571548403
87.23409892195642
4.2005707193603286
7.0437484961003065
1.0138647545609274
0.90777238458395
13.803414130270582
2.4907379658997826
16.860084383115463
4.161925439548213
14.890321455258345
10.57540290587474
5.862268734113968
5.498333331430331
0.0
0.0
0.0
11.439966428093612
0.0
27.240172692016642
26.756983083396335
1.70441193816373e-05
32.00674985148362
31.56112406812781
0.0
27.087710537382577
0.0
0.0
0.0
43.473330710666914
9.095587750023697
19.037881680536145
2.32467469014



Elapsed time to compute best fit: 2.401 seconds
Cross-validation score: 0.5202374842710066
Test score: 0.5514316012725345
Best Hyperparameters: {}
56.981848474559655
3025.6448017035714
594.6497975312059
500.6797086909114
16.415604439932395
22.554384535185363
30.870453984680353
17.238823451752978
6.443593125382904
22.823452488759358
517.7945091667832
81.72501488917624
2.9963906817138195
0.0
15.047387667000294
0.0
2.4877035594223003
11.72300206964428
2.0515251016295224
2.250995303153104
18.094329393959924
6.462731330189854
71.49239160595755
40.17504464412923
48.46050561483207
5.068765770374739
5.6584627944394015
14.368708554381556
4.854416014257538
12.263953477341985
0.9661772707477212
27.765240911911064
22.80507027502972
0.8200218757119728
11.764985079256235
0.0
0.0
0.0
8.373546055052827
4.450422421097755
14.419423856566937
18.298825506220282
0.0
14.78030611759391
12.031944460637419
0.0
15.701569028897211
0.0
0.0
0.0
17.93153672268636
6.347880227782298
6.995571577284693
3.88560330728068



Elapsed time to compute best fit: 2.209 seconds
Cross-validation score: 0.5371075606357174
Test score: 0.5081300813008129
Best Hyperparameters: {}
109.87438368639155
3535.6934972552776
192.7288155987218
439.66746406899495
54.3557036279244
44.56575156795694
2.7666173800826073
10.673129249364138
10.693342373357154
7.492631269211415
159.194491457967
343.01369789889867
32.731136437505484
0.0
6.483525777093746
0.0
24.78102806989631
68.03725040549014
15.319721655812828
1.4321966172574294
102.03832246310003
0.5996279286037236
45.7350065955834
42.143483005814346
2.1732225851119438
0.47125159065763
5.498143630800769
48.26286583833121
0.7398527401906456
3.5614025386262895
25.117381842478807
5.031061307594107
9.036398846671204
24.975913768630562
13.227945545529092
0.0
0.0
0.0
14.594459848004526
0.0
51.72391331775211
4.07149193514562
7.430739788105711e-05
42.35184586973037
12.062792301958325
0.2568329870700836
9.421728660803637
0.0
0.0
0.0
20.030396017155226
4.66713163490931
18.68620406580729
0.25



Elapsed time to compute best fit: 2.286 seconds
Cross-validation score: 0.543089141556408
Test score: 0.5656350053361793
Best Hyperparameters: {}
34.1310770931196
2964.294341878318
436.5547123797168
396.09331059333067
51.43139313685081
130.9561904271975
5.4253709010626565
35.605676154604225
1.3753254264179304
10.714628325775266
171.5961244951669
362.7372412510044
5.326249536126852
0.0
6.207591122506528
0.0
68.47499609673328
103.48684628821258
30.76503398847882
6.774042928314884
29.804424365734782
23.375485083794047
35.29051572798926
10.808527936062319
6.113921589743626
2.477566088145977
3.762516199214261
105.68357719766553
8.877628833055496
29.50527695677738
5.503480582688894
10.717921446878108
9.032969993991912
6.893288663100975
34.99121793258061
0.0
0.0
0.0
20.356444248496285
0.7920625768601894
5.486089050024292
1.735723231735136
0.33003598515642807
5.595249730297212
9.362406593696505
5.004240036010742
0.6346594436909072
0.0
0.0
0.0
14.839975303683218
11.292093905317842
17.9145098373



Elapsed time to compute best fit: 2.170 seconds
Cross-validation score: 0.5389494492324987
Test score: 0.526579739217653
Best Hyperparameters: {}
65.24091322111973
3163.7166960884924
357.69738827677935
352.36977757850104
30.566516426800035
66.8445574641155
9.72599315829575
20.7499501697161
4.476313934762018
13.579456366598606
76.77936575355126
623.2109280535103
9.829893827438354
0.0
12.831657027825713
0.0
21.97739957622798
45.226931146162656
52.1726123562521
3.915753499413384
8.397878295436982
68.71507381749507
62.58065779894241
16.714280192651586
2.1809917237551417
8.62042998051038e-06
4.208912969574158
6.297122277679591
4.929286984851387
21.320960333735457
43.93414057104201
10.433696673187569
4.609240097999361
2.988191299472703
14.847885427161486
0.0
0.0
0.0
16.941557178710354
5.902095079421997
72.83189734355335
3.409317199585672
0.0
44.96630899716749
9.366139701138309
0.0
11.47353616973487
0.0
0.0
0.0
8.221327003724582
5.999484283290258
12.301847999125584
4.542698854733885
0.0
4.015



Elapsed time to compute best fit: 2.233 seconds
Cross-validation score: 0.5423400347372309
Test score: 0.5393743257820928
Best Hyperparameters: {}
55.00220162000666
3295.9802549830097
445.28757355731113
413.7368640341822
182.32291557546992
143.28226269126085
0.3405570089817047
0.32034439221024513
0.5303293839097023
10.735522023096564
53.000416095320475
397.74375817008905
0.026310670655220747
0.0
41.016432836135465
0.0
97.41887498572069
84.27711386634113
75.34236799124608
22.319122659885608
7.403203394263983
17.34002698247508
36.583967028606395
3.3075047999009257
7.525578334927559
1.2372366142965916
0.5018840039405177
1.1630670515778547
0.6788266177882178
15.926354631619471
0.07651808403932137
11.387995500015677
25.274134174217807
0.3130892602202948
4.011940493597649
0.0
0.0
0.0
12.923834710312164
0.0
6.23039573332062
6.6021844624725645
0.0001573350018588826
6.5462182575808185
58.01706761340567
0.07386469841003418
20.916418227760005
0.0
0.0
0.0
8.367530812344981
1.8993389748943628
25.55



Elapsed time to compute best fit: 2.407 seconds
Cross-validation score: 0.5237485462005076
Test score: 0.593607305936073
Best Hyperparameters: {}
46.767284725119055
2125.9868807385933
1202.2236781946942
401.0147748896875
97.06610637600716
329.3617071455716
6.201453008805402
15.096836006712692
6.617796903476119
10.624143664246365
61.546365251285664
449.17145122446726
16.43732983316727
0.0
23.486873264954966
0.0
83.26537975491762
47.43201687599728
4.158169498712084
3.068927725946196
64.97426177274338
46.39437856565928
34.87823208400732
1.5962382719153538
13.250186238685167
17.8810838181073
2.2447778029365537
11.374790664965417
9.505886783937967
43.40775752647062
8.167246407829225
1.2907758711658062
40.323959016698836
4.453071694260871
17.720631960855826
0.0
0.0
0.0
13.323709187706754
0.0
16.09458417481983
10.535540045399102
0.0
34.81594526226297
16.807841304398607
0.0
11.671116278467935
0.0
0.0
0.0
11.233419185317246
9.28346929697372
20.13776064979665
3.002219472080469
3.8828339874744415



Elapsed time to compute best fit: 2.138 seconds
Cross-validation score: 0.5187796978192922
Test score: 0.5502136752136753
Best Hyperparameters: {}
68.84791323459883
3192.808033749028
502.2132031613137
548.046016467422
60.20975814133401
108.64694361380333
1.9995654188096523
27.78233564714901
2.0501970052719116
4.0613719522661995
24.46336502645441
461.61871877350296
1.619578382529724
0.0
15.627762742951745
0.0
48.73386197940272
36.69076879378054
28.761123870528536
2.9562855515779063
34.344447248975484
19.58807156223338
38.9852581473441
52.45029054549261
2.796346839517355
4.5676151043735445
3.4241340360888444
15.857051065956057
3.9051434260477436
12.568856168772271
1.6567078525869192
30.63951285350049
4.234915186909664
10.913494761334732
12.308900100360631
0.0
0.0
0.0
2.8733547873380942
0.0
8.83107513233007
7.633161673817085
0.0
4.216720837775938
25.04313220973927
0.0
9.662326116114855
0.0
0.0
0.0
15.467915347606777
5.954924079563398
22.02099655781967
1.069676546845585
5.622220123768784e-



Elapsed time to compute best fit: 2.349 seconds
Cross-validation score: 0.5215992289670159
Test score: 0.5472103004291846
Best Hyperparameters: {}
38.53045481482495
2843.290194553614
473.2490658090376
488.60537170986754
207.254493918776
121.02135928960485
4.978251277469099
19.126590155212398
8.442620672285557
3.5049268899019808
38.05457089292963
409.4340351082046
25.052119225591014
0.0
33.91264438876169
0.0
88.3989041806069
22.597542871317273
19.680382308979162
4.676061601872789
34.92930962704122
0.9353010105421617
20.767924145853613
16.53206845678733
28.507349509221967
0.21920470520853996
4.148891940764088
29.570619970851112
7.567027510542175
9.310777809722367
5.5615637130104005
25.017125077312812
8.86075049973806
0.9584585332559072
24.806302431068616
0.0
0.0
0.0
16.41594420458341
0.13887259596958756
15.524616212445835
33.626165472255195
0.0
10.249111197343154
18.344526860203132
0.03276750072836876
17.53537531742768
0.0
0.0
0.0
29.549697214240155
7.79342081809591
35.25668861109466
0.1



Elapsed time to compute best fit: 2.276 seconds
Cross-validation score: 0.5295709611300274
Test score: 0.5344295991778005
Best Hyperparameters: {}
73.29582027337987
3273.483310255116
388.0456833380087
401.17976446263776
110.70114771350828
107.01415113725024
27.877657889272797
34.3966183609308
16.087173461914062
4.503367752582562
25.490987778939598
304.23150172900716
9.160246446728706
0.0
37.32407706420054
0.0
75.50683311540686
13.421269219610338
22.54328445106421
5.236804274063616
103.99153898290706
38.23968165093629
42.77385926958249
29.89979320857674
14.597883453235227
5.914697530809008
11.55381664520246
37.090368759708554
7.118626064620912
24.41009820317413
28.694348602439277
2.222790122234187
12.419523095712066
0.04982378762906592
9.258127312651965
0.0
0.0
0.0
50.05024251658665
0.0
10.326432818127387
3.8208945301180393
0.5806959867477417
15.801119637695876
4.282983593869176
0.8103846148878802
3.786886612419039
0.0
0.0
0.0
30.581055217124245
11.23089450280122
10.852926412788861
26.5



Elapsed time to compute best fit: 2.374 seconds
Cross-validation score: 0.5377910977697504
Test score: 0.5395299145299146
Best Hyperparameters: {}
39.36138512068285
3036.2722035218267
360.1831238786225
432.80521175180706
131.09477291631964
63.3195287212175
13.654749888926744
36.323790585869574
3.682681331563799
9.78458906468586
178.24605144107045
403.05603049289857
3.641858679481004
0.0
17.60661328808419
0.0
110.76777067100011
42.96408786492269
50.988400063473904
14.510274764055595
33.080134814197535
12.454531762748957
63.0739110772854
13.34955386351794
5.713350068384898
4.872278864693726
2.2006280877226345
40.81469526025103
15.155646278930362
26.824409632777318
1.3468056804502453
10.24632693419833
27.440338590771816
5.77636749483645
26.872984156453498
0.0
0.0
0.0
5.08103490769372
8.635020094516221e-06
9.489694641088363
19.917952502542903
0.0
10.357698332626569
25.163734030909836
1.7639180123806
11.559594089449092
0.0
0.0
0.0
2.611138911517088
18.20012006835219
34.181555355407696
7.845



Elapsed time to compute best fit: 2.191 seconds
Cross-validation score: 0.5388346777470564
Test score: 0.5425963488843814
Best Hyperparameters: {}
84.3273459790434
2876.794990496251
484.50687024176375
237.76038811797844
27.29561046923027
332.55805372197574
8.864815502187412
80.43057473950347
4.429993949830532
5.913261078298092
5.321963201462367
492.86191766766603
2.5669200774282217
0.0
25.107627687225545
0.0
12.071078994999823
38.73223010216043
41.339122591004184
19.380134145496413
12.06488372408785
15.64442999730818
52.937833670692555
3.386918817177502
16.460041131824255
8.778419486247003
8.381743703968823
38.77529557361606
14.82483579750442
9.376286812985526
10.184475508343894
28.999865082325414
13.278108394190099
2.1435062984648425
12.358048478214187
0.0
0.0
0.0
14.660107908998157
1.9578700065612793
5.764495329349302
14.59329091587415
0.0
21.791178585146554
19.42900969384482
2.0832200050354004
5.694955495831678
0.0
0.0
0.0
27.653739055800997
40.18960633880124
33.16176129462838
7.474



Elapsed time to compute best fit: 2.349 seconds
Cross-validation score: 0.5400726359448178
Test score: 0.5234297108673979
Best Hyperparameters: {}
77.78812639804148
3038.0774801773
470.9217525737665
375.97442971659416
105.09283641297473
111.62920810780446
2.241411845825496
28.067364348660703
6.640656000003219
2.1023841947317123
100.69695252809834
428.1723494454756
3.9074206724762917
0.0
6.52421747148037
0.0
31.388610636176878
12.574977177784604
49.60822173588349
1.0853619634581264
19.40179396364192
0.3576719038246665
34.5633155182004
18.309540688393554
12.970909064635634
0.9576003588736057
4.713341362890787
46.40314299756483
18.664174882877887
18.924514717356487
1.8701634042081565
7.4297393917287025
10.13864161293224
3.9479818902909756
15.474936719176185
0.0
0.0
0.0
16.62462291237898
0.0
18.40590640157461
17.413496252413097
0.0
6.327272751082916
7.445581695790224
0.0
15.569249305757694
0.0
0.0
0.0
19.829532622582718
31.502245687829827
36.08738437093484
1.0079308785498142
2.656289555132



Elapsed time to compute best fit: 2.239 seconds
Cross-validation score: 0.5210554129151046
Test score: 0.5246913580246914
Best Hyperparameters: {}
92.53009137423237
2969.9270782347603
613.6495885689372
588.2664067042087
109.13448237215862
92.81629816977897
8.5402730187634
0.06644151691921252
2.0511634880676866
25.739374180573122
161.83653992003735
330.96547836903505
3.1551406170474365
0.0
31.374151418221402
0.0
50.896216409674594
75.18898684118093
12.66531430518694
2.180169484300933
47.91912229699426
7.829404098450482
49.65078104650249
6.66219566448674
37.15854210712011
6.33577024514193
18.23026255156651
0.7194867017394415
2.008541933409589
20.902151810813393
0.04037636606960149
7.440188094375884
16.58811354579302
5.515121024567634
8.7136174500489
0.0
0.0
0.0
1.9160337082289587
2.3958047994965455
2.9022431823245824
31.823651720341473
0.15017060190439224
20.565434284063258
9.750584672001995
0.00012519900337792933
5.221306880937391
0.0
0.0
0.0
9.902121420006779
3.0307344471875695
8.78605



Elapsed time to compute best fit: 2.345 seconds
Cross-validation score: 0.506732580265246
Test score: 0.5190989226248776
Best Hyperparameters: {}
65.04081013632563
3101.5716475374575
385.67311573413724
467.98175030930634
70.67045697824437
107.77603479536674
10.248673351481557
39.306545681132775
0.03256748432886525
0.33706479519605637
13.802105727964744
328.9151163223985
3.0159418146000263
0.0
51.23864354247344
0.0
123.00044346577062
44.25007156241898
10.35884616678527
9.131510299479714
10.751475419182498
3.5748416485361325
92.61929336289239
30.97473007370536
3.135855639586225
2.5064861681312323
8.862373443174874
32.535824307657094
4.706349606356525
10.929445802795442
0.7149760806541039
0.9788213857640216
12.739267964685325
1.4534908644855022
4.542983962046549
0.0
0.0
0.0
11.539142829045886
13.538561251014471
20.888014271760937
16.737612990909838
8.468239684589207e-06
30.995555614610566
29.93425153366843
0.000192620005691424
1.3091201579372864
0.0
0.0
0.0
51.32073179418694
12.2081059621



Elapsed time to compute best fit: 2.347 seconds
Cross-validation score: 0.5353469588852185
Test score: 0.5447070914696814
Best Hyperparameters: {}
81.49471981446192
3204.9800112840057
501.6174447014645
449.3413446761796
157.96571706353734
118.06675240803827
2.0716085004387423
39.76167410652852
8.606276366859674
12.66800568625331
115.76217904858754
199.05955105498288
2.1441048878025413
0.0
8.343042542458079
0.0
20.09229356265263
36.76208960947514
35.25137168202434
0.7108851913362741
14.31907830156706
2.923968033352139
103.25381409056718
15.80237943866669
1.0603624526229396
1.7115498171187937
4.022790860642743
27.558130416487074
20.100405253004283
21.928672264879054
10.235810224231955
4.020425440452527
4.057682690630827
4.982625780970196
13.688731819316274
0.0
0.0
0.0
3.61853642146491
0.0476284995675087
6.009511536194623
16.0444556317525
0.006152814097731607
33.312454517215656
38.46575932983069
0.0
4.290781612024148
0.0
0.0
0.0
13.88511330098845
22.442503351252526
25.53472733687741
1.056



Elapsed time to compute best fit: 2.288 seconds
Cross-validation score: 0.5412310334398738
Test score: 0.5300713557594292
Best Hyperparameters: {}
32.24391107231282
3258.2072890591394
421.2379174691591
279.53100416251255
123.98453285163617
36.603707993964974
34.09808364843002
63.044032994657755
1.864998115284834
1.8636094760149717
362.6395894642472
296.18557230778765
1.2196860969997942
0.0
25.489194432710065
0.0
50.08907380188674
7.668536750027442
17.898272937502725
10.7908897816111
1.9275389268816525
12.507942931530124
56.563305170243666
5.844762339256704
3.3420275403477717
0.22422005503904074
1.87928154040128
27.458742158618406
9.432076198136201
13.242433915641051
1.4897078041976783
4.437632228370376
6.2796941580636485
1.329681886572999
32.35734822073073
0.0
0.0
0.0
7.552961395533657
0.0
4.905364290905709
47.95953017769165
0.19017000496387482
34.179749146509494
29.765522813686403
0.0
41.80549700085248
0.0
0.0
0.0
19.050004737941862
8.220375525311567
13.615346695116209
4.5247975679667



Elapsed time to compute best fit: 2.227 seconds
Cross-validation score: 0.5277027516329135
Test score: 0.5095862764883955
Best Hyperparameters: {}
31.66681096764055
2398.3362206859847
614.1752077037914
1033.0021035371078
64.08829087509301
234.11496627286897
9.143185894412454
3.894314505159855
0.7347820345325999
19.645659803922513
87.92397004706726
356.1600552664961
0.4041850163266645
0.0
8.526244523398262
0.0
12.876513675642158
86.51886935936932
28.792884643761738
1.3855789264853229
31.28195055949888
0.644041719744564
32.777255353977125
55.98275843862211
4.574107850785367
7.75751944935746
1.223975851200521
39.18724925282993
1.8124287254177034
41.56145381454826
27.86780588788679
7.517277292208746
18.874152487667743
9.133181361443818
6.092394601592716
0.0
0.0
0.0
16.507612023762874
0.0811960007995367
3.2826836574822664
29.069812733199
0.0
12.338398031018869
64.3945583673825
0.5007780194282532
6.655448436977167
0.0
0.0
0.0
10.017302311664501
11.269923258919846
50.48990054495147
7.02579465

## 5.1 Rebalancing Strategy - 5050

### 5.4.1 Random Forest

In [133]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

fiftyfifty_randomforest_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    pipeline = Pipeline(steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                              ['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    fiftyfifty_randomforest_nonnormalized_performance_df = pd.concat([fiftyfifty_randomforest_nonnormalized_performance_df, new_performance_df])
    
fiftyfifty_randomforest_nonnormalized_performance_df.to_csv("../data/05_model_output/fiftyfifty_randomforest_nonnormalized_performance_df.csv")



### 5.4.2 XGBoost

In [134]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


fiftyfifty_xgboost_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    GXBoostPipeline = Pipeline(steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                                    ['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    fiftyfifty_xgboost_nonnormalized_performance_df = pd.concat([fiftyfifty_xgboost_nonnormalized_performance_df, new_performance_df])
    

fiftyfifty_xgboost_nonnormalized_performance_df.to_csv("../data/05_model_output/fiftyfifty_xgboost_nonnormalized_performance_df.csv")




Elapsed time to compute best fit: 92.312 seconds
Cross-validation score: 0.6760191317819937
Test score: 0.6722054380664653
Best Hyperparameters: {}
0.0081501175
0.04905513
0.028433606
0.20326464
0.009379097
0.0011324158
0.0011870879
0.010790196
0.0061279796
0.015786929
0.035957385
0.006708408
0.008868326
0.0
0.015594186
0.0
0.011370933
0.0073874993
0.008646797
0.0006726379
0.0021194378
0.0066869846
0.007832975
0.0038732507
0.0018924982
0.0022234037
0.0012421788
0.0056073866
0.001026288
0.002970361
0.046129126
0.0031219986
0.0010500895
0.0031760386
0.0007148296
0.0
0.0
0.0
0.016474567
0.03266005
0.0016634926
0.0027798924
0.010465873
0.0022153186
0.0030635404
0.0143805025
0.0008609286
0.0
0.0
0.0
0.001158754
0.00066274346
0.0021650237
0.01029535
0.04402843
0.0011097456
0.0024022525
0.0015791374
0.0059442907
0.010380088
0.0031167057
0.0055916607
0.0029146676
0.005285771
0.0036390016
0.0013892184
0.0020441315
0.013207752
0.0021397693
0.00056756986
0.0013985684
0.004668467
0.00070564577
0.0



Elapsed time to compute best fit: 89.641 seconds
Cross-validation score: 0.6882657119783505
Test score: 0.6734006734006734
Best Hyperparameters: {}
0.0162025
0.05287125
0.031853613
0.23494603
0.013390728
0.0032966197
0.005348916
0.0067421836
0.015026701
0.011765713
0.0505621
0.01183876
0.0009923917
0.0
0.011838401
0.0
0.0057939226
0.016933687
0.013451464
0.0030006857
0.0071000783
0.004320387
0.0055216406
0.0035694973
0.0020491772
0.00042785614
0.0010258186
0.014678711
0.00093147095
0.0043365867
0.012762124
0.004510801
0.0018703003
0.007934822
0.0020231118
0.0
0.0
0.0
0.029072136
0.016899437
0.0032770669
0.006932318
0.009061309
0.0075513693
0.0062426557
0.00050122634
0.002576009
0.0
0.0
0.0
0.0012700367
0.0021522422
0.0045265057
0.0
0.015237834
0.0021319625
0.0014698036
0.0021611084
0.015291347
0.014176129
0.0039997124
0.0015160885
0.0067499364
0.001181722
0.0003859189
0.0007184825
0.0014786791
0.007369983
0.002274106
0.00091276114
0.0019931088
0.006221762
0.0016050085
0.00055781135
0.0



Elapsed time to compute best fit: 92.163 seconds
Cross-validation score: 0.6952421073703638
Test score: 0.6190476190476191
Best Hyperparameters: {}
0.0069220006
0.07370866
0.034693107
0.26571235
0.016117517
0.0027468482
0.0025270733
0.015422547
0.004423105
0.0045449045
0.04196188
0.010252737
0.0033191484
0.0
0.0119303595
0.0
0.004902903
0.010443149
0.012193374
0.0014788836
0.004818989
0.003083939
0.00845693
0.003314496
0.0016794289
0.0024857486
0.00033221467
0.003725334
0.0016767716
0.0010464974
0.01645757
0.0031065096
0.0009935886
0.0077273455
0.0007542478
0.0
0.0
0.0
0.0186263
0.041911483
0.002174508
0.004133823
0.019671915
0.005517735
0.013051932
0.008303716
0.00057643396
0.0
0.0
0.0
0.000859713
0.0029905338
0.0006807794
0.00028787227
0.008665671
0.0056829387
0.0012294966
0.0020913798
0.011157288
0.032797173
0.0053551123
0.0026057595
0.0025753793
0.0037094927
0.006058638
0.00032820337
0.0018841012
0.006979974
0.0031384
0.0020272373
0.0012053187
0.006597162
0.0012680136
0.0005692513




Elapsed time to compute best fit: 91.520 seconds
Cross-validation score: 0.6757300708818927
Test score: 0.6859205776173285
Best Hyperparameters: {}
0.01416864
0.05506486
0.040614467
0.19699445
0.022639759
0.0016902683
0.0010799681
0.014791323
0.005253446
0.0076192496
0.054951545
0.0131291095
0.0024591135
0.0
0.028881049
0.0
0.010523515
0.012670322
0.010963937
0.0012844991
0.0060198638
0.0027303111
0.0033295234
0.0063471827
0.0039415667
0.0023901535
0.00037474302
0.006772218
0.0015926242
0.008584233
0.04090985
0.0008000616
0.0024049093
0.014220649
0.0013805715
0.0
0.0
0.0
0.00851447
0.009656026
0.0016123803
0.0033611024
0.01962611
0.0039637983
0.0051777065
0.01120666
0.0009142852
0.0
0.0
0.0
0.0011510258
0.0020213856
0.0028023147
0.00033085133
0.002290038
0.0007486273
0.0064439345
0.0023084355
0.0074368357
0.015375378
0.0018839084
0.0067029153
0.012888429
0.0028775637
0.006835198
0.0015852464
0.0015786666
0.009762
0.0039511374
0.00052553305
0.0008244179
0.010562256
0.00092273246
0.00165



Elapsed time to compute best fit: 87.456 seconds
Cross-validation score: 0.652840686102021
Test score: 0.7363636363636363
Best Hyperparameters: {}
0.0089646075
0.07105948
0.03302081
0.23188053
0.011230025
0.006145111
0.003677257
0.012781655
0.0109477695
0.008284993
0.060099766
0.014046025
0.0063166344
0.0
0.012780632
0.0
0.0038332064
0.015832422
0.018266754
0.0008578609
0.010786788
0.003609695
0.006461671
0.012840685
0.00068617956
0.0042592203
0.0010162987
0.003677768
0.0015691733
0.0021455528
0.025229434
0.0016570989
0.0012832951
0.0073970743
0.0007991298
0.0
0.0
0.0
0.0167584
0.0
0.0025874746
0.009811034
0.00089598284
0.0024839789
0.002425813
0.0033192777
0.0014414511
0.0
0.0
0.0
0.0009190532
0.0029860807
0.0018756929
0.005178581
0.0025682675
0.0008360495
0.002229996
0.0028383739
0.008263081
0.026073344
0.0043282863
0.0025460136
0.0018488349
0.013862549
0.015242048
0.0003239495
0.0016585679
0.009044935
0.0022187354
0.0016242831
0.0009682028
0.0067757764
0.00075742207
0.0028660586
0.0



Elapsed time to compute best fit: 90.092 seconds
Cross-validation score: 0.6634139316737948
Test score: 0.6309904153354634
Best Hyperparameters: {}
0.008645994
0.07423789
0.03729754
0.21857278
0.011585913
0.008964909
0.0012736218
0.0031149357
0.0076399413
0.0020646532
0.025680084
0.016537726
0.003001164
0.0
0.015701653
0.0
0.010869268
0.007639021
0.014109331
0.0033772388
0.003719644
0.002001671
0.007223137
0.0019112716
0.010480662
0.0010825485
0.000283144
0.0032972486
0.0014066966
0.00064865977
0.016168972
0.0010280965
0.0005160582
0.007917812
0.0011438946
0.0
0.0
0.0
0.014520758
0.06249125
0.0017857951
0.0009246063
0.008800373
0.002722312
0.0028359985
0.0039045196
0.0015693941
0.0
0.0
0.0
0.00042567673
0.005214204
0.000977094
0.003691569
0.0022999013
0.0014576577
0.0003008698
0.003458855
0.0122972755
0.04662614
0.0011754703
0.003155161
0.004246756
0.0070833857
0.000986033
0.012330217
0.0011173962
0.0120167155
0.004485326
0.0015422491
0.0018730644
0.006001067
0.0009377364
0.0016621455




Elapsed time to compute best fit: 93.121 seconds
Cross-validation score: 0.6625657358504285
Test score: 0.622895622895623
Best Hyperparameters: {}
0.010952207
0.054521915
0.039857287
0.25530347
0.0124817025
0.003952981
0.0028042798
0.0042203898
0.008210425
0.016977856
0.044739213
0.008525637
0.001740346
0.0
0.016222911
0.0
0.0063965092
0.010029298
0.012606411
0.0014594339
0.0042459504
0.004540773
0.005695022
0.00040634628
0.0019275423
0.0008375429
0.0003821733
0.012055101
0.0009215965
0.0044110655
0.0414241
0.000852712
0.0007307623
0.011805576
0.0006157464
0.0
0.0
0.0
0.006831049
0.0
0.0009742094
0.0035217756
0.0018754343
0.00301258
0.016158657
0.005372025
0.0015161284
0.0
0.0
0.0
0.0016004184
0.0026072874
0.0014483648
0.00047004526
0.0060086614
0.001778977
0.0034909407
0.0051388876
0.009350324
0.022791104
0.0034122462
0.0021002698
0.0034322024
0.005314956
0.0033531033
0.0025365779
0.0023785883
0.007589391
0.0007322647
0.001044032
0.0022069952
0.010139491
0.001122358
0.0008296143
0.000



Elapsed time to compute best fit: 94.643 seconds
Cross-validation score: 0.6667682480471969
Test score: 0.6911262798634813
Best Hyperparameters: {}
0.011623226
0.06327886
0.030357435
0.23649168
0.005710146
0.005757638
0.0014752796
0.010436266
0.007014499
0.019305259
0.05203579
0.011539478
0.004988776
0.0
0.01733929
0.0
0.016421128
0.011274523
0.018011134
0.000519568
0.006761614
0.0016759705
0.0058824876
0.003749387
0.0005646715
0.0012372966
0.0022103833
0.005177276
0.0011369538
0.0050007342
0.028308218
0.00084650074
0.001251643
0.0071661873
0.0010056007
0.0
0.0
0.0
0.0058743623
0.0
0.0072308443
0.0075551365
0.015320675
0.009162595
0.0073156953
0.0
0.001160628
0.0
0.0
0.0
0.0012933826
0.019474108
0.0010258601
0.009402599
0.002461136
0.0021373353
0.005333223
0.0023284082
0.002045039
0.0124513535
0.002735734
0.0024645827
0.0034185327
0.0015927684
0.0068515325
0.0016184563
0.0012883004
0.011884802
0.0011882692
0.0014165188
0.0027021556
0.0064867656
0.0012111353
0.0014683964
0.0016763309
0.



Elapsed time to compute best fit: 90.034 seconds
Cross-validation score: 0.6883656366039227
Test score: 0.6929824561403509
Best Hyperparameters: {}
0.0131750135
0.04910712
0.033318173
0.2468619
0.01491638
0.0051865876
0.0019287101
0.019027552
0.010090532
0.008075882
0.044054054
0.0137658445
0.012777618
0.0
0.010446176
0.0
0.0068803662
0.017031623
0.012791762
0.0017074747
0.004461366
0.005823013
0.007840696
0.0023212626
0.0022195051
0.00032877005
0.00095229544
0.014119135
0.0007182405
0.0009230958
0.026808038
0.0009244501
0.0015520524
0.0030337572
0.0009827156
0.0
0.0
0.0
0.008703727
0.026227187
0.0037985125
0.015804205
0.0
0.0027517793
0.0073651504
0.017216776
0.0006911734
0.0
0.0
0.0
0.0006158775
0.0031650236
0.00049762626
0.0026580228
0.010009226
0.005438257
0.0025913527
0.002309034
0.0021233233
0.012709896
0.0025798224
0.002165936
0.0056783296
0.009179205
0.001958326
0.0025007257
0.0017426037
0.009874804
0.00049471244
0.0027718127
0.0012892272
0.004655706
0.0009417877
0.003161568
0.



Elapsed time to compute best fit: 92.105 seconds
Cross-validation score: 0.6791482520987522
Test score: 0.6967213114754098
Best Hyperparameters: {}
0.009932557
0.054290358
0.039693836
0.22877237
0.021598427
0.0077813463
0.0016508464
0.0069996286
0.027840111
0.011083575
0.046279527
0.015671967
0.006092503
0.0
0.019461995
0.0
0.0035750354
0.015946737
0.006443558
0.002882156
0.0058683273
0.00205014
0.00627666
0.0031894452
0.0024585729
0.00022210904
0.0037393644
0.0009275777
0.0018207871
0.0074310363
0.040765256
0.0009828198
0.0016424039
0.007951162
0.0013229086
0.0
0.0
0.0
0.011397324
0.0076198517
0.003096099
0.0009979088
0.0012206623
0.008930806
0.0030916526
0.0
0.001018889
0.0
0.0
0.0
0.00075173326
0.006429047
0.002533964
0.0011928139
0.0009869282
0.004329555
0.015420835
0.0048325993
0.0050134175
0.017922254
0.0007740461
0.003615814
0.0013119073
0.01041592
0.0058172555
0.0015229059
0.0013315382
0.009804885
0.0010803478
0.0019996872
0.0019719482
0.016710002
0.00084233645
0.00101533
0.001



Elapsed time to compute best fit: 88.241 seconds
Cross-validation score: 0.6784135262657649
Test score: 0.7176656151419559
Best Hyperparameters: {}
0.010519151
0.065933235
0.02809555
0.22410402
0.007955239
0.0026858659
0.0010805575
0.01035223
0.01051458
0.0100518735
0.037316237
0.010387713
0.01484947
0.0
0.012283388
0.0
0.00775196
0.013771944
0.0071800724
0.0023932634
0.015470248
0.003956964
0.0023762295
0.003336799
0.0031951552
0.00082203903
0.00097382313
0.005257611
0.0012124042
0.0019166311
0.021973599
0.0014844417
0.00073697773
0.00493991
0.00063063885
0.0
0.0
0.0
0.007301387
0.058564626
0.0024723257
0.0017552368
0.0041209445
0.0038694602
0.01155216
0.0
0.0040773996
0.0
0.0
0.0
0.0008344727
0.012451778
0.002200546
0.000388086
0.000534615
0.0010284646
0.0014700523
0.006271896
0.006094065
0.009933671
0.0025030575
0.0031999438
0.008103135
0.006543833
0.007681346
0.0011408925
0.001492094
0.005794593
0.0031448826
0.0028528504
0.0015555436
0.012236016
0.0016293068
0.00087540457
0.0005777



Elapsed time to compute best fit: 94.066 seconds
Cross-validation score: 0.6696731720743716
Test score: 0.6677524429967426
Best Hyperparameters: {}
0.0060879565
0.059422396
0.035922766
0.24692526
0.01802768
0.004762051
0.002423186
0.018696256
0.004994926
0.015761232
0.03150674
0.01892424
0.017122876
0.0
0.025619391
0.0
0.006419954
0.014041024
0.009672006
0.0047468483
0.0024626139
0.005372323
0.012656393
0.004505201
0.0042162477
0.0017518507
0.0011801373
0.005830485
0.0025923874
0.0021827805
0.016774861
0.001547707
0.00091210194
0.0061915335
0.0009051765
0.0
0.0
0.0
0.004714852
0.013722386
0.0018446482
0.0028099914
0.0011304704
0.003582946
0.007769405
0.0
0.0014246549
0.0
0.0
0.0
0.0009886561
0.0026883336
0.0014126059
0.0039089234
0.015071313
0.0006194266
0.002025491
0.0031323938
0.014510371
0.012761835
0.0023998641
0.004041466
0.0016171443
0.00998739
0.0013769593
0.0008580322
0.0012945934
0.010491594
0.0012320109
0.0008341941
0.0021357343
0.0047953613
0.002020195
0.0016693948
0.0008780



Elapsed time to compute best fit: 88.635 seconds
Cross-validation score: 0.6747783088027547
Test score: 0.6747404844290656
Best Hyperparameters: {}
0.010495106
0.06714171
0.040295094
0.2022441
0.0063626217
0.0034193418
0.012144202
0.007054449
0.011412122
0.022802463
0.06819936
0.017663425
0.009193117
0.0
0.011603133
0.0
0.016263098
0.007895199
0.011521151
0.0009410517
0.0069098785
0.0043866783
0.00744249
0.011274566
0.00041964874
0.0008515503
0.0011067634
0.0044918675
0.0015512212
0.004015668
0.018271104
0.00072511844
0.001121153
0.0076912553
0.00056036696
0.0
0.0
0.0
0.015019481
0.011722024
0.0024183434
0.005517984
0.0009975419
0.007920304
0.0030546882
0.0028198964
0.0020349154
0.0
0.0
0.0
0.0011249367
0.00143709
0.0031137757
0.0016234936
0.0012357812
0.0015344606
0.00014294493
0.0020044066
0.008343045
0.013556284
0.013115712
0.004164428
0.0055959476
0.004180673
0.01611636
0.00033085374
0.001549805
0.014009286
0.0017185546
0.0017693647
0.0012114352
0.009296353
0.00061072886
0.00237929



Elapsed time to compute best fit: 90.655 seconds
Cross-validation score: 0.6570109945882008
Test score: 0.7095709570957096
Best Hyperparameters: {}
0.009462804
0.047670797
0.033089243
0.23356989
0.011038712
0.0043112948
0.0016073447
0.007928661
0.016048603
0.0042543435
0.026805239
0.009872173
0.0041431803
0.0
0.019146796
0.0
0.005388627
0.026294608
0.006860277
0.0016092189
0.004532327
0.0019890952
0.0062332526
0.0023719128
0.0032820497
0.0010300004
0.00049744954
0.0011666513
0.00237174
0.0069770217
0.04058383
0.0015197659
0.0021051615
0.015969133
0.00066930003
0.0
0.0
0.0
0.030792205
0.035280116
0.00086219003
0.0073446725
0.029610492
0.0027202445
0.006901012
0.00070618355
0.00054115825
0.0
0.0
0.0
0.00082996994
0.001244001
0.0036301683
0.00082178443
0.01091208
0.0010706594
0.005305691
0.0003469789
0.016654579
0.011365746
0.005623691
0.0029666466
0.0029802553
0.0069351844
0.0016891938
0.001116052
0.0023013381
0.0077343336
0.0013399996
0.0013479568
0.0011661098
0.0041158083
0.0006901574




Elapsed time to compute best fit: 91.663 seconds
Cross-validation score: 0.6479921643760294
Test score: 0.7120743034055727
Best Hyperparameters: {}
0.010086045
0.054530274
0.03612423
0.24403124
0.0015591295
0.01651712
0.002837496
0.00688143
0.0068276506
0.0067163263
0.030549485
0.01403784
0.0016128822
0.0
0.010005277
0.0
0.004847622
0.008933141
0.004652516
0.0030495161
0.012007014
0.003385807
0.004420913
0.0021261352
0.0011086537
0.00080269645
0.0005944867
0.0040328773
0.0010185281
0.0017322828
0.027713308
0.001456324
0.0016272131
0.003226271
0.0012344599
0.0
0.0
0.0
0.009273644
0.021348359
0.0030710516
0.006760841
0.06472614
0.0034578317
0.0018185205
0.0
0.0014416705
0.0
0.0
0.0
0.0010026353
0.0013964497
0.0016520564
0.0036988182
0.012776507
0.001736063
0.0029426224
0.0017066185
0.010624349
0.014887502
0.0024414442
0.0036377425
0.007270607
0.0075246603
0.00257995
0.0017222185
0.0018820394
0.012485641
0.0027728053
0.00063824095
0.0013817453
0.0025255936
0.0007035307
0.004879514
0.00135



Elapsed time to compute best fit: 90.913 seconds
Cross-validation score: 0.6704166109977634
Test score: 0.6721311475409836
Best Hyperparameters: {}
0.01184789
0.06306057
0.035506867
0.22250238
0.014104141
0.0057336385
0.0013701976
0.013926011
0.008387795
0.014334788
0.04521316
0.010770273
0.0021564374
0.0
0.012807876
0.0
0.005789719
0.012516356
0.014442908
0.0025871147
0.0049986374
0.0023081403
0.004367014
0.010018592
0.0026950068
0.0008571006
0.0008781926
0.018934285
0.0011910299
0.0067231995
0.03045416
0.0026894086
0.0014912924
0.0153872315
0.00110764
0.0
0.0
0.0
0.0056457953
0.030213568
0.0035204664
0.008165349
0.00027846708
0.003470097
0.0065566916
0.0
0.00087001343
0.0
0.0
0.0
0.0011377796
0.002608718
0.0026102532
0.0005166656
0.0034326264
0.0027405294
0.002725017
0.0005919853
0.0029388317
0.013145968
0.005156516
0.0029851394
0.0030976543
0.0059424327
0.004387481
0.0008312006
0.0019063567
0.010383092
0.0034457943
0.0022834684
0.0016810687
0.0074511277
0.0019175952
0.0018635354
0.0



Elapsed time to compute best fit: 89.350 seconds
Cross-validation score: 0.6936875099165493
Test score: 0.6629392971246006
Best Hyperparameters: {}
0.011843529
0.053967975
0.04038055
0.20910622
0.010051059
0.005313914
0.0016071978
0.00332251
0.011036201
0.006238185
0.03645683
0.01422784
0.006358323
0.0
0.015879052
0.0
0.011126726
0.015153353
0.009094201
0.0010087072
0.0033619343
0.0034990546
0.009765551
0.004229365
0.0013041025
0.008629612
0.00089082477
0.031999003
0.001997569
0.0043868446
0.021126289
0.0029272728
0.0016173808
0.0051365755
0.001145541
0.0
0.0
0.0
0.0034814484
0.005806618
0.004515873
0.0050224047
0.008176642
0.00508494
0.0027818875
0.019314336
0.0014981168
0.0
0.0
0.0
0.0012481108
0.0046298034
0.0038946385
0.004693858
0.0019899148
0.0026099787
0.00031747975
0.002390159
0.00913925
0.013190294
0.005848559
0.0025847238
0.024633456
0.00382844
0.008921045
0.0010831571
0.0017950671
0.009636855
0.0017266597
0.0008642977
0.0025350277
0.01317588
0.0016570222
0.0021454631
0.00102



Elapsed time to compute best fit: 91.989 seconds
Cross-validation score: 0.6674164793620079
Test score: 0.658307210031348
Best Hyperparameters: {}
0.011286633
0.061069842
0.022651702
0.2530361
0.013937977
0.0040124576
0.0012946258
0.018506423
0.017122878
0.0034700786
0.05140999
0.016848797
0.0038317663
0.0
0.011923399
0.0
0.008631713
0.022128558
0.003003153
0.0013712475
0.0029562588
0.008852245
0.0057443525
0.0053857407
0.0005759924
0.0117654465
0.0005249169
0.014050178
0.004244289
0.012494995
0.020273264
0.0017861181
0.0005810647
0.008692572
0.00060559146
0.0
0.0
0.0
0.0083309645
0.016575964
0.0024258245
0.007968123
0.016105935
0.0013022829
0.0017108846
0.0006976412
0.00067117176
0.0
0.0
0.0
0.0018442143
0.0011986758
0.0013379382
0.0028900285
0.0070160953
0.0016184612
0.00055369694
0.0002262115
0.011224418
0.007834684
0.00055259606
0.0032362922
0.0047239363
0.0021607368
0.0014768135
0.00080412236
0.0021371588
0.0047551636
0.0015762787
0.0020173856
0.0018227973
0.0037928305
0.001089740



Elapsed time to compute best fit: 91.202 seconds
Cross-validation score: 0.6786309615231996
Test score: 0.6481481481481481
Best Hyperparameters: {}
0.0127758365
0.05573117
0.036718376
0.21512346
0.009992383
0.0028927492
0.0019121869
0.006039719
0.00318754
0.010066345
0.08480888
0.010467173
0.005441925
0.0
0.016015982
0.0
0.0022547052
0.014671224
0.011207925
0.0016612484
0.009661274
0.00686833
0.008204052
0.0003091816
0.0024822121
0.00071523944
0.0017397427
0.01809817
0.0012652772
0.0021867773
0.026179181
0.0005409538
0.0008261959
0.008279643
0.000824153
0.0
0.0
0.0
0.0072911205
0.005607662
0.002436405
0.004983607
0.0023817238
0.00655607
0.0038033128
0.0
0.0007803544
0.0
0.0
0.0
0.00096148776
0.0033746758
0.0027452095
0.0004087317
0.0013492074
0.0027802049
0.002816115
0.0038368914
0.008925868
0.0046077496
0.006297226
0.006184217
0.0029886109
0.0021129402
0.0028017585
0.00192846
0.0016887978
0.011204089
0.00091001554
0.0011801296
0.001983635
0.00949565
0.0010587182
0.0012382192
0.0005733



Elapsed time to compute best fit: 91.372 seconds
Cross-validation score: 0.6435261670924441
Test score: 0.6451612903225806
Best Hyperparameters: {}
0.009733754
0.07430217
0.03771535
0.24302523
0.003838476
0.014536416
0.002316338
0.012902144
0.011775883
0.0010831705
0.036490653
0.012896443
0.012728915
0.0
0.012272591
0.0
0.02444302
0.011498985
0.0075791636
0.0006124338
0.0018929008
0.003621735
0.008838487
0.0021882935
0.011265547
0.000713529
0.0009993301
0.0040774257
0.0017087517
0.0057164733
0.022512255
0.0018193675
0.0015952619
0.010271919
0.0009886195
0.0
0.0
0.0
0.019383559
0.0
0.003866326
0.00662729
0.0005689484
0.0023787932
0.0020675631
0.00057157374
0.00083133625
0.0
0.0
0.0
0.0014809957
0.0018707702
0.0033167591
0.001220566
0.0019999056
0.0023031526
0.0062374426
0.0018506749
0.012044635
0.009848063
0.0017493962
0.003911967
0.007969252
0.0042096316
0.0026382317
0.0041086823
0.0012921253
0.007843148
0.0023176472
0.0018236525
0.0012792695
0.009108824
0.00067290355
0.0028210285
0.00



Elapsed time to compute best fit: 91.973 seconds
Cross-validation score: 0.6793579722655809
Test score: 0.6677524429967426
Best Hyperparameters: {}
0.009730943
0.05346848
0.03815321
0.22666278
0.019164238
0.0053138267
0.0009079917
0.029176388
0.020902723
0.011965388
0.046992343
0.012050819
0.0028766647
0.0
0.018726645
0.0
0.009461995
0.008724267
0.012345964
0.003202854
0.008945326
0.0023613076
0.0069405944
0.0040140445
0.0030156246
0.0040148497
0.00030573038
0.0024720398
0.0015591542
0.0023615607
0.025549531
0.0018369211
0.0008399398
0.0075988346
0.001457897
0.0
0.0
0.0
0.003943827
0.020180233
0.0028338544
0.0063694995
0.0013782318
0.0073188986
0.005805492
0.014342782
0.0026104928
0.0
0.0
0.0
0.0014501358
0.0042762803
0.001992148
0.0011848187
0.0022079905
0.0012052424
0.002026354
0.0031996716
0.005740565
0.01572223
0.0035042954
0.0069497554
0.0012631875
0.0010477334
0.0045516784
0.0012121982
0.0007773586
0.0126110315
0.00514662
0.0011321201
0.0016405189
0.0077254437
0.0041553658
0.0021



Elapsed time to compute best fit: 93.992 seconds
Cross-validation score: 0.6619112319359022
Test score: 0.6347962382445141
Best Hyperparameters: {}
0.012110809
0.047624186
0.038769133
0.26107082
0.013297274
0.0036559065
0.0017904874
0.013324207
0.010163112
0.01983508
0.048364967
0.011854925
0.001958573
0.0
0.024764745
0.0
0.016944468
0.010869234
0.0046855537
0.0017521764
0.0053380765
0.008489947
0.0065209633
0.0011428357
0.0067262053
0.001668917
0.0020583814
0.02633861
0.0014431211
0.006257671
0.038282223
0.0014408928
0.0028768335
0.003608357
0.0009795182
0.0
0.0
0.0
0.0016951298
0.007901135
0.0014129442
0.0022671535
0.0014595548
0.0032395031
0.0057943068
0.0
0.0020901752
0.0
0.0
0.0
0.0010900627
0.0031880764
0.0024963976
0.0029490236
0.00016734887
0.0040204227
0.0020727585
0.0021802338
0.0065337764
0.0132559
0.0051233326
0.0045762206
0.0010720554
0.0041928324
0.0003802976
0.00014537718
0.0013493112
0.008745029
0.00093158515
0.0014672829
0.0014708149
0.016466787
0.0020342988
0.00093668



Elapsed time to compute best fit: 90.903 seconds
Cross-validation score: 0.6595141784313573
Test score: 0.7302405498281787
Best Hyperparameters: {}
0.014465218
0.05509644
0.041490175
0.18886158
0.011666928
0.0074848025
0.0062824353
0.0046096668
0.019267429
0.014032374
0.035344895
0.013105454
0.002778185
0.0
0.012651582
0.0
0.01880325
0.03669078
0.010245084
0.0005629666
0.0021928286
0.0010970195
0.005837545
0.0008555514
0.010301706
0.0035219726
0.0026563033
0.006813393
0.0009921803
0.0027313456
0.059634924
0.003724833
0.00095430337
0.008046864
0.00070115336
0.0
0.0
0.0
0.0014451584
0.010470536
0.0032985024
0.0046213055
0.020701641
0.0018530089
0.004046703
0.02499465
0.0015179599
0.0
0.0
0.0
0.0013548838
0.0021573994
0.0005963227
0.0018389868
0.008180602
0.0026201517
0.0014437991
0.002373245
0.005170826
0.015377457
0.0003855999
0.003062848
0.012926005
0.002867625
0.011069111
0.002030338
0.001459322
0.005993655
0.0014787299
0.0013083875
0.0015754984
0.006191881
0.002254901
0.0023635582
0.



Elapsed time to compute best fit: 90.607 seconds
Cross-validation score: 0.6653386046807055
Test score: 0.6885245901639344
Best Hyperparameters: {}
0.0071604643
0.077289484
0.025258606
0.20321642
0.0076089883
0.0022816807
0.0022831252
0.015208979
0.009276026
0.016975705
0.041613854
0.014112776
0.023593782
0.0
0.019642059
0.0
0.020158082
0.013857489
0.011124733
0.0015593481
0.00446841
0.0029786928
0.013773623
0.0014122954
0.008872825
0.0042586434
0.0018539289
0.009813769
0.00057177513
0.0064546536
0.030042222
0.0029662098
0.001262303
0.0064346697
0.0006913097
0.0
0.0
0.0
0.0042769634
0.026518581
0.0013007899
0.0030566675
0.0042949715
0.0051247557
0.01275746
0.0013098046
0.0023227546
0.0
0.0
0.0
0.0011581152
0.0029305401
0.00069628865
0.0046846094
0.009018839
0.0014991565
0.003133094
0.0027584217
0.012248433
0.011581092
0.0011226705
0.0023006627
0.001298759
0.0038912112
0.0047147837
0.011560101
0.0025864206
0.008997285
0.0024559756
0.0031680441
0.0014073899
0.009583998
0.0015316812
0.003



Elapsed time to compute best fit: 93.865 seconds
Cross-validation score: 0.6631100543922955
Test score: 0.6746031746031746
Best Hyperparameters: {}
0.0049625644
0.062390257
0.02480225
0.20874663
0.02256563
0.0092202695
0.0023210468
0.016424915
0.00589688
0.0018445267
0.036658302
0.01436634
0.0037303523
0.0
0.007768112
0.0
0.0025218946
0.013261594
0.01671255
0.0031197832
0.010679849
0.0016750211
0.004599283
0.0009290213
0.0032719662
0.00044148692
0.0015343373
0.0028663545
0.00039273547
0.0029867978
0.033524156
0.0029288048
0.0021478534
0.012005829
0.001027703
0.0
0.0
0.0
0.0056160176
0.11224135
0.00264151
0.0032159179
0.010822269
0.004268588
0.002690505
0.001358477
0.0015890887
0.0
0.0
0.0
0.0007970855
0.009566003
0.0012155613
0.002518292
0.0073172534
0.00036703688
0.0019943884
0.0049917395
0.007206744
0.0066790767
0.001477006
0.004307633
0.009580595
0.0034027856
0.004929021
0.0004830783
0.0006700245
0.015165768
0.00076951843
0.000660604
0.0013525165
0.007349222
0.0012647348
0.001793495

### 5.4.3 LightGBM

In [135]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

fiftyfifty_lightgbm_performance_nonnormalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    LightGBMPipeline = Pipeline(steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                                    ['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    fiftyfifty_lightgbm_performance_nonnormalized_df = pd.concat([fiftyfifty_lightgbm_performance_nonnormalized_df, new_performance_df])
    

fiftyfifty_lightgbm_performance_nonnormalized_df.to_csv("../data/05_model_output/fiftyfifty_lightgbm_performance_nonnormalized_df.csv")




Elapsed time to compute best fit: 11.839 seconds
Cross-validation score: 0.686098053502568
Test score: 0.7724957555178268
Best Hyperparameters: {}
2713.0820776224136
28186.295781314373
23192.28696733713
53004.99878567457
714.6532052755356
283.7997786998749
307.7703982591629
255.59583830833435
662.0060307979584
253.83510875701904
7782.475315570831
2591.544456601143
236.9591212272644
0.0
1889.4860954284668
0.0
1618.1315678954124
301.733073592186
374.1625806093216
71.61814707517624
576.0079275965691
82.40565967559814
468.04527592658997
156.78730261325836
72.3555418252945
58.77864122390747
24.80512022972107
1055.5604150295258
55.75811946392059
53.66819977760315
2826.010975122452
155.84505188465118
52.58396100997925
1180.771995127201
64.68340027332306
0.0
0.0
0.0
477.0647008419037
0.0
311.3925123810768
421.4757294654846
119.73720169067383
307.8331506252289
476.4514572620392
8.744400143623352
208.25824117660522
0.0
0.0
0.0
101.70560050010681
983.3300784230232
73.42904216051102
19.27777016162



Elapsed time to compute best fit: 10.918 seconds
Cross-validation score: 0.7225386158169557
Test score: 0.7731958762886598
Best Hyperparameters: {}
3082.7518585920334
24461.399200558662
24368.297617971897
54010.26204818487
1138.06025493145
445.16840797662735
167.79064011573792
372.705673456192
626.4366991519928
602.2425202131271
9287.524209439754
1822.4782764911652
474.29910242557526
0.0
1246.6257297992706
0.0
536.1140177249908
1121.280467391014
715.5743116140366
254.5394960641861
803.3996052742004
133.861869931221
600.573134303093
210.56121063232422
247.16720539331436
13.46813976764679
20.46085011959076
1356.5939804315567
54.84024143218994
262.3039981126785
870.3284620046616
147.63369941711426
100.70779991149902
3149.944223046303
49.8368194103241
0.0
0.0
0.0
386.2423597574234
133.99451208114624
201.72246432304382
163.91559064388275
107.40770268440247
345.1097871661186
372.87768417596817
9.683529734611511
151.88544130325317
0.0
0.0
0.0
89.3510205745697
77.17288088798523
336.75212728977



Elapsed time to compute best fit: 10.741 seconds
Cross-validation score: 0.7108330028784418
Test score: 0.7155322862129144
Best Hyperparameters: {}
1885.3508021831512
35799.89565438032
25771.167616784573
49594.198817670345
945.0707072615623
800.1014988422394
286.11611062288284
218.62459152936935
418.7147432565689
49.4574590921402
5923.0771496891975
2291.6927646398544
151.62625408172607
0.0
1541.4389364123344
0.0
423.2716889977455
704.7391964197159
1294.626911520958
69.31699073314667
297.60460567474365
95.20419019460678
731.5039517879486
107.88419032096863
272.7264041900635
174.17906975746155
32.73573017120361
711.7327826023102
41.192949295043945
254.5738410949707
2737.6003139019012
119.74366360902786
100.73355984687805
944.9669023752213
33.742153227329254
0.0
0.0
0.0
772.8733515739441
84.35611200332642
220.11528825759888
134.32364988327026
21.664679527282715
385.920857667923
340.02859729528427
4.39001989364624
178.7343612909317
0.0
0.0
0.0
111.14652460813522
429.88053637742996
148.9471



Elapsed time to compute best fit: 11.589 seconds
Cross-validation score: 0.7505649893828645
Test score: 0.6494661921708186
Best Hyperparameters: {}
2150.8830069303513
34841.24555236101
19525.63734537363
52956.12299913168
446.01528906822205
544.929081082344
243.78474116325378
370.55264258384705
1159.0099215507507
321.4885137081146
6425.669618904591
3304.913136601448
279.63687205314636
0.0
1390.6094212532043
0.0
432.84536719322205
682.60824406147
827.0486154556274
93.28663873672485
1007.1085917949677
40.123799443244934
319.87559682130814
75.92806112766266
66.11503028869629
15.651130318641663
16.151410341262817
205.39688450098038
28.590099930763245
129.99753832817078
2147.577943921089
161.91977834701538
134.252971470356
871.6008920669556
191.2521380186081
0.0
0.0
0.0
633.2629240751266
104.16299819946289
260.94767022132874
346.56264531612396
141.19037079811096
252.09681642055511
582.7723053693771
39.60960006713867
73.29526007175446
0.0
0.0
0.0
151.4909496307373
446.140343606472
127.4733756



Elapsed time to compute best fit: 10.966 seconds
Cross-validation score: 0.7238709173275323
Test score: 0.6878306878306878
Best Hyperparameters: {}
1484.541998744011
31997.356930553913
25414.919642686844
49622.52619796991
767.9255145788193
270.43592071533203
104.86541140079498
317.8419814109802
619.5810108184814
371.77407371997833
7164.8774959445
2275.8229085206985
256.765404522419
0.0
1934.4715613126755
0.0
349.6174622774124
671.5961622595787
932.9333663582802
109.42589110136032
408.1380290389061
141.23043978214264
1120.7231147885323
127.99123120307922
63.02263033390045
24.526739716529846
45.023279905319214
735.5779272317886
34.00653040409088
130.48372185230255
1990.1327498555183
75.94007992744446
78.79476022720337
766.2575670480728
99.56092458963394
0.0
0.0
0.0
178.81408154964447
164.9734605550766
305.7507687807083
146.2651882171631
49.10677993297577
1107.698292016983
87.63040089607239
1.0813499689102173
56.86222553253174
0.0
0.0
0.0
87.03831899166107
272.572314620018
242.20941978693



Elapsed time to compute best fit: 11.137 seconds
Cross-validation score: 0.7195379110370748
Test score: 0.7342657342657343
Best Hyperparameters: {}
2822.639280498028
35303.45212072134
20700.439581871033
45291.46363610029
61.53020656108856
387.8237138390541
161.5602903366089
291.61961138248444
569.7595579624176
539.9791350364685
9818.525637865067
2550.240182995796
107.77567887306213
0.0
1708.744691491127
0.0
435.5896602869034
1285.962396979332
499.7488840818405
93.14598953723907
1081.2621575593948
77.68761968612671
464.97210985422134
80.65431940555573
146.87207746505737
18.575549840927124
25.060019493103027
1022.6766096353531
53.39901089668274
235.92724478244781
2035.5033029317856
177.5933393239975
126.3435235619545
1216.4049290418625
76.48842084407806
0.0
0.0
0.0
131.8682188987732
201.10979843139648
460.8549964427948
62.4162802696228
140.32410049438477
383.391574382782
664.1709792017937
202.24763178825378
175.9156218767166
0.0
0.0
0.0
174.3226928114891
1306.2495996952057
81.94935870170



Elapsed time to compute best fit: 11.185 seconds
Cross-validation score: 0.7264195946235972
Test score: 0.7180851063829787
Best Hyperparameters: {}
1855.6670579314232
32356.786748230457
26015.912788033485
51499.36219853163
1281.3347930312157
402.8410686850548
140.8302903175354
145.62993681430817
1441.7332410216331
162.57311284542084
4446.852216780186
2267.3229169249535
519.3798761367798
0.0
959.9126551151276
0.0
1288.034997344017
1040.4241118431091
1108.802033483982
161.13124430179596
565.3875824213028
295.05737125873566
788.7425402402878
64.66840600967407
190.25784742832184
27.238359451293945
20.768049716949463
857.676148891449
29.864529967308044
502.9758902788162
1738.8843268156052
67.15804052352905
174.53846979141235
593.3151420354843
67.87726068496704
0.0
0.0
0.0
1292.4273750782013
39.759520053863525
136.6848304271698
410.2883394956589
3.407509922981262
627.2140986323357
209.03056955337524
1.5671800374984741
44.38390654325485
0.0
0.0
0.0
136.43719065189362
101.36633306741714
311.91



Elapsed time to compute best fit: 11.228 seconds
Cross-validation score: 0.7296854216057685
Test score: 0.74486301369863
Best Hyperparameters: {}
1659.6804305315018
36393.18102669716
21347.418275773525
49409.4675449729
1372.4737226366997
957.5033187866211
104.35420978069305
758.8465659618378
695.8786507248878
27.555400371551514
8497.207942426205
2158.3743262887
553.8213281035423
0.0
1126.7888656258583
0.0
862.4819177985191
816.1081299781799
800.2224918007851
122.89498901367188
458.09441262483597
207.9358867406845
405.1861125230789
141.95349860191345
46.33479976654053
52.9204705953598
8.737610220909119
615.7378372550011
20.204795956611633
331.3486332297325
961.2587516307831
123.65411007404327
95.01724207401276
995.3583751916885
131.66554111242294
0.0
0.0
0.0
609.2956666946411
124.33313757181168
316.75310826301575
237.90118217468262
89.9741393327713
268.7786490917206
508.1624643802643
0.0
128.772121489048
0.0
0.0
0.0
88.83563041687012
145.60302019119263
127.84110003709793
85.481523513793



Elapsed time to compute best fit: 11.184 seconds
Cross-validation score: 0.7414876039814726
Test score: 0.6896551724137931
Best Hyperparameters: {}
3947.003767848015
29940.286727249622
24928.34360575676
45423.12796843052
959.6589119434357
580.9512971639633
44.51771020889282
139.65641635656357
646.5070708990097
269.20217698812485
8093.24488055706
2106.0312935113907
189.89661824703217
0.0
1547.401527106762
0.0
541.9800852537155
644.2055907249451
784.5577179193497
142.5228869318962
437.7512890100479
303.38707160949707
581.0304806828499
145.54081881046295
170.3603938817978
50.225669860839844
14.999650120735168
1083.4775884151459
10.252723038196564
203.47301971912384
2644.3519894480705
62.566100001335144
126.57400059700012
1146.5429213643074
70.45232075452805
0.0
0.0
0.0
439.7007106542587
260.9155297279358
479.2177883386612
274.9381254911423
169.08090209960938
69.44671070575714
297.0248489379883
0.0
189.32485830783844
0.0
0.0
0.0
174.19458478689194
192.71831411123276
82.68939971923828
184.5



Elapsed time to compute best fit: 10.999 seconds
Cross-validation score: 0.717015420554088
Test score: 0.7368421052631579
Best Hyperparameters: {}
2273.077926814556
28676.218424856663
26379.368080735207
50216.82010084391
764.3479714393616
423.21945118904114
209.45524215698242
532.5200691223145
706.0498169660568
72.81031954288483
7070.989808082581
2142.7465214133263
403.91424906253815
0.0
1700.5072993636131
0.0
243.6446060538292
866.603665292263
1006.403419137001
60.03980886936188
502.02202385663986
299.6151418685913
370.31646633148193
161.41810750961304
327.068727016449
12.745699882507324
27.326189637184143
1815.6125270724297
90.9985898733139
53.348039627075195
1986.273278415203
103.63960111141205
41.70696973800659
885.5855745077133
113.10611057281494
0.0
0.0
0.0
427.14587330818176
282.6906397342682
334.87545454502106
371.3024480342865
85.96194839477539
411.83775424957275
316.3592099547386
6.425340175628662
180.27558851242065
0.0
0.0
0.0
261.9439247250557
232.43863636255264
149.9577524



Elapsed time to compute best fit: 11.421 seconds
Cross-validation score: 0.7264730007351655
Test score: 0.7365684575389949
Best Hyperparameters: {}
619.3112403154373
28292.861267209053
26134.629810273647
53644.338056623936
914.600620508194
480.3057725429535
90.03144001960754
715.0300096273422
400.8823603987694
311.5700249671936
6509.62463003397
1792.8632648587227
198.31086766719818
0.0
1313.745562195778
0.0
616.3048757314682
1376.9331446886063
320.52449357509613
114.02071809768677
623.3267129659653
228.3487777709961
635.4158746600151
51.341930747032166
175.64514219760895
38.08680987358093
25.963029265403748
1270.2566541433334
43.951893746852875
266.0150864124298
1876.6356016397476
61.66924387216568
86.16354197263718
179.9958508014679
49.21900671720505
0.0
0.0
0.0
620.7932863235474
39.535301208496094
488.86194598674774
82.22329068183899
375.25658988952637
341.1005538702011
569.6008502840996
0.0
218.6752791404724
0.0
0.0
0.0
163.29533529281616
1286.840621471405
44.341620206832886
59.8749



Elapsed time to compute best fit: 11.680 seconds
Cross-validation score: 0.7403270560168173
Test score: 0.7719298245614037
Best Hyperparameters: {}
2593.1650987267494
28740.542661964893
24103.54890215397
48507.69401115179
996.0965765714645
410.6533380150795
130.37970912456512
880.9392113685608
411.7094213962555
382.8261381983757
7468.496638178825
2407.558035135269
205.2737033367157
0.0
2763.000216603279
0.0
434.53842079639435
610.2492753267288
825.4299609661102
53.45919930934906
390.62448275089264
123.01653122901917
400.6561706662178
83.6766699552536
42.052258253097534
33.098610639572144
66.80346941947937
1143.5642330646515
37.01643979549408
313.94807279109955
3213.7658244371414
61.43622261285782
82.37742948532104
1230.6117226481438
46.74356007575989
0.0
0.0
0.0
372.37647449970245
182.34296321868896
253.2739313840866
477.73014867305756
0.0
223.90826231241226
419.4771020412445
0.0
224.75087022781372
0.0
0.0
0.0
81.27270019054413
1403.841089963913
311.44962978363037
24.72200983762741
54.



Elapsed time to compute best fit: 10.891 seconds
Cross-validation score: 0.7006247678385467
Test score: 0.771869639794168
Best Hyperparameters: {}
1799.46119171381
33340.492054104805
21453.89679789543
50561.00505465269
1529.843933045864
350.69178599119186
62.15278434753418
471.0385627746582
495.3661150932312
170.19781970977783
7360.489504575729
2914.071601688862
1386.5992601513863
0.0
1145.0561434030533
0.0
689.0941125154495
1048.3363507986069
457.81747072935104
81.438059091568
408.32631182670593
257.7500879764557
627.8986262083054
187.3211920261383
1.2446800470352173
26.29379940032959
28.27524983882904
664.6603423953056
27.74995267391205
239.20199424028397
2074.5482526421547
54.340797781944275
39.757670283317566
1504.8479936122894
85.64819371700287
0.0
0.0
0.0
855.8541642427444
381.02112650871277
166.06157088279724
440.40873992443085
13.09060001373291
366.37856936454773
452.5514943599701
18.682429671287537
81.42114877700806
0.0
0.0
0.0
195.39019000530243
164.01845276355743
91.19313049



Elapsed time to compute best fit: 10.749 seconds
Cross-validation score: 0.7154411596169364
Test score: 0.7417102966841187
Best Hyperparameters: {}
2067.0542547106743
31439.685899615288
23688.252748668194
48707.036152780056
647.0336765050888
453.83142244815826
32.902561485767365
375.69910168647766
471.10845017433167
423.13366663455963
8225.78537517786
1784.5715234279633
380.46164643764496
0.0
2615.4470472335815
0.0
877.153667151928
697.0167750120163
529.680938243866
45.70869201421738
347.3197903037071
286.55700039863586
452.76023185253143
310.33078622817993
93.28465962409973
37.539700508117676
7.316989898681641
856.3249350786209
8.61486005783081
440.2180563211441
3126.7171437740326
102.58329117298126
23.221999883651733
809.3632145524025
118.76569497585297
0.0
0.0
0.0
801.3221797943115
57.8494987487793
427.3354790210724
600.9986507892609
84.97824668884277
391.3556891679764
181.4134896993637
0.0
141.98709154129028
0.0
0.0
0.0
43.82490885257721
143.85977351665497
78.18258780241013
28.7881



Elapsed time to compute best fit: 11.996 seconds
Cross-validation score: 0.7387243474887514
Test score: 0.6939501779359432
Best Hyperparameters: {}
1319.4454182982445
28643.789994239807
29524.495974361897
48834.82989060879
619.3571212291718
724.7440813779831
322.3431714773178
296.3014158010483
632.6483600139618
527.6157269477844
7500.8416558504105
1407.495777130127
709.8644165992737
0.0
1328.4453929066658
0.0
224.88975846767426
747.9692635536194
733.4200474023819
33.356574058532715
632.978259563446
408.0719264745712
625.7571312785149
214.38057124614716
48.452991008758545
9.635689854621887
18.548449397087097
783.7282845377922
14.487839698791504
344.91682744026184
1799.2570068836212
75.84898054599762
154.546919465065
1443.8994599580765
78.84055942296982
0.0
0.0
0.0
1195.7693946361542
396.05329990386963
273.6320996284485
287.01703548431396
289.6036092042923
345.0746336579323
329.62751257419586
3.5265700817108154
139.94186902046204
0.0
0.0
0.0
157.8770791888237
1640.4931427240372
127.12886



Elapsed time to compute best fit: 11.172 seconds
Cross-validation score: 0.7125092389609418
Test score: 0.7766143106457244
Best Hyperparameters: {}
1811.974987745285
34013.613943099976
25778.423403799534
48580.370637476444
1149.2564829587936
792.1788454055786
233.10210424661636
148.04899954795837
307.1439378261566
263.90860986709595
6337.779694318771
2004.6648281216621
325.5562039613724
0.0
1693.6377317905426
0.0
670.2379044294357
762.4476529359818
815.629389166832
31.262240290641785
528.1469447612762
516.6947194337845
410.0433613061905
98.6131581068039
158.38440835475922
25.844979524612427
37.943349957466125
58.63461410999298
49.30133819580078
1441.28032040596
1354.6014963388443
35.56922996044159
105.65429949760437
1273.4946163892746
100.66856646537781
0.0
0.0
0.0
597.1907963752747
262.87823247909546
411.50062358379364
416.800388276577
72.53845179080963
262.5277473926544
445.6569913625717
0.0
111.50063800811768
0.0
0.0
0.0
105.41116952896118
1557.8191635608673
91.2767698764801
31.9814



Elapsed time to compute best fit: 10.965 seconds
Cross-validation score: 0.713992477914106
Test score: 0.747422680412371
Best Hyperparameters: {}
2155.18047952652
31841.544031500816
21107.88250052929
52855.42498075962
1306.3929438591003
257.9294202327728
163.7278609275818
580.2959249019623
486.5411846637726
422.9178365468979
6437.564265668392
1823.967701971531
298.9888868331909
0.0
1856.3539026975632
0.0
430.5241686105728
2273.995482504368
1430.0044211149216
62.53629320859909
1193.761756658554
67.67100048065186
611.2025084495544
276.6280870437622
65.66406011581421
229.0897798538208
20.057680010795593
472.3259776830673
26.630404829978943
170.31601977348328
1909.5050012469292
117.47237014770508
107.97109258174896
1700.5864783525467
126.2619104385376
0.0
0.0
0.0
1048.3445001840591
79.83249855041504
291.92284601926804
454.55721724033356
90.9536485671997
593.3159391880035
380.9160863161087
4.032159805297852
146.03480005264282
0.0
0.0
0.0
175.18999046087265
99.29679107666016
120.262811481952



Elapsed time to compute best fit: 11.405 seconds
Cross-validation score: 0.726669490010923
Test score: 0.703125
Best Hyperparameters: {}
1315.4317073225975
28169.974255681038
26419.641486883163
52388.84180480242
638.1767529845238
503.0011875629425
147.12400019168854
142.32854044437408
326.89155673980713
294.2411821484566
7152.198612511158
1961.4999104738235
245.78178644180298
0.0
2677.684445142746
0.0
396.5321179628372
688.3265557289124
1286.8658882379532
79.37663853168488
440.9031641483307
76.07957053184509
581.9514924287796
136.82282829284668
55.44464957714081
7.396080017089844
26.25383108854294
1054.6018469929695
71.77252584695816
345.8618412017822
1724.4560956954956
86.80120915174484
38.51366949081421
962.2114510536194
67.67304784059525
0.0
0.0
0.0
800.6311190128326
62.77302050590515
277.3867870569229
15.159409761428833
29.219699382781982
625.5787976980209
507.5532658100128
1.3523900508880615
112.72682178020477
0.0
0.0
0.0
113.46429079771042
2516.2963163256645
94.8260173201561
15.4



Elapsed time to compute best fit: 11.490 seconds
Cross-validation score: 0.726055460799128
Test score: 0.7192982456140351
Best Hyperparameters: {}
3870.196885585785
23037.06669008732
26310.99406993389
54699.98037695885
676.3532787561417
596.464220225811
121.70463120937347
485.7893057465553
320.0345607995987
631.7721471786499
8007.827359974384
1705.023496389389
214.22170042991638
0.0
1886.9393859505653
0.0
218.9928629398346
583.0294063091278
959.0449206829071
8.170699954032898
597.6761230230331
14.028135478496552
284.7257559299469
216.15634381771088
20.97634983062744
52.739981174468994
35.401281893253326
2003.7159224748611
51.05738008022308
240.116281747818
2917.5175796747208
170.23133552074432
32.80792021751404
1436.245523095131
121.13490056991577
0.0
0.0
0.0
410.1693753004074
82.46419906616211
388.85330188274384
492.98048090934753
0.0
117.67142134904861
655.8694493174553
9.519590139389038
252.95115172863007
0.0
0.0
0.0
105.54876559972763
147.60107570886612
28.86882108449936
23.6745893



Elapsed time to compute best fit: 10.984 seconds
Cross-validation score: 0.718809722531452
Test score: 0.7586206896551725
Best Hyperparameters: {}
2759.096473097801
26104.335066080093
26401.266460061073
50494.34986370802
1006.525821685791
521.7734734416008
96.92293882369995
467.1009398698807
662.5336771011353
389.2480833530426
8081.293443977833
1478.9076423048973
27.294410347938538
0.0
1243.2434976696968
0.0
955.6115155220032
1384.9309705495834
277.3100446462631
78.24062490463257
465.3544212579727
495.4661250114441
332.10436165332794
95.87450015544891
168.4048409461975
108.4260424375534
4.793677926063538
1126.7927135825157
11.947869777679443
288.79664528369904
2431.670438170433
214.50913679599762
65.78578019142151
1159.3406176567078
49.18488001823425
0.0
0.0
0.0
736.2821125984192
131.0716495513916
549.7704275846481
423.8001357913017
25.909230709075928
452.43099308013916
428.29159128665924
63.000200629234314
196.502791762352
0.0
0.0
0.0
199.8895781636238
371.9322910308838
190.6323499679



Elapsed time to compute best fit: 11.610 seconds
Cross-validation score: 0.7397685938434282
Test score: 0.7118055555555556
Best Hyperparameters: {}
2395.6881081461906
25975.108159661293
26196.554028213024
52521.379501998425
574.1634229421616
647.1056019663811
114.67035984992981
768.8032691478729
531.5385384559631
666.5536889433861
6729.624281048775
2244.0909271240234
467.97403037548065
0.0
2141.588350057602
0.0
581.9671227931976
353.07069659233093
1044.3821200728416
94.47151982784271
662.0629670023918
368.22648096084595
463.6838832497597
44.23503029346466
128.6948685646057
34.632779598236084
62.411590337753296
199.88403165340424
66.553417801857
451.84322088956833
1831.6136012673378
81.24477052688599
51.13781034946442
1691.788783788681
91.9915526509285
0.0
0.0
0.0
501.52693951129913
171.06566047668457
149.8521773815155
182.39354729652405
466.9267957210541
480.34197521209717
425.9218077659607
35.891550064086914
170.02860748767853
0.0
0.0
0.0
148.55884093046188
763.8342089653015
75.828978



Elapsed time to compute best fit: 13.485 seconds
Cross-validation score: 0.7112274351061477
Test score: 0.7291666666666666
Best Hyperparameters: {}
3228.6141145825386
25631.231363117695
22747.075864613056
53632.16073209047
854.7182922363281
445.1494872570038
168.10824793577194
624.7841340899467
716.5270230770111
550.0148034095764
9099.746050655842
2347.6691987514496
212.50593030452728
0.0
2703.5003193616867
0.0
563.9958579540253
441.78540301322937
826.4098109602928
17.64648985862732
424.2270185947418
155.10116922855377
419.2042171359062
113.18671369552612
53.78146994113922
10.165020227432251
21.354379892349243
1204.3599531054497
52.72375988960266
268.9741688966751
2381.7536911964417
84.79250144958496
98.44983947277069
987.1709952354431
135.2075406908989
0.0
0.0
0.0
457.5110945701599
64.27579045295715
205.0667468905449
359.6414680480957
226.06710052490234
227.22417831420898
446.915936589241
1.3929500579833984
121.29058462381363
0.0
0.0
0.0
190.61253190040588
639.3083476424217
193.669509



Elapsed time to compute best fit: 11.486 seconds
Cross-validation score: 0.7194809039227474
Test score: 0.7042253521126761
Best Hyperparameters: {}
2763.828211069107
29535.569608986378
26227.981631219387
50938.36254787445
1343.6728961467743
686.6088818311691
225.6598019003868
436.25170373916626
482.74567997455597
259.4298677444458
8019.584329128265
1346.5393983125687
312.72245144844055
0.0
1219.0013508796692
0.0
484.50557965040207
682.2607460618019
490.7197493314743
34.004968881607056
559.5181099176407
275.31603038311005
987.0648941397667
146.73198795318604
98.66048711538315
118.29863286018372
23.527379989624023
178.37529873847961
55.46856129169464
293.26810693740845
3422.8299614191055
98.34773993492126
127.21776819229126
787.18370449543
79.59234774112701
0.0
0.0
0.0
589.9897178411484
80.50146853923798
405.38940089941025
186.55656599998474
60.27681887149811
619.0122386217117
493.7472493648529
98.77224063873291
118.4266499876976
0.0
0.0
0.0
175.73118537664413
102.98659133911133
153.6003



Elapsed time to compute best fit: 11.165 seconds
Cross-validation score: 0.7157210497811152
Test score: 0.7671957671957672
Best Hyperparameters: {}
860.123194038868
30869.656377851963
22191.919522166252
49235.883601009846
1019.8486217260361
548.1333195567131
242.44620788097382
631.73377430439
1416.518818795681
732.8282999992371
11421.577134728432
1728.0690656900406
113.16580241918564
0.0
2234.5195496678352
0.0
541.3349496126175
967.2636430263519
517.16858959198
45.54165667295456
407.02233254909515
126.66793036460876
643.1427642703056
45.56235885620117
51.2317008972168
63.46979796886444
8.158219814300537
1232.7440551519394
44.823259711265564
457.63341903686523
1366.1928758621216
52.30568051338196
22.08266019821167
1387.3713561296463
116.5545101761818
0.0
0.0
0.0
523.2925879955292
140.03672671318054
538.1677515506744
97.96885991096497
118.63788080215454
575.0332200527191
354.15211725234985
33.82419049739838
147.3879668712616
0.0
0.0
0.0
104.2118861079216
155.23472160100937
311.4353883862



Elapsed time to compute best fit: 12.498 seconds
Cross-validation score: 0.7265958485287252
Test score: 0.740418118466899
Best Hyperparameters: {}
2765.889974951744
26196.84827041626
23995.300331294537
52031.752422332764
368.8762024641037
689.8842178583145
9.326490044593811
402.8961670398712
416.992857336998
1193.7171831130981
9990.067598700523
1230.6666783690453
123.45068991184235
0.0
1624.0430541038513
0.0
899.22008061409
474.771452665329
1257.8204209804535
47.40305972099304
390.3604998588562
326.96939039230347
443.5062004327774
202.61687421798706
21.094930171966553
17.703369975090027
32.01532995700836
1514.7536928653717
23.064614951610565
402.03521740436554
1646.6342442035675
129.93926882743835
61.31110072135925
632.5765134096146
132.02733993530273
0.0
0.0
0.0
268.46072006225586
32.7712607383728
389.4758517742157
59.91828942298889
152.7292491197586
216.62895792722702
239.27962815761566
57.087568521499634
146.20758092403412
0.0
0.0
0.0
129.76862233877182
171.22369801998138
167.018689