In [1]:
#Import Python Libraries
import pandas as pd
import pickle
import time
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt')

#Import Self-written Functions
import os
import sys
src_dir = os.path.join(os.getcwd(), '..', 'src')
sys.path.append(src_dir)

from d00_utils.calculateTimeDifference import calculateTimeDifference #Function to calc time difference
from d01_data.loadCommits import loadCommits #Function to load SVN data
from d02_intermediate.cleanCommitData import cleanCommitData #Function to clean commit data
from d02_intermediate.cleanJiraData import cleanJiraData #Function to clean JIRA data

from d03_processing.createFittedTF_IDF import createFittedTF_IDF #Function to see if a trace is valid
from d03_processing.createCorpusFromDocumentList import createCorpusFromDocumentList #Function to create a corpus
from d03_processing.checkValidityTrace import checkValidityTrace #Function to see if a trace is valid
from d03_processing.calculateTimeDif import calculateTimeDif #Calculate the time difference between 2 dates in seconds
from d03_processing.checkFullnameEqualsEmail import checkFullnameEqualsEmail #Check if fullName is equal to the email
from d03_processing.calculateCosineSimilarity import calculateCosineSimilarity #Calculate the cos similarity
from d03_processing.calculateDocumentStatistics import calculateUniqueWordCount
from d03_processing.calculateDocumentStatistics import calculateTotalWordCount
from d03_processing.calculateDocumentStatistics import calculateOverlapBetweenDocuments

from d04_modelling.summariseClassDistribution import summariseClassDistribution #Visualize the class distribution
from d04_modelling.showModelPerformance import showModelPerformance # Show several performance measures

#Display full value of a column
pd.set_option('display.max_colwidth', None)

#Display all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rande\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rande\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 1. Load Raw Data

In [2]:
#Set dataset

datasetDirectory = ""

In [3]:
#Import raw JIRA dataset
rawData_JIRA_dealService = pd.read_csv('../data/01_raw/deal-service.csv')

#import
rawData_SVN_dealService = loadCommits('../data/01_raw/dealservice-dump.txt')

# 2. Clean Raw Data
## 2.1 Clean Raw Data - SVN Data
Clean the raw data of the SVN files

In [4]:
from datetime import datetime
import re
import pandas as pd
import string

#nltk for NLP 
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.util import ngrams

#Function to transform natural text into unigram tokens
def preprocessNaturalLanguage(text, porterStemmer, cachedStopWords):
    string_text = str(text)
    #lowercase the string
    lower_case_string = string_text.lower()
    
    #Remove interpunction
    no_interpunction = lower_case_string.translate(str.maketrans('','',string.punctuation))
    
    #Remove numbers
    no_numbers = ''.join([i for i in no_interpunction if not i.isdigit()])
    
    #tokenize string
    tokens = word_tokenize(no_interpunction)
    
    #remove stopwords
    tokens_without_sw = [word for word in tokens if not word in cachedStopWords]
    
    #Stem the tokens
    stemmedToken = list(map(porterStemmer.stem, tokens_without_sw))

    return(stemmedToken)

#Function to transform natural text into n-gram tokens
def preprocessNGrams(text, porterStemmer, cachedStopWords, nGramSize):
    string_text = str(text)
    
    #lowercase the string
    lower_case_string = string_text.lower()
    
    #Remove interpunction
    no_interpunction = lower_case_string.translate(str.maketrans('','',string.punctuation))
    
    #Remove numbers
    no_numbers = ''.join([i for i in no_interpunction if not i.isdigit()])
    
    #tokenize string
    tokens = word_tokenize(no_interpunction)
    
    #Create the ngrams
    ngrams = list(nltk.ngrams(tokens, nGramSize))
    
    #remove all the n-grams containing a stopword
    cleanNGrams = [ngram for ngram in ngrams if not any(stop in ngram for stop in cachedStopWords)]
    
    #Stem the tokens
    stemmedNGrams = []
    for ngram in cleanNGrams:
        stemmed = list(map(porterStemmer.stem, ngram))
        stemmedNGrams.append(stemmed)
    return(stemmedNGrams)

#Function to transform date into a date object
def preprocessCommitDate(date_string):
    date_time_obj = datetime.strptime(date_string, '%Y-%m-%dT%H:%M:%S.%fZ')  
    return(date_time_obj)
    
#Remove the found Issue key from the log
def removeIssueKey(log_message):
    issue_keys = re.findall(r"LRN+.[0-9]+|AFM+.[0-9]+|MA+.[0-9]+|AFI+.[0-9]+|EM+.[0-9]+|OE+.[0-9]+|EM+.[0-9]+", log_message)
    log_message_without_key = log_message
    for issue_key in issue_keys:
        log_message_without_key = log_message_without_key.replace(issue_key, "")
    return(log_message_without_key)

def unitNamesLambdaFunc(unitName, stemmer):
    #Lower case
    unitNameLowered = unitName.lower()
    
    #Remove interpunction
    noInterpunction = unitNameLowered.translate(str.maketrans('','',string.punctuation))
    
    #Remove numbers
    noNumbers = ''.join([i for i in noInterpunction if not i.isdigit()])
    
    stemmendUnitName = stemmer.stem(noInterpunction)
    
    
    return(stemmendUnitName)
    

def preprocessUnitNames(unitName, porterStemmer, cachedStopWords):
    if (isinstance(unitName, str)):
        #Split camelCasing
        unitNameSplitList = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', unitName)).split()
        
        porterStemmer = PorterStemmer() #create an object of class PorterStemmer
        
        #Preprocess each split found.
        unitNameLowered = list(map(lambda unitName: unitNamesLambdaFunc(unitName, porterStemmer), 
                                   unitNameSplitList))
        
        #Check for stopwords
        tokensWithoutSW = [word for word in unitNameLowered if not word in cachedStopWords]

        return(tokensWithoutSW)

def preprocessNGramsUnitNames(unitName, porterStemmer, cachedStopWords, nGramSize):
    if (isinstance(unitName, str)):
        #Split camelCasing
        unitNameSplitList = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', unitName)).split()
        
        cleanedUnitNames = []
        for unitNameSplit in unitNameSplitList:
            #Lower case unit names
            lowerCased = unitNameSplit.lower()

            #Remove interpunction
            removedInterpunction = lowerCased.translate(str.maketrans('','',string.punctuation))
            cleanedUnitNames.append(removedInterpunction)
            
        #Transform to string (needed for tokenizer
        unitNameString = ' '.join(cleanedUnitNames)

        #Tokenzize words
        tokenized = word_tokenize(unitNameString)
        
        #Create the ngrams
        ngrams = list(nltk.ngrams(tokenized, nGramSize))
        
        porterStemmer = PorterStemmer() #create an object of class PorterStemmer
        
        #remove all the n-grams containing a stopword
        cleanNGrams = [ngram for ngram in ngrams if not any(stop in ngram for stop in cachedStopWords)]
    
        #Stem the tokens
        stemmedNGrams = []
        for ngram in cleanNGrams:
            stemmed = list(map(porterStemmer.stem, ngram))
            stemmedNGrams.append(stemmed)
            
        return(stemmedNGrams)

#Method to clean all columns of the provided data
def cleanCommitData(rawCommitData): 
    #create an object of class PorterStemmer
    porterStemmer = PorterStemmer()
    
    #Find all stopwords
    cachedStopWords = stopwords.words("english")
    
    #Remove all revisions without an issue key in the log message
    commit_df = rawCommitData[rawCommitData["related_issue_key"].notna()]

    #Execute cleaning methods on dataset
    cleaned_commit_logs = commit_df['log'].apply(lambda x: removeIssueKey(x))
    processed_commit_logs = cleaned_commit_logs.apply(lambda x: preprocessNaturalLanguage(x, porterStemmer, cachedStopWords))
    processed_commit_logs_2grams = cleaned_commit_logs.apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 2))
    processed_commit_logs_3grams = cleaned_commit_logs.apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 3))
    processed_date_times = commit_df['date'].apply(lambda x: preprocessCommitDate(x))
    processed_unit_names = commit_df['impacted_unit_names'].apply(lambda x: preprocessUnitNames(x, porterStemmer, cachedStopWords))
    processed_unit_names_2grams = commit_df['impacted_unit_names'].apply(lambda x: preprocessNGramsUnitNames(x, porterStemmer, cachedStopWords, 2))
    processed_unit_names_3grams = commit_df['impacted_unit_names'].apply(lambda x: preprocessNGramsUnitNames(x, porterStemmer, cachedStopWords, 3))
    

    #Put all data together into a new dataframe
    commit_data = {'Revision': commit_df["revision"],
               'Email' : commit_df["email"],
               'Commit_date': processed_date_times,
               "Issue_key_commit": commit_df["related_issue_key"],
               'Logs': processed_commit_logs, 
               'Logs_2grams': processed_commit_logs_2grams, 
               'Logs_3grams': processed_commit_logs_3grams, 
               'Unit_names': processed_unit_names,
               'Unit_names_2grams': processed_unit_names_2grams,
               'Unit_names_3grams': processed_unit_names_3grams,
               'Commit_natural_text': processed_commit_logs + processed_unit_names,
               'Commit_natural_text_2grams': processed_commit_logs_2grams + processed_unit_names_2grams,
               'Commit_natural_text_3grams': processed_commit_logs_3grams + processed_unit_names_3grams
               }
               
    commit_processed_df = pd.DataFrame(data=commit_data)

    return(commit_processed_df)

In [5]:
#Start timer
startTime = time.time() 

intermediateData_SVN_dealService = cleanCommitData(rawData_SVN_dealService)

#Create a temp XLSX file for all intermediate datasets
intermediateData_SVN_dealService.to_excel(excel_writer = "../data/02_intermediate/intermediateData_SVN_dealService.xlsx", index = False)

#Create a pickle file for all intermediate datasets
intermediateData_SVN_dealService.to_pickle(path= "../data/02_intermediate/intermediateData_SVN_dealService.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished cleaning after " + timeDifference)

Finished cleaning after 0 minutes and 8.482605934143066 seconds


In [7]:
intermediateData_SVN_dealService['Issue_key_commit']
nofissues = intermediateData_SVN_dealService['Issue_key_commit'].apply(lambda x: len(x))

nofissues.value_counts()

1    1462
2      29
3       4
Name: Issue_key_commit, dtype: int64

In [6]:
import re

import string
#nltk for NLP 
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag  import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from datetime import datetime
import numpy as np
import pandas as pd
import time
nltk.download('averaged_perceptron_tagger')


#Function to clean the comments
def clean_comments(comment):
    try:
        commentDates = re.findall(r"[0-9]{2} [A-Z][a-z]{2} [0-9]{4} [0-9]{2}:[0-9]{2};[a-zA-Z0-9_]{24};", comment)
        accountIds = re.findall(r"\[~accountid:[a-zA-Z0-9]{24}\]", comment)
               
        
        cleanedComment = comment.replace("nan",'')
        for commentDate in commentDates:
            cleanedComment = cleanedComment.replace(commentDate,'')
        
        for accountId in accountIds: 
            cleanedComment = cleanedComment.replace(accountId,'')
        
        return(cleanedComment)
    except:
        return("")

def preprocess(text, porterStemmer, cachedStopwords):
    string_text = str(text)
    #lowercase the string
    lower_case_string = string_text.lower()
    
    #Remove interpunction
    no_interpunction = lower_case_string.translate(str.maketrans('','',string.punctuation))
    
    #Remove numbers
    no_numbers = ''.join([i for i in no_interpunction if not i.isdigit()])
    
    #tokenize string
    tokens = word_tokenize(no_numbers)
    
    #remove stopwords
    tokens_without_sw = [word for word in tokens if not word in cachedStopwords]
    
    #Stem the tokens
    stemmedToken = list(map(porterStemmer.stem, tokens_without_sw))

    return(stemmedToken)

def preprocessNGrams(text, porterStemmer, cachedStopWords, nGramSize):
    string_text = str(text)
    
    #lowercase the string
    lower_case_string = string_text.lower()
    
    #Remove interpunction
    no_interpunction = lower_case_string.translate(str.maketrans('','',string.punctuation))
    
    #Remove numbers
    no_numbers = ''.join([i for i in no_interpunction if not i.isdigit()])
    
    #tokenize string
    tokens = word_tokenize(no_numbers)
    
    #Create the ngrams
    ngrams = list(nltk.ngrams(tokens, nGramSize))
    
    #remove all the n-grams containing a stopword
    cleanNGrams = [ngram for ngram in ngrams if not any(stop in ngram for stop in cachedStopWords)]
    
    #Stem the tokens
    stemmedNGrams = []
    for ngram in cleanNGrams:
        stemmed = list(map(porterStemmer.stem, ngram))
        stemmedNGrams.append(stemmed)
    return(stemmedNGrams)

#Function to transform date into a date object
def preprocess_jira_date(date_string):
    if(isinstance(date_string, str)):
        try:
            date_time_obj = datetime.strptime(date_string, '%d %b %Y %H:%M')
        except:
            date_time_obj = datetime.strptime(date_string, '%Y-%m-%d %H:%M:%S:%f')
        return(date_time_obj)
    elif(isinstance(date_string, datetime)): 
        return(date_string)
    else:
        return(np.nan)
    
    
def findVerbs(tokenList):
    posTags = pos_tag(tokenList)
    verbAbrList = ['VBP', 'VBG', 'VBN', 'VBP', 'VBZ', 'RB', 'RBR', 'RBS']
    verbList = []
    for posTag in posTags:
        if posTag[1] in verbAbrList:
            verbList.append(posTag[0])
    return(verbList)

#Preprocess all the features and transform to the format needed for further processing.
def preprocessJiraData(cleanDataFrame, preprocessComments, porterStemmer, cachedStopWords, startTime):
    if (preprocessComments == True):
        nOfSteps = '4'
    else:
        nOfSteps = '3'

    #preprocess Summaries
    jira_summaries = cleanDataFrame['Summary'].apply(lambda x: preprocess(x, porterStemmer, cachedStopWords))
    jira_summaries_2grams = cleanDataFrame['Summary'].apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 2))
    jira_summaries_3grams = cleanDataFrame['Summary'].apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 3))
    
    endTimeCleaningSummaries = time.time() - startTime
    print("1/" + nOfSteps + ") Finished Cleaning Summaries after " + str(endTimeCleaningSummaries) + " sec")

    #preprocess Descriptions
    jira_descriptions = cleanDataFrame['Description'].apply(lambda x: preprocess(x, porterStemmer, cachedStopWords))
    jira_descriptions_2grams = cleanDataFrame['Description'].apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 2))
    jira_descriptions_3grams = cleanDataFrame['Description'].apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 2))
    
    endTimeCleaningDescriptions = time.time() - startTime
    print("2/" + nOfSteps + ") Finished Cleaning Description after " + str(endTimeCleaningDescriptions) + " sec")

    #preprocess Dates
    jira_creation = cleanDataFrame['Created'].apply(lambda x: preprocess_jira_date(x))
    jira_updated = cleanDataFrame['Updated'].apply(lambda x: preprocess_jira_date(x))
    jira_resolved = cleanDataFrame['Resolved'].apply(lambda x: preprocess_jira_date(x))
    endTimeCleaningDates = time.time() - startTime
    print("3/" + nOfSteps + ") Finished Cleaning Dates after " + str(endTimeCleaningDates) + " sec")

    #Comments take too long for a test run.
    if (preprocessComments == True):
        jira_comments = cleanDataFrame['Comments'].apply(lambda x: preprocess(x, porterStemmer, cachedStopWords))
        jira_comments_2grams = cleanDataFrame['Comments'].apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 2))
        jira_comments_3grams = cleanDataFrame['Comments'].apply(lambda x: preprocessNGrams(x, porterStemmer, cachedStopWords, 2))
        endTimeCleaningComments = time.time() - startTime
        print("4/" + nOfSteps + ") Finished Cleaning Comments after " + str(endTimeCleaningComments) + " sec")

         #create JIRA corpus by merging Summary and Description
        jira_data = {'Issue_key_jira': cleanDataFrame['Issue key'], 
             'Assignee': cleanDataFrame['Assignee'],
             'Jira_created_date': jira_creation, 
             'Jira_updated_date': jira_updated, 
             'Jira_resolved_date': jira_resolved, 
             'Summary': jira_summaries, 
             'Summary_2grams': jira_summaries_2grams,
             'Summary_3grams': jira_summaries_3grams, 
             'Description': jira_descriptions,
             'Description_2grams': jira_descriptions_2grams,
             'Description_3grams': jira_descriptions_3grams,
             'Comments': jira_comments,
             'Comments_2grams': jira_comments_2grams,
             'Comments_3grams': jira_comments_3grams,
             'Jira_natural_text': jira_summaries +  jira_descriptions + jira_comments,
             'Jira_natural_text_2grams': jira_summaries_2grams +  jira_descriptions_2grams + jira_comments_2grams,
             'Jira_natural_text_3grams': jira_summaries_3grams +  jira_descriptions_3grams + jira_comments_3grams}
    else:
         #create JIRA corpus by merging Summary and Description
        jira_data = {'Issue_key_jira': cleanDataFrame['Issue key'], 
             'Assignee': cleanDataFrame['Assignee'],
             'Jira_created_date': jira_creation, 
             'Jira_updated_date': jira_updated, 
             'Jira_resolved_date': jira_resolved, 
             'Summary': jira_summaries,
             'Summary_2grams': jira_summaries_2grams,
             'Summary_3grams': jira_summaries_3grams,
             'Description': jira_descriptions,
             'Description_2grams': jira_descriptions_2grams,
             'Description_3grams': jira_descriptions_3grams,
             'Jira_natural_text': jira_summaries +  jira_descriptions,
             'Jira_natural_text_2grams': jira_summaries_2grams +  jira_descriptions_2grams,
             'Jira_natural_text_3grams': jira_summaries_3grams +  jira_descriptions_3grams}

    jira_processed_df = pd.DataFrame(data=jira_data)
    
    #Find verbs
    jira_processed_df['verbs'] = jira_processed_df['Jira_natural_text'].apply(lambda x: findVerbs(x))
    
    return(jira_processed_df)

#Input dataframe and num of_comments, and bool to determine if comments need to be cleaned
def cleanJiraData(dataFrame, cleanComments, commentAmount):
    startTime = time.time()

    #create an object of class PorterStemmer
    porterStemmer = PorterStemmer()
    
    #Find all stopwords
    cachedStopWords = stopwords.words("english")

    if (cleanComments == True):
        #Subset only all comments 
        loc_first_comment = dataFrame.columns.get_loc('Comment') # Variable storing the col location of the 1st comment
    
        dataFrame["Comments"] = dataFrame.iloc[:,loc_first_comment:loc_first_comment+commentAmount].apply(
            lambda x: " ".join(x.astype(str)), axis=1)
    
        #First remove the date and comment string from the comments
        dataFrame["Comments"] = dataFrame["Comments"].apply(lambda x: clean_comments(x))

        #Subset JIRA ID, Summary, Description, comments
        jira_issues_subset = dataFrame[["Issue key", "Assignee", "Summary", "Description", "Comments", "Created", "Resolved", "Updated"]]
        cleanedAndProcessedJiraData = preprocessJiraData(jira_issues_subset, preprocessComments = True, porterStemmer = porterStemmer, cachedStopWords = cachedStopWords, startTime = startTime)
        return(cleanedAndProcessedJiraData)
    else: 
        jira_issues_subset = dataFrame[["Issue key", "Assignee", "Summary", "Description", "Created", "Resolved", "Updated"]]
        cleanedAndProcessedJiraData = preprocessJiraData(jira_issues_subset, preprocessComments = False, porterStemmer = porterStemmer, cachedStopWords = cachedStopWords, startTime = startTime)
        return(cleanedAndProcessedJiraData)


[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [Errno 11001] getaddrinfo failed>


In [7]:
#Rename key to Issue key
rawData_JIRA_dealService = rawData_JIRA_dealService.rename({'Key': 'Issue key'}, axis=1)

#Clean Data sets
intermediateData_JIRA_dealService = cleanJiraData(dataFrame = rawData_JIRA_dealService, cleanComments = False, commentAmount = 39)

#Create a temp XLSX file for all intermediate datasets
intermediateData_JIRA_dealService.to_excel(excel_writer = "../data/02_intermediate/intermediateData_JIRA_dealService.xlsx", index = False)

#Create a pickle file for all intermediate datasets
intermediateData_JIRA_dealService.to_pickle(path= "../data/02_intermediate/intermediateData_JIRA_dealService.pkl")

1/3) Finished Cleaning Summaries after 0.16937685012817383 sec
2/3) Finished Cleaning Description after 0.7384934425354004 sec
3/3) Finished Cleaning Dates after 0.7541210651397705 sec


## 2.4 Clean Raw Data - Create JIRA Corpora
Create the corpora for JIRA UNIGRAM

In [8]:
def createCorpusFromDocumentList(token_column):
    token_list = token_column.tolist()
    corpus_list = []
    
    for document in token_list:
        #Only join to the string when a list. When it is not a list, then it is np.NaN, thus no changes
        if(isinstance(document, list)):
            #Transform list to a string for SKLEARN to accept the input.
            token_string = ' '.join(document)
        
            #Add string to the corpus list
            corpus_list.append(token_string)
    return(corpus_list)

In [9]:
#Create JIRA corpus for dealService dataset
intermediateData_JIRA_dealServiceCorpusSummary = createCorpusFromDocumentList(intermediateData_JIRA_dealService.Summary)
intermediateData_JIRA_dealServiceCorpusDescription = createCorpusFromDocumentList(intermediateData_JIRA_dealService.Description)

#Merge all JIRA Corpora into 1 corpus
intermediateData_JIRA_dealServiceCorpus = [i+" "+j for i,j in zip(intermediateData_JIRA_dealServiceCorpusSummary,
                                                                             intermediateData_JIRA_dealServiceCorpusDescription
                                                                            )]

#Save intermediate pickles
with open('../data/02_intermediate/intermediateData_JIRA_dealServiceCorpus.pkl', 'wb') as f:
    pickle.dump(intermediateData_JIRA_dealServiceCorpus, f)

Bigram corpora

In [10]:
def createCorpusNGrams(tokenColumn):
    tokenList = tokenColumn.tolist()
    corpusList = []
    
    #Transform to strings
    for document in tokenList:
        if(isinstance(document, list)):
            for ngram in document:
                ngramString = ' '.join(ngram)
                corpusList.append(ngramString)         
    return(corpusList)

In [11]:
#Create JIRA corpus for dealService dataset
intermediateData_JIRA_dealServiceCorpusSummary_2grams = createCorpusNGrams(intermediateData_JIRA_dealService.Summary_2grams)
intermediateData_JIRA_dealServiceCorpusDescription_2grams = createCorpusNGrams(intermediateData_JIRA_dealService.Description_2grams)

#Merge all JIRA Corpora into 1 corpus
intermediateData_JIRA_dealServiceCorpus_2gram = [i+" "+j for i,j in zip(intermediateData_JIRA_dealServiceCorpusSummary_2grams,
                                                                             intermediateData_JIRA_dealServiceCorpusDescription_2grams
                                                                             )]


#Save intermediate pickles
with open('../data/02_intermediate/intermediateData_JIRA_dealServiceCorpus_2gram.pkl', 'wb') as f:
    pickle.dump(intermediateData_JIRA_dealServiceCorpus_2gram, f)

## 2.4 Clean Raw Data - Create SVN Corpora
Create the corpora for SVN

In [12]:
intermediateData_SVN_dealService = pd.read_pickle("../data/02_intermediate/intermediateData_SVN_dealService.pkl")

In [13]:
#Create corpus for log messages
intermediateData_SVNLogs_dealServiceCorpus = createCorpusFromDocumentList(intermediateData_SVN_dealService.Logs)

#Create corpus for unit names
intermediateData_SVNUnitNames_dealServiceCorpus = createCorpusFromDocumentList(intermediateData_SVN_dealService.Unit_names)

#Create corpus for entire commit (log message + model)
intermediateData_SVN_dealServiceCorpus = createCorpusFromDocumentList(intermediateData_SVN_dealService.Logs + intermediateData_SVN_dealService.Unit_names)
intermediateData_SVN_dealServiceCorpusAll = createCorpusFromDocumentList(intermediateData_SVN_dealService.Logs + intermediateData_SVN_dealService.Unit_names)
#Save intermediate pickles
with open('../data/02_intermediate/intermediateData_SVNLogs_dealServiceCorpus.pkl', 'wb') as f:
    pickle.dump(intermediateData_SVNLogs_dealServiceCorpus, f)

with open('../data/02_intermediate/intermediateData_SVNUnitNames_dealServiceCorpus.pkl', 'wb') as f:
    pickle.dump(intermediateData_SVNUnitNames_dealServiceCorpus, f)

with open('../data/02_intermediate/intermediateData_SVN_dealServiceCorpus.pkl', 'wb') as f:
    pickle.dump(intermediateData_SVN_dealServiceCorpus, f)
    
with open('../data/02_intermediate/intermediateData_SVN_dealServiceCorpusAll.pkl', 'wb') as f:
    pickle.dump(intermediateData_SVN_dealServiceCorpusAll, f)

bigram corpora

In [14]:
intermediateData_SVNLogs_dealServiceCorpus_2gram = createCorpusNGrams(intermediateData_SVN_dealService.Logs_2grams)
intermediateData_SVNUnitNames_dealServiceCorpus_2gram = createCorpusNGrams(intermediateData_SVN_dealService.Unit_names_2grams)
with open('../data/02_intermediate/intermediateData_SVNLogs_dealServiceCorpus_2gram.pkl', 'wb') as f:
    pickle.dump(intermediateData_SVNLogs_dealServiceCorpus_2gram, f)
    
    
with open('../data/02_intermediate/intermediateData_SVNUnitNames_dealServiceCorpus_2gram.pkl', 'wb') as f:
    pickle.dump(intermediateData_SVNUnitNames_dealServiceCorpus_2gram, f)

# 3. Preprocess Data

In [15]:
#Run this code block when you've restarted the kernel, and want to use previously gained results.
intermediateData_JIRA_dealService = pd.read_pickle("../data/02_intermediate/intermediateData_JIRA_dealService.pkl")

intermediateData_SVN_dealService = pd.read_pickle("../data/02_intermediate/intermediateData_SVN_dealService.pkl")

intermediateData_JIRA_dealServiceCorpus = pd.read_pickle(r'../data/02_intermediate/intermediateData_JIRA_dealServiceCorpus.pkl')
intermediateData_JIRA_dealServiceCorpus = pd.read_pickle(r'../data/02_intermediate/intermediateData_JIRA_dealServiceCorpus.pkl')
#intermediateData_SVN_dealServiceCorpusAll = pd.read_pickle(r'../data/02_intermediate/intermediateData_SVN_dealServiceCorpusAll.pkl')
#intermediateData_SVN_dealServiceCorpusModel = pd.read_pickle(r'../data/02_intermediate/intermediateData_SVN_dealServiceCorpusModel.pkl')
intermediateData_SVN_dealServiceCorpus = pd.read_pickle(r'../data/02_intermediate/intermediateData_SVN_dealServiceCorpus.pkl')

############# Bigrams


############# Trigrams

## 3.0 Preprocess Data - Create cartesian product JIRA x Commits

In [16]:
#Create cartesian products JIRA x Commits
processedData_dealServiceCartesian = intermediateData_JIRA_dealService.merge(intermediateData_SVN_dealService, how='cross')

processedData_dealServiceCartesian = processedData_dealServiceCartesian.drop(processedData_dealServiceCartesian[processedData_dealServiceCartesian.Jira_created_date > processedData_dealServiceCartesian.Commit_date].index)

#Create a pickle file for all intermediate datasets
processedData_dealServiceCartesian.to_pickle(path= "../data/03_processed/processedData_dealServiceCartesian.pkl")


## 3.1 Preprocess Data - Create Labels

In [17]:
#Create new dataFrames for the time features
processedData_dealServiceLabels = pd.DataFrame() 


#Create a column, which indicates which traces are valid.
processedData_dealServiceLabels["is_valid"] = processedData_dealServiceCartesian.apply(lambda x: checkValidityTrace(x.Issue_key_jira, x.Issue_key_commit), axis=1)
print("Finished creating labels for dealService")

#Save intermediate results
processedData_dealServiceLabels.to_pickle(path= "../data/03_processed/processedData_dealServiceLabels.pkl")

processedData_dealServiceLabels.info()

Finished creating labels for dealService
<class 'pandas.core.frame.DataFrame'>
Int64Index: 89233 entries, 1487 to 258634
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   is_valid  89233 non-null  bool 
dtypes: bool(1)
memory usage: 784.3 KB


In [18]:
#processedData_dealServiceLabels[processedData_dealServiceLabels.is_valid == True].count()
processedData_dealServiceLabels[processedData_dealServiceLabels.is_valid == True].count()

is_valid    420
dtype: int64

## 3.2 Preprocess Data - Create Time-Related Features

In [19]:
#Create new dataFrames for the time features
processedData_dealServiceFeaturesTime = pd.DataFrame() 


#Calculate the time features for data Processing Dataset
processedData_dealServiceFeaturesTime['Creation_commit_date_dif'] = processedData_dealServiceCartesian.apply(lambda x: calculateTimeDif(x.Jira_created_date, x.Commit_date), axis=1)
processedData_dealServiceFeaturesTime['Updated_commit_date_dif'] = processedData_dealServiceCartesian.apply(lambda x: calculateTimeDif(x.Jira_updated_date, x.Commit_date), axis=1)
processedData_dealServiceFeaturesTime['Resolved_commit_date_dif'] = processedData_dealServiceCartesian.apply(lambda x: calculateTimeDif(x.Jira_resolved_date, x.Commit_date), axis=1)
print("Finished data Processing")

#Create a pickle file for all intermediate datasets
processedData_dealServiceFeaturesTime.to_pickle(path= "../data/03_processed/processedData_dealServiceFeaturesTime.pkl")

Finished data Processing


## 3.3 Preprocess Data - Create Stakeholder-Related Features

In [20]:
#Create new dataFrames for the Stakeholder features
processedData_dealServiceFeaturesStakeholder = pd.DataFrame() 

processedData_dealServiceFeaturesStakeholder['Assignee_is_commiter'] = processedData_dealServiceCartesian.apply(lambda x: checkFullnameEqualsEmail(x.Assignee, x.Email), axis=1)
print("Finished dealService")

#Create a pickle file for all intermediate datasets
processedData_dealServiceFeaturesStakeholder.to_pickle(path= "../data/03_processed/processedData_dealServiceFeaturesStakeholder.pkl")


Finished dealService


## 3.4 Preprocess Data - Create Cosine Similarity Features
### 3.4.1 dealService - Cosine Similarity UniGrams

In [21]:
from scipy import spatial
import pandas as pd

def calc_vector_representation(document, cv, fittedTF_IDF):        
    #Transform document type to a string
    documentString = document
    
    #Calculate the Term Frequency of the document
    inputDocs = [documentString] 

    # count matrix 
    count_vector = cv.transform(inputDocs) 
 
    #tf-idf scores 
    tf_idf_vector = fittedTF_IDF.transform(count_vector)

    feature_names = cv.get_feature_names() 
 
    #get tfidf vector for first document 
    document_vector=tf_idf_vector[0] 
 
    #print the scores 
    
    # place tf-idf values in a pandas data frame 
    df = pd.DataFrame(document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
    df.sort_values(by=["tfidf"],ascending=False)

    return(document_vector.T.todense())

def calculateCosineSimilarity(document1, document2, cv, fittedTF_IDF):

    #If both doc1 and doc2 are lists
    if (isinstance(document1, list) & isinstance(document2, list)):
        #Transform document to string type
        document1String = ' '.join(document1)
        document2String = ' '.join(document2)

    #Only document1 is a list
    elif(isinstance(document1, list)):
        #Transform document to string type
        document1String = ' '.join(document1)
        document2String = ''

    #Only document2 is a list
    elif(isinstance(document2, list)):
        #Transform document to string type
        document1String = ''
        document2String = ' '.join(document2)
        
    else:
        document1String = ''
        document2String = ''

    vector1 = calc_vector_representation(document1String, cv, fittedTF_IDF)
    vector2 = calc_vector_representation(document2String, cv, fittedTF_IDF)
    
    #The cosine similarity. Produces NaN if no terms are found in the corpus.
    result = 1 - spatial.distance.cosine(vector1, vector2)
    
    return(result)

def calculateCosineSimilarityNGrams(document1, document2, cv, fittedTF_IDF):

    #If both doc1 and doc2 are lists
    if (isinstance(document1, list) & isinstance(document2, list)):
        #Transform document to string type
        document1String = ' '.join(document1)
        document2String = ' '.join(document2)

    #Only document1 is a list
    elif(isinstance(document1, list)):
        #Transform document to string type
        document1String = ' '.join(document1)
        document2String = ''

    #Only document2 is a list
    elif(isinstance(document2, list)):
        #Transform document to string type
        document1String = ''
        document2String = ' '.join(document2)
        
    else:
        document1String = ''
        document2String = ''

    vector1 = calc_vector_representation(document1String, cv, fittedTF_IDF)
    vector2 = calc_vector_representation(document2String, cv, fittedTF_IDF)
    
    #The cosine similarity. Produces NaN if no terms are found in the corpus.
    result = 1 - spatial.distance.cosine(vector1, vector2)
    
    return(result)


def calculateCosineSimilarityWithPOSPruning(document1, document2, cv, fittedTF_IDF, verbList):

    #If both doc1 and doc2 are lists
    if (isinstance(document1, list) & isinstance(document2, list)):
        #Transform document to string type
        document1String = ' '.join(document1)
        document2String = ' '.join(document2)

    #Only document1 is a list
    elif(isinstance(document1, list)):
        #Transform document to string type
        document1String = ' '.join(document1)
        document2String = ''

    #Only document2 is a list
    elif(isinstance(document2, list)):
        #Transform document to string type
        document1String = ''
        document2String = ' '.join(document2)
        
    else:
        document1String = ''
        document2String = ''

    vector1 = calc_vector_representation(document1String, cv, fittedTF_IDF)
    vector2 = calc_vector_representation(document2String, cv, fittedTF_IDF)
    
    #The cosine similarity. Produces NaN if no terms are found in the corpus.
    result = 1 - spatial.distance.cosine(vector1, vector2)
    
    verbCounter = 0
    if(isinstance(document2, list)):
        for token in document2:
            if token in verbList:
                verbCounter = verbCounter + 1
    
    if verbCounter > 0:
        result = result * (1 + (0.1 * verbCounter))
    else:
        result = 0
    
    return(result)

In [22]:
#Instantiate the count vectorizer and tfidf for the corpus
from sklearn.feature_extraction.text import CountVectorizer 

######################################################
#                       dealService              #
######################################################

################# Unigrams ###############
#instantiate CountVectorizer() for SVN
processedData_SVN_dealServiceCountVectorizer = CountVectorizer()
processedData_SVN_dealServiceCountTF_IDF = createFittedTF_IDF(processedData_SVN_dealServiceCountVectorizer, intermediateData_SVN_dealServiceCorpus)

processedData_SVNLogs_dealServiceCountVectorizer = CountVectorizer()
processedData_SVNLogs_dealServiceCountTF_IDF = createFittedTF_IDF(processedData_SVNLogs_dealServiceCountVectorizer, intermediateData_SVNLogs_dealServiceCorpus)

processedData_SVNUnitNames_dealServiceCountVectorizer = CountVectorizer()
processedData_SVNUnitNames_dealServiceCountTF_IDF = createFittedTF_IDF(processedData_SVNUnitNames_dealServiceCountVectorizer, intermediateData_SVNUnitNames_dealServiceCorpus)

#instantiate CountVectorizer() for JIRA - unigram
processedData_JIRA_dealServiceCountVectorizer = CountVectorizer()
processedData_JIRA_dealServiceCountTF_IDF = createFittedTF_IDF(processedData_JIRA_dealServiceCountVectorizer, intermediateData_JIRA_dealServiceCorpus)

processedData_JIRASummaries_dealServiceCountVectorizer = CountVectorizer()
processedData_JIRASummaries_dealServiceCountTF_IDF = createFittedTF_IDF(processedData_JIRASummaries_dealServiceCountVectorizer, intermediateData_JIRA_dealServiceCorpusSummary)

processedData_JIRADescriptions_dealServiceCountVectorizer = CountVectorizer()
processedData_JIRADescriptions_dealServiceCountTF_IDF = createFittedTF_IDF(processedData_JIRADescriptions_dealServiceCountVectorizer, intermediateData_JIRA_dealServiceCorpusDescription)

#processedData_JIRAComments_dealServiceCountVectorizer = CountVectorizer()
#processedData_JIRAComments_dealServiceCountTF_IDF = createFittedTF_IDF(processedData_JIRAComments_dealServiceCountVectorizer, intermediateData_JIRA_dealServiceCorpusComments)


################# Bigrams ###############
#instantiate CountVectorizer() for SVN - bigrams
processedData_SVNLogs_dealServiceCountVectorizer_2gram = CountVectorizer(ngram_range=(2, 2))
processedData_SVNLogs_dealServiceCountTF_IDF_2gram = createFittedTF_IDF(processedData_SVNLogs_dealServiceCountVectorizer_2gram, intermediateData_SVNLogs_dealServiceCorpus_2gram)

processedData_SVNUnitNames_dealServiceCountVectorizer_2gram = CountVectorizer()
processedData_SVNUnitNames_dealServiceCountTF_IDF_2gram = createFittedTF_IDF(processedData_SVNUnitNames_dealServiceCountVectorizer_2gram, intermediateData_SVNUnitNames_dealServiceCorpus_2gram)


#instantiate CountVectorizer() for JIRA - biigram
processedData_JIRA_dealServiceCountVectorizer_2gram = CountVectorizer(ngram_range=(2, 2))
processedData_JIRA_dealServiceCountTF_IDF_2gram = createFittedTF_IDF(processedData_JIRA_dealServiceCountVectorizer_2gram, intermediateData_JIRA_dealServiceCorpus_2gram)

processedData_JIRASummaries_dealServiceCountVectorizer_2gram = CountVectorizer(ngram_range=(2, 2))
processedData_JIRASummaries_dealServiceCountTF_IDF_2gram = createFittedTF_IDF(processedData_JIRASummaries_dealServiceCountVectorizer_2gram, intermediateData_JIRA_dealServiceCorpusSummary_2grams)

processedData_JIRADescriptions_dealServiceCountVectorizer_2gram = CountVectorizer(ngram_range=(2, 2))
processedData_JIRADescriptions_dealServiceCountTF_IDF_2gram = createFittedTF_IDF(processedData_JIRADescriptions_dealServiceCountVectorizer_2gram, intermediateData_JIRA_dealServiceCorpusDescription_2grams)

#processedData_JIRAComments_dealServiceCountVectorizer_2gram = CountVectorizer(ngram_range=(2, 2))
#processedData_JIRAComments_dealServiceCountTF_IDF_2gram = createFittedTF_IDF(processedData_JIRAComments_dealServiceCountVectorizer_2gram, intermediateData_JIRA_dealServiceCorpusComments_2grams)




#### 3.4.1 [VSM unigram] Similarity between JIRA issue and Commit Log - Jira As Query

In [23]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmLogsJiraAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmLogsJiraAsQuery["vsm_logs_jira_as_query"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Logs, processedData_JIRA_dealServiceCountVectorizer, processedData_JIRA_dealServiceCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmLogsJiraAsQuery.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmLogsJiraAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

  dist = 1.0 - uv / np.sqrt(uu * vv)


Finished creating 'VSM Logs Jira as query' after 5 minutes and 40.33592462539673 seconds


#### 3.4.2 [VSM unigram] Similarity between JIRA issue and Commit Log - Log As Query

In [24]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmLogsLogAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmLogsLogAsQuery["vsm_logs_log_as_query"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Logs, processedData_SVNLogs_dealServiceCountVectorizer, processedData_SVNLogs_dealServiceCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmLogsLogAsQuery.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmLogsLogAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 6 minutes and 39.3549439907074 seconds


#### 3.4.3 [VSM unigram] Similarity between JIRA issue and Unit Names - JIRA As Query

In [25]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmUnitNamesJiraAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmUnitNamesJiraAsQuery["vsm_unit_names_jira_as_query"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Unit_names, processedData_JIRA_dealServiceCountVectorizer, processedData_JIRA_dealServiceCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmUnitNamesJiraAsQuery.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmUnitNamesJiraAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 6 minutes and 45.37402677536011 seconds


#### 3.4.1 [VSM unigram] Similarity between JIRA Summary and Commit Log - Jira As Query

In [26]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmSummaryLogsSummaryAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmSummaryLogsSummaryAsQuery["vsm_summary_logs_summary_as_query"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Summary, x.Logs, processedData_JIRASummaries_dealServiceCountVectorizer, processedData_JIRASummaries_dealServiceCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmSummaryLogsSummaryAsQuery.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmSummaryLogsSummaryAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 5 minutes and 52.34311246871948 seconds


#### 3.4.1 [VSM unigram] Similarity between JIRA Summary and Commit Log - Log As Query

In [27]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmSummaryLogsLogsAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmSummaryLogsLogsAsQuery["vsm_summary_logs_logs_as_query"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Summary, x.Logs, processedData_SVNLogs_dealServiceCountVectorizer, processedData_SVNLogs_dealServiceCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmSummaryLogsLogsAsQuery.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmSummaryLogsLogsAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 6 minutes and 58.24149942398071 seconds


#### 3.4.1 [VSM unigram] Similarity between JIRA Summary and UnitNames - Summary As Query

In [28]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmSummaryUnitNamesSummaryAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmSummaryUnitNamesSummaryAsQuery["vsm_summary_unitNames_summary_as_query"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Summary, x.Unit_names, processedData_JIRASummaries_dealServiceCountVectorizer, processedData_JIRASummaries_dealServiceCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmSummaryUnitNamesSummaryAsQuery.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmSummaryUnitNamesSummaryAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 5 minutes and 5.077953815460205 seconds


#### 3.4.1 [VSM unigram] Similarity between JIRA Summary and UnitNames - UnitNames As Query

In [29]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmSummaryUnitNamesUnitNamesAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmSummaryUnitNamesUnitNamesAsQuery["vsm_summary_unitNames_unitNames_as_query"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Summary, x.Unit_names, processedData_SVNUnitNames_dealServiceCountVectorizer, processedData_SVNUnitNames_dealServiceCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmSummaryUnitNamesSummaryAsQuery.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmSummaryUnitNamesUnitNamesAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 45 minutes and 1.2660377025604248 seconds


#### 3.4.3 [VSM unigram - verb pruning] Similarity between JIRA issue and Unit Names - JIRA As Query

In [30]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmVerbPruningUnitNamesJiraAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmVerbPruningUnitNamesJiraAsQuery["vsm_verb_pruning_unit_names_jira_as_query"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarityWithPOSPruning(x.Jira_natural_text, x.Unit_names, processedData_JIRA_dealServiceCountVectorizer, processedData_JIRA_dealServiceCountTF_IDF, x.verbs), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmVerbPruningUnitNamesJiraAsQuery.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmVerbPruningUnitNamesJiraAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query and verb pruning' after " + timeDifference)

Finished creating 'VSM Logs Jira as query and verb pruning' after 5 minutes and 23.351595640182495 seconds


#### 3.4.4 [VSM unigram] Similarity between JIRA issue and Unit Names  - Unit Names As Query

In [31]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmUnitNamesUnitNamesAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmUnitNamesUnitNamesAsQuery["vsm_unit_names_log_as_query"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Unit_names, processedData_SVNUnitNames_dealServiceCountVectorizer, processedData_SVNUnitNames_dealServiceCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmUnitNamesUnitNamesAsQuery.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmUnitNamesUnitNamesAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 5 minutes and 50.075788736343384 seconds


#### 3.4.5 [VSM unigram] Similarity between JIRA description and commit log - Description as query

In [32]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmDescriptionDescriptionAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmDescriptionDescriptionAsQuery["vsm_description_description_as_query"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Description, x.Logs, processedData_JIRADescriptions_dealServiceCountVectorizer, processedData_JIRADescriptions_dealServiceCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmDescriptionDescriptionAsQuery.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmDescriptionDescriptionAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 5 minutes and 21.839947938919067 seconds


#### 3.4.5 [VSM unigram Silarity between JIRA description and commit log - Log as descrintion

In [33]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmDescriptionLogsAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmDescriptionLogsAsQuery["vsm_description_log_as_query"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Description, x.Unit_names, processedData_SVNUnitNames_dealServiceCountVectorizer, processedData_SVNUnitNames_dealServiceCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmDescriptionLogsAsQuery.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmDescriptionLogsAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 6 minutes and 32.12603807449341 seconds


#### 3.4.5 [VSM unigram Silarity between JIRA Comment and unitnames - Comment as query

#### 3.4.5 [VSM unigram Silarity between JIRA Comment and unitnames - Comment as query

#### 3.4.5 [VSM unigram Silarity between JIRA Comment and commit log - Comment as description

#### 3.4.5 [VSM unigram Silarity between JIRA description and commit log - Log as description

#### [VSM bigram] Similarity between JIRA comments and Commit Logs - Logs as query

#### 3.4.5 [VSM bigram] Silarity between JIRA Comment and commit log - Comment as query

#### [VSM Unigram] Similarity between Unit Names and Description - Unit Names as query

In [34]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmUnitNamesDescriptionUnitNamesAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmUnitNamesDescriptionUnitNamesAsQuery["vsm_unitnames_description_unitnames_as_query"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Description, x.Unit_names, processedData_SVNUnitNames_dealServiceCountVectorizer, processedData_SVNUnitNames_dealServiceCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmUnitNamesDescriptionUnitNamesAsQuery.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmUnitNamesDescriptionUnitNamesAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 8 minutes and 20.21900486946106 seconds


#### [VSM Unigram] Similarity between Unit Names and Description - Description as query

In [35]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmUnitNamesDescriptionDescriptionAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmUnitNamesDescriptionDescriptionAsQuery["vsm_unitnames_description_description_as_query"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Description, x.Unit_names, processedData_JIRADescriptions_dealServiceCountVectorizer, processedData_JIRADescriptions_dealServiceCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmUnitNamesDescriptionDescriptionAsQuery.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmUnitNamesDescriptionDescriptionAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 6 minutes and 2.416954755783081 seconds


#### [VSM Unigram] Similarity between Unit Names and Comments - Unit Names as query

#### [VSM Unigram] Similarity between Unit Names and Comments - Comments as query

In [36]:
#### [VSM Unigram] Similarity between SVN (entirely) and JIRA (entirely)- JIRA as query

In [37]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmSvnJiraJiraAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmSvnJiraJiraAsQuery["vsm_svn_jira_jira_as_query"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Commit_natural_text, processedData_JIRA_dealServiceCountVectorizer, processedData_JIRA_dealServiceCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmSvnJiraJiraAsQuery.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmSvnJiraJiraAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 6 minutes and 10.86130690574646 seconds


In [38]:
#### [VSM Unigram] Similarity between SVN (entirely) and JIRA (entirely) - SVN as query

In [39]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmSvnJiraSvnAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmSvnJiraSvnAsQuery["vsm_svn_jira_svn_as_query"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Commit_natural_text, processedData_SVN_dealServiceCountVectorizer, processedData_SVN_dealServiceCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmSvnJiraSvnAsQuery.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmSvnJiraSvnAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 8 minutes and 16.416340112686157 seconds


In [40]:
#### [VSM Unigram] Similarity between SVN (entirely) and Summary - SVN as query

In [41]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmSvnSummarySvnAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmSvnSummarySvnAsQuery["vsm_svn_summary_svn_as_query"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Commit_natural_text, x.Summary, processedData_SVN_dealServiceCountVectorizer, processedData_SVN_dealServiceCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmSvnSummarySvnAsQuery.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmSvnSummarySvnAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 6 minutes and 16.02962374687195 seconds


In [42]:
#### [VSM Unigram] Similarity between SVN (entirely) and Summary - Summary as query

In [43]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmSvnSummarySummaryAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmSvnSummarySummaryAsQuery["vsm_svn_summary_summary_as_query"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Commit_natural_text, x.Summary, processedData_JIRASummaries_dealServiceCountVectorizer, processedData_JIRASummaries_dealServiceCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmSvnSummarySummaryAsQuery.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmSvnSummarySummaryAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 5 minutes and 55.526084423065186 seconds


In [44]:
#### [VSM Unigram] Similarity between SVN (entirely) and Description - SVN as query

In [45]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmSvnDescriptionSvnAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmSvnDescriptionSvnAsQuery["vsm_svn_description_svn_as_query"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Commit_natural_text, x.Description, processedData_SVN_dealServiceCountVectorizer, processedData_SVN_dealServiceCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmSvnDescriptionSvnAsQuery.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmSvnDescriptionSvnAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 6 minutes and 0.9930412769317627 seconds


In [46]:
#### [VSM Unigram] Similarity between SVN (entirely) and Description - Description as query

In [47]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmSvnDescriptionDescriptionAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmSvnDescriptionDescriptionAsQuery["vsm_svn_description_description_as_query"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Commit_natural_text, x.Description, processedData_JIRADescriptions_dealServiceCountVectorizer, processedData_JIRADescriptions_dealServiceCountTF_IDF), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmSvnDescriptionDescriptionAsQuery.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmSvnDescriptionDescriptionAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 5 minutes and 55.10972571372986 seconds


In [48]:
#### [VSM Unigram] Similarity between SVN (entirely) and Comments - SVN as query

In [49]:
#### [VSM Unigram] Similarity between SVN (entirely) and Comments - Comments as query

#### 3.4.3 [VSM unigram - verb pruning] Similarity between JIRA issue and Unit Names and verb pruning - Unit Names As Query

In [50]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmVerbPruningUnitNamesUnitNamesAsQuery = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmVerbPruningUnitNamesUnitNamesAsQuery["vsm_verb_pruning_unit_names_log_as_query"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarityWithPOSPruning(x.Jira_natural_text, x.Unit_names, processedData_SVNUnitNames_dealServiceCountVectorizer, processedData_SVNUnitNames_dealServiceCountTF_IDF, x.verbs), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmVerbPruningUnitNamesUnitNamesAsQuery.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmVerbPruningUnitNamesUnitNamesAsQuery.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 5 minutes and 56.54522657394409 seconds


#### 3.4.5 [VSM bigram] Similarity between JIRA issue and Commit Log - Jira As Query

In [51]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmLogsJiraAsQuery_2gram = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmLogsJiraAsQuery_2gram["vsm_logs_jira_as_query_2gram"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Logs, processedData_JIRA_dealServiceCountVectorizer_2gram, processedData_JIRA_dealServiceCountTF_IDF_2gram), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmLogsJiraAsQuery_2gram.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmLogsJiraAsQuery_2gram.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 5 minutes and 24.822927236557007 seconds


#### 3.4.6 [VSM bigram] Similarity between JIRA issue and Commit Log - Logs As Query

In [52]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmLogsLogAsQuery_2gram = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmLogsLogAsQuery_2gram["vsm_logs_log_as_query_2gram"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Logs, processedData_SVNLogs_dealServiceCountVectorizer_2gram, processedData_SVNLogs_dealServiceCountTF_IDF_2gram), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmLogsLogAsQuery_2gram.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmLogsLogAsQuery_2gram.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 7 minutes and 24.35269832611084 seconds


#### 3.4.6 [VSM bigram] Similarity between JIRA issue and Unit Names - Jira As Query

In [53]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmUnitNamesJiraAsQuery_2gram = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmUnitNamesJiraAsQuery_2gram["vsm_unit_names_jira_as_query_2gram"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Unit_names, processedData_JIRA_dealServiceCountVectorizer_2gram, processedData_JIRA_dealServiceCountTF_IDF_2gram), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmUnitNamesJiraAsQuery_2gram.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmUnitNamesJiraAsQuery_2gram.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 6 minutes and 16.505225896835327 seconds


#### 3.4.6 [VSM bigram] Similarity between JIRA issue and Unit Names - UnitNames As Query

In [54]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmUnitNamesUnitNamesAsQuery_2gram = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmUnitNamesUnitNamesAsQuery_2gram["vsm_unit_names_log_as_query_2gram"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Unit_names, processedData_SVNUnitNames_dealServiceCountVectorizer_2gram, processedData_SVNUnitNames_dealServiceCountTF_IDF_2gram), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmUnitNamesUnitNamesAsQuery_2gram.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmUnitNamesUnitNamesAsQuery_2gram.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 5 minutes and 4.8398237228393555 seconds


#### [VSM bigram] Similarity between Logs and Description - Logs as Query

In [55]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmDescriptionLogsAsQuery_2gram = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmDescriptionLogsAsQuery_2gram["vsm_description_log_as_query_2gram"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Description, x.Unit_names, processedData_SVNUnitNames_dealServiceCountVectorizer_2gram, processedData_SVNUnitNames_dealServiceCountTF_IDF_2gram), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmDescriptionLogsAsQuery_2gram.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmDescriptionLogsAsQuery_2gram.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM UnitNames Unit Names as query' after " + timeDifference)

Finished creating 'VSM UnitNames Unit Names as query' after 40 minutes and 28.009114742279053 seconds


#### [VSM bigram] Similarity between Logs and Description - Description as Query

In [56]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmDescriptionDescriptionAsQuery_2gram = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmDescriptionDescriptionAsQuery_2gram["vsm_description_description_as_query_2gram"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarity(x.Description, x.Logs, processedData_JIRADescriptions_dealServiceCountVectorizer_2gram, processedData_JIRADescriptions_dealServiceCountTF_IDF_2gram), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmDescriptionDescriptionAsQuery_2gram.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmDescriptionDescriptionAsQuery_2gram.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Bigrams' after " + timeDifference)

Finished creating 'VSM Bigrams' after 8 minutes and 49.65316104888916 seconds


#### [VSM bigram] Similarity between Logs and Summary - Logs as Query

#### [VSM bigram] Similarity between Logs and Summary - Summary as Query

In [57]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_dealService_features_VsmSummaryLogsSummaryAsQuery_2gram = pd.DataFrame() 

#Calculate cosine similarity for each trace
processedData_dealService_features_VsmSummaryLogsSummaryAsQuery_2gram["vsm_summary_logs_summary_as_query_2gram"] = processedData_dealServiceCartesian.apply(lambda x: calculateCosineSimilarityNGrams(x.Summary, x.Logs, processedData_JIRASummaries_dealServiceCountVectorizer_2gram, processedData_JIRASummaries_dealServiceCountTF_IDF_2gram), 
                                                            axis=1)

#Save results in pickle
processedData_dealService_features_VsmSummaryLogsSummaryAsQuery_2gram.to_pickle(path= "../data/03_processed/processedData_dealService_features_VsmSummaryLogsSummaryAsQuery_2gram.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating 'VSM Logs Jira as query' after " + timeDifference)

Finished creating 'VSM Logs Jira as query' after 5 minutes and 2.825479745864868 seconds


## 3.6 Document Statistics

### dealService

In [58]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRA_dealServiceFeaturesUniqueWordCount = pd.DataFrame() 
processedData_SVN_dealServiceFeaturesUniqueWordCount = pd.DataFrame() 
processedData_JIRA_dealServiceFeaturesTotalWordCount = pd.DataFrame() 
processedData_SVN_dealServiceFeaturesTotalWordCount = pd.DataFrame()

processedData_JIRA_dealServiceFeaturesOverlapPercentage = pd.DataFrame()
processedData_SVN_dealServiceFeaturesOverlapPercentage = pd.DataFrame()
processedData_UNION_dealServiceFeaturesOverlapPercentage = pd.DataFrame()

#Calculate unique terms JIRA for each trace
processedData_JIRA_dealServiceFeaturesUniqueWordCount["unique_term_count_jira"] = processedData_dealServiceCartesian.apply(lambda x: calculateUniqueWordCount(x.Jira_natural_text), 
                                                            axis=1)
#Calculate unique terms JIRA for each trace
processedData_SVN_dealServiceFeaturesUniqueWordCount["unique_term_count_svn"] = processedData_dealServiceCartesian.apply(lambda x: calculateUniqueWordCount(x.Commit_natural_text), 
                                                            axis=1)

#Calculate total terms JIRA for each trace
processedData_JIRA_dealServiceFeaturesTotalWordCount["total_term_count_jira"] = processedData_dealServiceCartesian.apply(lambda x: calculateTotalWordCount(x.Jira_natural_text), 
                                                            axis=1)
#Calculate total terms JIRA for each trace
processedData_SVN_dealServiceFeaturesTotalWordCount["total_term_count_svn"] = processedData_dealServiceCartesian.apply(lambda x: calculateTotalWordCount(x.Commit_natural_text), 
                                                            axis=1)

processedData_JIRA_dealServiceFeaturesOverlapPercentage["overlap_percentage_compared_to_jira"] = processedData_dealServiceCartesian.apply(lambda x: calculateOverlapBetweenDocuments(x.Jira_natural_text, x.Commit_natural_text, 'list1'),
                                                            axis=1)
processedData_SVN_dealServiceFeaturesOverlapPercentage["overlap_percentage_compared_to_svn"] = processedData_dealServiceCartesian.apply(lambda x: calculateOverlapBetweenDocuments(x.Jira_natural_text, x.Commit_natural_text, 'list2'),
                                                            axis=1)
processedData_UNION_dealServiceFeaturesOverlapPercentage["overlap_percentage_compared_to_union"] = processedData_dealServiceCartesian.apply(lambda x: calculateOverlapBetweenDocuments(x.Jira_natural_text, x.Commit_natural_text, 'union'),
                                                            axis=1)





#Save results in pickle
processedData_JIRA_dealServiceFeaturesUniqueWordCount.to_pickle(path= "../data/03_processed/processedData_JIRA_dealServiceFeaturesUniqueWordCount.pkl")
processedData_SVN_dealServiceFeaturesUniqueWordCount.to_pickle(path= "../data/03_processed/processedData_SVN_dealServiceFeaturesUniqueWordCount.pkl")
processedData_JIRA_dealServiceFeaturesTotalWordCount.to_pickle(path= "../data/03_processed/processedData_JIRA_dealServiceFeaturesTotalWordCount.pkl")
processedData_SVN_dealServiceFeaturesTotalWordCount.to_pickle(path= "../data/03_processed/processedData_SVN_dealServiceFeaturesTotalWordCount.pkl")

processedData_JIRA_dealServiceFeaturesOverlapPercentage.to_pickle(path= "../data/03_processed/processedData_JIRA_dealServiceFeaturesOverlapPercentage.pkl")
processedData_SVN_dealServiceFeaturesOverlapPercentage.to_pickle(path= "../data/03_processed/processedData_SVN_dealServiceFeaturesOverlapPercentage.pkl")
processedData_UNION_dealServiceFeaturesOverlapPercentage.to_pickle(path= "../data/03_processed/processedData_UNION_dealServiceFeaturesOverlapPercentage.pkl")



endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating document statistics in " + timeDifference)

Finished creating document statistics in 0 minutes and 17.38437795639038 seconds


## 3.7 Query Quality

In [59]:
#Instantiate the count vectorizer and tfidf for the corpus
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import TfidfVectorizer 
from statistics import mean, median, mode, stdev, variance
from math import log, sqrt
import itertools

#Function calculating the IDFs of all query terms. Returns a list containing all IDFs
def calcIDFList(document, cv, tfidf_transformer):
    idfScoreList=[]
    if isinstance(document, list):
        termCount = len(document)
        for term in document:
            try:
                indexOfWord = cv.get_feature_names().index(term)
                idfScore = tfidf_transformer.idf_[indexOfWord]
                idfScoreList.append(idfScore)
            except:
                idfScoreList.append(0)
    else:
        termCount = 0
    return(idfScoreList)


def calcAvgIDF(IDFList):
    termCount = len(IDFList)
    if(termCount != 0):
        avgIdf = sum(IDFList) / termCount
    else:
        avgIdf = 0
    return(avgIdf)

def calcMaxIDF(IDFList): 
    termCount = len(IDFList)
    if(termCount != 0):
        maxIdf = np.amax(IDFList)
    else: 
        maxIdf = 0
    return(maxIdf)

def calcDevIDF(IDFList):
    termCount = len(IDFList)
    if(termCount > 1):
        stdevIdf = stdev(IDFList)
    else: 
        stdevIdf = 0
    return(stdevIdf)

#Function calculating the ICTF of all query terms. Returns a list containing all IDFs
def calcICTFList(document, cv, documentCount):
    ICTFList = []
        #For all terms in query, find how often they occur in the Corpus
    if isinstance(document, list):
        for term in document:
            try:
            #Find out how often the term occurs in the corpus
                termFrequency = (cv.vocabulary_[term])
                
                #Compute the log
                ictF = log(documentCount/termFrequency)
            except:
                ictF = 0
            
            ICTFList.append(ictF)
    return(ICTFList)

def calcAvgICTF(ICTFList, documentCount):
    avgICTF = sum(ICTFList) / documentCount
    return(avgICTF)


def calcMaxICTF(ICTFList): 
    termCount = len(ICTFList)
    if(termCount != 0):
        maxICTF = np.amax(ICTFList)
    else: 
        maxICTF = 0
    return(maxICTF)

def calcDevICTF(ICTFList):
    termCount = len(ICTFList)
    if(termCount > 1):
        stdevICTF = stdev(ICTFList)
    else: 
        stdevICTF = 0
    return(stdevICTF)


def calcEntropyList(query, cv, documentCount, docCollection):
    #entropy(t) = ∑ (d∈Dt)  ( tf(t,d) / tf(t, D) ) * log |D|(tf(t,d) / tf(t, D) )
        
    entropyValueList = []
    #for each term in the query, calculate the entropy of the query
    if isinstance(query, list):
        for queryTerm in query:
            #For each d ∈ D
            
            partialEntropyList = []
            
            for d in docCollection:
                #Check if queryTerm occurs in D (i.e/ d∈Dt)
                if (isinstance(d, list)):
                    if queryTerm in d:
                        try:
                            #Calculate the frequency of the term occurs in the document (i.e tf(t,d))
                            queryTermFrequencyInDocument = d.count(queryTerm)
                            
                            #calculate the frequency the term occurs in the query corpus (i.e tf(t,D))
                            queryTermFrequencyInCorpus = (cv.vocabulary_[queryTerm])
                             
                            # This part of the calculation tf(t,d) / tf(t, D)  * log |D|(tf(t,d) / tf(t, D))
                            partialEntropy1stHalf = queryTermFrequencyInDocument / queryTermFrequencyInCorpus
                            partialEntropy2ndHalf = log((queryTermFrequencyInDocument / queryTermFrequencyInCorpus), documentCount)
                            partialEntropy = partialEntropy1stHalf
                            partialEntropyList.append(partialEntropy)
                        except:
                            partialEntropyList.append(0) #If term not found entropy is 0
            #this part of the calculation ∑ (d∈Dt)
            entropyValueOfQueryTerm = sum(partialEntropyList)
            entropyValueList.append(entropyValueOfQueryTerm)
    
    return(entropyValueList)


def calcAvgEntropy(entropyValueList):
    termCount = len(entropyValueList)
    if(termCount != 0):
        #Calculate the average of all the entropies
        avgEntropy = sum(entropyValueList) / len(entropyValueList)
    else:
        avgEntropy = 0
    return(avgEntropy)

    
def calcMedEntropy(entropyValueList):
    termCount = len(entropyValueList)
    if(termCount != 0):
        #Calculate the average of all the entropies
        medEntropy = median(entropyValueList)
    else:
        medEntropy = 0
    return(medEntropy)
    
def calcMaxEntropy(entropyValueList):
    termCount = len(entropyValueList)
    if(termCount != 0):
        maxEntropy = np.amax(entropyValueList)
    else: 
        maxEntropy = 0
    return(maxEntropy)
    
def calcDevEntropy(entropyValueList):
    termCount = len(entropyValueList)
    if(termCount > 1):
        #Calculate the average of all the entropies
        devEntropy = stdev(entropyValueList)
    else:
        devEntropy = 0
    return(devEntropy)

#The percentage of documents in the collection containing at least one of the query terms
def calcQueryScope(query, docCollection): 
    counter = 0
    if isinstance(query, list):
        for document in docCollection:
            #check if query occurs in term. 
            if(isinstance(document, list)):
                for queryTerm in query:
                    if queryTerm in document:
                        counter = counter + 1
                        break
    queryScope = counter / len(docCollection)
    return(queryScope)

#The Kullback-Leiber divergence of the query language model from the collection language model
def calcSCS(query, cv, docCount):
    divergenceList = []
    if isinstance(query, list):
        for queryTerm in query:
            try:
                #frequency of term in query - tf(q, Q)/|Q|
                pqQ = query.count(queryTerm) / len(query)
                
                #frequency of term in documentlist - tf(q, D)/|D|
                pqD = cv.vocabulary_[queryTerm]
                
                divergence = pqQ * log(pqQ / pqD)
                divergenceList.append(divergence)
            except:
                continue
    SCS = sum(divergenceList)
    return(SCS)

#The average of the collection-query similarity (SCQ) over all query terms
def calcSCQList(query, docCollection, cv, fittedTF_IDF, documentCount):
    SCQList = []
    if isinstance(query, list):
        documentString = ' '.join(query)
        
        #Calculate the Term Frequency of the document
        inputDocs = [documentString] 
        
        # count matrix 
        count_vector = cv.transform(inputDocs) 
 
        #tf-idf scores 
        tf_idf_vector = fittedTF_IDF.transform(count_vector)
        
        feature_names = cv.get_feature_names() 
        # place tf-idf values in a pandas data frame 
        df = pd.DataFrame(tf_idf_vector.T.todense(), 
                          index=feature_names, columns=["tfidf"])
    
        
        #Find the tfidf of the term
        for queryTerm in query:    
            try:
                tfidf = df["tfidf"][queryTerm]
                SCQ = (1 + log(tfidf))
                SCQList.append(SCQ)
            except:
                continue
        
    avgSCQ = sum(SCQList) / documentCount
    return(SCQList)

#The average of the collection-query similarity (SCQ) over all query terms
def calcAvgSCQ(SCQList, documentCount):
    avgSCQ = sum(SCQList) / documentCount
    return(avgSCQ)
    
#The average of the collection-query similarity (SCQ) over all query terms
def calcMaxSCQ(SCQList):
    termCount = len(SCQList)
    if(termCount != 0):
        maxSCQ = np.amax(SCQList)
    else:
        maxSCQ = np.NaN
    return(maxSCQ)

#The average of the collection-query similarity (SCQ) over all query terms
def calcSumSCQ(SCQList):
    sumSCQ = sum(SCQList)
    return(sumSCQ)

def createTermPairs(cv):
    terms = list(cv.vocabulary_.keys())
    #Create all possible pair combinations from the terms in the query 
    pairCombinationList = list(itertools.combinations(terms, 2))
    return(pairCombinationList)

#Method to find out how often a term occurs in a document
def findTermFrequencies(cv, docCollection):
    terms = list(cv.vocabulary_.keys())
    termFrequencies = {}
    for term in terms:
        termCounter = 0
        for document in docCollection:
            if isinstance(document, list):
                if term in document: 
                    termCounter = termCounter + 1
        termFrequencies[term] = termCounter
    return(termFrequencies)

#Method to find out how often both terms occur in a document. 
def findTermPairFrequencies(termPairs, docCollection):
    termPairFrequencies = {}
    for termPair in termPairs:
        termPairCount = 0
        for document in docCollection:
            if (isinstance(document, list)):
                if all(i in document for i in termPair):
                    termPairCount = termPairCount + 1
        termPairFrequencies[termPair] = termPairCount
    return(termPairFrequencies)   

def calcPMIList(query, termFrequencies, termPairFrequencies, docCollection):
    if isinstance(query, list):
    #Find the frequencies of the individual terms and the pairs
        pairCombinationList = list(itertools.combinations(query, 2))
        termOccurances = []
        for pair in pairCombinationList:
            try:
                q1Freq = termFrequencies[pair[0]]
            except:
                q1Freq = 0
            try:
                q2Freq = termFrequencies[pair[1]]
            except:
                q2Freq = 0
            try:
                q1q2Freq = termPairFrequencies[pair]
            except:
                q1q2Freq = 0
                    
            termOccurances.append({'q1Freq': q1Freq, 
                                   'q2Freq': q2Freq, 
                                   'q1q2Freq': q1q2Freq})
    
        docCount = len(docCollection)
        pmiList = []
        for term in termOccurances:
            pq1 = term['q1Freq'] / docCount
            pq2 = term['q2Freq'] / docCount
            pq1q2 = term['q1q2Freq'] / docCount

            try:
                pmi = log(pq1q2 /(pq1 * pq2))
            except:
                pmi = np.nan
            pmiList.append(pmi)
        return(pmiList)
    else:
        return(np.nan)

def calcAvgPMI(pmiList):
    if(isinstance(pmiList, list)):
        pairCount = len(pmiList)
        if(pairCount != 0):
            #Calculate the average of all the entropies
            avgPMI= np.nansum(pmiList) / pairCount
        else:
            avgPMI = 0
        return(avgPMI)
    return(np.nan)

def calcMaxPMI(pmiList): 
    if(isinstance(pmiList, list)):
        pairCount = len(pmiList)
        if(pairCount != 0):
            maxPMI = np.nanmax(pmiList)
        else: 
            maxPMI = np.nan
        return(maxPMI)
    return(np.nan)

In [60]:
#Read datasets from disk
processedData_dealServiceCartesian = pd.read_pickle(r"../data/03_processed/processedData_dealServiceCartesian.pkl")

#instantiate CountVectorizer() for SVN
processedData_SVN_dealServiceCountVectorizer = CountVectorizer()
processedData_SVN_dealServiceTF_IDF = createFittedTF_IDF(processedData_SVN_dealServiceCountVectorizer, intermediateData_SVN_dealServiceCorpusAll)

#instantiate CountVectorizer() for JIRA
processedData_JIRA_dealServiceCountVectorizer = CountVectorizer()
processedData_JIRA_dealServiceTF_IDF = createFittedTF_IDF(processedData_JIRA_dealServiceCountVectorizer, intermediateData_JIRA_dealServiceCorpus)

#Determine document counts
intermediateData_JIRA_dealService_documentCount = len(intermediateData_JIRA_dealService.index)
intermediateData_SVN_dealService_documentCount = len(intermediateData_SVN_dealService.index)



#### IDF Scores (SVN as Query)

In [61]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVN_dealServiceFeaturesIDF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVN_dealServiceFeaturesIDF["SvnAsQuery_IDF"] = processedData_dealServiceCartesian.apply(lambda x: calcIDFList(x.Commit_natural_text, 
                                                                                                                processedData_SVN_dealServiceCountVectorizer, 
                                                                                                                processedData_SVN_dealServiceTF_IDF),axis=1)

processedData_SVN_dealServiceFeaturesIDF["SvnAsQuery_avgIDF"] = processedData_SVN_dealServiceFeaturesIDF.apply(lambda x: calcAvgIDF(x.SvnAsQuery_IDF), axis=1)
processedData_SVN_dealServiceFeaturesIDF["SvnAsQuery_maxIDF"] = processedData_SVN_dealServiceFeaturesIDF.apply(lambda x: calcMaxIDF(x.SvnAsQuery_IDF), axis=1)
processedData_SVN_dealServiceFeaturesIDF["SvnAsQuery_devIDF"] = processedData_SVN_dealServiceFeaturesIDF.apply(lambda x: calcDevIDF(x.SvnAsQuery_IDF), axis=1)

#Save results in pickle
processedData_SVN_dealServiceFeaturesIDF.to_pickle(path= "../data/03_processed/processedData_SVN_dealServiceFeaturesIDF.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 102 minutes and 5.8844568729400635 seconds


#### IDF Scores (SVNLogs as Query)

In [62]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNLogs_dealServiceFeaturesIDF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNLogs_dealServiceFeaturesIDF["SvnLogsAsQuery_IDF"] = processedData_dealServiceCartesian.apply(lambda x: calcIDFList(x.Logs, 
                                                                                                                processedData_SVNLogs_dealServiceCountVectorizer, 
                                                                                                                processedData_SVNLogs_dealServiceCountTF_IDF),axis=1)

processedData_SVNLogs_dealServiceFeaturesIDF["SvnLogsAsQuery_avgIDF"] = processedData_SVNLogs_dealServiceFeaturesIDF.apply(lambda x: calcAvgIDF(x.SvnLogsAsQuery_IDF), axis=1)
processedData_SVNLogs_dealServiceFeaturesIDF["SvnLogsAsQuery_maxIDF"] = processedData_SVNLogs_dealServiceFeaturesIDF.apply(lambda x: calcMaxIDF(x.SvnLogsAsQuery_IDF), axis=1)
processedData_SVNLogs_dealServiceFeaturesIDF["SvnLogsAsQuery_devIDF"] = processedData_SVNLogs_dealServiceFeaturesIDF.apply(lambda x: calcDevIDF(x.SvnLogsAsQuery_IDF), axis=1)

#Save results in pickle
processedData_SVNLogs_dealServiceFeaturesIDF.to_pickle(path= "../data/03_processed/processedData_SVNLogs_dealServiceFeaturesIDF.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 4 minutes and 23.578344106674194 seconds


#### IDF Scores (SVNUnitNames as Query

In [63]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNUnitNames_dealServiceFeaturesIDF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNUnitNames_dealServiceFeaturesIDF["SvnUnitNamesAsQuery_IDF"] = processedData_dealServiceCartesian.apply(lambda x: calcIDFList(x.Logs, 
                                                                                                                processedData_SVNLogs_dealServiceCountVectorizer, 
                                                                                                                processedData_SVNLogs_dealServiceCountTF_IDF),axis=1)

processedData_SVNUnitNames_dealServiceFeaturesIDF["SvnUnitNamesAsQuery_avgIDF"] = processedData_SVNUnitNames_dealServiceFeaturesIDF.apply(lambda x: calcAvgIDF(x.SvnUnitNamesAsQuery_IDF), axis=1)
processedData_SVNUnitNames_dealServiceFeaturesIDF["SvnUnitNamesAsQuery_maxIDF"] = processedData_SVNUnitNames_dealServiceFeaturesIDF.apply(lambda x: calcMaxIDF(x.SvnUnitNamesAsQuery_IDF), axis=1)
processedData_SVNUnitNames_dealServiceFeaturesIDF["SvnUnitNamesAsQuery_devIDF"] = processedData_SVNUnitNames_dealServiceFeaturesIDF.apply(lambda x: calcDevIDF(x.SvnUnitNamesAsQuery_IDF), axis=1)

#Save results in pickle
processedData_SVNUnitNames_dealServiceFeaturesIDF.to_pickle(path= "../data/03_processed/processedData_SVNUnitNames_dealServiceFeaturesIDF.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 5 minutes and 36.74569392204285 seconds


##### IDF Scores (JIRA as Query)

In [64]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRA_dealServiceFeaturesIDF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRA_dealServiceFeaturesIDF["JiraAsQuery_IDF"] = processedData_dealServiceCartesian.apply(lambda x: calcIDFList(x.Jira_natural_text, 
                                                                                                                processedData_JIRA_dealServiceCountVectorizer, 
                                                                                                                processedData_JIRA_dealServiceTF_IDF),axis=1)

processedData_JIRA_dealServiceFeaturesIDF["JiraAsQuery_avgIDF"] = processedData_JIRA_dealServiceFeaturesIDF.apply(lambda x: calcAvgIDF(x.JiraAsQuery_IDF), axis=1)
processedData_JIRA_dealServiceFeaturesIDF["JiraAsQuery_maxIDF"] = processedData_JIRA_dealServiceFeaturesIDF.apply(lambda x: calcMaxIDF(x.JiraAsQuery_IDF), axis=1)
processedData_JIRA_dealServiceFeaturesIDF["JiraAsQuery_devIDF"] = processedData_JIRA_dealServiceFeaturesIDF.apply(lambda x: calcDevIDF(x.JiraAsQuery_IDF), axis=1)

#Save results in pickle
processedData_JIRA_dealServiceFeaturesIDF.to_pickle(path= "../data/03_processed/processedData_JIRA_dealServiceFeaturesIDF.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 22 minutes and 42.92076897621155 seconds


##### IDF Scores (JIRA Summaries as Query)

In [65]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRASummaries_dealServiceFeaturesIDF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRASummaries_dealServiceFeaturesIDF["JiraSummariesAsQuery_IDF"] = processedData_dealServiceCartesian.apply(lambda x: calcIDFList(x.Summary, 
                                                                                                                processedData_JIRASummaries_dealServiceCountVectorizer, 
                                                                                                                processedData_JIRASummaries_dealServiceCountTF_IDF),axis=1)

processedData_JIRASummaries_dealServiceFeaturesIDF["JiraSummariesAsQuery_avgIDF"] = processedData_JIRASummaries_dealServiceFeaturesIDF.apply(lambda x: calcAvgIDF(x.JiraSummariesAsQuery_IDF), axis=1)
processedData_JIRASummaries_dealServiceFeaturesIDF["JiraSummariesAsQuery_maxIDF"] = processedData_JIRASummaries_dealServiceFeaturesIDF.apply(lambda x: calcMaxIDF(x.JiraSummariesAsQuery_IDF), axis=1)
processedData_JIRASummaries_dealServiceFeaturesIDF["JiraSummariesAsQuery_devIDF"] = processedData_JIRASummaries_dealServiceFeaturesIDF.apply(lambda x: calcDevIDF(x.JiraSummariesAsQuery_IDF), axis=1)

#Save results in pickle
processedData_JIRASummaries_dealServiceFeaturesIDF.to_pickle(path= "../data/03_processed/processedData_JIRASummaries_dealServiceFeaturesIDF.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 3 minutes and 37.09285593032837 seconds


##### IDF Scores (JIRA Descriptions as Query)

In [66]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRADescriptions_dealServiceFeaturesIDF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRADescriptions_dealServiceFeaturesIDF["JiraDescriptionsAsQuery_IDF"] = processedData_dealServiceCartesian.apply(lambda x: calcIDFList(x.Description, 
                                                                                                                processedData_JIRADescriptions_dealServiceCountVectorizer, 
                                                                                                                processedData_JIRADescriptions_dealServiceCountTF_IDF),axis=1)

processedData_JIRADescriptions_dealServiceFeaturesIDF["JiraDescriptionsAsQuery_avgIDF"] = processedData_JIRADescriptions_dealServiceFeaturesIDF.apply(lambda x: calcAvgIDF(x.JiraDescriptionsAsQuery_IDF), axis=1)
processedData_JIRADescriptions_dealServiceFeaturesIDF["JiraDescriptionsAsQuery_maxIDF"] = processedData_JIRADescriptions_dealServiceFeaturesIDF.apply(lambda x: calcMaxIDF(x.JiraDescriptionsAsQuery_IDF), axis=1)
processedData_JIRADescriptions_dealServiceFeaturesIDF["JiraDescriptionsAsQuery_devIDF"] = processedData_JIRADescriptions_dealServiceFeaturesIDF.apply(lambda x: calcDevIDF(x.JiraDescriptionsAsQuery_IDF), axis=1)

#Save results in pickle
processedData_JIRADescriptions_dealServiceFeaturesIDF.to_pickle(path= "../data/03_processed/processedData_JIRADescriptions_dealServiceFeaturesIDF.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 16 minutes and 54.49449443817139 seconds


##### IDF Scores (JIRA Comments as Query)

#### ICTF Scores (SVN as query)

In [67]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVN_dealServiceFeaturesICTF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVN_dealServiceFeaturesICTF["SvnAsQuery_ICTF"] = processedData_dealServiceCartesian.apply(lambda x: calcIDFList(x.Commit_natural_text, 
                                                                                                                processedData_SVN_dealServiceCountVectorizer, 
                                                                                                                intermediateData_SVN_dealService_documentCount),axis=1)

processedData_SVN_dealServiceFeaturesICTF["SvnAsQuery_avgICTF"] = processedData_SVN_dealServiceFeaturesICTF.apply(lambda x: calcAvgICTF(x.SvnAsQuery_ICTF, intermediateData_SVN_dealService_documentCount), axis=1)
processedData_SVN_dealServiceFeaturesICTF["SvnAsQuery_maxICTF"] = processedData_SVN_dealServiceFeaturesICTF.apply(lambda x: calcMaxICTF(x.SvnAsQuery_ICTF), axis=1)
processedData_SVN_dealServiceFeaturesICTF["SvnAsQuery_devICTF"] = processedData_SVN_dealServiceFeaturesICTF.apply(lambda x: calcDevICTF(x.SvnAsQuery_ICTF), axis=1)

#Save results in pickle
processedData_SVN_dealServiceFeaturesICTF.to_pickle(path= "../data/03_processed/processedData_SVN_dealServiceFeaturesICTF.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 19 minutes and 18.083749294281006 seconds


#### ICTF Scores (SVNLogs as query)

In [68]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNLogs_dealServiceFeaturesICTF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNLogs_dealServiceFeaturesICTF["SvnLogsAsQuery_ICTF"] = processedData_dealServiceCartesian.apply(lambda x: calcICTFList(x.Logs, 
                                                                                                                processedData_SVNLogs_dealServiceCountVectorizer, 
                                                                                                                intermediateData_SVN_dealService_documentCount),axis=1)
##
processedData_SVNLogs_dealServiceFeaturesICTF["SvnLogsAsQuery_avgICTF"] = processedData_SVNLogs_dealServiceFeaturesICTF.apply(lambda x: calcAvgICTF(x.SvnLogsAsQuery_ICTF, intermediateData_SVN_dealService_documentCount), axis=1)
processedData_SVNLogs_dealServiceFeaturesICTF["SvnLogsAsQuery_maxICTF"] = processedData_SVNLogs_dealServiceFeaturesICTF.apply(lambda x: calcMaxICTF(x.SvnLogsAsQuery_ICTF), axis=1)
processedData_SVNLogs_dealServiceFeaturesICTF["SvnLogsAsQuery_devICTF"] = processedData_SVNLogs_dealServiceFeaturesICTF.apply(lambda x: calcDevICTF(x.SvnLogsAsQuery_ICTF), axis=1)

#Save results in pickle
processedData_SVNLogs_dealServiceFeaturesICTF.to_pickle(path= "../data/03_processed/processedData_SVNLogs_dealServiceFeaturesICTF.pkl")



endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 0 minutes and 13.914036989212036 seconds


#### ICTF Scores (SVNUnitNames as query)

In [69]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNUnitNames_dealServiceFeaturesICTF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNUnitNames_dealServiceFeaturesICTF["SvnUnitNamesAsQuery_ICTF"] = processedData_dealServiceCartesian.apply(lambda x: calcICTFList(x.Unit_names, 
                                                                                                                processedData_SVNUnitNames_dealServiceCountVectorizer, 
                                                                                                                intermediateData_SVN_dealService_documentCount),axis=1)
##
processedData_SVNUnitNames_dealServiceFeaturesICTF["SvnUnitNamesAsQuery_avgICTF"] = processedData_SVNUnitNames_dealServiceFeaturesICTF.apply(lambda x: calcAvgICTF(x.SvnUnitNamesAsQuery_ICTF, intermediateData_SVN_dealService_documentCount), axis=1)
processedData_SVNUnitNames_dealServiceFeaturesICTF["SvnUnitNamesAsQuery_maxICTF"] = processedData_SVNUnitNames_dealServiceFeaturesICTF.apply(lambda x: calcMaxICTF(x.SvnUnitNamesAsQuery_ICTF), axis=1)
processedData_SVNUnitNames_dealServiceFeaturesICTF["SvnUnitNamesAsQuery_devICTF"] = processedData_SVNUnitNames_dealServiceFeaturesICTF.apply(lambda x: calcDevICTF(x.SvnUnitNamesAsQuery_ICTF), axis=1)

#Save results in pickle
processedData_SVNUnitNames_dealServiceFeaturesICTF.to_pickle(path= "../data/03_processed/processedData_SVNUnitNames_dealServiceFeaturesICTF.pkl")



endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 0 minutes and 18.21299457550049 seconds


#### ICTF Scores (JIRA as query)

In [70]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRA_dealServiceFeaturesICTF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRA_dealServiceFeaturesICTF["JiraAsQuery_ICTF"] = processedData_dealServiceCartesian.apply(lambda x: calcICTFList(x.Jira_natural_text, 
                                                                                                                processedData_JIRA_dealServiceCountVectorizer, 
                                                                                                                intermediateData_JIRA_dealService_documentCount),axis=1)
##
processedData_JIRA_dealServiceFeaturesICTF["JiraAsQuery_avgICTF"] = processedData_JIRA_dealServiceFeaturesICTF.apply(lambda x: calcAvgICTF(x.JiraAsQuery_ICTF, intermediateData_JIRA_dealService_documentCount), axis=1)
processedData_JIRA_dealServiceFeaturesICTF["JiraAsQuery_maxICTF"] = processedData_JIRA_dealServiceFeaturesICTF.apply(lambda x: calcMaxICTF(x.JiraAsQuery_ICTF), axis=1)
processedData_JIRA_dealServiceFeaturesICTF["JiraAsQuery_devICTF"] = processedData_JIRA_dealServiceFeaturesICTF.apply(lambda x: calcDevICTF(x.JiraAsQuery_ICTF), axis=1)

#Save results in pickle
processedData_JIRA_dealServiceFeaturesICTF.to_pickle(path= "../data/03_processed/processedData_JIRA_dealServiceFeaturesICTF.pkl")



endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 0 minutes and 25.223266124725342 seconds


#### ICTF Scores (JIRA Summaries as query)

In [71]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRASummaries_dealServiceFeaturesICTF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRASummaries_dealServiceFeaturesICTF["JiraSummariesAsQuery_ICTF"] = processedData_dealServiceCartesian.apply(lambda x: calcICTFList(x.Summary, 
                                                                                                                processedData_JIRASummaries_dealServiceCountVectorizer, 
                                                                                                                intermediateData_JIRA_dealService_documentCount),axis=1)
##
processedData_JIRASummaries_dealServiceFeaturesICTF["JiraSummariesAsQuery_avgICTF"] = processedData_JIRASummaries_dealServiceFeaturesICTF.apply(lambda x: calcAvgICTF(x.JiraSummariesAsQuery_ICTF, intermediateData_JIRA_dealService_documentCount), axis=1)
processedData_JIRASummaries_dealServiceFeaturesICTF["JiraSummariesAsQuery_maxICTF"] = processedData_JIRASummaries_dealServiceFeaturesICTF.apply(lambda x: calcMaxICTF(x.JiraSummariesAsQuery_ICTF), axis=1)
processedData_JIRASummaries_dealServiceFeaturesICTF["JiraSummariesAsQuery_devICTF"] = processedData_JIRASummaries_dealServiceFeaturesICTF.apply(lambda x: calcDevICTF(x.JiraSummariesAsQuery_ICTF), axis=1)

#Save results in pickle
processedData_JIRASummaries_dealServiceFeaturesICTF.to_pickle(path= "../data/03_processed/processedData_JIRASummaries_dealServiceFeaturesICTF.pkl")



endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 0 minutes and 15.806091070175171 seconds


#### ICTF Scores (JIRA Descriptions as query)

In [72]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRADescriptions_dealServiceFeaturesICTF = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRADescriptions_dealServiceFeaturesICTF["JiraDescriptionsAsQuery_ICTF"] = processedData_dealServiceCartesian.apply(lambda x: calcICTFList(x.Description, 
                                                                                                                processedData_JIRADescriptions_dealServiceCountVectorizer, 
                                                                                                                intermediateData_JIRA_dealService_documentCount),axis=1)
##
processedData_JIRADescriptions_dealServiceFeaturesICTF["JiraDescriptionsAsQuery_avgICTF"] = processedData_JIRADescriptions_dealServiceFeaturesICTF.apply(lambda x: calcAvgICTF(x.JiraDescriptionsAsQuery_ICTF, intermediateData_JIRA_dealService_documentCount), axis=1)
processedData_JIRADescriptions_dealServiceFeaturesICTF["JiraDescriptionsAsQuery_maxICTF"] = processedData_JIRADescriptions_dealServiceFeaturesICTF.apply(lambda x: calcMaxICTF(x.JiraDescriptionsAsQuery_ICTF), axis=1)
processedData_JIRADescriptions_dealServiceFeaturesICTF["JiraDescriptionsAsQuery_devICTF"] = processedData_JIRADescriptions_dealServiceFeaturesICTF.apply(lambda x: calcDevICTF(x.JiraDescriptionsAsQuery_ICTF), axis=1)

#Save results in pickle
processedData_JIRADescriptions_dealServiceFeaturesICTF.to_pickle(path= "../data/03_processed/processedData_JIRADescriptions_dealServiceFeaturesICTF.pkl")



endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 0 minutes and 21.15519118309021 seconds


#### ICTF Scores (JIRA Comments as query)

#### Entropy (SVN as query)

In [73]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVN_dealServiceFeaturesEntropy = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVN_dealServiceFeaturesEntropy["SvnAsQuery_Entropy"] = processedData_dealServiceCartesian.apply(lambda x: calcEntropyList(x.Commit_natural_text, 
                                                                                                                processedData_SVN_dealServiceCountVectorizer, 
                                                                                                                intermediateData_SVN_dealService_documentCount,
                                                                                                                intermediateData_SVN_dealService.Commit_natural_text),axis=1)

processedData_SVN_dealServiceFeaturesEntropy["SvnAsQuery_avgEntropy"] = processedData_SVN_dealServiceFeaturesEntropy.apply(lambda x: calcAvgEntropy(x.SvnAsQuery_Entropy), axis=1)
processedData_SVN_dealServiceFeaturesEntropy["SvnAsQuery_medEntropy"] = processedData_SVN_dealServiceFeaturesEntropy.apply(lambda x: calcMedEntropy(x.SvnAsQuery_Entropy), axis=1)
processedData_SVN_dealServiceFeaturesEntropy["SvnAsQuery_maxEntropy"] = processedData_SVN_dealServiceFeaturesEntropy.apply(lambda x: calcMaxEntropy(x.SvnAsQuery_Entropy), axis=1)
processedData_SVN_dealServiceFeaturesEntropy["SvnAsQuery_devEntropy"] = processedData_SVN_dealServiceFeaturesEntropy.apply(lambda x: calcDevEntropy(x.SvnAsQuery_Entropy), axis=1)

#Save results in pickle
processedData_SVN_dealServiceFeaturesEntropy.to_pickle(path= "../data/03_processed/processedData_SVN_dealServiceFeaturesEntropy.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 65 minutes and 8.259272575378418 seconds


#### Entropy (SVNLogs as query)

In [74]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNLogs_dealServiceFeaturesEntropy = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNLogs_dealServiceFeaturesEntropy["SvnLogsAsQuery_Entropy"] = processedData_dealServiceCartesian.apply(lambda x: calcEntropyList(x.Logs, 
                                                                                                                processedData_SVNLogs_dealServiceCountVectorizer, 
                                                                                                                intermediateData_SVN_dealService_documentCount,
                                                                                                                intermediateData_SVN_dealService.Logs),axis=1)
##
processedData_SVNLogs_dealServiceFeaturesEntropy["SvnLogsAsQuery_avgEntropy"] = processedData_SVNLogs_dealServiceFeaturesEntropy.apply(lambda x: calcAvgEntropy(x.SvnLogsAsQuery_Entropy), axis=1)
processedData_SVNLogs_dealServiceFeaturesEntropy["SvnLogsAsQuery_medEntropy"] = processedData_SVNLogs_dealServiceFeaturesEntropy.apply(lambda x: calcMedEntropy(x.SvnLogsAsQuery_Entropy), axis=1)
processedData_SVNLogs_dealServiceFeaturesEntropy["SvnLogsAsQuery_maxEntropy"] = processedData_SVNLogs_dealServiceFeaturesEntropy.apply(lambda x: calcMaxEntropy(x.SvnLogsAsQuery_Entropy), axis=1)
processedData_SVNLogs_dealServiceFeaturesEntropy["SvnLogsAsQuery_devEntropy"] = processedData_SVNLogs_dealServiceFeaturesEntropy.apply(lambda x: calcDevEntropy(x.SvnLogsAsQuery_Entropy), axis=1)


#Save results in pickle
processedData_SVNLogs_dealServiceFeaturesEntropy.to_pickle(path= "../data/03_processed/processedData_SVNLogs_dealServiceFeaturesEntropy.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 4 minutes and 57.35947299003601 seconds


#### Entropy (SVNUnitNames as query)

In [75]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNUnitNames_dealServiceFeaturesEntropy = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNUnitNames_dealServiceFeaturesEntropy["SvnUnitNamesAsQuery_Entropy"] = processedData_dealServiceCartesian.apply(lambda x: calcEntropyList(x.Unit_names, 
                                                                                                                processedData_SVNUnitNames_dealServiceCountVectorizer, 
                                                                                                                intermediateData_SVN_dealService_documentCount,
                                                                                                                intermediateData_SVN_dealService.Unit_names),axis=1)
##
processedData_SVNUnitNames_dealServiceFeaturesEntropy["SvnUnitNamesAsQuery_avgEntropy"] = processedData_SVNUnitNames_dealServiceFeaturesEntropy.apply(lambda x: calcAvgEntropy(x.SvnUnitNamesAsQuery_Entropy), axis=1)
processedData_SVNUnitNames_dealServiceFeaturesEntropy["SvnUnitNamesAsQuery_medEntropy"] = processedData_SVNUnitNames_dealServiceFeaturesEntropy.apply(lambda x: calcMedEntropy(x.SvnUnitNamesAsQuery_Entropy), axis=1)
processedData_SVNUnitNames_dealServiceFeaturesEntropy["SvnUnitNamesAsQuery_maxEntropy"] = processedData_SVNUnitNames_dealServiceFeaturesEntropy.apply(lambda x: calcMaxEntropy(x.SvnUnitNamesAsQuery_Entropy), axis=1)
processedData_SVNUnitNames_dealServiceFeaturesEntropy["SvnUnitNamesAsQuery_devEntropy"] = processedData_SVNUnitNames_dealServiceFeaturesEntropy.apply(lambda x: calcDevEntropy(x.SvnUnitNamesAsQuery_Entropy), axis=1)


#Save results in pickle
processedData_SVNUnitNames_dealServiceFeaturesEntropy.to_pickle(path= "../data/03_processed/processedData_SVNUnitNames_dealServiceFeaturesEntropy.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 46 minutes and 37.08889198303223 seconds


#### Entropy (JIRA as query)

In [76]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRA_dealServiceFeaturesEntropy = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRA_dealServiceFeaturesEntropy["JiraAsQuery_Entropy"] = processedData_dealServiceCartesian.apply(lambda x: calcEntropyList(x.Jira_natural_text, 
                                                                                                                processedData_JIRA_dealServiceCountVectorizer, 
                                                                                                                intermediateData_JIRA_dealService_documentCount,
                                                                                                                intermediateData_JIRA_dealService.Jira_natural_text),axis=1)
##
processedData_JIRA_dealServiceFeaturesEntropy["JiraAsQuery_avgEntropy"] = processedData_JIRA_dealServiceFeaturesEntropy.apply(lambda x: calcAvgEntropy(x.JiraAsQuery_Entropy), axis=1)
processedData_JIRA_dealServiceFeaturesEntropy["JiraAsQuery_medEntropy"] = processedData_JIRA_dealServiceFeaturesEntropy.apply(lambda x: calcMedEntropy(x.JiraAsQuery_Entropy), axis=1)
processedData_JIRA_dealServiceFeaturesEntropy["JiraAsQuery_maxEntropy"] = processedData_JIRA_dealServiceFeaturesEntropy.apply(lambda x: calcMaxEntropy(x.JiraAsQuery_Entropy), axis=1)
processedData_JIRA_dealServiceFeaturesEntropy["JiraAsQuery_devEntropy"] = processedData_JIRA_dealServiceFeaturesEntropy.apply(lambda x: calcDevEntropy(x.JiraAsQuery_Entropy), axis=1)


#Save results in pickle
processedData_JIRA_dealServiceFeaturesEntropy.to_pickle(path= "../data/03_processed/processedData_JIRA_dealServiceFeaturesEntropy.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 8 minutes and 27.53518295288086 seconds


#### Entropy (JIRA Summaries as query)

In [77]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRASummaries_dealServiceFeaturesEntropy = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRASummaries_dealServiceFeaturesEntropy["JiraSummariesAsQuery_Entropy"] = processedData_dealServiceCartesian.apply(lambda x: calcEntropyList(x.Summary, 
                                                                                                                processedData_JIRASummaries_dealServiceCountVectorizer, 
                                                                                                                intermediateData_JIRA_dealService_documentCount,
                                                                                                                intermediateData_JIRA_dealService.Summary),axis=1)
##
processedData_JIRASummaries_dealServiceFeaturesEntropy["JiraSummariesAsQuery_avgEntropy"] = processedData_JIRASummaries_dealServiceFeaturesEntropy.apply(lambda x: calcAvgEntropy(x.JiraSummariesAsQuery_Entropy), axis=1)
processedData_JIRASummaries_dealServiceFeaturesEntropy["JiraSummariesAsQuery_medEntropy"] = processedData_JIRASummaries_dealServiceFeaturesEntropy.apply(lambda x: calcMedEntropy(x.JiraSummariesAsQuery_Entropy), axis=1)
processedData_JIRASummaries_dealServiceFeaturesEntropy["JiraSummariesAsQuery_maxEntropy"] = processedData_JIRASummaries_dealServiceFeaturesEntropy.apply(lambda x: calcMaxEntropy(x.JiraSummariesAsQuery_Entropy), axis=1)
processedData_JIRASummaries_dealServiceFeaturesEntropy["JiraSummariesAsQuery_devEntropy"] = processedData_JIRASummaries_dealServiceFeaturesEntropy.apply(lambda x: calcDevEntropy(x.JiraSummariesAsQuery_Entropy), axis=1)


#Save results in pickle
processedData_JIRASummaries_dealServiceFeaturesEntropy.to_pickle(path= "../data/03_processed/processedData_JIRASummaries_dealServiceFeaturesEntropy.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 1 minutes and 55.679604291915894 seconds


#### Entropy (JIRA Descriptions as query)

In [78]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRADescriptions_dealServiceFeaturesEntropy = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRADescriptions_dealServiceFeaturesEntropy["JiraDescriptionsAsQuery_Entropy"] = processedData_dealServiceCartesian.apply(lambda x: calcEntropyList(x.Description, 
                                                                                                                processedData_JIRADescriptions_dealServiceCountVectorizer, 
                                                                                                                intermediateData_JIRA_dealService_documentCount,
                                                                                                                intermediateData_JIRA_dealService.Description),axis=1)
##
processedData_JIRADescriptions_dealServiceFeaturesEntropy["JiraDescriptionsAsQuery_avgEntropy"] = processedData_JIRADescriptions_dealServiceFeaturesEntropy.apply(lambda x: calcAvgEntropy(x.JiraDescriptionsAsQuery_Entropy), axis=1)
processedData_JIRADescriptions_dealServiceFeaturesEntropy["JiraDescriptionsAsQuery_medEntropy"] = processedData_JIRADescriptions_dealServiceFeaturesEntropy.apply(lambda x: calcMedEntropy(x.JiraDescriptionsAsQuery_Entropy), axis=1)
processedData_JIRADescriptions_dealServiceFeaturesEntropy["JiraDescriptionsAsQuery_maxEntropy"] = processedData_JIRADescriptions_dealServiceFeaturesEntropy.apply(lambda x: calcMaxEntropy(x.JiraDescriptionsAsQuery_Entropy), axis=1)
processedData_JIRADescriptions_dealServiceFeaturesEntropy["JiraDescriptionsAsQuery_devEntropy"] = processedData_JIRADescriptions_dealServiceFeaturesEntropy.apply(lambda x: calcDevEntropy(x.JiraDescriptionsAsQuery_Entropy), axis=1)


#Save results in pickle
processedData_JIRADescriptions_dealServiceFeaturesEntropy.to_pickle(path= "../data/03_processed/processedData_JIRADescriptions_dealServiceFeaturesEntropy.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

Finished creating query quality features in 5 minutes and 15.807366609573364 seconds


#### Entropy (JIRA Comments as query)

##### Query Scope (SVN as query)

In [79]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVN_dealServiceFeaturesQueryScope = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVN_dealServiceFeaturesQueryScope["SvnAsQuery_QueryScope"] = processedData_dealServiceCartesian.apply(lambda x: calcQueryScope(x.Commit_natural_text, 
                                                                                                                intermediateData_SVN_dealService.Commit_natural_text),axis=1)

#Save results in pickle
processedData_SVN_dealServiceFeaturesQueryScope.to_pickle(path= "../data/03_processed/processedData_SVN_dealServiceFeaturesQueryScope.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 5 minutes and 54.49874782562256 seconds


##### Query Scope (SVNLogs as query)

In [80]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNLogs_dealServiceFeaturesQueryScope = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNLogs_dealServiceFeaturesQueryScope["SvnLogsAsQuery_QueryScope"] = processedData_dealServiceCartesian.apply(lambda x: calcQueryScope(x.Logs, 
                                                                                                                intermediateData_SVN_dealService.Logs),axis=1)

#Save results in pickle
processedData_SVNLogs_dealServiceFeaturesQueryScope.to_pickle(path= "../data/03_processed/processedData_SVNLogs_dealServiceFeaturesQueryScope.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 2 minutes and 36.839460611343384 seconds


##### Query Scope (SVNUnitNames as query)

In [81]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNUnitNames_dealServiceFeaturesQueryScope = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNUnitNames_dealServiceFeaturesQueryScope["SvnUnitNamesAsQuery_QueryScope"] = processedData_dealServiceCartesian.apply(lambda x: calcQueryScope(x.Unit_names, 
                                                                                                                intermediateData_SVN_dealService.Unit_names),axis=1)

#Save results in pickle
processedData_SVNUnitNames_dealServiceFeaturesQueryScope.to_pickle(path= "../data/03_processed/processedData_SVNUnitNames_dealServiceFeaturesQueryScope.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 2 minutes and 45.09112215042114 seconds


##### Query Scope (JIRA as query)

In [82]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRA_dealServiceFeaturesQueryScope = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRA_dealServiceFeaturesQueryScope["JiraAsQuery_QueryScope"] = processedData_dealServiceCartesian.apply(lambda x: calcQueryScope(x.Jira_natural_text, 
                                                                                                                intermediateData_JIRA_dealService.Jira_natural_text),axis=1)

#Save results in pickle
processedData_JIRA_dealServiceFeaturesQueryScope.to_pickle(path= "../data/03_processed/processedData_JIRA_dealServiceFeaturesQueryScope.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 1 minutes and 56.1243360042572 seconds


##### Query Scope (JIRA Summaries as query)

In [83]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRASummaries_dealServiceFeaturesQueryScope = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRASummaries_dealServiceFeaturesQueryScope["JiraSummariesAsQuery_QueryScope"] = processedData_dealServiceCartesian.apply(lambda x: calcQueryScope(x.Summary, 
                                                                                                                intermediateData_JIRA_dealService.Summary),axis=1)

#Save results in pickle
processedData_JIRASummaries_dealServiceFeaturesQueryScope.to_pickle(path= "../data/03_processed/processedData_JIRASummaries_dealServiceFeaturesQueryScope.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 14.92852234840393 seconds


##### Query Scope (JIRA Descriptions as query)

In [84]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRADescriptions_dealServiceFeaturesQueryScope = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRADescriptions_dealServiceFeaturesQueryScope["JiraDescriptionsAsQuery_QueryScope"] = processedData_dealServiceCartesian.apply(lambda x: calcQueryScope(x.Description, 
                                                                                                                intermediateData_JIRA_dealService.Description),axis=1)

#Save results in pickle
processedData_JIRADescriptions_dealServiceFeaturesQueryScope.to_pickle(path= "../data/03_processed/processedData_JIRADescriptions_dealServiceFeaturesQueryScope.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 1 minutes and 54.25439167022705 seconds


##### Query Scope (JIRA Comments as query)

#### Kullback-Leiber divergence (SVN as query)

In [85]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVN_dealServiceFeaturesSCS = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVN_dealServiceFeaturesSCS["SvnAsQuery_SCS"] = processedData_dealServiceCartesian.apply(lambda x: calcSCS(x.Commit_natural_text, 
                                                                                                                processedData_SVN_dealServiceCountVectorizer, 
                                                                                                                intermediateData_SVN_dealService_documentCount),axis=1)

#Save results in pickle
processedData_SVN_dealServiceFeaturesSCS.to_pickle(path= "../data/03_processed/processedData_SVN_dealServiceFeaturesSCS.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 19.071340084075928 seconds


#### Kullback-Leiber divergence (SVNLogs as query)

In [86]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNLogs_dealServiceFeaturesSCS = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNLogs_dealServiceFeaturesSCS["SvnLogsAsQuery_SCS"] = processedData_dealServiceCartesian.apply(lambda x: calcSCS(x.Logs, 
                                                                                                                processedData_SVNLogs_dealServiceCountVectorizer, 
                                                                                                                intermediateData_SVN_dealService_documentCount),axis=1)

#Save results in pickle
processedData_SVNLogs_dealServiceFeaturesSCS.to_pickle(path= "../data/03_processed/processedData_SVNLogs_dealServiceFeaturesSCS.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 2.019867420196533 seconds


#### Kullback-Leiber divergence (SVNUnitNames as query)

In [87]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNUnitNames_dealServiceFeaturesSCS = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNUnitNames_dealServiceFeaturesSCS["SvnUnitNamesAsQuery_SCS"] = processedData_dealServiceCartesian.apply(lambda x: calcSCS(x.Unit_names, 
                                                                                                                processedData_SVNUnitNames_dealServiceCountVectorizer, 
                                                                                                                intermediateData_SVN_dealService_documentCount),axis=1)

#Save results in pickle
processedData_SVNUnitNames_dealServiceFeaturesSCS.to_pickle(path= "../data/03_processed/processedData_SVNUnitNames_dealServiceFeaturesSCS.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 18.570611238479614 seconds


#### Kullback-Leiber divergence (JIRA as query)

In [88]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRA_dealServiceFeaturesSCS = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRA_dealServiceFeaturesSCS["JiraAsQuery_SCS"] = processedData_dealServiceCartesian.apply(lambda x: calcSCS(x.Jira_natural_text, 
                                                                                                                processedData_JIRA_dealServiceCountVectorizer, 
                                                                                                                intermediateData_JIRA_dealService_documentCount),axis=1)

#Save results in pickle
processedData_JIRA_dealServiceFeaturesSCS.to_pickle(path= "../data/03_processed/processedData_JIRA_dealServiceFeaturesSCS.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 7.896151542663574 seconds


#### Kullback-Leiber divergence (JIRA Summaries as query)

In [89]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRASummaries_dealServiceFeaturesSCS = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRASummaries_dealServiceFeaturesSCS["JiraSummariesAsQuery_SCS"] = processedData_dealServiceCartesian.apply(lambda x: calcSCS(x.Summary, 
                                                                                                                processedData_JIRASummaries_dealServiceCountVectorizer, 
                                                                                                                intermediateData_JIRA_dealService_documentCount),axis=1)

#Save results in pickle
processedData_JIRASummaries_dealServiceFeaturesSCS.to_pickle(path= "../data/03_processed/processedData_JIRASummaries_dealServiceFeaturesSCS.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 2.607619524002075 seconds


In [90]:
##### Kullback-Leiber divergence (JIRA Description as query)

In [91]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRADescriptions_dealServiceFeaturesSCS = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRADescriptions_dealServiceFeaturesSCS["JiraDescriptionsAsQuery_SCS"] = processedData_dealServiceCartesian.apply(lambda x: calcSCS(x.Description, 
                                                                                                                processedData_JIRADescriptions_dealServiceCountVectorizer, 
                                                                                                                intermediateData_JIRA_dealService_documentCount),axis=1)

#Save results in pickle
processedData_JIRADescriptions_dealServiceFeaturesSCS.to_pickle(path= "../data/03_processed/processedData_JIRADescriptions_dealServiceFeaturesSCS.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 0 minutes and 7.222806453704834 seconds


In [92]:
##### Kullback-Leiber divergence (JIRA Comments as query)

#### SCQ (SVN as Query)

In [93]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVN_dealServiceFeaturesSCQ = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVN_dealServiceFeaturesSCQ["SvnAsQuery_SCQ"] = processedData_dealServiceCartesian.apply(lambda x: calcSCQList(x.Commit_natural_text, intermediateData_SVN_dealService.Commit_natural_text,
                                                                                                                                         processedData_SVN_dealServiceCountVectorizer,
                                                                                                                                         processedData_SVN_dealServiceCountTF_IDF,
                                                                                                                                         intermediateData_SVN_dealService_documentCount),axis=1)

processedData_SVN_dealServiceFeaturesSCQ["SvnAsQuery_avgSCQ"] = processedData_SVN_dealServiceFeaturesSCQ.apply(lambda x: calcAvgSCQ(x.SvnAsQuery_SCQ, intermediateData_SVN_dealService_documentCount), axis=1)
processedData_SVN_dealServiceFeaturesSCQ["SvnAsQuery_maxSCQ"] = processedData_SVN_dealServiceFeaturesSCQ.apply(lambda x: calcMaxSCQ(x.SvnAsQuery_SCQ), axis=1)
processedData_SVN_dealServiceFeaturesSCQ["SvnAsQuery_sumSCQ"] = processedData_SVN_dealServiceFeaturesSCQ.apply(lambda x: calcSumSCQ(x.SvnAsQuery_SCQ), axis=1)


#Save results in pickle
processedData_SVN_dealServiceFeaturesSCQ.to_pickle(path= "../data/03_processed/processedData_SVN_dealServiceFeaturesSCQ.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 2 minutes and 8.745262861251831 seconds


#### SCQ (SVNLogs as Query)

In [94]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNLogs_dealServiceFeaturesSCQ = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNLogs_dealServiceFeaturesSCQ["SvnLogsAsQuery_SCQ"] = processedData_dealServiceCartesian.apply(lambda x: calcSCQList(x.Logs, intermediateData_SVN_dealService.Logs,
                                                                                                                                         processedData_SVNLogs_dealServiceCountVectorizer,
                                                                                                                                         processedData_SVNLogs_dealServiceCountTF_IDF,
                                                                                                                                         intermediateData_SVN_dealService_documentCount),axis=1)

processedData_SVNLogs_dealServiceFeaturesSCQ["SvnLogsAsQuery_avgSCQ"] = processedData_SVNLogs_dealServiceFeaturesSCQ.apply(lambda x: calcAvgSCQ(x.SvnLogsAsQuery_SCQ, intermediateData_SVN_dealService_documentCount), axis=1)
processedData_SVNLogs_dealServiceFeaturesSCQ["SvnLogsAsQuery_maxSCQ"] = processedData_SVNLogs_dealServiceFeaturesSCQ.apply(lambda x: calcMaxSCQ(x.SvnLogsAsQuery_SCQ), axis=1)
processedData_SVNLogs_dealServiceFeaturesSCQ["SvnLogsAsQuery_sumSCQ"] = processedData_SVNLogs_dealServiceFeaturesSCQ.apply(lambda x: calcSumSCQ(x.SvnLogsAsQuery_SCQ), axis=1)


#Save results in pickle
processedData_SVNLogs_dealServiceFeaturesSCQ.to_pickle(path= "../data/03_processed/processedData_SVNLogs_dealServiceFeaturesSCQ.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 2 minutes and 0.13643503189086914 seconds


#### SCQ (SVNUnitNames as Query)

In [95]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVNUnitNames_dealServiceFeaturesSCQ = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNUnitNames_dealServiceFeaturesSCQ["SvnUnitNamesAsQuery_SCQ"] = processedData_dealServiceCartesian.apply(lambda x: calcSCQList(x.Unit_names, intermediateData_SVN_dealService.Unit_names,
                                                                                                                                         processedData_SVNUnitNames_dealServiceCountVectorizer,
                                                                                                                                         processedData_SVNUnitNames_dealServiceCountTF_IDF,
                                                                                                                                         intermediateData_SVN_dealService_documentCount),axis=1)

processedData_SVNUnitNames_dealServiceFeaturesSCQ["SvnUnitNamesAsQuery_avgSCQ"] = processedData_SVNUnitNames_dealServiceFeaturesSCQ.apply(lambda x: calcAvgSCQ(x.SvnUnitNamesAsQuery_SCQ, intermediateData_SVN_dealService_documentCount), axis=1)
processedData_SVNUnitNames_dealServiceFeaturesSCQ["SvnUnitNamesAsQuery_maxSCQ"] = processedData_SVNUnitNames_dealServiceFeaturesSCQ.apply(lambda x: calcMaxSCQ(x.SvnUnitNamesAsQuery_SCQ), axis=1)
processedData_SVNUnitNames_dealServiceFeaturesSCQ["SvnUnitNamesAsQuery_sumSCQ"] = processedData_SVNUnitNames_dealServiceFeaturesSCQ.apply(lambda x: calcSumSCQ(x.SvnUnitNamesAsQuery_SCQ), axis=1)


#Save results in pickle
processedData_SVNUnitNames_dealServiceFeaturesSCQ.to_pickle(path= "../data/03_processed/processedData_SVNUnitNames_dealServiceFeaturesSCQ.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 2 minutes and 40.09870982170105 seconds


#### SCQ (JIRA as Query)

In [96]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRA_dealServiceFeaturesSCQ = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRA_dealServiceFeaturesSCQ["JiraAsQuery_SCQ"] = processedData_dealServiceCartesian.apply(lambda x: calcSCQList(x.Jira_natural_text, intermediateData_JIRA_dealService.Jira_natural_text,
                                                                                                                                         processedData_JIRA_dealServiceCountVectorizer,
                                                                                                                                         processedData_JIRA_dealServiceTF_IDF,
                                                                                                                                         intermediateData_JIRA_dealService_documentCount),axis=1)

processedData_JIRA_dealServiceFeaturesSCQ["JiraAsQuery_avgSCQ"] = processedData_JIRA_dealServiceFeaturesSCQ.apply(lambda x: calcAvgSCQ(x.JiraAsQuery_SCQ, intermediateData_JIRA_dealService_documentCount), axis=1)
processedData_JIRA_dealServiceFeaturesSCQ["JiraAsQuery_maxSCQ"] = processedData_JIRA_dealServiceFeaturesSCQ.apply(lambda x: calcMaxSCQ(x.JiraAsQuery_SCQ), axis=1)
processedData_JIRA_dealServiceFeaturesSCQ["JiraAsQuery_sumSCQ"] = processedData_JIRA_dealServiceFeaturesSCQ.apply(lambda x: calcSumSCQ(x.JiraAsQuery_SCQ), axis=1)


#Save results in pickle
processedData_JIRA_dealServiceFeaturesSCQ.to_pickle(path= "../data/03_processed/processedData_JIRA_dealServiceFeaturesSCQ.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 2 minutes and 12.603689193725586 seconds


#### SCQ (JIRA Summaries as Query)

In [97]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRASummaries_dealServiceFeaturesSCQ = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRASummaries_dealServiceFeaturesSCQ["JiraSummariesAsQuery_SCQ"] = processedData_dealServiceCartesian.apply(lambda x: calcSCQList(x.Summary, intermediateData_JIRA_dealService.Summary,
                                                                                                                                         processedData_JIRASummaries_dealServiceCountVectorizer,
                                                                                                                                         processedData_JIRASummaries_dealServiceCountTF_IDF,
                                                                                                                                         intermediateData_JIRA_dealService_documentCount),axis=1)

processedData_JIRASummaries_dealServiceFeaturesSCQ["JiraSummariesAsQuery_avgSCQ"] = processedData_JIRASummaries_dealServiceFeaturesSCQ.apply(lambda x: calcAvgSCQ(x.JiraSummariesAsQuery_SCQ, intermediateData_JIRA_dealService_documentCount), axis=1)
processedData_JIRASummaries_dealServiceFeaturesSCQ["JiraSummariesAsQuery_maxSCQ"] = processedData_JIRASummaries_dealServiceFeaturesSCQ.apply(lambda x: calcMaxSCQ(x.JiraSummariesAsQuery_SCQ), axis=1)
processedData_JIRASummaries_dealServiceFeaturesSCQ["JiraSummariesAsQuery_sumSCQ"] = processedData_JIRASummaries_dealServiceFeaturesSCQ.apply(lambda x: calcSumSCQ(x.JiraSummariesAsQuery_SCQ), axis=1)


#Save results in pickle
processedData_JIRASummaries_dealServiceFeaturesSCQ.to_pickle(path= "../data/03_processed/processedData_JIRASummaries_dealServiceFeaturesSCQ.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 2 minutes and 35.5435311794281 seconds


#### SCQ (JIRA Descriptions as Query)

In [98]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_JIRADescriptions_dealServiceFeaturesSCQ = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRADescriptions_dealServiceFeaturesSCQ["JiraDescriptionsAsQuery_SCQ"] = processedData_dealServiceCartesian.apply(lambda x: calcSCQList(x.Description, intermediateData_JIRA_dealService.Description,
                                                                                                                                         processedData_JIRADescriptions_dealServiceCountVectorizer,
                                                                                                                                         processedData_JIRADescriptions_dealServiceCountTF_IDF,
                                                                                                                                         intermediateData_JIRA_dealService_documentCount),axis=1)

processedData_JIRADescriptions_dealServiceFeaturesSCQ["JiraDescriptionsAsQuery_avgSCQ"] = processedData_JIRADescriptions_dealServiceFeaturesSCQ.apply(lambda x: calcAvgSCQ(x.JiraDescriptionsAsQuery_SCQ, intermediateData_JIRA_dealService_documentCount), axis=1)
processedData_JIRADescriptions_dealServiceFeaturesSCQ["JiraDescriptionsAsQuery_maxSCQ"] = processedData_JIRADescriptions_dealServiceFeaturesSCQ.apply(lambda x: calcMaxSCQ(x.JiraDescriptionsAsQuery_SCQ), axis=1)
processedData_JIRADescriptions_dealServiceFeaturesSCQ["JiraDescriptionsAsQuery_sumSCQ"] = processedData_JIRADescriptions_dealServiceFeaturesSCQ.apply(lambda x: calcSumSCQ(x.JiraDescriptionsAsQuery_SCQ), axis=1)


#Save results in pickle
processedData_JIRADescriptions_dealServiceFeaturesSCQ.to_pickle(path= "../data/03_processed/processedData_JIRADescriptions_dealServiceFeaturesSCQ.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 2 minutes and 4.489922285079956 seconds


#### SCQ (JIRA Comments as Query)

#### PMI (SVN as query)

In [99]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(processedData_SVN_dealServiceCountVectorizer)
termFrequencies = findTermFrequencies(processedData_SVN_dealServiceCountVectorizer, intermediateData_SVN_dealService.Commit_natural_text)
termPairFrequencies = findTermPairFrequencies(termPairs, intermediateData_SVN_dealService.Commit_natural_text)

#Create new dataFrame
processedData_SVN_dealServiceFeaturesPMI = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVN_dealServiceFeaturesPMI["SvnAsQuery_PMI"] = processedData_dealServiceCartesian.apply(lambda x: calcPMIList(x.Commit_natural_text, 
                                                                                                                                  termFrequencies, 
                                                                                                                                  termPairFrequencies, 
                                                                                                                                  intermediateData_SVN_dealService.Commit_natural_text),axis=1)

processedData_SVN_dealServiceFeaturesPMI["SvnAsQuery_avgPMI"] = processedData_SVN_dealServiceFeaturesPMI.apply(lambda x: calcAvgPMI(x.SvnAsQuery_PMI), axis=1)
processedData_SVN_dealServiceFeaturesPMI["SvnAsQuery_maxPMI"] = processedData_SVN_dealServiceFeaturesPMI.apply(lambda x: calcMaxPMI(x.SvnAsQuery_PMI), axis=1)



processedData_SVN_dealServiceFeaturesPMI.drop('SvnAsQuery_PMI', axis = 1, inplace=True)

#Save results in pickle
processedData_SVN_dealServiceFeaturesPMI.to_pickle(path= "../data/03_processed/processedData_SVN_dealServiceFeaturesPMI.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



  maxPMI = np.nanmax(pmiList)


Finished creating query quality features in 52 minutes and 47.94470453262329 seconds


#### PMI (SVNLogs as query)

In [100]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(processedData_SVNLogs_dealServiceCountVectorizer)
termFrequencies = findTermFrequencies(processedData_SVNLogs_dealServiceCountVectorizer, intermediateData_SVN_dealService.Logs)
termPairFrequencies = findTermPairFrequencies(termPairs, intermediateData_SVN_dealService.Logs)

#Create new dataFrame
processedData_SVNLogs_dealServiceFeaturesPMI = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNLogs_dealServiceFeaturesPMI["SvnLogsAsQuery_PMI"] = processedData_dealServiceCartesian.apply(lambda x: calcPMIList(x.Logs, 
                                                                                                                                  termFrequencies, 
                                                                                                                                  termPairFrequencies, 
                                                                                                                                  intermediateData_SVN_dealService.Logs),axis=1)

processedData_SVNLogs_dealServiceFeaturesPMI["SvnLogsAsQuery_avgPMI"] = processedData_SVNLogs_dealServiceFeaturesPMI.apply(lambda x: calcAvgPMI(x.SvnLogsAsQuery_PMI), axis=1)
processedData_SVNLogs_dealServiceFeaturesPMI["SvnLogsAsQuery_maxPMI"] = processedData_SVNLogs_dealServiceFeaturesPMI.apply(lambda x: calcMaxPMI(x.SvnLogsAsQuery_PMI), axis=1)



processedData_SVNLogs_dealServiceFeaturesPMI.drop('SvnLogsAsQuery_PMI', axis = 1, inplace=True)

#Save results in pickle
processedData_SVNLogs_dealServiceFeaturesPMI.to_pickle(path= "../data/03_processed/processedData_SVNLogs_dealServiceFeaturesPMI.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



  maxPMI = np.nanmax(pmiList)


Finished creating query quality features in 15 minutes and 40.21407437324524 seconds


#### PMI (SVNUnitNames as query)

In [101]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(processedData_SVNUnitNames_dealServiceCountVectorizer)
termFrequencies = findTermFrequencies(processedData_SVNUnitNames_dealServiceCountVectorizer, intermediateData_SVN_dealService.Unit_names)
termPairFrequencies = findTermPairFrequencies(termPairs, intermediateData_SVN_dealService.Unit_names)

#Create new dataFrame
processedData_SVNUnitNames_dealServiceFeaturesPMI = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_SVNUnitNames_dealServiceFeaturesPMI["SvnUnitNamesAsQuery_PMI"] = processedData_dealServiceCartesian.apply(lambda x: calcPMIList(x.Unit_names, 
                                                                                                                                  termFrequencies, 
                                                                                                                                  termPairFrequencies, 
                                                                                                                                  intermediateData_SVN_dealService.Unit_names),axis=1)

processedData_SVNUnitNames_dealServiceFeaturesPMI["SvnUnitNamesAsQuery_avgPMI"] = processedData_SVNUnitNames_dealServiceFeaturesPMI.apply(lambda x: calcAvgPMI(x.SvnUnitNamesAsQuery_PMI), axis=1)
processedData_SVNUnitNames_dealServiceFeaturesPMI["SvnUnitNamesAsQuery_maxPMI"] = processedData_SVNUnitNames_dealServiceFeaturesPMI.apply(lambda x: calcMaxPMI(x.SvnUnitNamesAsQuery_PMI), axis=1)



processedData_SVNUnitNames_dealServiceFeaturesPMI.drop('SvnUnitNamesAsQuery_PMI', axis = 1, inplace=True)

#Save results in pickle
processedData_SVNUnitNames_dealServiceFeaturesPMI.to_pickle(path= "../data/03_processed/processedData_SVNUnitNames_dealServiceFeaturesPMI.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



  maxPMI = np.nanmax(pmiList)


Finished creating query quality features in 22 minutes and 59.03677034378052 seconds


#### PMI (JIRA as query)

In [102]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(processedData_JIRA_dealServiceCountVectorizer)
termFrequencies = findTermFrequencies(processedData_JIRA_dealServiceCountVectorizer, intermediateData_JIRA_dealService.Jira_natural_text)
termPairFrequencies = findTermPairFrequencies(termPairs, intermediateData_JIRA_dealService.Jira_natural_text)

#Create new dataFrame
processedData_JIRA_dealServiceFeaturesPMI = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRA_dealServiceFeaturesPMI["JiraAsQuery_PMI"] = processedData_dealServiceCartesian.apply(lambda x: calcPMIList(x.Jira_natural_text, 
                                                                                                                                  termFrequencies, 
                                                                                                                                  termPairFrequencies, 
                                                                                                                                  intermediateData_JIRA_dealService.Jira_natural_text),axis=1)

processedData_JIRA_dealServiceFeaturesPMI["JiraAsQuery_avgPMI"] = processedData_JIRA_dealServiceFeaturesPMI.apply(lambda x: calcAvgPMI(x.JiraAsQuery_PMI), axis=1)
processedData_JIRA_dealServiceFeaturesPMI["JiraAsQuery_maxPMI"] = processedData_JIRA_dealServiceFeaturesPMI.apply(lambda x: calcMaxPMI(x.JiraAsQuery_PMI), axis=1)



processedData_JIRA_dealServiceFeaturesPMI.drop('JiraAsQuery_PMI', axis = 1, inplace=True)

#Save results in pickle
processedData_JIRA_dealServiceFeaturesPMI.to_pickle(path= "../data/03_processed/processedData_JIRA_dealServiceFeaturesPMI.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



  maxPMI = np.nanmax(pmiList)


Finished creating query quality features in 6 minutes and 29.46059226989746 seconds


#### PMI (JIRA Summaries as query)

In [103]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(processedData_JIRASummaries_dealServiceCountVectorizer)
termFrequencies = findTermFrequencies(processedData_JIRASummaries_dealServiceCountVectorizer, intermediateData_JIRA_dealService.Summary)
termPairFrequencies = findTermPairFrequencies(termPairs, intermediateData_JIRA_dealService.Summary)

#Create new dataFrame
processedData_JIRASummaries_dealServiceFeaturesPMI = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRASummaries_dealServiceFeaturesPMI["JiraSummariesAsQuery_PMI"] = processedData_dealServiceCartesian.apply(lambda x: calcPMIList(x.Summary, 
                                                                                                                                  termFrequencies, 
                                                                                                                                  termPairFrequencies, 
                                                                                                                                  intermediateData_JIRA_dealService.Summary),axis=1)

processedData_JIRASummaries_dealServiceFeaturesPMI["JiraSummariesAsQuery_avgPMI"] = processedData_JIRASummaries_dealServiceFeaturesPMI.apply(lambda x: calcAvgPMI(x.JiraSummariesAsQuery_PMI), axis=1)
processedData_JIRASummaries_dealServiceFeaturesPMI["JiraSummariesAsQuery_maxPMI"] = processedData_JIRASummaries_dealServiceFeaturesPMI.apply(lambda x: calcMaxPMI(x.JiraSummariesAsQuery_PMI), axis=1)



processedData_JIRASummaries_dealServiceFeaturesPMI.drop('JiraSummariesAsQuery_PMI', axis = 1, inplace=True)

#Save results in pickle
processedData_JIRASummaries_dealServiceFeaturesPMI.to_pickle(path= "../data/03_processed/processedData_JIRASummaries_dealServiceFeaturesPMI.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



  maxPMI = np.nanmax(pmiList)


Finished creating query quality features in 0 minutes and 24.82874059677124 seconds


#### PMI (JIRA Descriptions as query)

In [104]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(processedData_JIRADescriptions_dealServiceCountVectorizer)
termFrequencies = findTermFrequencies(processedData_JIRADescriptions_dealServiceCountVectorizer, intermediateData_JIRA_dealService.Description)
termPairFrequencies = findTermPairFrequencies(termPairs, intermediateData_JIRA_dealService.Description)

#Create new dataFrame
processedData_JIRADescriptions_dealServiceFeaturesPMI = pd.DataFrame()

#Calculate IDF stats for each svn
processedData_JIRADescriptions_dealServiceFeaturesPMI["JiraDescriptionsAsQuery_PMI"] = processedData_dealServiceCartesian.apply(lambda x: calcPMIList(x.Description, 
                                                                                                                                  termFrequencies, 
                                                                                                                                  termPairFrequencies, 
                                                                                                                                  intermediateData_JIRA_dealService.Description),axis=1)

processedData_JIRADescriptions_dealServiceFeaturesPMI["JiraDescriptionsAsQuery_avgPMI"] = processedData_JIRADescriptions_dealServiceFeaturesPMI.apply(lambda x: calcAvgPMI(x.JiraDescriptionsAsQuery_PMI), axis=1)
processedData_JIRADescriptions_dealServiceFeaturesPMI["JiraDescriptionsAsQuery_maxPMI"] = processedData_JIRADescriptions_dealServiceFeaturesPMI.apply(lambda x: calcMaxPMI(x.JiraDescriptionsAsQuery_PMI), axis=1)



processedData_JIRADescriptions_dealServiceFeaturesPMI.drop('JiraDescriptionsAsQuery_PMI', axis = 1, inplace=True)

#Save results in pickle
processedData_JIRADescriptions_dealServiceFeaturesPMI.to_pickle(path= "../data/03_processed/processedData_JIRADescriptions_dealServiceFeaturesPMI.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



Finished creating query quality features in 5 minutes and 56.17855978012085 seconds


#### PMI (JIRA Comments as query)

In [105]:
from sklearn import preprocessing
import numpy as np
import pandas as pd

def normalizeData(dataFrame):
    scaler = preprocessing.MinMaxScaler()
    names = dataFrame.columns
    d = scaler.fit_transform(dataFrame)
    scaledDataFrame = pd.DataFrame(d, columns=names)
    return(scaledDataFrame)

# Normalize all data

In [106]:
from sklearn import preprocessing
import numpy as np

################################## Loading #################################
#Load Process-Related Features
processedData_dealServiceFeaturesTime = pd.read_pickle(r'../data/03_processed/processedData_dealServiceFeaturesTime.pkl')
processedData_dealServiceFeaturesStakeholder = pd.read_pickle(r'../data/03_processed/processedData_dealServiceFeaturesStakeholder.pkl')

#Load IR-Related Features - unigram
processedData_dealService_features_VsmLogsJiraAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmLogsJiraAsQuery.pkl')
processedData_dealService_features_VsmLogsLogAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmLogsLogAsQuery.pkl')
processedData_dealService_features_VsmUnitNamesJiraAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmUnitNamesJiraAsQuery.pkl')
processedData_dealService_features_VsmUnitNamesUnitNamesAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmUnitNamesUnitNamesAsQuery.pkl')

#processedData_dealService_features_VsmUnitNamesCommentsCommentsAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmUnitNamesCommentsCommentsAsQuery.pkl')
#processedData_dealService_features_VsmUnitNamesCommentsUnitNamesAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmUnitNamesCommentsUnitNamesAsQuery.pkl')
processedData_dealService_features_VsmUnitNamesDescriptionDescriptionAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmUnitNamesDescriptionDescriptionAsQuery.pkl')
processedData_dealService_features_VsmUnitNamesDescriptionUnitNamesAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmUnitNamesDescriptionUnitNamesAsQuery.pkl')

#processedData_dealService_features_VsmVerbPruningUnitNamesJiraAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmVerbPruningUnitNamesJiraAsQuery.pkl')
#processedData_dealService_features_VsmVerbPruningUnitNamesUnitNamesAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmVerbPruningUnitNamesUnitNamesAsQuery.pkl')
processedData_dealService_features_VsmSummaryLogsSummaryAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmSummaryLogsSummaryAsQuery.pkl')
processedData_dealService_features_VsmSummaryLogsLogsAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmSummaryLogsLogsAsQuery.pkl')
processedData_dealService_features_VsmSummaryUnitNamesSummaryAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmSummaryUnitNamesSummaryAsQuery.pkl')
processedData_dealService_features_VsmSummaryUnitNamesUnitNamesAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmSummaryUnitNamesUnitNamesAsQuery.pkl')
processedData_dealService_features_VsmDescriptionDescriptionAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmDescriptionDescriptionAsQuery.pkl')
processedData_dealService_features_VsmDescriptionLogsAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmDescriptionLogsAsQuery.pkl')
#processedData_dealService_features_VsmCommentsCommentsAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmCommentsCommentsAsQuery.pkl')
#processedData_dealService_features_VsmCommentsLogsAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmCommentsLogsAsQuery.pkl')

processedData_dealService_features_VsmSvnJiraJiraAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmSvnJiraJiraAsQuery.pkl')
processedData_dealService_features_VsmSvnJiraSvnAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmSvnJiraSvnAsQuery.pkl')
processedData_dealService_features_VsmSvnSummarySvnAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmSvnSummarySvnAsQuery.pkl')
processedData_dealService_features_VsmSvnSummarySummaryAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmSvnSummarySummaryAsQuery.pkl')
processedData_dealService_features_VsmSvnDescriptionSvnAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmSvnDescriptionSvnAsQuery.pkl')
processedData_dealService_features_VsmSvnDescriptionDescriptionAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmSvnDescriptionDescriptionAsQuery.pkl')
#processedData_dealService_features_VsmSvnCommentsSvnAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmSvnCommentsSvnAsQuery.pkl')
#processedData_dealService_features_VsmSvnCommentsCommentsAsQuery = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmSvnCommentsCommentsAsQuery.pkl')


#Load IR-Related Features - bigram
#processedData_dealService_features_VsmLogsJiraAsQuery_2gram = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmLogsJiraAsQuery_2gram.pkl')
#processedData_dealService_features_VsmLogsLogAsQuery_2gram = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmLogsLogAsQuery_2gram.pkl')
#processedData_dealService_features_VsmUnitNamesJiraAsQuery_2gram = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmUnitNamesJiraAsQuery_2gram.pkl')
#processedData_dealService_features_VsmUnitNamesUnitNamesAsQuery_2gram = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmUnitNamesUnitNamesAsQuery_2gram.pkl')
#processedData_dealService_features_VsmCommentsLogsAsQuery_2gram = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmCommentsLogsAsQuery_2gram.pkl')
#processedData_dealService_features_VsmCommentsCommentsAsQuery_2gram = pd.read_pickle(r'../data/03_processed/processedData_dealService_features_VsmCommentsCommentsAsQuery_2gram.pkl')


#Load Document Statistics Features
processedData_JIRA_dealServiceFeaturesUniqueWordCount = pd.read_pickle(r"../data/03_processed/processedData_JIRA_dealServiceFeaturesUniqueWordCount.pkl")
processedData_SVN_dealServiceFeaturesUniqueWordCount = pd.read_pickle(r"../data/03_processed/processedData_SVN_dealServiceFeaturesUniqueWordCount.pkl")
processedData_JIRA_dealServiceFeaturesTotalWordCount = pd.read_pickle(r"../data/03_processed/processedData_JIRA_dealServiceFeaturesTotalWordCount.pkl")
processedData_SVN_dealServiceFeaturesTotalWordCount = pd.read_pickle(r"../data/03_processed/processedData_SVN_dealServiceFeaturesTotalWordCount.pkl")
processedData_JIRA_dealServiceFeaturesOverlapPercentage = pd.read_pickle(r"../data/03_processed/processedData_JIRA_dealServiceFeaturesOverlapPercentage.pkl")
processedData_SVN_dealServiceFeaturesOverlapPercentage = pd.read_pickle(r"../data/03_processed/processedData_SVN_dealServiceFeaturesOverlapPercentage.pkl")
processedData_UNION_dealServiceFeaturesOverlapPercentage = pd.read_pickle(r"../data/03_processed/processedData_UNION_dealServiceFeaturesOverlapPercentage.pkl")

#Load Query Quality Features
#processedData_dealServiceFeaturesQueryQuality = pd.read_pickle(r'../data/03_processed/processedData_dealServiceFeaturesQueryQuality.pkl')
processedData_SVN_dealServiceFeaturesIDF = pd.read_pickle(r'../data/03_processed/processedData_SVN_dealServiceFeaturesIDF.pkl')
processedData_SVNLogs_dealServiceFeaturesIDF = pd.read_pickle(r'../data/03_processed/processedData_SVNLogs_dealServiceFeaturesIDF.pkl')
processedData_SVNUnitNames_dealServiceFeaturesIDF = pd.read_pickle(r'../data/03_processed/processedData_SVNUnitNames_dealServiceFeaturesIDF.pkl')
processedData_JIRA_dealServiceFeaturesIDF = pd.read_pickle(r'../data/03_processed/processedData_JIRA_dealServiceFeaturesIDF.pkl')
processedData_JIRASummaries_dealServiceFeaturesIDF = pd.read_pickle(r'../data/03_processed/processedData_JIRASummaries_dealServiceFeaturesIDF.pkl')
processedData_JIRADescriptions_dealServiceFeaturesIDF = pd.read_pickle(r'../data/03_processed/processedData_JIRADescriptions_dealServiceFeaturesIDF.pkl')
#processedData_JIRAComments_dealServiceFeaturesIDF = pd.read_pickle(r'../data/03_processed/processedData_JIRAComments_dealServiceFeaturesIDF.pkl')


processedData_SVN_dealServiceFeaturesICTF = pd.read_pickle(r'../data/03_processed/processedData_SVN_dealServiceFeaturesICTF.pkl')
processedData_SVNLogs_dealServiceFeaturesICTF = pd.read_pickle(r'../data/03_processed/processedData_SVNLogs_dealServiceFeaturesICTF.pkl')
processedData_SVNUnitNames_dealServiceFeaturesICTF = pd.read_pickle(r'../data/03_processed/processedData_SVNUnitNames_dealServiceFeaturesICTF.pkl')
processedData_JIRA_dealServiceFeaturesICTF = pd.read_pickle(r'../data/03_processed/processedData_JIRA_dealServiceFeaturesICTF.pkl')
processedData_JIRASummaries_dealServiceFeaturesICTF = pd.read_pickle(r'../data/03_processed/processedData_JIRASummaries_dealServiceFeaturesICTF.pkl')
processedData_JIRADescriptions_dealServiceFeaturesICTF = pd.read_pickle(r'../data/03_processed/processedData_JIRADescriptions_dealServiceFeaturesICTF.pkl')
#processedData_JIRAComments_dealServiceFeaturesICTF = pd.read_pickle(r'../data/03_processed/processedData_JIRAComments_dealServiceFeaturesICTF.pkl')


processedData_SVN_dealServiceFeaturesEntropy = pd.read_pickle(r'../data/03_processed/processedData_SVN_dealServiceFeaturesEntropy.pkl')
processedData_SVNLogs_dealServiceFeaturesEntropy = pd.read_pickle(r'../data/03_processed/processedData_SVNLogs_dealServiceFeaturesEntropy.pkl')
processedData_SVNUnitNames_dealServiceFeaturesEntropy = pd.read_pickle(r'../data/03_processed/processedData_SVNUnitNames_dealServiceFeaturesEntropy.pkl')
processedData_JIRA_dealServiceFeaturesEntropy = pd.read_pickle(r'../data/03_processed/processedData_JIRA_dealServiceFeaturesEntropy.pkl')
processedData_JIRASummaries_dealServiceFeaturesEntropy = pd.read_pickle(r'../data/03_processed/processedData_JIRASummaries_dealServiceFeaturesEntropy.pkl')
processedData_JIRADescriptions_dealServiceFeaturesEntropy = pd.read_pickle(r'../data/03_processed/processedData_JIRADescriptions_dealServiceFeaturesEntropy.pkl')
#processedData_JIRAComments_dealServiceFeaturesEntropy = pd.read_pickle(r'../data/03_processed/processedData_JIRAComments_dealServiceFeaturesEntropy.pkl')


processedData_SVN_dealServiceFeaturesQueryScope = pd.read_pickle(r'../data/03_processed/processedData_SVN_dealServiceFeaturesQueryScope.pkl')
processedData_SVNLogs_dealServiceFeaturesQueryScope = pd.read_pickle(r'../data/03_processed/processedData_SVNLogs_dealServiceFeaturesQueryScope.pkl')
processedData_SVNUnitNames_dealServiceFeaturesQueryScope = pd.read_pickle(r'../data/03_processed/processedData_SVNUnitNames_dealServiceFeaturesQueryScope.pkl')
processedData_JIRA_dealServiceFeaturesQueryScope = pd.read_pickle(r'../data/03_processed/processedData_JIRA_dealServiceFeaturesQueryScope.pkl')
processedData_JIRASummaries_dealServiceFeaturesQueryScope = pd.read_pickle(r'../data/03_processed/processedData_JIRASummaries_dealServiceFeaturesQueryScope.pkl')
processedData_JIRADescriptions_dealServiceFeaturesQueryScope = pd.read_pickle(r'../data/03_processed/processedData_JIRADescriptions_dealServiceFeaturesQueryScope.pkl')
#processedData_JIRAComments_dealServiceFeaturesQueryScope = pd.read_pickle(r'../data/03_processed/processedData_JIRAComments_dealServiceFeaturesQueryScope.pkl')


processedData_SVN_dealServiceFeaturesSCS = pd.read_pickle(r'../data/03_processed/processedData_SVN_dealServiceFeaturesSCS.pkl')
processedData_SVNLogs_dealServiceFeaturesSCS = pd.read_pickle(r'../data/03_processed/processedData_SVNLogs_dealServiceFeaturesSCS.pkl')
processedData_SVNUnitNames_dealServiceFeaturesSCS = pd.read_pickle(r'../data/03_processed/processedData_SVNUnitNames_dealServiceFeaturesSCS.pkl')
processedData_JIRA_dealServiceFeaturesSCS = pd.read_pickle(r'../data/03_processed/processedData_JIRA_dealServiceFeaturesSCS.pkl')
processedData_JIRASummaries_dealServiceFeaturesSCS = pd.read_pickle(r'../data/03_processed/processedData_JIRASummaries_dealServiceFeaturesSCS.pkl')
processedData_JIRADescriptions_dealServiceFeaturesSCS = pd.read_pickle(r'../data/03_processed/processedData_JIRADescriptions_dealServiceFeaturesSCS.pkl')
#processedData_JIRAComments_dealServiceFeaturesSCS = pd.read_pickle(r'../data/03_processed/processedData_JIRAComments_dealServiceFeaturesSCS.pkl')


processedData_SVN_dealServiceFeaturesSCQ = pd.read_pickle(r'../data/03_processed/processedData_SVN_dealServiceFeaturesSCQ.pkl')
processedData_SVNLogs_dealServiceFeaturesSCQ = pd.read_pickle(r'../data/03_processed/processedData_SVNLogs_dealServiceFeaturesSCQ.pkl')
processedData_SVNUnitNames_dealServiceFeaturesSCQ = pd.read_pickle(r'../data/03_processed/processedData_SVNUnitNames_dealServiceFeaturesSCQ.pkl')
processedData_JIRA_dealServiceFeaturesSCQ = pd.read_pickle(r'../data/03_processed/processedData_JIRA_dealServiceFeaturesSCQ.pkl')
processedData_JIRASummaries_dealServiceFeaturesSCQ = pd.read_pickle(r'../data/03_processed/processedData_JIRASummaries_dealServiceFeaturesSCQ.pkl')
processedData_JIRADescriptions_dealServiceFeaturesSCQ = pd.read_pickle(r'../data/03_processed/processedData_JIRADescriptions_dealServiceFeaturesSCQ.pkl')
#processedData_JIRAComments_dealServiceFeaturesSCQ = pd.read_pickle(r'../data/03_processed/processedData_JIRAComments_dealServiceFeaturesSCQ.pkl')


#processedData_SVN_dealServiceFeaturesPMI = pd.read_pickle(r'../data/03_processed/processedData_SVN_dealServiceFeaturesPMI.pkl')
processedData_SVNLogs_dealServiceFeaturesPMI = pd.read_pickle(r'../data/03_processed/processedData_SVNLogs_dealServiceFeaturesPMI.pkl')
#processedData_SVNUnitNames_dealServiceFeaturesPMI = pd.read_pickle(r'../data/03_processed/processedData_SVNUnitNames_dealServiceFeaturesPMI.pkl')
#processedData_JIRA_dealServiceFeaturesPMI = pd.read_pickle(r'../data/03_processed/processedData_JIRA_dealServiceFeaturesPMI.pkl')
processedData_JIRASummaries_dealServiceFeaturesPMI = pd.read_pickle(r'../data/03_processed/processedData_JIRASummaries_dealServiceFeaturesPMI.pkl')
#processedData_JIRADescriptions_dealServiceFeaturesPMI = pd.read_pickle(r'../data/03_processed/processedData_JIRADescriptions_dealServiceFeaturesPMI.pkl')
#processedData_JIRAComments_dealServiceFeaturesPMI = pd.read_pickle(r'../data/03_processed/processedData_JIRAComments_dealServiceFeaturesPMI.pkl')


################################## Drop query array for normalization ###############################################


processedData_SVN_dealServiceFeaturesIDF.drop('SvnAsQuery_IDF', axis = 1, inplace=True)
processedData_SVNLogs_dealServiceFeaturesIDF.drop('SvnLogsAsQuery_IDF', axis = 1, inplace=True)
processedData_SVNUnitNames_dealServiceFeaturesIDF.drop('SvnUnitNamesAsQuery_IDF', axis = 1, inplace=True)
processedData_JIRA_dealServiceFeaturesIDF.drop('JiraAsQuery_IDF', axis = 1, inplace=True)
processedData_JIRASummaries_dealServiceFeaturesIDF.drop('JiraSummariesAsQuery_IDF', axis = 1, inplace=True)
processedData_JIRADescriptions_dealServiceFeaturesIDF.drop('JiraDescriptionsAsQuery_IDF', axis = 1, inplace=True)
#processedData_JIRAComments_dealServiceFeaturesIDF.drop('JiraCommentsAsQuery_IDF', axis = 1, inplace=True)

processedData_SVN_dealServiceFeaturesICTF.drop('SvnAsQuery_ICTF', axis = 1, inplace=True)
processedData_SVNLogs_dealServiceFeaturesICTF.drop('SvnLogsAsQuery_ICTF', axis = 1, inplace=True)
processedData_SVNUnitNames_dealServiceFeaturesICTF.drop('SvnUnitNamesAsQuery_ICTF', axis = 1, inplace=True)
processedData_JIRA_dealServiceFeaturesICTF.drop('JiraAsQuery_ICTF', axis = 1, inplace=True)
processedData_JIRASummaries_dealServiceFeaturesICTF.drop('JiraSummariesAsQuery_ICTF', axis = 1, inplace=True)
processedData_JIRADescriptions_dealServiceFeaturesICTF.drop('JiraDescriptionsAsQuery_ICTF', axis = 1, inplace=True)
#processedData_JIRAComments_dealServiceFeaturesICTF.drop('JiraCommentsAsQuery_ICTF', axis = 1, inplace=True)

processedData_SVN_dealServiceFeaturesEntropy.drop('SvnAsQuery_Entropy', axis = 1, inplace=True)
processedData_SVNLogs_dealServiceFeaturesEntropy.drop('SvnLogsAsQuery_Entropy', axis = 1, inplace=True)
processedData_SVNUnitNames_dealServiceFeaturesEntropy.drop('SvnUnitNamesAsQuery_Entropy', axis = 1, inplace=True)
processedData_JIRA_dealServiceFeaturesEntropy.drop('JiraAsQuery_Entropy', axis = 1, inplace=True)
processedData_JIRASummaries_dealServiceFeaturesEntropy.drop('JiraSummariesAsQuery_Entropy', axis = 1, inplace=True)
processedData_JIRADescriptions_dealServiceFeaturesEntropy.drop('JiraDescriptionsAsQuery_Entropy', axis = 1, inplace=True)
#processedData_JIRAComments_dealServiceFeaturesEntropy.drop('JiraCommentsAsQuery_Entropy', axis = 1, inplace=True)

processedData_SVN_dealServiceFeaturesSCQ.drop('SvnAsQuery_SCQ', axis = 1, inplace=True)
processedData_SVNLogs_dealServiceFeaturesSCQ.drop('SvnLogsAsQuery_SCQ', axis = 1, inplace=True)
processedData_SVNUnitNames_dealServiceFeaturesSCQ.drop('SvnUnitNamesAsQuery_SCQ', axis = 1, inplace=True)
processedData_JIRA_dealServiceFeaturesSCQ.drop('JiraAsQuery_SCQ', axis = 1, inplace=True)
processedData_JIRASummaries_dealServiceFeaturesSCQ.drop('JiraSummariesAsQuery_SCQ', axis = 1, inplace=True)
processedData_JIRADescriptions_dealServiceFeaturesSCQ.drop('JiraDescriptionsAsQuery_SCQ', axis = 1, inplace=True)
#processedData_JIRAComments_dealServiceFeaturesSCQ.drop('JiraCommentsAsQuery_SCQ', axis = 1, inplace=True)

################################## Normalizing ################################################

processedData_dealServiceFeaturesTime_normalized = normalizeData(processedData_dealServiceFeaturesTime)
processedData_dealServiceFeaturesStakeholder_normalized = normalizeData(processedData_dealServiceFeaturesStakeholder)

#Load IR-Related Features - unigram
processedData_dealService_features_VsmLogsJiraAsQuery_normalized = normalizeData(processedData_dealService_features_VsmLogsJiraAsQuery)
processedData_dealService_features_VsmLogsLogAsQuery_normalized = normalizeData(processedData_dealService_features_VsmLogsLogAsQuery)
processedData_dealService_features_VsmUnitNamesJiraAsQuery_normalized = normalizeData(processedData_dealService_features_VsmUnitNamesJiraAsQuery)
processedData_dealService_features_VsmUnitNamesUnitNamesAsQuery_normalized = normalizeData(processedData_dealService_features_VsmUnitNamesUnitNamesAsQuery)
#processedData_dealService_features_VsmUnitNamesCommentsCommentsAsQuery_normalized = normalizeData(processedData_dealService_features_VsmUnitNamesCommentsCommentsAsQuery)
#processedData_dealService_features_VsmUnitNamesCommentsUnitNamesAsQuery_normalized = normalizeData(processedData_dealService_features_VsmUnitNamesCommentsUnitNamesAsQuery)
processedData_dealService_features_VsmUnitNamesDescriptionDescriptionAsQuery_normalized = normalizeData(processedData_dealService_features_VsmUnitNamesDescriptionDescriptionAsQuery)
processedData_dealService_features_VsmUnitNamesDescriptionUnitNamesAsQuery_normalized = normalizeData(processedData_dealService_features_VsmUnitNamesDescriptionUnitNamesAsQuery)

#processedData_dealService_features_VsmVerbPruningUnitNamesJiraAsQuery_normalized = normalizeData(processedData_dealService_features_VsmVerbPruningUnitNamesJiraAsQuery)
#processedData_dealService_features_VsmVerbPruningUnitNamesUnitNamesAsQuery_normalized = normalizeData(processedData_dealService_features_VsmVerbPruningUnitNamesUnitNamesAsQuery)
processedData_dealService_features_VsmSummaryLogsSummaryAsQuery_normalized = normalizeData(processedData_dealService_features_VsmSummaryLogsSummaryAsQuery)
processedData_dealService_features_VsmSummaryLogsLogsAsQuery_normalized = normalizeData(processedData_dealService_features_VsmSummaryLogsLogsAsQuery)
processedData_dealService_features_VsmSummaryUnitNamesSummaryAsQuery_normalized = normalizeData(processedData_dealService_features_VsmSummaryUnitNamesSummaryAsQuery)
processedData_dealService_features_VsmSummaryUnitNamesUnitNamesAsQuery_normalized = normalizeData(processedData_dealService_features_VsmSummaryUnitNamesUnitNamesAsQuery)
processedData_dealService_features_VsmDescriptionDescriptionAsQuery_normalized = normalizeData(processedData_dealService_features_VsmDescriptionDescriptionAsQuery)
processedData_dealService_features_VsmDescriptionLogsAsQuery_normalized = normalizeData(processedData_dealService_features_VsmDescriptionLogsAsQuery)
#processedData_dealService_features_VsmCommentsCommentsAsQuery_normalized = normalizeData(processedData_dealService_features_VsmCommentsCommentsAsQuery)
#processedData_dealService_features_VsmCommentsLogsAsQuery_normalized = normalizeData(processedData_dealService_features_VsmCommentsLogsAsQuery)

processedData_dealService_features_VsmSvnJiraJiraAsQuery_normalized = normalizeData(processedData_dealService_features_VsmSvnJiraJiraAsQuery)
processedData_dealService_features_VsmSvnJiraSvnAsQuery_normalized = normalizeData(processedData_dealService_features_VsmSvnJiraSvnAsQuery)
processedData_dealService_features_VsmSvnSummarySvnAsQuery_normalized = normalizeData(processedData_dealService_features_VsmSvnSummarySvnAsQuery)
processedData_dealService_features_VsmSvnSummarySummaryAsQuery_normalized = normalizeData(processedData_dealService_features_VsmSvnSummarySummaryAsQuery)
processedData_dealService_features_VsmSvnDescriptionSvnAsQuery_normalized = normalizeData(processedData_dealService_features_VsmSvnDescriptionSvnAsQuery)
processedData_dealService_features_VsmSvnDescriptionDescriptionAsQuery_normalized = normalizeData(processedData_dealService_features_VsmSvnDescriptionDescriptionAsQuery)
#processedData_dealService_features_VsmSvnCommentsSvnAsQuery_normalized = normalizeData(processedData_dealService_features_VsmSvnCommentsSvnAsQuery)
#processedData_dealService_features_VsmSvnCommentsCommentsAsQuery_normalized = normalizeData(processedData_dealService_features_VsmSvnCommentsCommentsAsQuery)



#Load IR-Related Features - bigram
#processedData_dealService_features_VsmLogsJiraAsQuery_2gram_normalized = normalizeData(processedData_dealService_features_VsmLogsJiraAsQuery_2gram)
#processedData_dealService_features_VsmLogsLogAsQuery_2gram_normalized = normalizeData(processedData_dealService_features_VsmLogsLogAsQuery_2gram)
#processedData_dealService_features_VsmUnitNamesJiraAsQuery_2gram_normalized = normalizeData(processedData_dealService_features_VsmUnitNamesJiraAsQuery_2gram)
#processedData_dealService_features_VsmUnitNamesUnitNamesAsQuery_2gram_normalized = normalizeData(processedData_dealService_features_VsmUnitNamesUnitNamesAsQuery_2gram)
#processedData_dealService_features_VsmCommentsLogsAsQuery_2gram_normalized = normalizeData(processedData_dealService_features_VsmCommentsLogsAsQuery_2gram)
#processedData_dealService_features_VsmCommentsCommentsAsQuery_2gram_normalized = normalizeData(processedData_dealService_features_VsmCommentsCommentsAsQuery_2gram)


#Load Document Statistics Features
processedData_JIRA_dealServiceFeaturesUniqueWordCount_normalized = normalizeData(processedData_JIRA_dealServiceFeaturesUniqueWordCount)
processedData_SVN_dealServiceFeaturesUniqueWordCount_normalized = normalizeData(processedData_SVN_dealServiceFeaturesUniqueWordCount)
processedData_JIRA_dealServiceFeaturesTotalWordCount_normalized = normalizeData(processedData_JIRA_dealServiceFeaturesTotalWordCount)
processedData_SVN_dealServiceFeaturesTotalWordCount_normalized = normalizeData(processedData_SVN_dealServiceFeaturesTotalWordCount)
processedData_JIRA_dealServiceFeaturesOverlapPercentage_normalized = normalizeData(processedData_JIRA_dealServiceFeaturesOverlapPercentage)
processedData_SVN_dealServiceFeaturesOverlapPercentage_normalized = normalizeData(processedData_SVN_dealServiceFeaturesOverlapPercentage)
processedData_UNION_dealServiceFeaturesOverlapPercentage_normalized = normalizeData(processedData_UNION_dealServiceFeaturesOverlapPercentage)

#Load Query Quality Features
processedData_SVN_dealServiceFeaturesIDF_normalized = normalizeData(processedData_SVN_dealServiceFeaturesIDF)
processedData_SVNLogs_dealServiceFeaturesIDF_normalized = normalizeData(processedData_SVNLogs_dealServiceFeaturesIDF)
processedData_SVNUnitNames_dealServiceFeaturesIDF_normalized = normalizeData(processedData_SVNUnitNames_dealServiceFeaturesIDF)
processedData_JIRA_dealServiceFeaturesIDF_normalized = normalizeData(processedData_JIRA_dealServiceFeaturesIDF)
processedData_JIRASummaries_dealServiceFeaturesIDF_normalized = normalizeData(processedData_JIRASummaries_dealServiceFeaturesIDF)
processedData_JIRADescriptions_dealServiceFeaturesIDF_normalized = normalizeData(processedData_JIRADescriptions_dealServiceFeaturesIDF)
#processedData_JIRAComments_dealServiceFeaturesIDF_normalized = normalizeData(processedData_JIRAComments_dealServiceFeaturesIDF)

processedData_SVN_dealServiceFeaturesICTF_normalized = normalizeData(processedData_SVN_dealServiceFeaturesICTF)
processedData_SVNLogs_dealServiceFeaturesICTF_normalized = normalizeData(processedData_SVNLogs_dealServiceFeaturesICTF)
processedData_SVNUnitNames_dealServiceFeaturesICTF_normalized = normalizeData(processedData_SVNUnitNames_dealServiceFeaturesICTF)
processedData_JIRA_dealServiceFeaturesICTF_normalized = normalizeData(processedData_JIRA_dealServiceFeaturesICTF)
processedData_JIRASummaries_dealServiceFeaturesICTF_normalized = normalizeData(processedData_JIRASummaries_dealServiceFeaturesICTF)
processedData_JIRADescriptions_dealServiceFeaturesICTF_normalized = normalizeData(processedData_JIRADescriptions_dealServiceFeaturesICTF)
#processedData_JIRAComments_dealServiceFeaturesICTF_normalized = normalizeData(processedData_JIRAComments_dealServiceFeaturesICTF)

processedData_SVN_dealServiceFeaturesEntropy_normalized = normalizeData(processedData_SVN_dealServiceFeaturesEntropy)
processedData_SVNLogs_dealServiceFeaturesEntropy_normalized = normalizeData(processedData_SVNLogs_dealServiceFeaturesEntropy)
processedData_SVNUnitNames_dealServiceFeaturesEntropy_normalized = normalizeData(processedData_SVNUnitNames_dealServiceFeaturesEntropy)
processedData_JIRA_dealServiceFeaturesEntropy_normalized = normalizeData(processedData_JIRA_dealServiceFeaturesEntropy)
processedData_JIRASummaries_dealServiceFeaturesEntropy_normalized = normalizeData(processedData_JIRASummaries_dealServiceFeaturesEntropy)
processedData_JIRADescriptions_dealServiceFeaturesEntropy_normalized = normalizeData(processedData_JIRADescriptions_dealServiceFeaturesEntropy)
#processedData_JIRAComments_dealServiceFeaturesEntropy_normalized = normalizeData(processedData_JIRAComments_dealServiceFeaturesEntropy)

processedData_SVN_dealServiceFeaturesQueryScope_normalized = normalizeData(processedData_SVN_dealServiceFeaturesQueryScope)
processedData_SVNLogs_dealServiceFeaturesQueryScope_normalized = normalizeData(processedData_SVNLogs_dealServiceFeaturesQueryScope)
processedData_SVNUnitNames_dealServiceFeaturesQueryScope_normalized = normalizeData(processedData_SVNUnitNames_dealServiceFeaturesQueryScope)
processedData_JIRA_dealServiceFeaturesQueryScope_normalized = normalizeData(processedData_JIRA_dealServiceFeaturesQueryScope)
processedData_JIRASummaries_dealServiceFeaturesQueryScope_normalized = normalizeData(processedData_JIRASummaries_dealServiceFeaturesQueryScope)
processedData_JIRADescriptions_dealServiceFeaturesQueryScope_normalized = normalizeData(processedData_JIRADescriptions_dealServiceFeaturesQueryScope)
#processedData_JIRAComments_dealServiceFeaturesQueryScope_normalized = normalizeData(processedData_JIRAComments_dealServiceFeaturesQueryScope)

processedData_SVN_dealServiceFeaturesSCS_normalized = normalizeData(processedData_SVN_dealServiceFeaturesSCS)
processedData_SVNLogs_dealServiceFeaturesSCS_normalized = normalizeData(processedData_SVNLogs_dealServiceFeaturesSCS)
processedData_SVNUnitNames_dealServiceFeaturesSCS_normalized = normalizeData(processedData_SVNUnitNames_dealServiceFeaturesSCS)
processedData_JIRA_dealServiceFeaturesSCS_normalized = normalizeData(processedData_JIRA_dealServiceFeaturesSCS)
processedData_JIRASummaries_dealServiceFeaturesSCS_normalized = normalizeData(processedData_JIRASummaries_dealServiceFeaturesSCS)
processedData_JIRADescriptions_dealServiceFeaturesSCS_normalized = normalizeData(processedData_JIRADescriptions_dealServiceFeaturesSCS)
#processedData_JIRAComments_dealServiceFeaturesSCS_normalized = normalizeData(processedData_JIRAComments_dealServiceFeaturesSCS)

processedData_SVN_dealServiceFeaturesSCQ_normalized = normalizeData(processedData_SVN_dealServiceFeaturesSCQ)
processedData_SVNLogs_dealServiceFeaturesSCQ_normalized = normalizeData(processedData_SVNLogs_dealServiceFeaturesSCQ)
processedData_SVNUnitNames_dealServiceFeaturesSCQ_normalized = normalizeData(processedData_SVNUnitNames_dealServiceFeaturesSCQ)
processedData_JIRA_dealServiceFeaturesSCQ_normalized = normalizeData(processedData_JIRA_dealServiceFeaturesSCQ)
processedData_JIRASummaries_dealServiceFeaturesSCQ_normalized = normalizeData(processedData_JIRASummaries_dealServiceFeaturesSCQ)
processedData_JIRADescriptions_dealServiceFeaturesSCQ_normalized = normalizeData(processedData_JIRADescriptions_dealServiceFeaturesSCQ)
#processedData_JIRAComments_dealServiceFeaturesSCQ_normalized = normalizeData(processedData_JIRAComments_dealServiceFeaturesSCQ)

#processedData_SVN_dealServiceFeaturesPMI_normalized = normalizeData(processedData_SVN_dealServiceFeaturesPMI)
processedData_SVNLogs_dealServiceFeaturesPMI_normalized = normalizeData(processedData_SVNLogs_dealServiceFeaturesPMI)
#processedData_SVNUnitNames_dealServiceFeaturesPMI_normalized = normalizeData(processedData_SVNUnitNames_dealServiceFeaturesPMI)
#processedData_JIRA_dealServiceFeaturesPMI_normalized = normalizeData(processedData_JIRA_dealServiceFeaturesPMI)
processedData_JIRASummaries_dealServiceFeaturesPMI_normalized = normalizeData(processedData_JIRASummaries_dealServiceFeaturesPMI)
#processedData_JIRADescriptions_dealServiceFeaturesPMI_normalized = normalizeData(processedData_JIRADescriptions_dealServiceFeaturesPMI)
#processedData_JIRAComments_dealServiceFeaturesPMI_normalized = normalizeData(processedData_JIRAComments_dealServiceFeaturesPMI)


## 3.8 Preprocess Data - Load and transform feature families needed for training

In [107]:

#Merge features into 1 dataframe
processedData_dealServiceFeatures_normalized = pd.concat([processedData_dealServiceFeaturesTime_normalized,
                                                  processedData_dealServiceFeaturesStakeholder_normalized,
                                                  #IR-based
                                                  processedData_dealService_features_VsmLogsJiraAsQuery_normalized,
                                                  processedData_dealService_features_VsmLogsLogAsQuery_normalized,
                                                  processedData_dealService_features_VsmUnitNamesJiraAsQuery_normalized,
                                                  processedData_dealService_features_VsmUnitNamesUnitNamesAsQuery_normalized,
                                                #  processedData_dealService_features_VsmUnitNamesCommentsCommentsAsQuery_normalized,
                                                #  processedData_dealService_features_VsmUnitNamesCommentsUnitNamesAsQuery_normalized,
                                                  processedData_dealService_features_VsmUnitNamesDescriptionDescriptionAsQuery_normalized,
                                                  processedData_dealService_features_VsmUnitNamesDescriptionUnitNamesAsQuery_normalized,
                                                  processedData_dealService_features_VsmSummaryLogsSummaryAsQuery_normalized,
                                                  processedData_dealService_features_VsmSummaryLogsLogsAsQuery_normalized,
                                                  processedData_dealService_features_VsmSummaryUnitNamesSummaryAsQuery_normalized,
                                                  processedData_dealService_features_VsmSummaryUnitNamesUnitNamesAsQuery_normalized,
                                                  processedData_dealService_features_VsmDescriptionDescriptionAsQuery_normalized,
                                                  processedData_dealService_features_VsmDescriptionLogsAsQuery_normalized,
                                                 # processedData_dealService_features_VsmCommentsCommentsAsQuery_normalized,
                                                #  processedData_dealService_features_VsmCommentsLogsAsQuery_normalized,
                                                 # processedData_dealService_features_VsmLogsJiraAsQuery_2gram_normalized,
                                                 # processedData_dealService_features_VsmLogsLogAsQuery_2gram_normalized,
                                                 # processedData_dealService_features_VsmUnitNamesJiraAsQuery_2gram_normalized,
                                                 # processedData_dealService_features_VsmUnitNamesUnitNamesAsQuery_2gram_normalized,
                                                  #processedData_dealService_features_VsmVerbPruningUnitNamesJiraAsQuery_normalized,
                                                 # processedData_dealService_features_VsmVerbPruningUnitNamesUnitNamesAsQuery_normalized,
                                                  processedData_dealService_features_VsmSvnJiraJiraAsQuery_normalized,
                                                  processedData_dealService_features_VsmSvnJiraSvnAsQuery_normalized,
                                                  processedData_dealService_features_VsmSvnSummarySvnAsQuery_normalized,
                                                  processedData_dealService_features_VsmSvnSummarySummaryAsQuery_normalized,
                                                  processedData_dealService_features_VsmSvnDescriptionSvnAsQuery_normalized,
                                                  processedData_dealService_features_VsmSvnDescriptionDescriptionAsQuery_normalized,
                                                #  processedData_dealService_features_VsmSvnCommentsSvnAsQuery_normalized,
                                                #  processedData_dealService_features_VsmSvnCommentsCommentsAsQuery_normalized,

                                                  
                                                  #Document Statistics
                                                  processedData_JIRA_dealServiceFeaturesUniqueWordCount_normalized,
                                                  processedData_SVN_dealServiceFeaturesUniqueWordCount_normalized,
                                                  processedData_JIRA_dealServiceFeaturesTotalWordCount_normalized,
                                                  processedData_SVN_dealServiceFeaturesTotalWordCount_normalized,
                                                  processedData_JIRA_dealServiceFeaturesOverlapPercentage_normalized,
                                                  processedData_SVN_dealServiceFeaturesOverlapPercentage_normalized,
                                                  processedData_UNION_dealServiceFeaturesOverlapPercentage_normalized,
                                                 #Query Quality
                                                  processedData_SVN_dealServiceFeaturesIDF_normalized['SvnAsQuery_avgIDF'],
                                                  processedData_SVN_dealServiceFeaturesIDF_normalized['SvnAsQuery_maxIDF'],
                                                  processedData_SVN_dealServiceFeaturesIDF_normalized['SvnAsQuery_devIDF'],
                                                  processedData_SVNLogs_dealServiceFeaturesIDF_normalized['SvnLogsAsQuery_avgIDF'],
                                                  processedData_SVNLogs_dealServiceFeaturesIDF_normalized['SvnLogsAsQuery_maxIDF'],
                                                  processedData_SVNLogs_dealServiceFeaturesIDF_normalized['SvnLogsAsQuery_devIDF'],
                                                  processedData_SVNUnitNames_dealServiceFeaturesIDF_normalized['SvnUnitNamesAsQuery_avgIDF'],
                                                  processedData_SVNUnitNames_dealServiceFeaturesIDF_normalized['SvnUnitNamesAsQuery_maxIDF'],
                                                  processedData_SVNUnitNames_dealServiceFeaturesIDF_normalized['SvnUnitNamesAsQuery_devIDF'],
                                                  processedData_JIRA_dealServiceFeaturesIDF_normalized['JiraAsQuery_avgIDF'],
                                                  processedData_JIRA_dealServiceFeaturesIDF_normalized['JiraAsQuery_maxIDF'],
                                                  processedData_JIRA_dealServiceFeaturesIDF_normalized['JiraAsQuery_devIDF'],  
                                                  processedData_JIRASummaries_dealServiceFeaturesIDF_normalized['JiraSummariesAsQuery_avgIDF'],
                                                  processedData_JIRASummaries_dealServiceFeaturesIDF_normalized['JiraSummariesAsQuery_maxIDF'],
                                                  processedData_JIRASummaries_dealServiceFeaturesIDF_normalized['JiraSummariesAsQuery_devIDF'],  
                                                  processedData_JIRADescriptions_dealServiceFeaturesIDF_normalized['JiraDescriptionsAsQuery_avgIDF'],
                                                  processedData_JIRADescriptions_dealServiceFeaturesIDF_normalized['JiraDescriptionsAsQuery_maxIDF'],
                                                  processedData_JIRADescriptions_dealServiceFeaturesIDF_normalized['JiraDescriptionsAsQuery_devIDF'],  
                                                #  processedData_JIRAComments_dealServiceFeaturesIDF_normalized['JiraCommentsAsQuery_avgIDF'],
                                                #  processedData_JIRAComments_dealServiceFeaturesIDF_normalized['JiraCommentsAsQuery_maxIDF'],
                                                #  processedData_JIRAComments_dealServiceFeaturesIDF_normalized['JiraCommentsAsQuery_devIDF'],  
                                                  
                                                  processedData_SVN_dealServiceFeaturesICTF_normalized["SvnAsQuery_avgICTF"],
                                                  processedData_SVN_dealServiceFeaturesICTF_normalized["SvnAsQuery_maxICTF"],
                                                  processedData_SVN_dealServiceFeaturesICTF_normalized["SvnAsQuery_devICTF"],
                                                  processedData_SVNLogs_dealServiceFeaturesICTF_normalized["SvnLogsAsQuery_avgICTF"],
                                                  processedData_SVNLogs_dealServiceFeaturesICTF_normalized["SvnLogsAsQuery_maxICTF"],
                                                  processedData_SVNLogs_dealServiceFeaturesICTF_normalized["SvnLogsAsQuery_devICTF"],
                                                  processedData_SVNUnitNames_dealServiceFeaturesICTF_normalized["SvnUnitNamesAsQuery_avgICTF"],
                                                  processedData_SVNUnitNames_dealServiceFeaturesICTF_normalized["SvnUnitNamesAsQuery_maxICTF"],
                                                  processedData_SVNUnitNames_dealServiceFeaturesICTF_normalized["SvnUnitNamesAsQuery_devICTF"],
                                                  processedData_JIRA_dealServiceFeaturesICTF_normalized["JiraAsQuery_avgICTF"],
                                                  processedData_JIRA_dealServiceFeaturesICTF_normalized["JiraAsQuery_maxICTF"],
                                                  processedData_JIRA_dealServiceFeaturesICTF_normalized["JiraAsQuery_devICTF"],
                                                  processedData_JIRASummaries_dealServiceFeaturesICTF_normalized["JiraSummariesAsQuery_avgICTF"],
                                                  processedData_JIRASummaries_dealServiceFeaturesICTF_normalized["JiraSummariesAsQuery_maxICTF"],
                                                  processedData_JIRASummaries_dealServiceFeaturesICTF_normalized["JiraSummariesAsQuery_devICTF"],
                                                  processedData_JIRADescriptions_dealServiceFeaturesICTF_normalized["JiraDescriptionsAsQuery_avgICTF"],
                                                  processedData_JIRADescriptions_dealServiceFeaturesICTF_normalized["JiraDescriptionsAsQuery_maxICTF"],
                                                  processedData_JIRADescriptions_dealServiceFeaturesICTF_normalized["JiraDescriptionsAsQuery_devICTF"],
                                              #    processedData_JIRAComments_dealServiceFeaturesICTF_normalized["JiraCommentsAsQuery_avgICTF"],
                                              #    processedData_JIRAComments_dealServiceFeaturesICTF_normalized["JiraCommentsAsQuery_maxICTF"],
                                              #    processedData_JIRAComments_dealServiceFeaturesICTF_normalized["JiraCommentsAsQuery_devICTF"],
                                                  
                                                  processedData_SVN_dealServiceFeaturesEntropy_normalized["SvnAsQuery_avgEntropy"],
                                                  processedData_SVN_dealServiceFeaturesEntropy_normalized["SvnAsQuery_medEntropy"],
                                                  processedData_SVN_dealServiceFeaturesEntropy_normalized["SvnAsQuery_maxEntropy"],
                                                  processedData_SVN_dealServiceFeaturesEntropy_normalized["SvnAsQuery_devEntropy"],
                                                  processedData_SVNLogs_dealServiceFeaturesEntropy_normalized["SvnLogsAsQuery_avgEntropy"],
                                                  processedData_SVNLogs_dealServiceFeaturesEntropy_normalized["SvnLogsAsQuery_medEntropy"],
                                                  processedData_SVNLogs_dealServiceFeaturesEntropy_normalized["SvnLogsAsQuery_maxEntropy"],
                                                  processedData_SVNLogs_dealServiceFeaturesEntropy_normalized["SvnLogsAsQuery_devEntropy"],
                                                  processedData_SVNUnitNames_dealServiceFeaturesEntropy_normalized["SvnUnitNamesAsQuery_avgEntropy"],
                                                  processedData_SVNUnitNames_dealServiceFeaturesEntropy_normalized["SvnUnitNamesAsQuery_medEntropy"],
                                                  processedData_SVNUnitNames_dealServiceFeaturesEntropy_normalized["SvnUnitNamesAsQuery_maxEntropy"],
                                                  processedData_SVNUnitNames_dealServiceFeaturesEntropy_normalized["SvnUnitNamesAsQuery_devEntropy"],
                                                  processedData_JIRA_dealServiceFeaturesEntropy_normalized["JiraAsQuery_avgEntropy"],
                                                  processedData_JIRA_dealServiceFeaturesEntropy_normalized["JiraAsQuery_medEntropy"],
                                                  processedData_JIRA_dealServiceFeaturesEntropy_normalized["JiraAsQuery_maxEntropy"],
                                                  processedData_JIRA_dealServiceFeaturesEntropy_normalized["JiraAsQuery_devEntropy"],
                                                  processedData_JIRASummaries_dealServiceFeaturesEntropy_normalized["JiraSummariesAsQuery_avgEntropy"],
                                                  processedData_JIRASummaries_dealServiceFeaturesEntropy_normalized["JiraSummariesAsQuery_medEntropy"],
                                                  processedData_JIRASummaries_dealServiceFeaturesEntropy_normalized["JiraSummariesAsQuery_maxEntropy"],
                                                  processedData_JIRASummaries_dealServiceFeaturesEntropy_normalized["JiraSummariesAsQuery_devEntropy"],
                                                  processedData_JIRADescriptions_dealServiceFeaturesEntropy_normalized["JiraDescriptionsAsQuery_avgEntropy"],
                                                  processedData_JIRADescriptions_dealServiceFeaturesEntropy_normalized["JiraDescriptionsAsQuery_medEntropy"],
                                                  processedData_JIRADescriptions_dealServiceFeaturesEntropy_normalized["JiraDescriptionsAsQuery_maxEntropy"],
                                                  processedData_JIRADescriptions_dealServiceFeaturesEntropy_normalized["JiraDescriptionsAsQuery_devEntropy"],
                                               #   processedData_JIRAComments_dealServiceFeaturesEntropy_normalized["JiraCommentsAsQuery_avgEntropy"],
                                               #   processedData_JIRAComments_dealServiceFeaturesEntropy_normalized["JiraCommentsAsQuery_medEntropy"],
                                               #   processedData_JIRAComments_dealServiceFeaturesEntropy_normalized["JiraCommentsAsQuery_maxEntropy"],
                                               #   processedData_JIRAComments_dealServiceFeaturesEntropy_normalized["JiraCommentsAsQuery_devEntropy"],
                                                  
                                                  processedData_SVN_dealServiceFeaturesQueryScope_normalized,
                                                  processedData_SVNLogs_dealServiceFeaturesQueryScope_normalized,
                                                  processedData_SVNUnitNames_dealServiceFeaturesQueryScope_normalized,
                                                  processedData_JIRA_dealServiceFeaturesQueryScope_normalized,
                                                  processedData_JIRASummaries_dealServiceFeaturesQueryScope_normalized,
                                                  processedData_JIRADescriptions_dealServiceFeaturesQueryScope_normalized,
                                                #  processedData_JIRAComments_dealServiceFeaturesQueryScope_normalized,
                                                  
                                                  processedData_SVN_dealServiceFeaturesSCS_normalized,
                                                  processedData_SVNLogs_dealServiceFeaturesSCS_normalized,
                                                  processedData_SVNUnitNames_dealServiceFeaturesSCS_normalized,
                                                  processedData_JIRA_dealServiceFeaturesSCS_normalized,
                                                  processedData_JIRASummaries_dealServiceFeaturesSCS_normalized,
                                                  processedData_JIRADescriptions_dealServiceFeaturesSCS_normalized,
                                                #  processedData_JIRAComments_dealServiceFeaturesSCS_normalized,
                                                  
                                                  processedData_SVN_dealServiceFeaturesSCQ_normalized["SvnAsQuery_avgSCQ"],
                                                  processedData_SVN_dealServiceFeaturesSCQ_normalized["SvnAsQuery_maxSCQ"],
                                                  processedData_SVN_dealServiceFeaturesSCQ_normalized["SvnAsQuery_sumSCQ"],
                                                  processedData_SVNLogs_dealServiceFeaturesSCQ_normalized["SvnLogsAsQuery_avgSCQ"],
                                                  processedData_SVNLogs_dealServiceFeaturesSCQ_normalized["SvnLogsAsQuery_maxSCQ"],
                                                  processedData_SVNLogs_dealServiceFeaturesSCQ_normalized["SvnLogsAsQuery_sumSCQ"],
                                                  processedData_SVNUnitNames_dealServiceFeaturesSCQ_normalized["SvnUnitNamesAsQuery_avgSCQ"],
                                                  processedData_SVNUnitNames_dealServiceFeaturesSCQ_normalized["SvnUnitNamesAsQuery_maxSCQ"],
                                                  processedData_SVNUnitNames_dealServiceFeaturesSCQ_normalized["SvnUnitNamesAsQuery_sumSCQ"],
                                                  processedData_JIRA_dealServiceFeaturesSCQ_normalized["JiraAsQuery_avgSCQ"],
                                                  processedData_JIRA_dealServiceFeaturesSCQ_normalized["JiraAsQuery_maxSCQ"],
                                                  processedData_JIRA_dealServiceFeaturesSCQ_normalized["JiraAsQuery_sumSCQ"],
                                                  processedData_JIRASummaries_dealServiceFeaturesSCQ_normalized["JiraSummariesAsQuery_avgSCQ"],
                                                  processedData_JIRASummaries_dealServiceFeaturesSCQ_normalized["JiraSummariesAsQuery_maxSCQ"],
                                                  processedData_JIRASummaries_dealServiceFeaturesSCQ_normalized["JiraSummariesAsQuery_sumSCQ"],
                                                  processedData_JIRADescriptions_dealServiceFeaturesSCQ_normalized["JiraDescriptionsAsQuery_avgSCQ"],
                                                  processedData_JIRADescriptions_dealServiceFeaturesSCQ_normalized["JiraDescriptionsAsQuery_maxSCQ"],
                                                  processedData_JIRADescriptions_dealServiceFeaturesSCQ_normalized["JiraDescriptionsAsQuery_sumSCQ"],
                                                 # processedData_JIRAComments_dealServiceFeaturesSCQ_normalized["JiraCommentsAsQuery_avgSCQ"],
                                                #  processedData_JIRAComments_dealServiceFeaturesSCQ_normalized["JiraCommentsAsQuery_maxSCQ"],
                                                #  processedData_JIRAComments_dealServiceFeaturesSCQ_normalized["JiraCommentsAsQuery_sumSCQ"],
                                                  
                                                 # processedData_SVN_dealServiceFeaturesPMI_normalized["SvnAsQuery_avgPMI"],
                                                 # processedData_SVN_dealServiceFeaturesPMI_normalized["SvnAsQuery_maxPMI"],
                                                  processedData_SVNLogs_dealServiceFeaturesPMI_normalized["SvnLogsAsQuery_avgPMI"],
                                                  processedData_SVNLogs_dealServiceFeaturesPMI_normalized["SvnLogsAsQuery_maxPMI"],
                                                 # processedData_SVNUnitNames_dealServiceFeaturesPMI_normalized["SvnUnitNamesAsQuery_avgPMI"],
                                                 # processedData_SVNUnitNames_dealServiceFeaturesPMI_normalized["SvnUnitNamesAsQuery_maxPMI"],
                                                 # processedData_JIRA_dealServiceFeaturesPMI_normalized["JiraAsQuery_avgPMI"],
                                                 # processedData_JIRA_dealServiceFeaturesPMI_normalized["JiraAsQuery_maxPMI"],
                                                  processedData_JIRASummaries_dealServiceFeaturesPMI_normalized["JiraSummariesAsQuery_avgPMI"],
                                                  processedData_JIRASummaries_dealServiceFeaturesPMI_normalized["JiraSummariesAsQuery_maxPMI"],
                                                 # processedData_JIRADescriptions_dealServiceFeaturesPMI_normalized["JiraDescriptionsAsQuery_avgPMI"],
                                                 # processedData_JIRADescriptions_dealServiceFeaturesPMI_normalized["JiraDescriptionsAsQuery_maxPMI"],
                                                #  processedData_JIRAComments_dealServiceFeaturesPMI_normalized["JiraCommentsAsQuery_avgPMI"],
                                               #   processedData_JIRAComments_dealServiceFeaturesPMI_normalized["JiraCommentssAsQuery_maxPMI"],                                                  
                                                 ], axis=1)
#Set the NaN to 0
processedData_dealServiceFeatures_normalized = processedData_dealServiceFeatures_normalized.fillna(0)

#Saving feature names for later use
processedData_dealServiceFeatureNames_normalized = list(processedData_dealServiceFeatures_normalized.columns)

#Transform pandas data frame into numpy arrays
processedData_dealServiceFeatures_normalized = np.array(processedData_dealServiceFeatures_normalized)

#Load labels
processedData_dealServiceLabels_normalized = pd.read_pickle(r'../data/03_processed/processedData_dealServiceLabels.pkl')
processedData_dealServiceLabels_normalized = np.array(processedData_dealServiceLabels_normalized["is_valid"])


In [108]:

#Merge features into 1 dataframe
processedData_dealServiceFeatures = pd.concat([processedData_dealServiceFeaturesTime,
                                                  processedData_dealServiceFeaturesStakeholder,
                                                  #IR-based
                                                  processedData_dealService_features_VsmLogsJiraAsQuery,
                                                  processedData_dealService_features_VsmLogsLogAsQuery,
                                                  processedData_dealService_features_VsmUnitNamesJiraAsQuery,
                                                  processedData_dealService_features_VsmUnitNamesUnitNamesAsQuery,
                                                 # processedData_dealService_features_VsmUnitNamesCommentsCommentsAsQuery,
                                                 # processedData_dealService_features_VsmUnitNamesCommentsUnitNamesAsQuery,
                                                  processedData_dealService_features_VsmUnitNamesDescriptionDescriptionAsQuery,
                                                  processedData_dealService_features_VsmUnitNamesDescriptionUnitNamesAsQuery,
                                                  processedData_dealService_features_VsmSummaryLogsSummaryAsQuery,
                                                  processedData_dealService_features_VsmSummaryLogsLogsAsQuery,
                                                  processedData_dealService_features_VsmSummaryUnitNamesSummaryAsQuery,
                                                  processedData_dealService_features_VsmSummaryUnitNamesUnitNamesAsQuery,
                                                  processedData_dealService_features_VsmDescriptionDescriptionAsQuery,
                                                  processedData_dealService_features_VsmDescriptionLogsAsQuery,
                                                 # processedData_dealService_features_VsmLogsJiraAsQuery_2gram,
                                                 # processedData_dealService_features_VsmLogsLogAsQuery_2gram,
                                                 # processedData_dealService_features_VsmUnitNamesJiraAsQuery_2gram,
                                                 # processedData_dealService_features_VsmUnitNamesUnitNamesAsQuery_2gram,
                                                 # processedData_dealService_features_VsmVerbPruningUnitNamesJiraAsQuery,
                                                 # processedData_dealService_features_VsmVerbPruningUnitNamesUnitNamesAsQuery,
                                                  processedData_dealService_features_VsmSvnJiraJiraAsQuery,
                                                  processedData_dealService_features_VsmSvnJiraSvnAsQuery,
                                                  processedData_dealService_features_VsmSvnSummarySvnAsQuery,
                                                  processedData_dealService_features_VsmSvnSummarySummaryAsQuery,
                                                  processedData_dealService_features_VsmSvnDescriptionSvnAsQuery,
                                                  processedData_dealService_features_VsmSvnDescriptionDescriptionAsQuery,
                                                #  processedData_dealService_features_VsmSvnCommentsSvnAsQuery,
                                                #  processedData_dealService_features_VsmSvnCommentsCommentsAsQuery,

                                                  
                                                  #Document Statistics
                                                  processedData_JIRA_dealServiceFeaturesUniqueWordCount,
                                                  processedData_SVN_dealServiceFeaturesUniqueWordCount,
                                                  processedData_JIRA_dealServiceFeaturesTotalWordCount,
                                                  processedData_SVN_dealServiceFeaturesTotalWordCount,
                                                  processedData_JIRA_dealServiceFeaturesOverlapPercentage,
                                                  processedData_SVN_dealServiceFeaturesOverlapPercentage,
                                                  processedData_UNION_dealServiceFeaturesOverlapPercentage,
                                                 #Query Quality
                                                  processedData_SVN_dealServiceFeaturesIDF['SvnAsQuery_avgIDF'],
                                                  processedData_SVN_dealServiceFeaturesIDF['SvnAsQuery_maxIDF'],
                                                  processedData_SVN_dealServiceFeaturesIDF['SvnAsQuery_devIDF'],
                                                  processedData_SVNLogs_dealServiceFeaturesIDF['SvnLogsAsQuery_avgIDF'],
                                                  processedData_SVNLogs_dealServiceFeaturesIDF['SvnLogsAsQuery_maxIDF'],
                                                  processedData_SVNLogs_dealServiceFeaturesIDF['SvnLogsAsQuery_devIDF'],
                                                  processedData_SVNUnitNames_dealServiceFeaturesIDF['SvnUnitNamesAsQuery_avgIDF'],
                                                  processedData_SVNUnitNames_dealServiceFeaturesIDF['SvnUnitNamesAsQuery_maxIDF'],
                                                  processedData_SVNUnitNames_dealServiceFeaturesIDF['SvnUnitNamesAsQuery_devIDF'],
                                                  processedData_JIRA_dealServiceFeaturesIDF['JiraAsQuery_avgIDF'],
                                                  processedData_JIRA_dealServiceFeaturesIDF['JiraAsQuery_maxIDF'],
                                                  processedData_JIRA_dealServiceFeaturesIDF['JiraAsQuery_devIDF'], 
                                                  processedData_JIRASummaries_dealServiceFeaturesIDF['JiraSummariesAsQuery_avgIDF'],
                                                  processedData_JIRASummaries_dealServiceFeaturesIDF['JiraSummariesAsQuery_maxIDF'],
                                                  processedData_JIRASummaries_dealServiceFeaturesIDF['JiraSummariesAsQuery_devIDF'], 
                                                  processedData_JIRADescriptions_dealServiceFeaturesIDF['JiraDescriptionsAsQuery_avgIDF'],
                                                  processedData_JIRADescriptions_dealServiceFeaturesIDF['JiraDescriptionsAsQuery_maxIDF'],
                                                  processedData_JIRADescriptions_dealServiceFeaturesIDF['JiraDescriptionsAsQuery_devIDF'], 
                                                #  processedData_JIRAComments_dealServiceFeaturesIDF['JiraCommentsAsQuery_avgIDF'],
                                               #   processedData_JIRAComments_dealServiceFeaturesIDF['JiraCommentsAsQuery_maxIDF'],
                                               #   processedData_JIRAComments_dealServiceFeaturesIDF['JiraCommentsAsQuery_devIDF'], 
                                                  
                                                  processedData_SVN_dealServiceFeaturesICTF["SvnAsQuery_avgICTF"],
                                                  processedData_SVN_dealServiceFeaturesICTF["SvnAsQuery_maxICTF"],
                                                  processedData_SVN_dealServiceFeaturesICTF["SvnAsQuery_devICTF"],
                                                  processedData_SVNLogs_dealServiceFeaturesICTF["SvnLogsAsQuery_avgICTF"],
                                                  processedData_SVNLogs_dealServiceFeaturesICTF["SvnLogsAsQuery_maxICTF"],
                                                  processedData_SVNLogs_dealServiceFeaturesICTF["SvnLogsAsQuery_devICTF"],
                                                  processedData_SVNUnitNames_dealServiceFeaturesICTF["SvnUnitNamesAsQuery_avgICTF"],
                                                  processedData_SVNUnitNames_dealServiceFeaturesICTF["SvnUnitNamesAsQuery_maxICTF"],
                                                  processedData_SVNUnitNames_dealServiceFeaturesICTF["SvnUnitNamesAsQuery_devICTF"],
                                                  processedData_JIRA_dealServiceFeaturesICTF["JiraAsQuery_avgICTF"],
                                                  processedData_JIRA_dealServiceFeaturesICTF["JiraAsQuery_maxICTF"],
                                                  processedData_JIRA_dealServiceFeaturesICTF["JiraAsQuery_devICTF"],
                                                  processedData_JIRASummaries_dealServiceFeaturesICTF["JiraSummariesAsQuery_avgICTF"],
                                                  processedData_JIRASummaries_dealServiceFeaturesICTF["JiraSummariesAsQuery_maxICTF"],
                                                  processedData_JIRASummaries_dealServiceFeaturesICTF["JiraSummariesAsQuery_devICTF"],
                                                  processedData_JIRADescriptions_dealServiceFeaturesICTF["JiraDescriptionsAsQuery_avgICTF"],
                                                  processedData_JIRADescriptions_dealServiceFeaturesICTF["JiraDescriptionsAsQuery_maxICTF"],
                                                  processedData_JIRADescriptions_dealServiceFeaturesICTF["JiraDescriptionsAsQuery_devICTF"],
                                                 # processedData_JIRAComments_dealServiceFeaturesICTF["JiraCommentsAsQuery_avgICTF"],
                                                #  processedData_JIRAComments_dealServiceFeaturesICTF["JiraCommentsAsQuery_maxICTF"],
                                               #   processedData_JIRAComments_dealServiceFeaturesICTF["JiraCommentsAsQuery_devICTF"],
                                                  
                                                  processedData_SVN_dealServiceFeaturesEntropy["SvnAsQuery_avgEntropy"],
                                                  processedData_SVN_dealServiceFeaturesEntropy["SvnAsQuery_medEntropy"],
                                                  processedData_SVN_dealServiceFeaturesEntropy["SvnAsQuery_maxEntropy"],
                                                  processedData_SVN_dealServiceFeaturesEntropy["SvnAsQuery_devEntropy"],
                                                  processedData_SVNLogs_dealServiceFeaturesEntropy["SvnLogsAsQuery_avgEntropy"],
                                                  processedData_SVNLogs_dealServiceFeaturesEntropy["SvnLogsAsQuery_medEntropy"],
                                                  processedData_SVNLogs_dealServiceFeaturesEntropy["SvnLogsAsQuery_maxEntropy"],
                                                  processedData_SVNLogs_dealServiceFeaturesEntropy["SvnLogsAsQuery_devEntropy"],
                                                  processedData_SVNUnitNames_dealServiceFeaturesEntropy["SvnUnitNamesAsQuery_avgEntropy"],
                                                  processedData_SVNUnitNames_dealServiceFeaturesEntropy["SvnUnitNamesAsQuery_medEntropy"],
                                                  processedData_SVNUnitNames_dealServiceFeaturesEntropy["SvnUnitNamesAsQuery_maxEntropy"],
                                                  processedData_SVNUnitNames_dealServiceFeaturesEntropy["SvnUnitNamesAsQuery_devEntropy"],
                                                  processedData_JIRA_dealServiceFeaturesEntropy["JiraAsQuery_avgEntropy"],
                                                  processedData_JIRA_dealServiceFeaturesEntropy["JiraAsQuery_medEntropy"],
                                                  processedData_JIRA_dealServiceFeaturesEntropy["JiraAsQuery_maxEntropy"],
                                                  processedData_JIRA_dealServiceFeaturesEntropy["JiraAsQuery_devEntropy"],
                                                  processedData_JIRASummaries_dealServiceFeaturesEntropy["JiraSummariesAsQuery_avgEntropy"],
                                                  processedData_JIRASummaries_dealServiceFeaturesEntropy["JiraSummariesAsQuery_medEntropy"],
                                                  processedData_JIRASummaries_dealServiceFeaturesEntropy["JiraSummariesAsQuery_maxEntropy"],
                                                  processedData_JIRASummaries_dealServiceFeaturesEntropy["JiraSummariesAsQuery_devEntropy"],
                                                  processedData_JIRADescriptions_dealServiceFeaturesEntropy["JiraDescriptionsAsQuery_avgEntropy"],
                                                  processedData_JIRADescriptions_dealServiceFeaturesEntropy["JiraDescriptionsAsQuery_medEntropy"],
                                                  processedData_JIRADescriptions_dealServiceFeaturesEntropy["JiraDescriptionsAsQuery_maxEntropy"],
                                                  processedData_JIRADescriptions_dealServiceFeaturesEntropy["JiraDescriptionsAsQuery_devEntropy"],
                                                #  processedData_JIRAComments_dealServiceFeaturesEntropy["JiraCommentsAsQuery_avgEntropy"],
                                                 # processedData_JIRAComments_dealServiceFeaturesEntropy["JiraCommentsAsQuery_medEntropy"],
                                                 # processedData_JIRAComments_dealServiceFeaturesEntropy["JiraCommentsAsQuery_maxEntropy"],
                                                 # processedData_JIRAComments_dealServiceFeaturesEntropy["JiraCommentsAsQuery_devEntropy"],
                                                  
                                                  processedData_SVN_dealServiceFeaturesQueryScope,
                                                  processedData_SVNLogs_dealServiceFeaturesQueryScope,
                                                  processedData_SVNUnitNames_dealServiceFeaturesQueryScope,
                                                  processedData_JIRA_dealServiceFeaturesQueryScope,
                                                  processedData_JIRASummaries_dealServiceFeaturesQueryScope,
                                                  processedData_JIRADescriptions_dealServiceFeaturesQueryScope,
                                                #  processedData_JIRAComments_dealServiceFeaturesQueryScope,
                                                  
                                                  processedData_SVN_dealServiceFeaturesSCS,
                                                  processedData_SVNLogs_dealServiceFeaturesSCS,
                                                  processedData_SVNUnitNames_dealServiceFeaturesSCS,
                                                  processedData_JIRA_dealServiceFeaturesSCS,
                                                  processedData_JIRASummaries_dealServiceFeaturesSCS,
                                                  processedData_JIRADescriptions_dealServiceFeaturesSCS,
                                                 # processedData_JIRAComments_dealServiceFeaturesSCS,
                                                  
                                                  processedData_SVN_dealServiceFeaturesSCQ["SvnAsQuery_avgSCQ"],
                                                  processedData_SVN_dealServiceFeaturesSCQ["SvnAsQuery_maxSCQ"],
                                                  processedData_SVN_dealServiceFeaturesSCQ["SvnAsQuery_sumSCQ"],
                                                  processedData_SVNLogs_dealServiceFeaturesSCQ["SvnLogsAsQuery_avgSCQ"],
                                                  processedData_SVNLogs_dealServiceFeaturesSCQ["SvnLogsAsQuery_maxSCQ"],
                                                  processedData_SVNLogs_dealServiceFeaturesSCQ["SvnLogsAsQuery_sumSCQ"],
                                                  processedData_SVNUnitNames_dealServiceFeaturesSCQ["SvnUnitNamesAsQuery_avgSCQ"],
                                                  processedData_SVNUnitNames_dealServiceFeaturesSCQ["SvnUnitNamesAsQuery_maxSCQ"],
                                                  processedData_SVNUnitNames_dealServiceFeaturesSCQ["SvnUnitNamesAsQuery_sumSCQ"],
                                                  processedData_JIRA_dealServiceFeaturesSCQ["JiraAsQuery_avgSCQ"],
                                                  processedData_JIRA_dealServiceFeaturesSCQ["JiraAsQuery_maxSCQ"],
                                                  processedData_JIRA_dealServiceFeaturesSCQ["JiraAsQuery_sumSCQ"],
                                                  processedData_JIRASummaries_dealServiceFeaturesSCQ["JiraSummariesAsQuery_avgSCQ"],
                                                  processedData_JIRASummaries_dealServiceFeaturesSCQ["JiraSummariesAsQuery_maxSCQ"],
                                                  processedData_JIRASummaries_dealServiceFeaturesSCQ["JiraSummariesAsQuery_sumSCQ"],
                                                  processedData_JIRADescriptions_dealServiceFeaturesSCQ["JiraDescriptionsAsQuery_avgSCQ"],
                                                  processedData_JIRADescriptions_dealServiceFeaturesSCQ["JiraDescriptionsAsQuery_maxSCQ"],
                                                  processedData_JIRADescriptions_dealServiceFeaturesSCQ["JiraDescriptionsAsQuery_sumSCQ"],
                                                 # processedData_JIRAComments_dealServiceFeaturesSCQ["JiraCommentsAsQuery_avgSCQ"],
                                                 # processedData_JIRAComments_dealServiceFeaturesSCQ["JiraCommentsAsQuery_maxSCQ"],
                                                 # processedData_JIRAComments_dealServiceFeaturesSCQ["JiraCommentsAsQuery_sumSCQ"],
                                                  
                                                  #processedData_SVN_dealServiceFeaturesPMI["SvnAsQuery_avgPMI"],
                                                  #processedData_SVN_dealServiceFeaturesPMI["SvnAsQuery_maxPMI"],
                                                  processedData_SVNLogs_dealServiceFeaturesPMI["SvnLogsAsQuery_avgPMI"],
                                                  processedData_SVNLogs_dealServiceFeaturesPMI["SvnLogsAsQuery_maxPMI"],
                                                 # processedData_SVNUnitNames_dealServiceFeaturesPMI["SvnUnitNamesAsQuery_avgPMI"],
                                                 # processedData_SVNUnitNames_dealServiceFeaturesPMI["SvnUnitNamesAsQuery_maxPMI"],
                                                 # processedData_JIRA_dealServiceFeaturesPMI["JiraAsQuery_avgPMI"],
                                                 # processedData_JIRA_dealServiceFeaturesPMI["JiraAsQuery_maxPMI"],
                                                  processedData_JIRASummaries_dealServiceFeaturesPMI["JiraSummariesAsQuery_avgPMI"],
                                                  processedData_JIRASummaries_dealServiceFeaturesPMI["JiraSummariesAsQuery_maxPMI"],
                                                #  processedData_JIRADescriptions_dealServiceFeaturesPMI["JiraDescriptionsAsQuery_avgPMI"],
                                                #  processedData_JIRADescriptions_dealServiceFeaturesPMI["JiraDescriptionsAsQuery_maxPMI"],
                                                  #processedData_JIRAComments_dealServiceFeaturesPMI["JiraCommentsAsQuery_avgPMI"],
                                                  #processedData_JIRAComments_dealServiceFeaturesPMI["JiraCommentssAsQuery_maxPMI"],
                                                 ], axis=1)
#Set the NaN to 0
processedData_dealServiceFeatures = processedData_dealServiceFeatures.fillna(0)

#Saving feature names for later use
processedData_dealServiceFeatureNames = list(processedData_dealServiceFeatures.columns)

#Transform pandas data frame into numpy arrays
processedData_dealServiceFeatures = np.array(processedData_dealServiceFeatures)

#Load labels
processedData_dealServiceLabels = pd.read_pickle(r'../data/03_processed/processedData_dealServiceLabels.pkl')
processedData_dealServiceLabels = np.array(processedData_dealServiceLabels["is_valid"])


# 4. Modeling - Normalization
First select which data set to train:


In [109]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve



def showModelPerformance(trainedModel, testFeatures, testLabels):
    # Use the forest's predict method on the test data
    predictionLabels = trainedModel.predict(testFeatures)
    
    accuracyValue = accuracy_score(testLabels.astype(bool), predictionLabels)
    precisionValue = precision_score(testLabels.astype(bool), predictionLabels, average='binary')
    f1Value = f1_score(testLabels.astype(bool), predictionLabels)
    f2Value = fbeta_score(testLabels.astype(bool), predictionLabels, beta=2.0)
    f05Value = fbeta_score(testLabels.astype(bool), predictionLabels, beta=0.5)
    recallValue = recall_score(testLabels.astype(bool), predictionLabels)
    averagePrecisionValue = average_precision_score(testLabels.astype(bool), predictionLabels)
          
    performanceData = {'Accuracy':  [accuracyValue],
                       'Precision': [precisionValue],
                       'Recall': [recallValue],
                       'F1': [f1Value],
                       'F2': [f2Value],
                       'F0.5': [f05Value],
                       'Average Precision': [averagePrecisionValue]
                      }
    performanceDf = pd.DataFrame(performanceData)
    return(performanceDf)

In [110]:
features_normalized = processedData_dealServiceFeatures_normalized
labels_normalized = processedData_dealServiceLabels_normalized

## 4.1 Rebalancing Strategy - None

### 4.1.1 Random Forests

In [111]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

none_randomforest_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    pipeline = Pipeline(steps = [#['smote', SMOTE(sampling_strategy = 0.5, n_jobs=2)],
                              #['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    none_randomforest_normalized_performance_df = pd.concat([none_randomforest_normalized_performance_df, new_performance_df])
    
none_randomforest_normalized_performance_df.to_csv("../data/05_model_output/none_randomforest_normalized_performance_df.csv")



### 4.1.2 XGBoost

In [112]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


none_xgboost_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    GXBoostPipeline = Pipeline(steps = [#['smote', SMOTE()],
                                    #['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    none_xgboost_normalized_performance_df = pd.concat([none_xgboost_normalized_performance_df, new_performance_df])
    

none_xgboost_normalized_performance_df.to_csv("../data/05_model_output/none_xgboost_normalized_performance_df.csv")




Elapsed time to compute best fit: 160.301 seconds
Cross-validation score: 0.6975767239336215
Test score: 0.7072368421052632
Best Hyperparameters: {}
0.008676752
0.041046172
0.03783664
0.027425356
0.012683119
0.014372032
0.004837089
0.014786544
0.0061999713
0.013560761
0.0072678886
0.022684457
0.016365405
0.0
0.00642399
0.0
0.010634785
0.021724585
0.005430614
0.02617823
0.015862387
0.0024125613
0.017791377
0.0002615639
0.0024053021
0.011152072
0.003732035
0.021721495
0.0077337874
0.0032589065
0.0070524197
0.0030311965
0.0031285384
0.0028522299
0.0064769783
0.0
0.0
0.0
0.004492225
0.0049051875
0.0068654637
0.010090493
0.0025620935
0.01604737
0.02124385
0.006715815
0.00976126
0.0
0.0
0.0
0.0057116044
0.005588794
0.0039776857
0.0036056032
0.005677693
0.006532783
0.007880179
0.004733837
0.010231195
0.016302396
0.0059092892
0.004788617
0.0016809874
0.006257256
0.0055345166
0.0060888533
0.008320094
0.00032796955
0.0042191474
0.0076027303
0.006322667
0.0067233965
0.010090252
0.008976779
0.0050



Elapsed time to compute best fit: 154.406 seconds
Cross-validation score: 0.7219720279632081
Test score: 0.6866197183098592
Best Hyperparameters: {}
0.0076344763
0.043080136
0.0333627
0.023952913
0.013950366
0.012324427
0.0034579886
0.007176553
0.0047375085
0.0053114356
0.0050250487
0.017246474
0.021670703
0.0
0.0028269046
0.0
0.010854925
0.013974388
0.006968393
0.019092666
0.02924549
0.0025979653
0.02284454
0.0058252434
0.011473054
0.0026613178
0.011723341
0.026235664
0.0073189135
0.004955614
0.0075790132
0.0068779625
0.003767105
0.005452502
0.003082172
0.0
0.0
0.0
0.0055593755
0.0021068647
0.009379482
0.0059739077
0.01999961
0.0071776775
0.022710185
0.025643846
0.0075132656
0.0
0.0
0.0
0.0029918884
0.0036156238
0.004691074
0.00739229
0.0021606712
0.0067131193
0.007060285
0.005548398
0.008259317
0.014691463
0.012935008
0.004795292
0.0015046116
0.006877376
0.0063303476
0.0016481703
0.0065911342
0.0077683902
0.012403285
0.002500307
0.0065762848
0.0
0.009735252
0.00893691
0.0061905463
0.



Elapsed time to compute best fit: 165.419 seconds
Cross-validation score: 0.7261635759270344
Test score: 0.7094594594594594
Best Hyperparameters: {}
0.01313893
0.04887395
0.03645095
0.027303288
0.006903566
0.010851587
0.0048144893
0.011238398
0.004890115
0.0146056535
0.009777247
0.022971075
0.013267507
0.0
0.0034442805
0.0
0.014577277
0.021265801
0.009663576
0.01822702
0.0054985755
0.00771229
0.01972655
0.0036385995
0.0021896095
0.010993027
0.006365493
0.020471929
0.005680272
0.0067989416
0.0022548889
0.0068610716
0.0031791031
0.004149125
0.004642306
0.0
0.0
0.0
0.0075104963
0.00078871025
0.0043632253
0.009141988
0.016994001
0.008461997
0.009038323
0.0
0.00901598
0.0
0.0
0.0
0.00768846
0.0052730828
0.0044304146
0.007240389
0.012557003
0.005708186
0.005673455
0.010544952
0.009469525
0.0111135915
0.009590491
0.0091786785
0.00081173424
0.0077932742
0.014330602
0.0061246194
0.004115085
0.0006226082
0.0090618385
0.0048232046
0.008527233
0.0031722602
0.004857067
0.005360722
0.0054142387
0.00



Elapsed time to compute best fit: 159.768 seconds
Cross-validation score: 0.6839282777508706
Test score: 0.705128205128205
Best Hyperparameters: {}
0.0075288625
0.043139067
0.037982345
0.031757798
0.007908647
0.01144899
0.0043044304
0.0062790206
0.008745107
0.013542978
0.008814933
0.015359522
0.0070487754
0.0
0.009151662
0.0
0.0073665
0.025362441
0.005892015
0.026650324
0.008936673
0.012561691
0.021858634
0.015906325
0.008060173
0.0
0.004445849
0.017939216
0.0042044935
0.008699071
0.0038882832
0.0064675454
0.003517493
0.0025396692
0.0043708542
0.0
0.0
0.0
0.009829166
0.0032180895
0.009311647
0.008415
0.01586161
0.014144896
0.009281235
0.0048171487
0.012302275
0.0
0.0
0.0
0.0061960677
0.0058228206
0.0049152486
0.004467266
0.008645989
0.0060746605
0.009554876
0.007862665
0.005418272
0.009868109
0.007409298
0.007021217
0.008657259
0.0031059284
0.0015136455
0.0057457383
0.0046912236
0.01321716
0.0065782857
0.006871228
0.0045086024
0.0025564982
0.0057608057
0.01123755
0.009019739
0.0
0.0024



Elapsed time to compute best fit: 163.235 seconds
Cross-validation score: 0.7093859958279315
Test score: 0.7363013698630136
Best Hyperparameters: {}
0.0076752105
0.042973146
0.03497234
0.02626525
0.004700014
0.008484248
0.009278529
0.011489794
0.0028471828
0.0026729712
0.013619806
0.01702789
0.013974563
0.0
0.004160907
0.0
0.008800302
0.02078093
0.0039138636
0.027809843
0.01663048
0.0061806957
0.028904127
0.0032691832
0.0009647439
0.014762008
0.0026708506
0.025962694
0.0047752345
0.0053604245
0.004289346
0.0065991646
0.0040665655
0.0038706018
0.0050511113
0.0
0.0
0.0
0.00988806
0.017017258
0.006345611
0.012468793
0.008242788
0.010498594
0.013925442
0.01941597
0.0043797377
0.0
0.0
0.0
0.008158704
0.005268092
0.0053234515
0.0076967105
0.02200001
0.0063089184
0.006433626
0.0069505153
0.004675476
0.020604854
0.010326608
0.0076606614
0.0096147
0.0027299256
0.004379721
0.006065523
0.009042865
0.0019468964
0.011468658
0.0034586256
0.006273191
0.023960143
0.009456375
0.0053674243
0.005659067
0



Elapsed time to compute best fit: 157.513 seconds
Cross-validation score: 0.7237965714258199
Test score: 0.7465277777777778
Best Hyperparameters: {}
0.010066988
0.048839215
0.03842698
0.026797367
0.012267853
0.0101787755
0.0066876486
0.0047810297
0.006553445
0.004971817
0.013274401
0.013621223
0.019669183
0.0
0.010685435
0.0
0.012227418
0.01690378
0.0053327433
0.021361027
0.006989405
0.006057644
0.02897865
0.0037675998
0.0
0.00811073
0.007461981
0.026514808
0.003682021
0.0056445757
0.0015311576
0.008808085
0.0056780227
0.004375134
0.0033938852
0.0
0.0
0.0
0.0053100507
0.0
0.008804225
0.0070760706
0.0
0.007947201
0.001975726
0.01665115
0.006810128
0.0
0.0
0.0
0.004571253
0.009140446
0.0025567643
0.0068213916
0.0046817386
0.00619713
0.008634977
0.0074303807
0.009519299
0.0131427385
0.008670783
0.010042392
0.0
0.0066991583
0.022294952
0.0067602573
0.0062135146
0.0
0.0022094748
0.004617414
0.0070616375
0.0040490767
0.009272472
0.0021855866
0.0076709683
0.0007749311
0.00433881
0.015895827
0



Elapsed time to compute best fit: 162.480 seconds
Cross-validation score: 0.7272571580175713
Test score: 0.6891025641025641
Best Hyperparameters: {}
0.013053508
0.04790651
0.037859377
0.035815813
0.009888893
0.00976741
0.0036774387
0.0056028906
0.0035753944
0.020731173
0.009736008
0.013277918
0.014372824
0.0
0.008202434
0.0
0.013714745
0.01184087
0.003334856
0.027838591
0.0062365686
0.006967361
0.024202738
0.0028073192
0.0016586807
0.009966216
0.0073876884
0.021717472
0.008908405
0.004895085
0.0036011648
0.0054898704
0.0061849602
0.0018277232
0.006209359
0.0
0.0
0.0
0.010617438
0.008424636
0.011220872
0.0096623795
0.0031588194
0.008820668
0.009790922
0.0
0.011684113
0.0
0.0
0.0
0.008361229
0.006533572
0.0059370813
0.00780109
0.00550746
0.019588696
0.008115864
0.005769295
0.008829817
0.011352494
0.015501305
0.0071266023
0.0
0.007776148
0.015242375
0.006245475
0.006352693
0.0
0.009189549
0.011707136
0.0056497823
0.000877897
0.005658848
0.009545564
0.00647738
0.0
0.0071568824
0.016289938




Elapsed time to compute best fit: 158.603 seconds
Cross-validation score: 0.7268439162081116
Test score: 0.6925675675675675
Best Hyperparameters: {}
0.010288162
0.040975455
0.033774383
0.025411637
0.014842935
0.008363655
0.003337505
0.006597003
0.013658597
0.007880912
0.008400027
0.013111804
0.018786645
0.0
0.0081176
0.0
0.011023104
0.019009806
0.0134268515
0.020430462
0.01858315
0.0018819281
0.024916645
0.0131837865
0.050272364
0.0010142676
0.012304819
0.013030619
0.005685344
0.0043154224
0.004318037
0.008626709
0.003996432
0.006511223
0.0044119526
0.0
0.0
0.0
0.006117501
0.0004400514
0.009927482
0.0058770743
0.009393434
0.014558452
0.019445736
0.0
0.012986311
0.0
0.0
0.0
0.0053259665
0.007282493
0.0042891987
0.0068004983
0.00032529802
0.0039217635
0.005470868
0.010208493
0.009071839
0.0125039555
0.0066515887
0.010550204
0.002655454
0.004083127
0.0028236723
0.005064834
0.0068535963
0.00052372046
0.004896252
0.0074611455
0.0047687883
0.0050794985
0.0093705105
0.012770945
0.009648727
0.



Elapsed time to compute best fit: 159.945 seconds
Cross-validation score: 0.7457355403575944
Test score: 0.7166666666666667
Best Hyperparameters: {}
0.007832577
0.03937018
0.036891572
0.029859599
0.012697918
0.011025932
0.0051563047
0.017131552
0.012960034
0.0034081687
0.009629793
0.013872887
0.018863771
0.0
0.0014936536
0.0
0.008354155
0.008255242
0.0049435766
0.021601263
0.017271271
0.0032781283
0.021585993
0.0071679857
0.013892184
0.015246118
0.009345977
0.01604938
0.0065501053
0.007762213
0.00091280247
0.0050032833
0.004132501
0.0020548278
0.004171094
0.0
0.0
0.0
0.009570516
0.007543322
0.010541651
0.0065576257
0.006767364
0.012168246
0.044247653
0.0
0.01864015
0.0
0.0
0.0
0.007986032
0.010165921
0.004735037
0.0076999073
0.0012927852
0.0063520046
0.013008702
0.0068758815
0.0088968165
0.014252525
0.014608277
0.0073757917
0.033106856
0.00074894703
0.017983874
0.0025058286
0.006226323
0.0
0.008065601
0.006625749
0.004692673
0.004310035
0.0067185834
0.0064508864
0.0056577004
0.00094659



Elapsed time to compute best fit: 158.574 seconds
Cross-validation score: 0.7048467116834389
Test score: 0.6944444444444444
Best Hyperparameters: {}
0.009055512
0.042199045
0.03652361
0.028224166
0.013223873
0.0072470694
0.005801563
0.009973408
0.014303543
0.003501961
0.0051182453
0.016817583
0.00893501
0.0
0.0047616316
0.0
0.011929207
0.01855891
0.0096979225
0.020398298
0.019541914
0.012796057
0.021565147
0.004181804
0.0060171955
0.0033102045
0.0065096216
0.019303093
0.011078627
0.004241876
0.0011478588
0.0059023043
0.004512364
0.0017895157
0.0051124236
0.0
0.0
0.0
0.010000223
0.003287916
0.00910429
0.008249035
0.0
0.0060240934
0.010974904
0.0
0.010390211
0.0
0.0
0.0
0.0065093557
0.007441859
0.0054774326
0.003962927
0.0
0.004626269
0.00936819
0.008530555
0.0119819585
0.0099241845
0.009241786
0.0066050086
0.0009411751
0.012004576
0.009625688
0.0072770347
0.004805835
0.0011334601
0.013146591
0.0044836244
0.0077497084
0.0037929588
0.0067151315
0.0036612628
0.00429301
0.0
0.005618267
0.00



Elapsed time to compute best fit: 166.979 seconds
Cross-validation score: 0.6890585780727551
Test score: 0.7862903225806451
Best Hyperparameters: {}
0.009729646
0.04658307
0.036489245
0.027909279
0.014605697
0.01099834
0.0057956055
0.0042266715
0.0021231845
0.008572727
0.007809568
0.0125787305
0.011933025
0.0
0.0051163253
0.0
0.0074058473
0.012604846
0.011979149
0.032964934
0.008625515
0.0058386647
0.017340884
0.009728676
0.0022631006
0.012049667
0.011934012
0.019570792
0.008510529
0.009104653
0.0033993684
0.0060204226
0.0049133827
0.002740052
0.0067240437
0.0
0.0
0.0
0.010720282
0.003644723
0.0058803796
0.011542977
0.0
0.010756971
0.02291917
0.0
0.014703081
0.0
0.0
0.0
0.0048843115
0.0074044755
0.0026462579
0.0076974453
0.0064928783
0.008722722
0.013862583
0.013928375
0.0067260326
0.009177026
0.014503712
0.0071440917
0.01687067
0.004606254
0.0038231437
0.0065861344
0.0060305367
0.0
0.0050692093
0.007819845
0.009580943
0.018925129
0.0067042806
0.0061609535
0.004594536
0.0
0.0034336597




Elapsed time to compute best fit: 159.784 seconds
Cross-validation score: 0.6998290109000866
Test score: 0.75
Best Hyperparameters: {}
0.008340516
0.044026177
0.037548617
0.026557386
0.01273086
0.010944213
0.0063833822
0.005505898
0.019901691
0.01031063
0.00710887
0.016049506
0.010371368
0.0
0.0052475934
0.0
0.010715972
0.037138328
0.0076866653
0.022108302
0.0074745337
0.004084898
0.016950754
0.0005908842
0.0
0.0020759704
0.0050891084
0.015149334
0.006037538
0.0046667946
0.0037249567
0.0053044236
0.0040729186
0.0027831746
0.0058259657
0.0
0.0
0.0
0.009976191
0.0066624815
0.0056492323
0.009160193
0.004901779
0.007754765
0.009359901
0.035825945
0.017024687
0.0
0.0
0.0
0.0058179894
0.0037178234
0.0038614715
0.0034183792
0.0042142933
0.010632726
0.008965063
0.011846483
0.0047842315
0.009671066
0.004650685
0.00971273
0.0026866312
0.0015450271
0.005813442
0.0071469955
0.0067005097
0.0022242814
0.009696454
0.0061734566
0.009596237
0.01733291
0.009104716
0.0052812453
0.0051486744
0.0
0.0056536



Elapsed time to compute best fit: 161.104 seconds
Cross-validation score: 0.6917285025397344
Test score: 0.7904411764705882
Best Hyperparameters: {}
0.008103107
0.04072
0.033588145
0.024955662
0.010870771
0.013089288
0.0033159119
0.0037854586
0.0029572384
0.013601708
0.00945332
0.012579947
0.020649765
0.0
0.00978144
0.0
0.0031846429
0.006882921
0.011835881
0.02032895
0.028819451
0.006701381
0.017617676
0.0049784044
0.006133057
0.0054080095
0.004345094
0.014904413
0.0073181307
0.0092835175
0.003192985
0.0061666435
0.004633141
0.009087383
0.005450287
0.0
0.0
0.0
0.00870573
0.0
0.0076048025
0.0047835554
0.004124781
0.009134758
0.04092288
0.00060844736
0.011742933
0.0
0.0
0.0
0.0045104045
0.0056616855
0.005168528
0.0048399675
0.005996983
0.007089933
0.008590128
0.009929804
0.003812507
0.013275029
0.011944535
0.0055348515
0.004627604
0.008310763
0.0085197715
0.0029832006
0.0055755107
0.0026908123
0.013203667
0.007178019
0.0072183073
0.0050034206
0.0047842218
0.002737209
0.009296526
0.0
0.00



Elapsed time to compute best fit: 165.960 seconds
Cross-validation score: 0.7317200455100368
Test score: 0.6702898550724637
Best Hyperparameters: {}
0.010929132
0.041494213
0.03970613
0.028704524
0.009891574
0.010471781
0.008513915
0.008973531
0.013284989
0.006566021
0.008078097
0.014671807
0.019686148
0.0
0.0065930565
0.0
0.008050867
0.02062101
0.006071376
0.02890067
0.0040896703
0.010822069
0.01886535
0.0059664254
0.005877648
0.01045274
0.005867466
0.024301067
0.006623429
0.0077724936
0.0038772637
0.0044201934
0.004603503
0.0037368764
0.005450186
0.0
0.0
0.0
0.0066752224
0.006659391
0.008077042
0.012664389
0.027679011
0.012353706
0.009809447
0.0
0.017944092
0.0
0.0
0.0
0.0075929365
0.0075516864
0.00439247
0.0103882365
0.0006242969
0.01000021
0.005205145
0.014075232
0.005781206
0.01468176
0.0056069666
0.011301742
0.0017813765
0.0010144466
0.007805858
0.005810295
0.009410516
0.0
0.0073525077
0.0058481637
0.008178664
0.0009498561
0.005164883
0.012390309
0.007053234
0.0
0.00088342594
0.0



Elapsed time to compute best fit: 175.314 seconds
Cross-validation score: 0.7097734301879532
Test score: 0.704225352112676
Best Hyperparameters: {}
0.01178214
0.047198515
0.04018263
0.02596043
0.014485608
0.0052778306
0.008110324
0.009779245
0.00816727
0.003975021
0.008913456
0.020989418
0.015272824
0.0
0.004493989
0.0
0.0062009282
0.026224794
0.010958266
0.028639881
0.011703951
0.004147551
0.022862412
0.008771643
0.004608633
0.00090291136
0.015026907
0.012368513
0.006108401
0.0036202406
0.0022824367
0.006148897
0.0067374655
0.0030427508
0.003913144
0.0
0.0
0.0
0.013634798
0.002273068
0.0075030834
0.011617249
0.0029720406
0.006173667
0.020328
0.0
0.01751059
0.0
0.0
0.0
0.005303857
0.005576296
0.008321415
0.0063628945
0.0031151746
0.0060535395
0.0093735885
0.006188402
0.013426027
0.0131917065
0.005398574
0.012300858
0.014429675
0.007970747
0.013981288
0.0034586817
0.003905824
0.0032713409
0.0076852217
0.0038971496
0.0065956977
0.004324606
0.006864077
0.008276925
0.008212765
0.0
0.005304



Elapsed time to compute best fit: 154.515 seconds
Cross-validation score: 0.7171336297800329
Test score: 0.7601351351351352
Best Hyperparameters: {}
0.008576781
0.03870938
0.037197765
0.027618974
0.017165465
0.008935568
0.0046818145
0.006043104
0.0119457
0.006827096
0.0052372916
0.012425112
0.021097356
0.0
0.0055752895
0.0
0.011919027
0.03566738
0.0055475663
0.018372677
0.014064548
0.005708135
0.012693333
0.0049836347
0.010647037
0.01478747
0.0074631367
0.021361312
0.0075173755
0.007883011
0.0043663993
0.0069905766
0.0038072912
0.0026536928
0.0057754675
0.0
0.0
0.0
0.011018328
0.0017929232
0.007963859
0.012785226
0.0032414687
0.0057506645
0.013914989
0.0
0.011616605
0.0
0.0
0.0
0.005230903
0.00954804
0.0069389725
0.013856937
0.0038071608
0.009507517
0.012187195
0.0051379222
0.008625679
0.01477046
0.008518796
0.0063045477
0.0006497693
0.0040240427
0.016425928
0.0057963873
0.0032792287
0.0
0.0038072453
0.0072945673
0.0062307953
0.0109883575
0.006141598
0.0038818691
0.006436394
0.0
0.0074



Elapsed time to compute best fit: 160.573 seconds
Cross-validation score: 0.7103232230029163
Test score: 0.7765151515151515
Best Hyperparameters: {}
0.011273327
0.046122354
0.039540306
0.028733036
0.009567229
0.012724702
0.009705507
0.0035581437
0.0091159595
0.0073629282
0.008247272
0.01649588
0.02121938
0.0
0.0047558136
0.0
0.0048444313
0.023805609
0.011313493
0.020257128
0.0147545785
0.0036510597
0.01414449
0.0077820993
0.014273753
0.014057129
0.008110611
0.017988365
0.0050875684
0.013682425
0.0071009295
0.0057922485
0.0058181565
0.0041656494
0.0045899614
0.0
0.0
0.0
0.007630067
0.0
0.0049691056
0.00900337
0.0013346064
0.011164521
0.018069932
0.0
0.007592319
0.0
0.0
0.0
0.0073221154
0.008753981
0.0032855219
0.010432907
0.011796016
0.003269437
0.009859697
0.004307328
0.00786806
0.012447619
0.008988537
0.009711905
0.0028310162
0.005140927
0.0074530984
0.0028687776
0.0053305086
0.0
0.009978807
0.0066128364
0.0070816344
0.0007689786
0.006836052
0.007956667
0.006702984
0.0
0.0021004742
0.



Elapsed time to compute best fit: 160.351 seconds
Cross-validation score: 0.739410204273211
Test score: 0.6785714285714285
Best Hyperparameters: {}
0.008512042
0.038208246
0.031194797
0.024158176
0.0052756486
0.010437371
0.010386423
0.0072633526
0.0066016093
0.011616127
0.007603771
0.01844949
0.017894365
0.0
0.0055736885
0.0
0.008848397
0.030499762
0.012985251
0.024688093
0.009487062
0.00659108
0.026176581
0.0036505924
0.0
0.016722871
0.0009028695
0.019263024
0.0071572927
0.0068196696
0.007642062
0.0056123394
0.0038853132
0.004630809
0.0059218938
0.0
0.0
0.0
0.010683069
0.01529983
0.0059722117
0.0068926816
0.0006230043
0.0051929587
0.019673266
0.0
0.010820438
0.0
0.0
0.0
0.0059459875
0.004767801
0.005468591
0.0053402875
0.004060042
0.0041574854
0.006320704
0.011203787
0.005604716
0.012038965
0.005435564
0.009474419
0.0041019283
0.004273595
0.014435654
0.009036054
0.005649063
0.0
0.0033956724
0.004360731
0.0068400656
0.0016224051
0.0042977305
0.0068493807
0.004421825
0.03106941
0.004514



Elapsed time to compute best fit: 158.849 seconds
Cross-validation score: 0.7285398805219818
Test score: 0.6372549019607844
Best Hyperparameters: {}
0.009956276
0.050734393
0.046784177
0.025745485
0.013034879
0.00944798
0.0042949533
0.008624855
0.005446864
0.0069079143
0.0067330245
0.018729707
0.026894566
0.0
0.003975565
0.0
0.01168497
0.012405511
0.0082531
0.027523106
0.021986434
0.0044748774
0.017361471
0.0017052623
0.009357489
0.0010781634
0.008714503
0.022575181
0.009824905
0.009841941
0.004804743
0.006218944
0.0054497058
0.008303055
0.0056937085
0.0
0.0
0.0
0.011038594
0.0029952975
0.011929766
0.013760067
0.0
0.0065117357
0.02217927
0.0
0.011898006
0.0
0.0
0.0
0.007697755
0.0038950203
0.0051322402
0.004956088
0.0047667935
0.007949862
0.0058819116
0.009000188
0.0045847488
0.012666373
0.0028565545
0.011113494
0.0016444795
0.0061335578
0.0025927585
0.0036483018
0.006347147
0.0024176394
0.0069554006
0.008334494
0.0074061407
0.0
0.001883045
0.005409365
0.010105675
0.0
0.007495447
0.020



Elapsed time to compute best fit: 160.935 seconds
Cross-validation score: 0.708227359045408
Test score: 0.7427536231884058
Best Hyperparameters: {}
0.0072965114
0.0397082
0.037382953
0.026894461
0.008556746
0.008667165
0.0075434847
0.0032722421
0.015635021
0.018479416
0.006246952
0.014049224
0.012760843
0.0
0.0039573577
0.0
0.007989236
0.018394025
0.007196851
0.034431703
0.0068593738
0.007321287
0.01899527
0.004082976
0.0028057117
0.008402173
0.0032760713
0.022166722
0.0030839036
0.005586586
0.0021986875
0.0025027285
0.004908369
0.0013936138
0.00422104
0.0
0.0
0.0
0.008879035
0.0037554768
0.009048126
0.0059892144
0.0016278295
0.0075506223
0.015136437
0.0061861635
0.010060465
0.0
0.0
0.0
0.0051168366
0.005821658
0.0074153612
0.006920751
0.0070196874
0.009171789
0.0046347575
0.0070564994
0.0047154594
0.014198353
0.010250478
0.0077065905
0.002189417
0.003484221
0.007567495
0.007821157
0.0058042593
0.008705737
0.007406192
0.007973587
0.009345675
0.008309606
0.011020271
0.0030448777
0.00704



Elapsed time to compute best fit: 162.699 seconds
Cross-validation score: 0.7257083279158436
Test score: 0.6770833333333333
Best Hyperparameters: {}
0.008079947
0.045438867
0.038001757
0.023443196
0.016113717
0.009272265
0.005988652
0.004098229
0.0026955334
0.0023433499
0.010079752
0.016361102
0.01564211
0.0
0.0032580765
0.0
0.011302044
0.03215219
0.010760401
0.033456404
0.0059397807
0.0113195395
0.025074841
0.0008063144
0.039834887
0.004578123
0.0018192221
0.024128117
0.0069442578
0.006979532
0.0064530307
0.0071779094
0.0046471506
0.0020595652
0.004541391
0.0
0.0
0.0
0.0100189755
0.007037351
0.008485936
0.008336668
0.00063226075
0.007504626
0.015823036
0.005712604
0.007848038
0.0
0.0
0.0
0.003512793
0.0065069846
0.0046685827
0.011309551
0.0038487304
0.003960218
0.0032645778
0.006186256
0.011724626
0.013603218
0.0049705994
0.007732431
0.008475119
0.0030432793
0.01083706
0.0067068343
0.005171242
0.0
0.008854753
0.0064229583
0.008009343
0.0022205608
0.009565536
0.006155405
0.0052201347
0



Elapsed time to compute best fit: 162.753 seconds
Cross-validation score: 0.740661268450551
Test score: 0.6439393939393939
Best Hyperparameters: {}
0.0071715615
0.04255614
0.03457102
0.028725
0.0121601075
0.0080046775
0.008712806
0.010173317
0.017199878
0.0036732866
0.0049541043
0.019676564
0.0071752323
0.0
0.014383995
0.0
0.00787798
0.030780181
0.011109389
0.019864442
0.015667757
0.00426829
0.020359995
0.0039827973
0.0011034741
0.0063131433
0.0066669523
0.020881461
0.0067670117
0.0070860395
0.0031999785
0.005801755
0.004783832
0.008843752
0.0033641013
0.0
0.0
0.0
0.011423965
0.0
0.0063556326
0.011182796
0.0041116313
0.007539909
0.023770124
0.0
0.01498391
0.0
0.0
0.0
0.006268299
0.0048958403
0.0029876065
0.0070761973
0.0051537785
0.011566534
0.008780478
0.008157488
0.005950087
0.007087416
0.01132206
0.005321427
0.0008566948
0.0024140263
0.0045904927
0.0038534151
0.0044002566
0.0026240763
0.008336308
0.0046670423
0.005148015
0.001449067
0.0050371024
0.0068497434
0.00522649
0.0
0.0108086



Elapsed time to compute best fit: 160.294 seconds
Cross-validation score: 0.7145685145530314
Test score: 0.7228915662650601
Best Hyperparameters: {}
0.006867913
0.045058634
0.037241705
0.029342277
0.00616412
0.009462379
0.004841319
0.0051187146
0.0059490004
0.029068489
0.007871205
0.025700878
0.012898897
0.0
0.006468447
0.0
0.015318858
0.031165909
0.011917545
0.024553755
0.0086433785
0.003915788
0.02366858
0.0014474641
0.01583115
0.006336743
0.009725522
0.026455522
0.004120709
0.006120643
0.0015814304
0.012511161
0.003482344
0.005093621
0.0066095535
0.0
0.0
0.0
0.007628734
0.0006239748
0.0071812044
0.009524388
0.0
0.009897206
0.015445884
0.00034215505
0.010340434
0.0
0.0
0.0
0.008409305
0.008813997
0.0046112756
0.005621741
0.0072079934
0.006047675
0.015497297
0.009417027
0.009520722
0.012559356
0.011740972
0.0062581534
0.006948127
0.0037081519
0.0036883545
0.004181551
0.0071207937
0.0
0.007454223
0.0059118806
0.009306527
0.0
0.010801174
0.0046954057
0.0070581688
0.0
0.0073677967
0.0033



Elapsed time to compute best fit: 154.750 seconds
Cross-validation score: 0.7098478995079522
Test score: 0.7792207792207791
Best Hyperparameters: {}
0.009258751
0.04118765
0.03624064
0.024370242
0.016356971
0.010963822
0.003356467
0.0090365345
0.011639662
0.007838845
0.008115857
0.016882183
0.019938802
0.0
0.011060044
0.0
0.011815152
0.01789819
0.013136907
0.016768452
0.01887439
0.0027173457
0.033178885
0.009795108
0.0018699563
0.007061136
0.007141766
0.018949607
0.012650862
0.0044622347
0.0037459515
0.0048317723
0.0030937882
0.003762132
0.0054992144
0.0
0.0
0.0
0.01074833
0.00077898323
0.009884929
0.0048654107
0.0017409528
0.0120846
0.01792738
0.0
0.008409151
0.0
0.0
0.0
0.0070770443
0.007653828
0.003422537
0.0097854985
0.018518806
0.004132285
0.010991601
0.0067521627
0.005799521
0.0120544555
0.018915951
0.004432736
0.012996596
0.0012351557
0.016320039
0.0028442212
0.0063187736
0.0
0.009786839
0.007093095
0.0069907727
0.0012593353
0.009788873
0.00658431
0.00553212
0.0
0.0032798045
0.0



Elapsed time to compute best fit: 158.710 seconds
Cross-validation score: 0.7010275595633125
Test score: 0.7401315789473684
Best Hyperparameters: {}
0.00837712
0.040745515
0.034153298
0.027860496
0.010694059
0.010393136
0.004786148
0.01652476
0.007733648
0.00890526
0.007850475
0.0165015
0.020912787
0.0
0.015305346
0.0
0.008302042
0.02848995
0.009415773
0.028569032
0.0076665655
0.0020306746
0.01544096
0.007873887
0.018806767
0.0075938874
0.0043235472
0.018413026
0.0030482523
0.0032246066
0.004146014
0.00447011
0.003492443
0.004181845
0.005322399
0.0
0.0
0.0
0.008945209
0.0023799941
0.0121086
0.012819489
0.0048058038
0.007542294
0.01559177
0.0006834802
0.01993738
0.0
0.0
0.0
0.0058142426
0.009106778
0.0053155134
0.0039139893
0.008104934
0.007728861
0.009149
0.00890029
0.0073941653
0.012703767
0.010327189
0.0059985253
0.0
0.0040769954
0.004553089
0.0060976096
0.007700631
0.0005317195
0.0020816512
0.0038977545
0.007009059
0.0076850126
0.013156395
0.010787127
0.005676745
0.0
0.008229246
0.0

### 4.1.3 LightGBM

In [113]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

none_lightgbm_performance_normalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    LightGBMPipeline = Pipeline(steps = [#['smote', SMOTE(sampling_strategy = 0.5, n_jobs=2)],
                                    #['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    none_lightgbm_performance_normalized_df = pd.concat([none_lightgbm_performance_normalized_df, new_performance_df])
    

none_lightgbm_performance_normalized_df.to_csv("../data/05_model_output/none_lightgbm_performance_normalized_df.csv")




Elapsed time to compute best fit: 22.674 seconds
Cross-validation score: 0.5207737848168407
Test score: 0.5231143552311435
Best Hyperparameters: {}
946.9529880583286
11875.752541705966
3981.122979249805
2046.7272648513317
190.5205580741167
139.2675550058484
55.512152306735516
208.30775320529938
51.060007996857166
124.65562592446804
889.8140492737293
2499.2165074422956
234.2896012738347
0.0
92.93982706218958
0.0
113.65760836750269
1093.5899726450443
408.45586822927
1342.5016307383776
120.8470883667469
66.29647742211819
222.90027545392513
28.50414453446865
0.3734700083732605
100.51438981294632
34.42196626961231
3795.5822935402393
172.24252328276634
206.07512436062098
149.64549645781517
251.9278378933668
154.6847553551197
58.74646206200123
236.7387361228466
0.0
0.0
0.0
34.91791172325611
15.969499588012695
262.1838092431426
251.27020645141602
20.334441870450974
70.35611970722675
80.49306845664978
0.0
35.6222161129117
0.0
0.0
0.0
113.7151476368308
45.51053640246391
72.93437416106462
7.64914



Elapsed time to compute best fit: 19.038 seconds
Cross-validation score: 0.49635390221685666
Test score: 0.5555555555555556
Best Hyperparameters: {}
960.4097068533301
6531.8304538577795
5029.0678432658315
294.6027959100902
147.66780956089497
469.22319424152374
135.64960062503815
74.67883729934692
9.598986953496933
64.89102879911661
178.9183498546481
1622.1657815948129
1078.310499534011
0.0
54.82267400622368
0.0
170.2653465270996
5352.654862567782
261.41398184001446
1125.398879930377
162.49118888378143
24.877518489956856
142.46595086157322
63.31618911027908
34.61243784427643
5.422030046582222
243.45053246617317
1653.9343159645796
1270.1919535323977
75.12675002217293
76.20973415672779
287.621574729681
258.71292539685965
26.358706757426262
383.9079132601619
0.0
0.0
0.0
254.84371830523014
23.400169610977173
190.51697272062302
504.26991008222103
14.80730789899826
103.90423218905926
143.5478128194809
0.0
1558.5929265767336
0.0
0.0
0.0
131.2461247742176
149.43949589133263
117.05887185037136
1



Elapsed time to compute best fit: 18.844 seconds
Cross-validation score: 0.46991580993782495
Test score: 0.482233502538071
Best Hyperparameters: {}
792.1605598144233
11656.024755127728
2522.5027486011386
946.0120711289346
1943.757939465344
152.81275791674852
931.360115006566
167.4572631418705
31.072910115122795
136.80530175566673
454.3254881836474
346.3174158036709
84.45044326037169
0.0
314.56433130800724
0.0
309.1153139397502
1232.8085387051105
433.70085997879505
2049.380979090929
36.261925637722015
67.01630015671253
285.60100577771664
18.631518721580505
2.9690289199352264
4.258440136909485
64.60288762301207
2597.796355973929
54.20151390880346
105.76877090334892
55.67844544351101
426.71918009221554
72.22262950241566
54.461123287677765
225.40084075927734
0.0
0.0
0.0
236.88641015440226
0.7822759747505188
353.4207891821861
999.5823134407401
16.668136946856976
101.89549113810062
17.442857682704926
7.88077974319458
135.5095889866352
0.0
0.0
0.0
665.0895029902458
486.1587788686156
153.71754



Elapsed time to compute best fit: 19.449 seconds
Cross-validation score: 0.48264046168500796
Test score: 0.5321782178217821
Best Hyperparameters: {}
1394.7203286662698
11674.770505126566
3397.8852365911007
1355.0326816923916
1684.2446330860257
294.95981261879206
385.67079874128103
41.44573103636503
3.215140923857689
57.64057985693216
304.19658505916595
123.10556900501251
95.41960011422634
0.0
229.4381244853139
0.0
59.17419394105673
3141.494199886918
114.23104111850262
1204.9651103541255
57.49076206982136
107.6227867975831
145.41346868872643
62.73186206817627
0.0
25.908437192440033
56.76506444066763
1298.812048137188
137.9938736706972
172.13468964397907
476.5354066193104
628.6335898786783
315.60368470847607
38.11121017485857
200.74728932976723
0.0
0.0
0.0
43.977050229907036
2.5022919550538063
1311.242612078786
1824.405146613717
28.73384778946638
164.70717179775238
21.782624274492264
0.0
14.807214006781578
0.0
0.0
0.0
151.45182882994413
204.67103200405836
400.27997129410505
11.9497770369



Elapsed time to compute best fit: 20.171 seconds
Cross-validation score: 0.5298669311856388
Test score: 0.57356608478803
Best Hyperparameters: {}
669.5418869182467
5528.286052167416
5686.129549019039
973.1582825183868
815.7839595898986
110.26860962063074
955.8097238242626
114.19896547496319
244.3120628297329
42.87036511301994
95.1953516677022
2122.739420324564
1107.0601594299078
0.0
130.82019928097725
0.0
148.50525139272213
4256.1242981776595
526.7627726122737
131.1748249232769
325.0640944764018
11.136995151638985
99.9115718305111
43.437440656125546
103.08497142791748
25.026532500982285
135.8377389907837
1153.5197667777538
128.89153600484133
852.8366588056087
26.20331421494484
235.2674819305539
120.36827728152275
8.172916188836098
173.53723986446857
0.0
0.0
0.0
126.80124726891518
0.9423900246620178
201.41886673867702
63.89694030582905
1.2338449954986572
197.32323451340199
248.1295489668846
0.7948690056800842
1265.816853016615
0.0
0.0
0.0
86.14929558336735
104.38531009852886
1255.929249



Elapsed time to compute best fit: 21.080 seconds
Cross-validation score: 0.533438549631658
Test score: 0.4961832061068702
Best Hyperparameters: {}
1110.4574157707393
11530.980545978993
3066.95886458084
2410.809512730688
1718.6315454095602
115.25556191056967
117.29374876618385
38.905741825699806
14.21905666589737
32.62723993510008
260.08477521687746
318.84088046103716
152.9085090830922
0.0
145.97233119606972
0.0
158.6024205237627
1514.970252752304
63.346195712685585
1709.902750864625
122.82580582797527
155.61100286990404
206.03544482588768
55.90355537831783
133.86369478702545
81.09781178832054
83.10587726533413
2416.3763371258974
166.57233008742332
62.450250059366226
41.499419793486595
372.0290353745222
212.75730270147324
105.75847654044628
176.54532459378242
0.0
0.0
0.0
49.23506540060043
0.6131009981036186
508.7437471821904
65.97499844431877
0.16039299964904785
49.21664445847273
45.333140432834625
0.0
1004.1405847072601
0.0
0.0
0.0
155.9760072082281
29.855944477021694
208.8576852232217



Elapsed time to compute best fit: 18.971 seconds
Cross-validation score: 0.5070009849524773
Test score: 0.5236907730673317
Best Hyperparameters: {}
1453.6256717145443
11710.355071958154
4269.350451953709
669.0001371726394
268.38135144114494
137.73171462118626
196.52704848349094
122.58286634087563
97.10989901423454
70.86074578016996
766.9780667051673
2910.769841708243
1347.3147924095392
0.0
103.47995214164257
0.0
1104.7778286635876
143.44457726180553
1151.8454204797745
470.3434341698885
166.99992997944355
19.15548501163721
173.31911873817444
136.2102237045765
0.693494975566864
7.981176182627678
233.28900407254696
1925.202437311411
304.11368499696255
337.36103254556656
46.3088741004467
163.49825815856457
776.3152147084475
12.532496318221092
438.3355942592025
0.0
0.0
0.0
49.134771786630154
11.659523248672485
539.9362823963165
629.3061041384935
5.884880065917969
112.34069347381592
458.54496669769287
7.611690044403076
150.4284858033061
0.0
0.0
0.0
156.53084326535463
81.95258592069149
149.92



Elapsed time to compute best fit: 19.239 seconds
Cross-validation score: 0.4774088369592729
Test score: 0.5759803921568627
Best Hyperparameters: {}
538.3221790120006
7248.555700130761
2728.2348455637693
1642.9808473922312
93.24492362141609
70.4983893558383
93.88976662606001
142.13700599968433
20.500968724489212
83.6020633354783
1340.180204845965
2112.4660559371114
396.49198458343744
0.0
268.09837406128645
0.0
74.89170609414577
5802.288035236299
397.61101143062115
161.2210060209036
131.57582440972328
38.92356038093567
294.0575509816408
136.67497862875462
11.831079006195068
100.55071198940277
107.48932972550392
1991.3058844432235
153.14399176836014
149.01688745617867
863.882133319974
352.6269942075014
553.6663385555148
31.914783023297787
187.31598866730928
0.0
0.0
0.0
44.34699631482363
0.8065620064735413
114.26815710961819
231.1515666693449
22.282504349946976
181.36239811778069
1291.8669030070305
0.7565209865570068
290.6470542550087
0.0
0.0
0.0
88.30940192937851
140.95443426072598
168.95



Elapsed time to compute best fit: 19.575 seconds
Cross-validation score: 0.46933662582215174
Test score: 0.5348258706467661
Best Hyperparameters: {}
1915.509332947433
8393.479833018035
2594.7752093710005
782.3341591432691
543.6876036338508
97.80396503210068
502.1601218730211
18.374915972352028
9.683720782399178
19.709975965321064
1026.524997241795
2379.354610219598
455.1631872281432
0.0
144.7361722290516
0.0
182.90919621288776
6026.506635203958
461.31690050661564
973.7826629430056
218.42676393687725
7.932029902935028
21.32672668993473
48.25950264930725
0.35330501198768616
16.61508697271347
495.13625878095627
1145.2121252790093
658.088536247611
125.60055743157864
66.44175118207932
68.26126584038138
154.6068465411663
24.075299911201
103.24201630800962
0.0
0.0
0.0
79.47737205773592
19.87745399773121
231.16387951374054
216.79087244346738
0.0
30.941805072128773
7.092393055558205
0.4962120056152344
1776.471523180604
0.0
0.0
0.0
171.4543662518263
172.7679961323738
144.3399069942534
54.5209401



Elapsed time to compute best fit: 20.214 seconds
Cross-validation score: 0.48989055573036566
Test score: 0.49875311720698245
Best Hyperparameters: {}
1496.0348295457661
10480.699380192906
4166.98120585829
1952.3466504514217
1976.1115595772862
961.2976691573858
86.91944184154272
164.48928156495094
3.9576580226421356
70.83733107149601
1067.2756552696228
394.0277556851506
804.689906232059
0.0
170.46631165593863
0.0
31.993819497525692
152.7041326239705
158.77444696426392
1099.0837505459785
104.57339563965797
282.7784704044461
92.83030904829502
4.439303994178772
11.340414971113205
19.05528275668621
303.93833597004414
3343.411419212818
325.620753236115
374.2385113313794
14.10257737338543
257.02307553589344
175.1534092798829
63.007183223962784
143.86046882718801
0.0
0.0
0.0
67.6906690299511
0.0
193.09672697633505
90.43540125340223
159.870989382267
94.2354399561882
17.411825716495514
0.0
171.23453880101442
0.0
0.0
0.0
145.9241947233677
75.58279679715633
187.84242106229067
35.814950197935104
13



Elapsed time to compute best fit: 19.517 seconds
Cross-validation score: 0.5327395366110186
Test score: 0.46835443037974683
Best Hyperparameters: {}
2206.1430496014655
9735.371786773205
4760.437520250678
1395.917637374252
287.50947277247906
112.40125245600939
59.188924089074135
69.73302599042654
47.039574325084686
60.34283955395222
708.8567524440587
2641.091960903257
696.1410739831626
0.0
67.91164612025023
0.0
68.05675618350506
1083.4345057755709
130.0175866894424
1764.4229880571365
65.45588841289282
27.840035870671272
120.91993806511164
23.447109706699848
118.17904078960419
48.8352257758379
44.25035988166928
2801.8435788601637
186.22527646645904
114.61781167984009
193.24038759618998
92.10939548909664
1348.290966577828
223.34376364201307
112.47620389610529
0.0
0.0
0.0
48.880317494273186
19.535876616835594
42.06029003113508
39.69680044800043
5.530479833483696
109.91349197179079
150.46774902194738
0.0
52.09323497861624
0.0
0.0
0.0
315.53192861378193
205.22364189475775
135.32922557741404




Elapsed time to compute best fit: 19.814 seconds
Cross-validation score: 0.4905554818686291
Test score: 0.5137844611528822
Best Hyperparameters: {}
969.1038727536798
11631.60174947977
4058.125614695251
1280.8822904489934
1996.0067640990019
162.9423943310976
113.88741030544043
113.27282141149044
99.78992415964603
15.646704837679863
98.24632736295462
380.5009993761778
1786.1589099764824
0.0
153.07453671097755
0.0
154.9113966152072
2098.8531032055616
293.6356020644307
93.28319788724184
71.86328910291195
99.54295054078102
295.8270099312067
2.5589929819107056
0.7638250142335892
22.356093674898148
619.9775590151548
127.38607950508595
509.8302280306816
279.55801813304424
4.994058758020401
163.41678170859814
1017.697080925107
29.43394474685192
204.18310970067978
0.0
0.0
0.0
1135.476295515895
103.56020173430443
166.80280154943466
87.94123373180628
19.509103998541832
113.42554428428411
430.9616961479187
0.0
1475.5017869621515
0.0
0.0
0.0
78.61888758838177
64.89348418265581
225.52270358800888
280



Elapsed time to compute best fit: 20.113 seconds
Cross-validation score: 0.5223221119722996
Test score: 0.42307692307692313
Best Hyperparameters: {}
1607.634838692844
10478.531719863415
4105.689114268869
339.90903754904866
149.25138179957867
520.931530252099
71.96271549165249
392.74013470858335
58.08330835402012
10.101211294531822
859.0403841584921
2227.9700697399676
51.864541970193386
0.0
141.7793378829956
0.0
104.8939014673233
2880.351079508662
1167.9229053743184
1308.8439475670457
34.606427267193794
57.90427418798208
51.67569242417812
2.9567860439419746
0.2797519862651825
45.37252217531204
50.25756439566612
1362.633448652923
125.36756926774979
65.31960508227348
27.456372171640396
149.73474396765232
65.23251152783632
67.88894928991795
107.47743871808052
0.0
0.0
0.0
102.4414326697588
0.0
68.26727988570929
101.94051878154278
12.677473053336143
279.8808812946081
21.17035609483719
0.17226800322532654
3.372419960796833
0.0
0.0
0.0
117.97870706766844
118.62062738090754
131.21593007445335
1



Elapsed time to compute best fit: 19.661 seconds
Cross-validation score: 0.47251195583511374
Test score: 0.5486284289276809
Best Hyperparameters: {}
425.02204547822475
9302.506770532578
3684.8454908281565
1205.813711129129
274.36510775983334
123.64340762421489
89.5328411757946
305.17954444885254
563.2821154296398
219.26459784805775
516.4029388949275
2152.078144699335
536.3628330528736
0.0
153.96031938493252
0.0
204.82629988342524
5329.688323274255
235.8904033973813
397.0132352113724
117.25614431500435
31.639669567346573
166.17530769109726
19.903296813368797
378.53257620334625
87.77339535951614
60.64723900705576
2421.5173543691635
182.99213261157274
389.83644995093346
32.84026789665222
172.25902120769024
248.89289601147175
120.33256809413433
188.69102923572063
0.0
0.0
0.0
1280.7005349695683
145.22257475554943
127.24866542220116
555.6784406900406
46.230984061956406
163.04149368405342
15.37372986972332
0.0
848.1616009026766
0.0
0.0
0.0
251.84884098917246
173.56750364601612
141.77766104042



Elapsed time to compute best fit: 20.688 seconds
Cross-validation score: 0.482540946622824
Test score: 0.5474452554744527
Best Hyperparameters: {}
481.30901488661766
8056.811392530799
5722.352530613542
2361.741418682039
2206.948091700673
241.25329092144966
186.58088339865208
86.3894479572773
94.39163692295551
3448.749915406108
847.161745980382
2259.524942725897
506.91258566081524
0.0
104.59504866600037
0.0
200.55158565938473
70.59076900035143
289.0083611793816
2068.6156558021903
40.05491427332163
65.19604760408401
23.973111040890217
34.15651239454746
50.784799575805664
24.81089597940445
120.566534884274
923.3519151136279
299.4331863000989
319.37546023726463
62.57569733262062
409.6772518232465
183.81683358550072
1844.0709194093943
435.1384479701519
0.0
0.0
0.0
279.13454020023346
0.0
347.2503601759672
220.61965108662844
21.221027344465256
86.54810558259487
133.27875919640064
0.0
26.46478794515133
0.0
0.0
0.0
181.01836709678173
29.59163637459278
1354.2018556296825
66.07982537150383
2.7076



Elapsed time to compute best fit: 19.129 seconds
Cross-validation score: 0.49515248966740194
Test score: 0.5210918114143921
Best Hyperparameters: {}
1130.6938139088452
8868.969482224435
3440.5278813801706
484.0492209084332
255.18662648275495
1413.5875286608934
120.70582969114184
95.6769416257739
5.686640448868275
137.57331336289644
943.049195650965
1991.0371295511723
182.1215770356357
0.0
101.3253965973854
0.0
848.3573478907347
2452.053792349994
144.22021912038326
1156.752317801118
141.14054072648287
65.65609770268202
92.33056076616049
5.736384183168411
2.093764014542103
19.508628979325294
95.5026021450758
2203.4855971187353
41.75736293196678
2102.557486385107
36.08039930462837
370.2639607936144
196.02660538256168
86.37131363153458
182.86643712967634
0.0
0.0
0.0
99.46141043305397
1.471693016588688
244.5934346318245
146.12629936635494
5.545023135840893
98.77487065643072
81.05837548524141
0.0
85.53482884168625
0.0
0.0
0.0
64.00639013946056
74.56248228996992
128.19693329185247
102.9576099



Elapsed time to compute best fit: 19.845 seconds
Cross-validation score: 0.4874804357313932
Test score: 0.5243902439024389
Best Hyperparameters: {}
750.0809823349118
7577.201769527048
4579.451667189598
1506.4194065257907
1276.4374057129025
215.5968033373356
449.96412735432386
103.10737121850252
36.72421573847532
31.017757758498192
209.6106343716383
2190.64858302474
1290.2176193967462
0.0
130.94491512328386
0.0
114.85212567448616
1266.5143132135272
278.24227644503117
2177.6149257943034
10.156278058886528
26.44494404643774
484.6867762133479
3.756817303597927
24.24134922027588
22.020402647554874
105.51174575835466
5222.658422209322
1014.6709413602948
251.46526804566383
427.1486276835203
222.297428868711
278.5269049555063
39.95052498579025
68.16990491002798
0.0
0.0
0.0
49.99508037418127
0.9289209842681885
223.92465209960938
121.24640017747879
9.767084211111069
161.4939690977335
24.512976825237274
0.0
37.23578530550003
0.0
0.0
0.0
65.029710277915
63.34827309846878
282.3369814902544
85.89426



Elapsed time to compute best fit: 18.919 seconds
Cross-validation score: 0.5203535395217775
Test score: 0.49261083743842354
Best Hyperparameters: {}
2224.7629929557443
8316.587333384901
2208.130821440369
2403.026706084609
189.26871535182
197.08813323825598
629.044286698103
772.1207866296172
37.746805645525455
40.51126439869404
180.7803766876459
1600.1981286928058
346.71957890689373
0.0
93.09673511236906
0.0
47.002046421170235
5937.907134771347
197.3739776685834
477.9919931590557
30.95411491394043
27.02713680267334
38.566223092377186
5.045342028141022
0.6587180197238922
21.330565184354782
109.3179298043251
2182.578162178397
147.85819436609745
423.2542312294245
84.2971477881074
169.99347615242004
320.82844138145447
27.26732398569584
389.0588953718543
0.0
0.0
0.0
329.9023161381483
2.147944949567318
528.101803407073
84.55506069213152
0.3796539902687073
116.57458953559399
1248.0239900127053
0.174904003739357
29.236224949359894
0.0
0.0
0.0
136.31545041501522
32.603174820542336
262.2785606980



Elapsed time to compute best fit: 20.975 seconds
Cross-validation score: 0.48140301021985027
Test score: 0.5289672544080605
Best Hyperparameters: {}
673.0687588751316
6204.699177719653
4425.166252605617
1706.4506805501878
189.57088772952557
240.0913345515728
69.8413377404213
825.3941722214222
260.2581849396229
103.67958290874958
1127.0370048172772
1910.8172343969345
171.75809833407402
0.0
92.55935683846474
0.0
56.68084176629782
5527.560895830393
566.8260883763433
1487.4172076433897
72.46343645825982
25.107949301600456
121.32776121795177
472.4224678128958
9.019970774650574
45.45532077550888
70.09471635892987
2027.5717440247536
62.64342322200537
438.0647048577666
4.198056772351265
293.3871401846409
335.3894287198782
48.34971686452627
211.79409256577492
0.0
0.0
0.0
324.6521504148841
67.90907841175795
113.13415652513504
50.712987042963505
49.466336742043495
83.07655247300863
568.3786328956485
0.0
36.00850337743759
0.0
0.0
0.0
86.37673725187778
184.87208679318428
192.99707558006048
19.32612



Elapsed time to compute best fit: 21.656 seconds
Cross-validation score: 0.5176256750194362
Test score: 0.5802469135802469
Best Hyperparameters: {}
644.9585961848497
10757.95102251321
4806.014053188264
2489.0484783276916
476.1505327373743
173.3987010270357
128.41424590349197
51.028908513486385
65.24444335699081
92.74465201795101
178.28004025295377
2238.73547398299
49.16114577651024
0.0
324.5396056547761
0.0
808.7725957855582
1229.8798450902104
823.0922391414642
246.37047651410103
313.94161581993103
85.61715984344482
1119.8955129161477
89.1824911981821
4.809458136558533
77.45604172348976
219.81817843019962
1828.2665705531836
93.16013700515032
239.24477596580982
21.896888062357903
190.42992888391018
133.63652462512255
19.548625871539116
411.8727619498968
0.0
0.0
0.0
70.20187874138355
1.6937899589538574
94.16436046361923
148.64635115861893
9.787402845919132
2236.480481736362
72.8520589619875
0.6178970038890839
946.6666040793061
0.0
0.0
0.0
100.57180687785149
336.83965945243835
159.8914205



Elapsed time to compute best fit: 22.580 seconds
Cross-validation score: 0.48244523130816813
Test score: 0.4785894206549119
Best Hyperparameters: {}
813.623191408813
10680.306244939566
4661.786774426699
1325.4649445004761
665.3232791870832
116.6224710047245
70.41548612713814
285.77737847715616
26.139003455638885
27.636998265981674
250.81978488713503
2185.2568162232637
253.07511024922132
0.0
75.28654210269451
0.0
122.83392702043056
889.4565103948116
1138.0858458653092
271.0701623931527
68.61655655503273
79.45332831889391
88.02978557348251
28.535994917154312
7.616239905357361
21.089290618896484
90.86940650641918
2260.1201541125774
145.9756200760603
392.45457546412945
33.539856642484665
281.71514192968607
182.98602071404457
6.8938839212059975
174.3528765887022
0.0
0.0
0.0
198.6906960606575
19.713329136371613
181.01199366897345
102.05422760546207
0.8287239968776703
1012.3143501207232
54.19063015282154
0.1483599990606308
694.5722530633211
0.0
0.0
0.0
150.6104140728712
147.39558708667755
159



Elapsed time to compute best fit: 22.762 seconds
Cross-validation score: 0.48714413372214443
Test score: 0.4430379746835444
Best Hyperparameters: {}
502.6172454506159
8657.242532163858
2484.4674628525972
1163.1778304055333
2012.965832836926
450.08293079584837
737.2825246751308
67.00061827152967
152.47247257083654
52.833100236952305
494.35799553990364
195.39127988368273
877.8383002579212
0.0
216.78968165442348
0.0
181.72135204076767
1397.5059511885047
46.56705071032047
1287.7847918644547
111.57549580186605
220.33785559237003
143.66187676787376
25.722399353981018
9.48097026348114
27.985035985708237
70.20083712786436
2544.092221483588
170.52489399164915
158.83273332566023
85.08113570511341
222.57898870110512
170.67164503782988
37.22524216026068
261.881646476686
0.0
0.0
0.0
61.811977952718735
0.0
179.33761210739613
92.32967705279589
2.2976309582591057
351.1797526553273
56.93734888732433
0.0
25.64692808687687
0.0
0.0
0.0
146.03149788826704
53.43040473759174
149.2254381775856
49.426870949566



Elapsed time to compute best fit: 19.206 seconds
Cross-validation score: 0.5125171660487903
Test score: 0.42713567839195976
Best Hyperparameters: {}
712.9303293786943
11611.77997712791
4485.436182949692
1980.533368781209
2991.028120137751
261.2978833913803
113.09585542976856
42.755645878612995
56.823852613568306
104.60998462885618
917.7841900065541
1292.1229596585035
87.22108415514231
0.0
168.21525056660175
0.0
64.77933009713888
1182.5168602317572
215.46045178174973
687.2893946543336
133.5617711097002
64.68322338163853
1256.2971857637167
25.486299961805344
167.38336238265038
18.887489557266235
120.74574252963066
2516.06532792747
129.9954883903265
208.69709078222513
189.60154108703136
1416.3699000179768
243.02538055181503
3.4047539830207825
150.71917836368084
0.0
0.0
0.0
347.2174022421241
0.7692539840936661
668.8937317878008
755.2825531512499
1.032979965209961
158.7068872153759
421.3877668976784
5.662460088729858
121.42545226216316
0.0
0.0
0.0
123.74649780243635
61.04271610826254
141.78



Elapsed time to compute best fit: 20.181 seconds
Cross-validation score: 0.47813477106454727
Test score: 0.482233502538071
Best Hyperparameters: {}
1380.758540071547
5942.982654813677
2792.3509957045317
1779.3315336108208
208.75074153393507
144.0307492017746
82.71384100615978
229.74195437133312
23.8151064068079
90.76065162569284
147.25578790903091
2074.765208773315
801.6711453348398
0.0
163.1227879524231
0.0
473.54824064671993
5541.312488466501
596.6982495188713
1980.3997002989054
486.1029304713011
31.040303274989128
144.53055871278048
17.85973309725523
6.342812821269035
12.750129669904709
351.46304470300674
1891.1013624221087
198.42088174819946
215.55754506587982
23.50719678401947
151.60779245197773
822.4329065009952
62.760676860809326
327.83652321994305
0.0
0.0
0.0
400.4392766803503
28.91892296075821
24.78341330587864
125.21128863096237
17.660821735858917
1068.7801321595907
31.11034658551216
0.0
981.1224896013737
0.0
0.0
0.0
41.041285157203674
473.9581022784114
174.78638200461864
60.



Elapsed time to compute best fit: 19.994 seconds
Cross-validation score: 0.49753057389401467
Test score: 0.48872180451127817
Best Hyperparameters: {}
747.6966179050505
9013.136707503349
3305.466642856598
771.9701891094446
1959.175547413528
122.57713723182678
35.49004869163036
70.03094682842493
14.843184046447277
63.66341404616833
480.88322319835424
212.51380564272404
1492.9352016597986
0.0
19.667439505457878
0.0
166.00449034571648
97.24637042731047
83.63293416053057
2919.3545137047768
2143.414547652006
24.295209519565105
150.5928326845169
140.82470355927944
15.043499946594238
17.54285103082657
138.7295415252447
918.9936081543565
130.92733013629913
124.2946617603302
35.34093241393566
68.51825450360775
363.7451194599271
78.00059704482555
132.88229721784592
0.0
0.0
0.0
215.1746114268899
0.14262700080871582
272.9824561625719
343.9318820014596
86.71448361873627
120.5406685769558
1330.1247538030148
0.0
992.5860576927662
0.0
0.0
0.0
125.16944000124931
196.69171602278948
152.6453595161438
869.

## 4.2 Rebalancing Strategy - SMOTE

### 4.2.1 Random Forest

In [114]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

smote_randomforest_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    pipeline = Pipeline(steps = [['smote', SMOTE()],
                              #['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    smote_randomforest_normalized_performance_df = pd.concat([smote_randomforest_normalized_performance_df, new_performance_df])
    
smote_randomforest_normalized_performance_df.to_csv("../data/05_model_output/smote_randomforest_normalized_performance_df.csv")



### 4.2.3 XGBoost

In [115]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


smote_xgboost_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    GXBoostPipeline = Pipeline(steps = [['smote', SMOTE()],
                                    #['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    smote_xgboost_normalized_performance_df = pd.concat([smote_xgboost_normalized_performance_df, new_performance_df])
    

smote_xgboost_normalized_performance_df.to_csv("../data/05_model_output/smote_xgboost_normalized_performance_df.csv")




Elapsed time to compute best fit: 663.625 seconds
Cross-validation score: 0.6420975745039922
Test score: 0.6701030927835052
Best Hyperparameters: {}
0.011878123
0.10058836
0.19551927
0.031478133
0.02844919
0.08031075
0.0018216533
0.003031354
0.0016200672
0.0011137606
0.02583761
0.056913733
0.010270053
0.0
0.0010911971
0.0
0.00662747
0.0025776783
0.026386417
0.0030010336
0.0015824548
0.0012287534
0.0061977655
0.0042924182
0.0011114543
0.003964141
0.00058897864
0.008731988
0.0047324826
0.0017008571
0.00046934633
0.0019943293
0.002145113
0.008065223
0.0012674461
0.0
0.0
0.0
0.0021944502
0.0018036191
0.0031078777
0.0033304705
0.02717927
0.002150641
0.0017921531
0.0044088117
0.003910546
0.0
0.0
0.0
0.00085441925
0.0017341938
0.0048809135
0.0023176814
0.0049065785
0.0007277916
0.0064976416
0.014326493
0.012036446
0.005486376
0.0057738046
0.004779802
0.0001473025
0.0011807694
0.0042384025
0.004414803
0.0030540412
0.00061645
0.013418433
0.004921072
0.0033908614
0.0070057516
0.0012868996
0.0096



Elapsed time to compute best fit: 672.568 seconds
Cross-validation score: 0.6376060961559314
Test score: 0.6536697247706422
Best Hyperparameters: {}
0.011297882
0.096010104
0.20570935
0.026239417
0.025053442
0.0059978585
0.0016295181
0.0012651039
0.00095437124
0.00032518452
0.030336455
0.05289409
0.009250457
0.0
0.0025872933
0.0
0.0043104314
0.0029895927
0.012062321
0.06262352
0.0024248248
0.0011922544
0.013108087
0.0014015378
0.003893098
0.0009309101
0.00038339256
0.0076236594
0.0038199842
0.002278284
0.0023390646
0.0024440514
0.005355797
0.011602134
0.0023100898
0.0
0.0
0.0
0.0032232252
0.00099865
0.00402368
0.0034475536
0.041224375
0.002705556
0.0135206
0.011926099
0.009901318
0.0
0.0
0.0
0.006268467
0.011379732
0.00058668986
0.012608101
0.009029655
0.001554678
0.00075164443
0.019274648
0.017126951
0.008886805
0.007868234
0.004025392
0.0018101424
0.0014514265
0.008281814
0.014653221
0.0012265864
0.00201882
0.00073866156
0.0057862755
0.001998283
0.002925753
0.0009571369
0.00071533205



Elapsed time to compute best fit: 665.947 seconds
Cross-validation score: 0.6514332946705647
Test score: 0.6721698113207547
Best Hyperparameters: {}
0.00927044
0.089885294
0.18836495
0.029790966
0.012961983
0.0022840982
0.0025015536
0.001602978
0.00042394598
0.0023506486
0.027818276
0.15084122
0.012279744
0.0
0.0074590384
0.0
0.011076967
0.0016143069
0.01875748
0.0030980522
0.0021067557
0.0024015803
0.01382545
0.005291439
0.0053490032
0.0035334907
0.0033378354
0.0117648635
0.0009823939
0.0004981786
0.005466008
0.0009025842
0.0007061564
0.015307668
0.002428352
0.0
0.0
0.0
0.0017451384
0.0032116987
0.0049182232
0.002211867
0.017912563
0.0049574403
0.0016173236
0.0022587816
0.013848867
0.0
0.0
0.0
0.0025885934
0.0033766995
0.0016105713
0.0008114524
0.0074885823
0.0039239903
0.008137277
0.020547079
0.009384052
0.007983571
0.0070881755
0.0036127581
0.000103989725
0.0027225285
0.00092783425
0.0044637425
0.0044867694
0.0061742514
0.0021460315
0.0034430414
0.0025328516
0.004341438
0.0032301322



Elapsed time to compute best fit: 662.298 seconds
Cross-validation score: 0.6532504841155864
Test score: 0.6818181818181818
Best Hyperparameters: {}
0.02755478
0.09593582
0.18183821
0.03346843
0.029745564
0.0017222076
0.0010650444
0.002361418
0.0007362932
0.00081581797
0.03994163
0.062128104
0.026981402
0.0
0.011941131
0.0
0.0037863273
0.0011675248
0.017889848
0.011984398
0.0014401936
0.0010287986
0.0077296537
0.0033923388
0.006761368
0.006182481
0.0003019061
0.0056393896
0.004466205
0.0034999782
0.0046984344
0.001840263
0.003373809
0.0034804402
0.0017873342
0.0
0.0
0.0
0.003151526
0.0065314434
0.0049000215
0.0031784347
0.037935603
0.0053534615
0.00065106485
0.008935772
0.0053295526
0.0
0.0
0.0
0.0057100914
0.0012749747
0.0006294644
0.008175491
0.016252898
0.0063596247
0.0084359255
0.014813347
0.013779138
0.007573429
0.008000055
0.001831276
0.0013613371
0.0011306198
0.0058281953
0.006588903
0.0038109052
0.0005702893
0.0026221196
0.0054010716
0.0023100344
0.002492451
0.001571694
0.00102



Elapsed time to compute best fit: 657.274 seconds
Cross-validation score: 0.6467967506573776
Test score: 0.6311881188118812
Best Hyperparameters: {}
0.011906981
0.088186316
0.19730645
0.028587569
0.023306789
0.03233282
0.0019792165
0.0033168318
0.0005391092
0.000763073
0.030908963
0.057608087
0.02280255
0.0
0.00093183736
0.0
0.0038719845
0.004883747
0.010766893
0.0050131544
0.0014748242
0.00068363576
0.004454222
0.002497528
0.013285387
0.0037289793
0.00066071376
0.0054956656
0.0030044715
0.0008901335
0.0021983236
0.0017470653
0.0032744482
0.006856515
0.000878452
0.0
0.0
0.0
0.0027168393
0.0029619879
0.0049616713
0.0014691433
0.05972962
0.001163544
0.0019938978
0.016558
0.009432893
0.0
0.0
0.0
0.005581318
0.008740617
0.00083266094
0.00045797488
0.015286403
0.0034056427
0.009104642
0.020350192
0.005283551
0.008284724
0.010735249
0.0034890242
0.0024071534
0.00068536564
0.0077792215
0.020850327
0.0061308634
0.00093042845
0.0016510559
0.0018305841
0.0008763482
0.0023625237
0.0028902455
0.00



Elapsed time to compute best fit: 660.235 seconds
Cross-validation score: 0.6326434209515845
Test score: 0.7311320754716981
Best Hyperparameters: {}
0.013906387
0.09873058
0.20388089
0.033569403
0.030735439
0.08174936
0.0025325757
0.0022199808
0.0005151115
0.0005471438
0.026774812
0.053080298
0.019719666
0.0
0.0016810783
0.0
0.005959087
0.0020233605
0.009284419
0.0039719823
0.002790469
0.0025701625
0.010974233
0.005380741
0.0034245988
0.0016297436
0.0011411977
0.008034064
0.005301416
0.0017523923
0.0033561836
0.000880117
0.0020697562
0.010710552
0.0006957126
0.0
0.0
0.0
0.0023509143
0.0014209847
0.002554811
0.004480884
0.031144088
0.0031158903
0.0011338273
0.00019744023
0.0051151696
0.0
0.0
0.0
0.002463145
0.0012903138
0.0052759713
0.00089874415
0.0035869947
0.0030036776
0.0049005155
0.015331432
0.007609903
0.007971639
0.008378465
0.0036017243
0.0005471732
0.0012135691
0.0022521531
0.010568919
0.0004233529
0.000910838
0.0033869455
0.002146488
0.0025933816
0.004217347
0.00028011424
0.00



Elapsed time to compute best fit: 664.338 seconds
Cross-validation score: 0.6546144254789266
Test score: 0.6432038834951457
Best Hyperparameters: {}
0.010397533
0.10357686
0.21994473
0.024143096
0.017490538
0.0049965587
0.004061687
0.0017498818
0.0005592304
0.0004719521
0.032706674
0.10123934
0.024589263
0.0
0.007347326
0.0
0.012686508
0.0026480886
0.012996923
0.010614214
0.0024607978
0.0032139332
0.005552322
0.00093636214
0.0034105836
0.00081668096
0.0013969468
0.012727787
0.002695234
0.0036208483
0.0022408538
0.0007454645
0.0008693422
0.01032805
0.0017743595
0.0
0.0
0.0
0.004179096
0.003517095
0.0024909605
0.0014307484
0.02976928
0.008628575
0.0023908077
0.002380048
0.01212847
0.0
0.0
0.0
0.0013445256
0.0016015712
0.0008314126
0.0008925807
0.009094343
0.0023695577
0.005850186
0.008140494
0.016396437
0.01081321
0.0056606545
0.0013872581
0.0010178307
0.0015790896
0.0060620103
0.008149532
0.0037944303
0.0024682034
0.0044520483
0.005535684
0.0020295703
0.004433566
0.000986688
0.000407482



Elapsed time to compute best fit: 659.648 seconds
Cross-validation score: 0.6478127460596422
Test score: 0.720108695652174
Best Hyperparameters: {}
0.03653324
0.090033494
0.22091429
0.045068845
0.017750783
0.037343714
0.006203773
0.0032374978
0.0009983584
0.0013520565
0.02565747
0.040801723
0.010201249
0.0
0.0017017548
0.0
0.004491311
0.00092573307
0.017302644
0.013137078
0.0008612245
0.0006478315
0.012441012
0.0031760163
0.010028935
0.0042425776
0.0014973084
0.005628099
0.0037176206
0.0008660484
0.0030192763
0.0015378458
0.0013118347
0.011086289
0.002694075
0.0
0.0
0.0
0.0054669296
0.0022369802
0.00479736
0.0036540083
0.024240486
0.004470257
0.00020330807
0.0022571015
0.007535106
0.0
0.0
0.0
0.0010978581
0.00841346
0.0005842013
0.0001882662
0.010226283
0.018299282
0.002256528
0.019432712
0.01134117
0.008586896
0.005235602
0.0014146473
0.0011543917
0.0014979733
0.00078116666
0.019322481
0.0019943232
0.0007852049
0.0027905598
0.0041369055
0.00084150134
0.004759493
0.004003252
0.00788492



Elapsed time to compute best fit: 666.584 seconds
Cross-validation score: 0.6651366538739877
Test score: 0.6674757281553398
Best Hyperparameters: {}
0.016961526
0.106405124
0.22766663
0.029940004
0.036206577
0.02146698
0.0024271808
0.0016087842
0.0022671039
0.00046528905
0.026541311
0.06661179
0.0153704835
0.0
0.0023161808
0.0
0.0048622857
0.0031965922
0.019005574
0.0061668903
0.0045181494
0.0020080796
0.0046803886
0.020048484
0.0021353513
0.0010551403
0.00087750336
0.0075771036
0.009432183
0.00042152536
0.0024029135
0.0018986093
0.00056954275
0.010473995
0.0018968544
0.0
0.0
0.0
0.0016070289
0.005627422
0.004165369
0.0033773445
0.038035873
0.002613168
0.0008458296
0.0
0.011647004
0.0
0.0
0.0
0.0025655997
0.0012863493
0.0011770383
0.001376407
0.011060597
0.015889123
0.003120375
0.007725865
0.008030551
0.010310544
0.00400998
0.0023707757
0.00056344835
0.0019878
0.00089279667
0.0004899864
0.004392704
0.0060285702
0.0013796864
0.0027573004
0.0038589274
0.0045339293
0.0053230845
0.00500051



Elapsed time to compute best fit: 667.403 seconds
Cross-validation score: 0.632945561403075
Test score: 0.6531531531531533
Best Hyperparameters: {}
0.003410815
0.09813422
0.22734116
0.025668968
0.038130652
0.032609574
0.00087922945
0.0024176526
0.002737628
0.0036856397
0.015125338
0.071699634
0.011658224
0.0
0.0028634176
0.0
0.0015123636
0.0028593554
0.022009876
0.008690069
0.0017786649
0.001524624
0.013452942
0.00086393143
0.0071783615
0.00022409951
0.0008807312
0.00421556
0.0012932637
0.0027935978
0.005595934
0.0011565819
0.0006699436
0.012208965
0.002118709
0.0
0.0
0.0
0.0033069593
0.003376592
0.0036560718
0.0039761127
0.014761507
0.0037389623
0.0022667178
0.0009647275
0.003103639
0.0
0.0
0.0
0.0008541953
0.0018721749
0.0017155237
0.0014103386
0.007958393
0.01222445
0.0053603593
0.01141107
0.010700211
0.009863877
0.004905556
0.0025725588
0.0015505329
0.000602493
0.0020887873
0.01093656
0.0075380737
0.0076327724
0.0004992165
0.0015590136
0.0017763374
0.004960397
0.00059908844
0.00964



Elapsed time to compute best fit: 655.959 seconds
Cross-validation score: 0.6469167068561342
Test score: 0.625
Best Hyperparameters: {}
0.0077247764
0.08752979
0.22550768
0.032577466
0.020121101
0.00089513726
0.0013184594
0.00211956
0.00094218063
0.0004315715
0.023781091
0.111744575
0.034071703
0.0
0.007958115
0.0
0.015826697
0.0027894147
0.016011413
0.00473634
0.0020601293
0.0010241751
0.009819528
0.0012844817
0.013009408
0.002301167
0.0017833572
0.0025674195
0.0069735986
0.0018674516
0.006781326
0.0030080865
0.00499222
0.009406811
0.0017894795
0.0
0.0
0.0
0.0016387049
0.0057727955
0.004863437
0.0048276894
0.02445326
0.008124937
0.0018574062
0.00836207
0.011012413
0.0
0.0
0.0
0.008140071
0.003910287
0.0008174953
0.0008968748
0.0055302125
0.0031062118
0.001911511
0.013893401
0.004249385
0.0079086935
0.006528939
0.006165104
0.0040039555
0.00077687134
0.00251883
0.0071862056
0.0042791916
0.0010106451
0.0031832089
0.0014560372
0.0028337317
0.0018658998
0.0004361542
0.010527623
0.003448037



Elapsed time to compute best fit: 656.175 seconds
Cross-validation score: 0.6671519413974822
Test score: 0.6119791666666666
Best Hyperparameters: {}
0.043256193
0.088356026
0.22120972
0.038148873
0.011872834
0.019064274
0.0008847054
0.0024239204
0.0010452787
0.0051181577
0.03371898
0.04191255
0.008746375
0.0
0.0010734674
0.0
0.0031783644
0.003069553
0.010598521
0.00548148
0.004356482
0.0016131046
0.004379232
0.00041812073
0.0003383927
0.0015280914
0.0010093404
0.008844463
0.0009510875
0.0014847746
0.0071766754
0.002160852
0.0009994351
0.011088319
0.0015122897
0.0
0.0
0.0
0.0043143462
0.0021377886
0.00417598
0.0022488076
0.03287959
0.0015207934
0.00032770116
0.00063922064
0.009455753
0.0
0.0
0.0
0.002928236
0.008581917
0.0014201469
0.03569832
0.009188609
0.00074997084
0.0014785997
0.008842062
0.023629239
0.00970646
0.010361417
0.0023539483
0.0010556424
0.002968329
0.0014119424
0.003870553
0.0012766776
0.005682495
0.005966062
0.005143473
0.0027509236
0.004903217
0.010175235
0.009210248
0



Elapsed time to compute best fit: 657.420 seconds
Cross-validation score: 0.6473358069590601
Test score: 0.6124999999999999
Best Hyperparameters: {}
0.003093268
0.0953975
0.20203444
0.015741779
0.035809223
0.030966295
0.0008033534
0.002373103
0.0022068024
0.0012200257
0.02092731
0.097122274
0.03210847
0.0
0.0011128866
0.0
0.013049401
0.0043983716
0.022203466
0.0012849473
0.0012796285
0.0009347246
0.014674179
0.004856703
0.009340027
0.00067604583
0.004158576
0.01016818
0.0026916869
0.0014557587
0.000772838
0.0017447176
0.0006789979
0.007487378
0.001484329
0.0
0.0
0.0
0.0030509036
0.0021478054
0.0065555386
0.0032952707
0.0353973
0.0053647254
0.0027299835
0.0
0.006633918
0.0
0.0
0.0
0.0018166967
0.00183894
0.0003649781
0.001454044
0.0053701345
0.0011372861
0.0008134915
0.019895459
0.015721304
0.010003775
0.0072005233
0.0008665168
9.716263e-05
0.0013422522
0.0028479556
0.010268645
0.0034113538
0.00090186246
0.0044652643
0.0015949514
0.0014614984
0.0031645193
0.0011402711
0.005112663
0.0014



Elapsed time to compute best fit: 664.702 seconds
Cross-validation score: 0.6407322824171509
Test score: 0.701530612244898
Best Hyperparameters: {}
0.03706293
0.08780824
0.19857125
0.031836383
0.029079862
0.0032574546
0.0020059873
0.0013621558
0.00093737675
0.00065444026
0.018141286
0.102885164
0.008139182
0.0
0.0035190121
0.0
0.0054315845
0.0028098014
0.013914039
0.005018177
0.003030354
0.0028369536
0.0057716067
0.007030936
0.0036373974
0.0020991927
0.0027994437
0.0051974393
0.00049824256
0.0013900639
0.0023200437
0.0037154765
0.0025657828
0.007729407
0.002635494
0.0
0.0
0.0
0.006877392
0.0055339914
0.0013536453
0.0044893897
0.027411696
0.006094842
0.011325577
0.0028078053
0.00791718
0.0
0.0
0.0
0.0014910374
0.0127653275
0.0020723199
0.0015749761
0.0053536687
0.020565452
0.0022736997
0.026649151
0.00883947
0.013084489
0.007821778
0.0057666237
0.005320214
0.0007027714
0.0008700357
0.011460488
0.0018578202
0.0005012678
0.004643231
0.0049649766
0.0023969817
0.0042702365
0.0014498663
0.00



Elapsed time to compute best fit: 659.133 seconds
Cross-validation score: 0.6610327394419031
Test score: 0.6760204081632653
Best Hyperparameters: {}
0.013186141
0.089492
0.19653271
0.02849849
0.01391347
0.02391099
0.0011704598
0.00044537892
0.0010604016
0.0019006425
0.030372273
0.13316196
0.009073022
0.0
0.007702075
0.0
0.0017717846
0.002475886
0.016485563
0.005538276
0.0019829078
0.00068783556
0.008396161
0.00091376016
0.0012760012
0.003297683
0.0036897834
0.00749909
0.004870837
0.0036204627
0.004691469
0.0018806478
0.005332409
0.008576219
0.00087147014
0.0
0.0
0.0
0.0052119154
0.006520438
0.0028175886
0.0033244255
0.034637768
0.0034894105
0.00018553456
0.01889726
0.01435969
0.0
0.0
0.0
0.0044737197
0.0011310562
0.0013244296
0.0034332501
0.00755703
0.002108481
0.00080783834
0.019521339
0.009599196
0.00735541
0.004226973
0.0015446465
0.0006439661
0.0014542947
0.0014210059
0.013610233
0.0045834198
0.0012038648
0.0029684785
0.0030245015
0.001212138
0.0032012938
0.0014147899
0.003980042
0



Elapsed time to compute best fit: 658.286 seconds
Cross-validation score: 0.6196669067691287
Test score: 0.675
Best Hyperparameters: {}
0.018834442
0.093153976
0.18842854
0.031368233
0.03155083
0.0024783178
0.007422162
0.0027831034
0.003953716
0.0038050837
0.03084237
0.094663985
0.018101463
0.0
0.0013980326
0.0
0.0055742497
0.001979548
0.013778867
0.0077796527
0.0033227988
0.0009108387
0.009863843
0.00082579337
0.0064969114
0.00069275
0.0019986383
0.006816205
0.006292032
0.0019404518
0.003746017
0.004512359
0.0010360887
0.0052306573
0.0006471189
0.0
0.0
0.0
0.002183426
0.002431286
0.0031108009
0.0025275848
0.02203704
0.0023619626
0.00048615152
0.023086654
0.008957611
0.0
0.0
0.0
0.005771474
0.008060712
0.0015278633
0.0066860924
0.01187035
0.0027096467
0.0016429839
0.01365306
0.008365726
0.008572018
0.0055543478
0.002585619
0.0075871595
0.0016544879
0.002332173
0.008595574
0.003790983
0.0011887926
0.008027726
0.0031529455
0.0012624129
0.0038573078
0.00068162236
0.00090916624
0.002629858



Elapsed time to compute best fit: 660.047 seconds
Cross-validation score: 0.6663015237551053
Test score: 0.7180851063829787
Best Hyperparameters: {}
0.017608248
0.09205379
0.19321683
0.023158371
0.026598914
0.08904354
0.0014804621
0.00028811418
0.006248648
0.0025798187
0.025444446
0.08665414
0.012862779
0.0
0.00057361677
0.0
0.0062070503
0.0024339552
0.01538429
0.0031541097
0.0023435026
0.0010170917
0.0059988135
0.0003792352
0.005189653
0.0041290116
0.0005048798
0.0072524673
0.0017394051
0.0019567143
0.0044737044
0.0016961447
0.00041869484
0.0048668473
0.0013437569
0.0
0.0
0.0
0.0009945893
0.006912284
0.002351137
0.0043028514
0.018931672
0.0049879663
0.00034917405
0.0
0.013564598
0.0
0.0
0.0
0.011245798
0.0014073041
0.0024100456
0.0010393873
0.006820734
0.004027752
0.0033438206
0.01202311
0.006693515
0.00881533
0.0125338305
0.0017392702
0.0008846463
0.00091287715
0.0009795969
0.004453337
0.005082385
0.0011829828
0.00809688
0.005581741
0.0020761157
0.0037711163
0.002688389
0.008723557
0



Elapsed time to compute best fit: 660.262 seconds
Cross-validation score: 0.663011985819313
Test score: 0.675
Best Hyperparameters: {}
0.02399635
0.0938005
0.20829313
0.036637455
0.01527619
0.0010681037
0.0026749962
0.0016803801
0.003953934
0.0005324871
0.033042062
0.07804956
0.02981752
0.0
0.0024158412
0.0
0.0019749987
0.0025503112
0.013061241
0.0033124234
0.0028891566
0.00021862067
0.0072706356
0.0014118741
8.698091e-05
0.0018846769
0.0017564152
0.0043868707
0.008024512
0.0020248762
0.0037656361
0.0011454568
0.00040371084
0.0070382766
0.0010382367
0.0
0.0
0.0
0.0055560204
0.008884462
0.004179191
0.0063231937
0.017363645
0.005220194
0.005253607
0.023040237
0.00646096
0.0
0.0
0.0
0.0022689293
0.0020417199
0.0009863921
0.0017373817
0.009849969
0.0011954465
0.0050062197
0.017087825
0.010365908
0.007209133
0.004897671
0.002856657
0.005248481
0.0001932913
0.0037308154
0.016214522
0.004343955
0.0006480864
0.002539015
0.0052895793
0.003482404
0.004090231
0.004001267
0.0005783252
0.0023136898



Elapsed time to compute best fit: 678.319 seconds
Cross-validation score: 0.6370825509526752
Test score: 0.6536697247706422
Best Hyperparameters: {}
0.012241607
0.09301522
0.19691677
0.027454453
0.02934836
0.081637435
0.0028127704
0.001092807
0.0015181893
0.00527951
0.021789694
0.05983691
0.0058493274
0.0
0.0013561901
0.0
0.006497842
0.0081450185
0.0169046
0.0018830759
0.00084411167
0.000732966
0.006000103
0.002800548
0.0053786603
0.0051132506
0.002793912
0.004628997
0.0015617452
0.0025655052
0.0010804349
0.0024749709
0.001156622
0.0077733058
0.0017556389
0.0
0.0
0.0
0.005105602
0.003177003
0.0071796356
0.004256237
0.025772441
0.0039351126
0.0009450625
0.00013941691
0.013179204
0.0
0.0
0.0
0.0035637913
0.011613434
0.002156082
0.0013686006
0.0016344811
0.006991898
0.0055909595
0.018390767
0.009429796
0.012386491
0.008670843
0.0024118342
0.003037259
0.0010675505
0.0030664594
0.022331765
0.0010451125
0.0026463787
0.00054431066
0.007382062
0.0017842114
0.00344416
0.0015598695
0.0015986525




Elapsed time to compute best fit: 661.793 seconds
Cross-validation score: 0.6706154005385733
Test score: 0.6887755102040817
Best Hyperparameters: {}
0.012765294
0.11116887
0.18532334
0.03532362
0.019218706
0.02578658
0.001954018
0.0014297946
0.003386973
0.0004302536
0.03576862
0.09141466
0.027544072
0.0
0.0066325353
0.0
0.0049903374
0.0013384171
0.010805309
0.008137795
0.0011197493
0.0008514594
0.013155435
0.0013716731
0.008573292
0.0004661403
0.009336813
0.0057958565
0.00516116
0.00060276676
0.0012701045
0.0013996599
0.001171882
0.0105578285
0.0038805618
0.0
0.0
0.0
0.004064998
0.0036314866
0.0064781057
0.003955399
0.026900837
0.004336412
0.0017809976
0.003342518
0.011942722
0.0
0.0
0.0
0.002129706
0.0065406915
0.0031634332
0.0010612096
0.009878505
0.0027430414
0.002479173
0.01885956
0.0114831235
0.009513056
0.004239341
0.0028402777
0.00522667
0.00093594694
0.0019448978
0.013439705
0.0011081372
0.006690444
0.00071690057
0.0018922052
0.0023081675
0.0022286084
0.001014853
0.0028370027
0



Elapsed time to compute best fit: 678.167 seconds
Cross-validation score: 0.6332226841701376
Test score: 0.6850961538461539
Best Hyperparameters: {}
0.023255602
0.09323889
0.23145421
0.032739233
0.02176578
0.027504776
0.0015009987
0.0013821282
0.0006398659
0.0004854515
0.0327105
0.042049844
0.015802609
0.0
0.0061741783
0.0
0.0025106557
0.0043045734
0.021040084
0.009282161
0.0037812628
0.0011416569
0.0078357
0.0012167719
0.004926169
0.00444388
0.002045958
0.0057881
0.00080790184
0.0009314706
0.0018883392
0.0028479234
0.00078134186
0.006637866
0.0029836912
0.0
0.0
0.0
0.0035822159
0.0021820937
0.0038754009
0.0059229303
0.022538727
0.0049106916
0.0016681829
0.039835557
0.0069560967
0.0
0.0
0.0
0.004187723
0.009759216
0.0011598463
0.0036904428
0.007976904
0.0022135014
0.0012927954
0.012556363
0.0062207133
0.00957397
0.0054272963
0.0037718143
0.00048902334
0.0101763345
0.0023096765
0.0014174654
0.0045610717
0.006929214
0.0029850292
0.0028193144
0.0023316212
0.002999727
0.00030725892
0.00231



Elapsed time to compute best fit: 667.333 seconds
Cross-validation score: 0.6218207932766767
Test score: 0.6919642857142858
Best Hyperparameters: {}
0.038172893
0.10049602
0.21444787
0.038095552
0.020837532
0.0010726013
0.0030174318
0.0032399034
0.0013083
0.0003757484
0.024456568
0.027321428
0.016259177
0.0
0.0049861153
0.0
0.0025141356
0.004342344
0.014296658
0.03660324
0.002211562
0.0020406141
0.0132500315
0.001039072
0.0069357837
0.0025985122
0.006723508
0.009246976
0.00822081
0.0020393038
0.004905396
0.0022542367
0.005243408
0.011769875
0.0033545264
0.0
0.0
0.0
0.0069267824
0.0047548823
0.0036057474
0.0022591932
0.02832798
0.0051994617
0.0038120567
0.009539868
0.00795109
0.0
0.0
0.0
0.0018638244
0.0011505334
0.0009361743
0.0011542477
0.0035295887
0.0012117125
0.00056349265
0.01124793
0.014563279
0.013595806
0.0058285557
0.0052032345
0.027634887
0.00056930736
0.0042233416
0.0054197623
0.0011709249
0.0024514683
0.0018717669
0.0041334177
0.0019862454
0.003077944
0.0013723662
0.0009663



Elapsed time to compute best fit: 660.638 seconds
Cross-validation score: 0.6267998886351906
Test score: 0.7500000000000001
Best Hyperparameters: {}
0.011133523
0.09235101
0.22547288
0.033511225
0.025574882
0.10670096
0.00121964
0.0006137831
0.0021192369
0.00033024384
0.023436796
0.029274084
0.011587003
0.0
0.00070203154
0.0
0.008977943
0.001052977
0.023903804
0.001937455
0.001980219
0.000980371
0.009535132
0.0007993925
0.0018074895
0.000719343
0.0033256814
0.005522686
0.0012497823
0.0009425414
0.0027850103
0.0011264363
0.0009323867
0.0071382914
0.0009982041
0.0
0.0
0.0
0.0020491593
0.0035972602
0.001409172
0.0018775802
0.027263783
0.0017993428
0.0031210424
0.008151898
0.01282661
0.0
0.0
0.0
0.0022580193
0.003129476
0.00091311376
0.00044474253
0.0067330766
0.0037450858
0.0012870228
0.011492186
0.008408239
0.008214528
0.0073475624
0.0051281867
0.0042055175
0.0028168876
0.000622855
0.00056325714
0.0029130387
0.003053437
0.0015125404
0.0044917725
0.004439656
0.004615019
0.003297512
0.0093



Elapsed time to compute best fit: 656.161 seconds
Cross-validation score: 0.6478603670075762
Test score: 0.5934343434343434
Best Hyperparameters: {}
0.009800954
0.08284971
0.21294306
0.034774117
0.019937616
0.021332985
0.0024092153
0.0023521658
0.0038105797
0.00036773976
0.015603283
0.14874502
0.021542093
0.0
0.007862381
0.0
0.009601559
0.0021496885
0.025231242
0.0054906304
0.0015516224
0.0017733262
0.0046471986
0.008996353
0.00173111
0.003841203
0.0017626487
0.0075494167
0.0004817473
0.0020106153
0.0008380546
0.0007248635
0.00051244063
0.006164692
0.00096636853
0.0
0.0
0.0
0.0020302539
0.004840903
0.00064177206
0.00089489715
0.03825127
0.007883585
0.0006623265
0.0
0.0066205687
0.0
0.0
0.0
0.0035203963
0.006791454
0.0027735499
0.0010519829
0.008834258
0.0033590982
0.0012905069
0.008714762
0.0029816309
0.008597955
0.0046918355
0.0034847595
0.0012921054
0.0011559774
0.0007041197
0.0003350765
0.0052593485
0.006861698
0.003059305
0.0020212824
0.0023787734
0.0042144903
0.002059314
0.0007323



Elapsed time to compute best fit: 688.682 seconds
Cross-validation score: 0.6335118390221386
Test score: 0.7270408163265307
Best Hyperparameters: {}
0.012262308
0.09309683
0.20569246
0.04126832
0.012855923
0.004030625
0.00074822834
0.0012536297
0.0016267259
0.00088225584
0.044110756
0.07497675
0.010329008
0.0
0.0068469336
0.0
0.0038934716
0.0026030038
0.029338174
0.011290139
0.0024762335
0.0015618147
0.012846513
0.0025440275
0.0008868612
0.0021246746
0.0022816546
0.007984354
0.0019249249
0.00051330676
0.0029216167
0.0018359238
0.0010399822
0.013001781
0.0015669272
0.0
0.0
0.0
0.0028808648
0.0011677834
0.0033572079
0.0030147906
0.036045443
0.004823846
0.00043898146
0.005063768
0.010263588
0.0
0.0
0.0
0.001871116
0.0072012106
0.0016616557
0.00015693458
0.005525431
0.0020229926
0.0019995263
0.02041798
0.0054919715
0.011705295
0.008192168
0.0046254774
0.0005291036
0.0046204375
0.003890237
0.009353341
0.0036432147
0.0017888402
0.0012957772
0.0016597209
0.004168266
0.004257311
0.016441636
0.

### 4.2.4 LightGBM

In [116]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

smote_lightgbm_performance_normalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    LightGBMPipeline = Pipeline(steps = [['smote', SMOTE()],
                                    #['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    smote_lightgbm_performance_normalized_df = pd.concat([smote_lightgbm_performance_normalized_df, new_performance_df])
    

smote_lightgbm_performance_normalized_df.to_csv("../data/05_model_output/smote_lightgbm_performance_normalized_df.csv")




Elapsed time to compute best fit: 47.271 seconds
Cross-validation score: 0.6651982301103263
Test score: 0.6208425720620843
Best Hyperparameters: {}
37269.607696056366
261046.40476846695
462971.9622955322
33384.93755722046
12694.711158752441
18737.02147769928
444.00741958618164
244.32965850830078
488.9177746772766
17.594170093536377
20772.30537891388
5997.105810642242
13919.585157871246
0.0
686.9189414978027
0.0
3425.3514790534973
502.95841360092163
2548.957640647888
3514.6406536102295
98.31614923477173
287.34751415252686
3880.720860481262
598.9390449523926
2399.768335342407
259.7451992034912
220.11283016204834
2448.7377281188965
194.388596534729
412.9979977607727
1248.3417897224426
781.503448009491
895.1760411262512
7600.385234832764
860.6148390769958
0.0
0.0
0.0
414.97322130203247
1026.205822467804
626.6276955604553
936.0289297103882
23067.829770088196
1152.3444681167603
45.37786912918091
94.87573194503784
1035.2208762168884
0.0
0.0
0.0
1988.575689792633
3322.509639263153
1242.1847734



Elapsed time to compute best fit: 42.756 seconds
Cross-validation score: 0.6633771527968733
Test score: 0.6627906976744187
Best Hyperparameters: {}
25776.387248039246
249348.91211652756
435661.1815047264
33880.03420519829
10216.525091648102
3988.4183440208435
1692.262921333313
269.6616973876953
92.46243190765381
94.94060850143433
25581.00134420395
55626.94676589966
6409.610678195953
0.0
913.175742149353
0.0
1341.7359566688538
584.3759660720825
5164.657745838165
23.220720291137695
219.50427150726318
313.8491973876953
3395.4969234466553
432.92065715789795
490.8462038040161
371.03284645080566
339.0652298927307
4355.354782581329
518.0507102012634
138.17543029785156
2657.785861492157
526.7532434463501
1094.3273329734802
6029.57425069809
1298.6017937660217
0.0
0.0
0.0
1431.0938386917114
520.4662203788757
1663.0870490074158
1263.8090138435364
28732.44468975067
978.5031366348267
113.39810180664062
77.44042158126831
306.8230652809143
0.0
0.0
0.0
850.9039006233215
5347.768787384033
131.108988761



Elapsed time to compute best fit: 41.690 seconds
Cross-validation score: 0.6316206945129383
Test score: 0.6705882352941176
Best Hyperparameters: {}
17904.638590812683
261313.71141004562
426168.74907541275
30564.471714019775
16802.940900325775
443.91959285736084
372.63649463653564
327.7172861099243
171.32572221755981
197.62897729873657
26456.040015220642
59236.79150915146
7071.53219127655
0.0
1327.224904537201
0.0
1030.753589630127
394.6865072250366
6960.246623516083
2436.1796002388
276.25071835517883
294.7524585723877
2585.6368193626404
724.1448011398315
1174.4783148765564
422.9957084655762
1624.5219411849976
5354.560347080231
1927.4187240600586
1890.0422205924988
1164.4819583892822
432.20323753356934
1599.6262273788452
7001.600757598877
1961.2286944389343
0.0
0.0
0.0
1064.7697229385376
595.742561340332
2636.2090673446655
1116.862196445465
16107.726251602173
985.6840009689331
236.7806224822998
227.86997509002686
2199.5818195343018
0.0
0.0
0.0
2841.9790625572205
4259.905261516571
408.70



Elapsed time to compute best fit: 45.891 seconds
Cross-validation score: 0.6552207207074024
Test score: 0.6570155902004454
Best Hyperparameters: {}
12636.187673568726
254649.63294744492
482251.5922598839
25051.449955940247
39279.50655698776
210.60862922668457
232.05518770217896
293.32733488082886
303.91529989242554
77.10951042175293
18670.462157726288
18635.447313308716
7499.747295379639
0.0
905.7468252182007
0.0
1867.166757106781
405.54218435287476
13827.949680805206
2053.4825162887573
179.9519271850586
72.56893873214722
3191.284384727478
1598.032099723816
531.6108465194702
75.15290117263794
1362.2000422477722
4159.029299259186
146.9374499320984
840.5681164264679
292.0471701622009
847.8103342056274
582.559280872345
7497.303552150726
260.91339015960693
0.0
0.0
0.0
1139.6500911712646
223.7164821624756
834.6975154876709
1825.7268261909485
12595.152756690979
1139.9196124076843
244.8585557937622
725.2409172058105
2119.448571205139
0.0
0.0
0.0
1858.1454586982727
1206.672667503357
565.513226



Elapsed time to compute best fit: 43.484 seconds
Cross-validation score: 0.6361756593103448
Test score: 0.6695464362850972
Best Hyperparameters: {}
14073.074268341064
264362.70280122757
436470.89112472534
32075.270758152008
9884.745722770691
880.9267678260803
1038.58935546875
208.4769992828369
242.72917079925537
360.3900408744812
27195.15700483322
65980.91881990433
1706.1117248535156
0.0
1073.3074598312378
0.0
440.76442289352417
426.3827543258667
18749.80916929245
539.7517743110657
413.6213526725769
61.59649848937988
2716.6492776870728
444.7959108352661
1314.0727610588074
192.71432065963745
975.7110424041748
3191.1308908462524
469.7307620048523
3007.307951927185
308.7461504936218
469.0332360267639
571.9092435836792
6624.047168254852
765.6054983139038
0.0
0.0
0.0
1165.4556198120117
415.60148096084595
3156.3388266563416
779.3824400901794
14562.020275592804
473.4069790840149
261.62677526474
385.469500541687
2056.7590007781982
0.0
0.0
0.0
1774.3011946678162
7739.368684768677
507.9336667060



Elapsed time to compute best fit: 44.955 seconds
Cross-validation score: 0.6490548002048867
Test score: 0.7666666666666666
Best Hyperparameters: {}
17664.946454048157
267109.7153019905
444541.509547472
23123.64053440094
10495.242426872253
3119.910886287689
487.67892360687256
126.97999954223633
334.7329030036926
85.11837911605835
32098.403900146484
43147.55991077423
6901.875765323639
0.0
358.56741523742676
0.0
1121.2536873817444
458.57441759109497
6035.528380870819
1997.3480739593506
457.77025985717773
163.1604242324829
3316.274101257324
152.93912982940674
591.2333326339722
312.63415336608887
193.82455158233643
3763.698121547699
102.06909132003784
659.94801902771
754.8550248146057
2251.7920327186584
1328.7691683769226
4332.22022151947
955.6615772247314
0.0
0.0
0.0
1218.348223209381
652.3399682044983
1791.2683453559875
888.0100040435791
27984.896342754364
1235.152142047882
449.40413665771484
284.4972620010376
669.3286051750183
0.0
0.0
0.0
2551.003047466278
4801.067378520966
387.330700397



Elapsed time to compute best fit: 43.223 seconds
Cross-validation score: 0.6369886450363254
Test score: 0.7189542483660132
Best Hyperparameters: {}
14850.572010993958
271123.01217365265
424449.00734329224
31469.368318080902
22751.361711025238
1048.9632325172424
151.86715745925903
551.0829339027405
758.8185653686523
228.88764572143555
25933.31528520584
52545.39035463333
5193.142411231995
0.0
2564.30428314209
0.0
2457.4766025543213
979.6937909126282
10453.122702121735
1581.0236945152283
548.7194023132324
158.94306707382202
3968.00088596344
1136.0925312042236
889.2805066108704
344.04576206207275
508.9332890510559
2749.561336040497
605.4288325309753
972.5320792198181
1390.8292932510376
280.3556694984436
804.9042592048645
5573.630518436432
853.7292671203613
0.0
0.0
0.0
594.3456654548645
1049.0680584907532
268.98904037475586
2875.109399795532
14739.5974650383
1221.299087524414
495.20050382614136
835.9584093093872
1783.8097615242004
0.0
0.0
0.0
889.1617412567139
8794.376458644867
708.26125431



Elapsed time to compute best fit: 48.708 seconds
Cross-validation score: 0.6363752898384197
Test score: 0.6407322654462243
Best Hyperparameters: {}
7513.549520015717
276762.4040398598
479867.149600029
19441.55130624771
11334.453983306885
798.0209217071533
1054.3532190322876
392.5497508049011
80.84743022918701
137.1077332496643
19370.29305410385
4213.710512161255
5994.5767068862915
0.0
349.0087642669678
0.0
676.2189812660217
913.9543061256409
34383.78367424011
26375.81458044052
503.39436388015747
427.87362003326416
3061.287071943283
258.2168302536011
216.10056591033936
35.145700454711914
459.0252060890198
5245.633492469788
175.30772256851196
731.7880277633667
1540.369035243988
782.0898098945618
1520.8214263916016
5654.216742515564
586.4110321998596
0.0
0.0
0.0
1630.804247379303
589.7981505393982
708.2885208129883
887.8255486488342
5957.368784427643
919.5405840873718
211.6731128692627
165.62909984588623
688.7382864952087
0.0
0.0
0.0
724.2344822883606
690.7508807182312
455.14952659606934




Elapsed time to compute best fit: 43.331 seconds
Cross-validation score: 0.6352174405461939
Test score: 0.7094594594594594
Best Hyperparameters: {}
13860.99083852768
279043.88382434845
436437.77644610405
32570.551671504974
24909.19651699066
12791.204457521439
325.46949911117554
436.91775846481323
15.910599708557129
23.60849952697754
16689.867120742798
33998.30903816223
9618.895418167114
0.0
2397.1258416175842
0.0
4230.008321762085
796.1972250938416
9871.648934841156
1534.0253009796143
243.70867156982422
155.85915899276733
2229.3967685699463
783.0867457389832
564.3043489456177
54.389870166778564
535.356493473053
3399.420359134674
1164.9859838485718
380.9515142440796
2385.3666796684265
1795.0163278579712
919.21484375
5564.414425134659
1402.2762141227722
0.0
0.0
0.0
1714.5926237106323
310.48319911956787
816.3175525665283
1008.7137479782104
9006.067757606506
2673.9578280448914
194.03548765182495
264.5638999938965
1376.0582900047302
0.0
0.0
0.0
1406.5692257881165
1531.5903949737549
302.9768



Elapsed time to compute best fit: 44.321 seconds
Cross-validation score: 0.657517050869642
Test score: 0.6142241379310344
Best Hyperparameters: {}
18887.00879430771
275505.85545539856
425428.1547007561
30624.45784854889
14483.515372753143
12939.351431369781
861.8373203277588
243.82742834091187
17.487300872802734
10.219300270080566
29644.248540878296
39183.708753585815
5274.760531902313
0.0
494.6543664932251
0.0
1601.9676003456116
208.772038936615
9236.94619178772
1718.2618432044983
184.92390060424805
792.9359922409058
5956.706223487854
1231.4216918945312
715.5866279602051
196.95052242279053
566.2598595619202
5383.192672729492
156.52964782714844
303.42969274520874
2293.696669101715
428.9079065322876
1038.6375079154968
5089.4697732925415
1291.4311933517456
0.0
0.0
0.0
1105.5847840309143
239.52349758148193
2895.3719573020935
1694.0234112739563
13644.378039836884
2392.4505105018616
208.7035026550293
608.0289287567139
1112.9082870483398
0.0
0.0
0.0
541.3059515953064
7190.040511608124
246.10



Elapsed time to compute best fit: 43.190 seconds
Cross-validation score: 0.6671684767106782
Test score: 0.5518763796909493
Best Hyperparameters: {}
5592.465928554535
262943.24840402603
438009.18111658096
23475.167254924774
25380.02308177948
15080.607161998749
452.0190382003784
268.7166838645935
614.2919688224792
75.95596075057983
27079.491271972656
46975.63137483597
15610.1552901268
0.0
728.2115731239319
0.0
4755.859838485718
415.0685167312622
14490.311521053314
5379.229449272156
317.4532413482666
649.4482045173645
4291.8520584106445
109.65230131149292
640.2049102783203
600.9515228271484
718.2861542701721
2321.6037182807922
467.298166513443
179.20409965515137
611.7118740081787
215.23223876953125
531.1944694519043
6842.403801202774
1064.2358207702637
0.0
0.0
0.0
1453.5665502548218
142.79507207870483
3142.2401266098022
1125.00621175766
10059.163558959961
1036.2872796058655
138.52023220062256
443.7795376777649
1641.4082770347595
0.0
0.0
0.0
701.4932689666748
879.9730916023254
1140.2086093



Elapsed time to compute best fit: 44.114 seconds
Cross-validation score: 0.6431453088154934
Test score: 0.6264236902050114
Best Hyperparameters: {}
16435.670485973358
263211.1104848385
450990.38179421425
31372.491347312927
15334.152647018433
2672.521411895752
461.5456259250641
155.0163493156433
58.542420864105225
122.83199977874756
20669.877901792526
56505.97979545593
9546.995244026184
0.0
723.3561458587646
0.0
5272.327090263367
692.2383890151978
4738.784343719482
948.2762508392334
574.0293369293213
331.8731002807617
3033.4405674934387
389.417200088501
675.2729473114014
597.3985161781311
582.1705532073975
4281.264220714569
64.65756797790527
828.8544149398804
592.3760604858398
1372.1230697631836
1409.5970430374146
6763.17648935318
347.5094976425171
0.0
0.0
0.0
972.6142525672913
753.8922033309937
2554.2526059150696
488.7478322982788
10039.509203910828
1134.2620940208435
79.46235084533691
288.6373038291931
2135.44407081604
0.0
0.0
0.0
782.318519115448
4149.379794120789
363.43241596221924




Elapsed time to compute best fit: 44.621 seconds
Cross-validation score: 0.6743392205836987
Test score: 0.599128540305011
Best Hyperparameters: {}
12329.558814287186
252468.54682803154
455505.6142926216
26896.517308712006
23902.683785438538
28899.8144364357
216.61330842971802
273.64175271987915
146.12952995300293
40.04637050628662
32815.654994249344
8870.083209991455
10487.164425849915
0.0
1108.7177867889404
0.0
1790.2272419929504
217.89901304244995
9910.499341726303
1462.8304252624512
367.31743025779724
260.0432152748108
2569.9663124084473
1477.5446543693542
581.5684909820557
256.54876136779785
815.0053930282593
3612.005875110626
1145.7688488960266
160.49666023254395
1087.5975017547607
599.1822166442871
534.0181837081909
5635.022364139557
493.0286464691162
0.0
0.0
0.0
2462.307703971863
637.5887298583984
1672.8761529922485
777.6589527130127
17491.882469177246
446.01393818855286
403.56248664855957
239.82050323486328
948.4926972389221
0.0
0.0
0.0
533.1071152687073
5603.946776866913
867.6



Elapsed time to compute best fit: 44.444 seconds
Cross-validation score: 0.6398145599475122
Test score: 0.6947608200455581
Best Hyperparameters: {}
20618.668728351593
251562.78753089905
452083.6975927353
33968.470583200455
13121.527917861938
399.3432002067566
195.64411973953247
101.25476837158203
636.4642677307129
141.54349184036255
28619.58319759369
42800.35258436203
4345.329514980316
0.0
3843.064606189728
0.0
672.754994392395
767.4868679046631
11945.033064842224
2885.7958517074585
1009.0439281463623
38.16069984436035
4238.70755815506
153.14196014404297
967.396487236023
238.94757080078125
402.1835732460022
2198.548852920532
1264.743185043335
291.86544847488403
763.82066822052
552.528284072876
1191.0788292884827
6061.663763999939
860.0609006881714
0.0
0.0
0.0
1890.380337715149
584.4309601783752
548.4931707382202
728.1220970153809
17054.161137104034
1169.3312907218933
243.32064628601074
154.68090057373047
1167.2024993896484
0.0
0.0
0.0
2060.3886790275574
6730.251846790314
647.5264849662



Elapsed time to compute best fit: 46.569 seconds
Cross-validation score: 0.6540645868674357
Test score: 0.6877729257641921
Best Hyperparameters: {}
40966.386972904205
265301.6231713295
456557.7548456192
31236.08434510231
6915.980945587158
266.032657623291
993.7950940132141
146.852068901062
438.1850299835205
116.48482847213745
33509.16760587692
18878.787474632263
11749.32419204712
0.0
2315.924252986908
0.0
1310.395206451416
599.0148067474365
2499.9235429763794
2022.9300417900085
154.1865997314453
58.4289493560791
4327.169774055481
421.9744110107422
257.6172456741333
0.0
376.6602854728699
4757.829283237457
131.65680170059204
1748.3716230392456
1423.079008102417
405.8124647140503
493.0695562362671
4349.573250770569
422.14660930633545
0.0
0.0
0.0
1872.935348033905
342.86568784713745
1245.2397375106812
2071.681833267212
12739.85361623764
592.8742837905884
316.7412815093994
83.91875171661377
1977.2038412094116
0.0
0.0
0.0
2166.5637488365173
795.6858043670654
766.3254561424255
7308.1222004890



Elapsed time to compute best fit: 43.267 seconds
Cross-validation score: 0.6560392502747556
Test score: 0.6989247311827957
Best Hyperparameters: {}
15534.190348625183
247444.2051372528
464204.02266311646
38545.13648080826
21610.657280921936
194.06859064102173
150.4019694328308
729.7017331123352
163.91370725631714
140.69829940795898
23195.098262310028
46600.426619529724
6265.237218379974
0.0
2056.8912620544434
0.0
1350.5168361663818
336.98566007614136
7841.111341953278
1022.5274472236633
108.8519983291626
8.993880271911621
4963.307340145111
1479.4345655441284
248.50450706481934
554.5207033157349
2218.0251173973083
3682.955413341522
584.1881632804871
200.20108366012573
805.3572111129761
773.864068031311
851.5742120742798
7911.9453983306885
301.719847202301
0.0
0.0
0.0
1752.7756171226501
636.711498260498
926.4099659919739
940.8127684593201
13743.975033283234
1063.0445203781128
385.5436496734619
662.1576843261719
1825.9159832000732
0.0
0.0
0.0
763.5120658874512
960.2609157562256
898.165434



Elapsed time to compute best fit: 43.874 seconds
Cross-validation score: 0.6597052018311591
Test score: 0.592255125284738
Best Hyperparameters: {}
47931.3392367363
259294.2164490223
467233.2330131531
29126.701082229614
9681.635118961334
3291.4566259384155
847.4240710735321
317.1975522041321
183.55810976028442
33.34627914428711
23195.435772895813
14886.93369102478
14221.979503631592
0.0
275.5704345703125
0.0
480.254638671875
1003.5899958610535
6895.166296482086
1800.4125499725342
715.7736415863037
33.85193920135498
5373.003027915955
690.9320106506348
669.6640853881836
104.61052083969116
194.8364176750183
2789.4506664276123
185.80064010620117
974.1645035743713
1142.4159467220306
883.6168336868286
849.0132174491882
5341.602974891663
617.2730231285095
0.0
0.0
0.0
1730.9720587730408
292.9411301612854
190.27911043167114
2255.6614689826965
23742.751505851746
1214.3890690803528
134.87125062942505
58.062798500061035
1715.6756134033203
0.0
0.0
0.0
1632.3723230361938
6964.687686920166
417.7058897



Elapsed time to compute best fit: 45.080 seconds
Cross-validation score: 0.6431612479990119
Test score: 0.6759656652360514
Best Hyperparameters: {}
19783.960394382477
241326.83751249313
440629.75227069855
37495.274196624756
21558.38064432144
628.8217530250549
587.5941677093506
123.909499168396
176.40241813659668
17.995800018310547
30360.55572938919
37428.93789577484
15070.951555728912
0.0
3001.2375288009644
0.0
880.4468331336975
1153.6604776382446
8921.715772151947
2284.6222925186157
268.6502618789673
277.5759048461914
3584.9865713119507
964.0204472541809
76.36139678955078
1081.4627571105957
559.7177858352661
5566.639961242676
188.8973183631897
970.8515911102295
1426.3186736106873
688.6191635131836
806.5759100914001
3997.509757041931
1231.4342460632324
0.0
0.0
0.0
1189.1056814193726
364.37240839004517
644.5467247962952
1127.7997555732727
22455.74812555313
549.561448097229
257.92505168914795
25.768699645996094
1846.8368577957153
0.0
0.0
0.0
818.3047547340393
7132.879330158234
495.769256



Elapsed time to compute best fit: 42.705 seconds
Cross-validation score: 0.6733595321822793
Test score: 0.5955056179775281
Best Hyperparameters: {}
14081.379521846771
271624.8927414417
451433.4001555443
24038.704506397247
25699.928415298462
749.7821750640869
696.5721378326416
915.9932131767273
284.1676845550537
250.501859664917
27137.535573005676
41034.908103466034
7912.017242431641
0.0
3322.344941139221
0.0
1943.485013961792
201.08052825927734
8411.461044311523
387.05571365356445
1177.0627222061157
24.821410179138184
2257.9489154815674
1362.9683861732483
595.1756925582886
41.16367053985596
1314.3973202705383
3175.526981830597
316.113730430603
676.8979496955872
1115.052943944931
2107.293644428253
595.7244172096252
4725.519961357117
1662.9160656929016
0.0
0.0
0.0
1789.4920325279236
912.473840713501
1193.5302877426147
1350.952540397644
16183.728607654572
749.9545946121216
278.2706937789917
29.19950008392334
1214.8329772949219
0.0
0.0
0.0
1782.50279712677
5385.827935218811
223.22272157669



Elapsed time to compute best fit: 46.754 seconds
Cross-validation score: 0.653668427052297
Test score: 0.6843267108167771
Best Hyperparameters: {}
44707.37938833237
259331.23551177979
454103.70988607407
31028.739334583282
15451.209638595581
15837.37802362442
535.6468033790588
855.4360947608948
384.8741602897644
244.96521949768066
24204.026831150055
6211.983901500702
5653.825532436371
0.0
1504.5102472305298
0.0
1437.209608078003
478.1428384780884
8465.183172225952
2297.694049358368
435.0913152694702
76.67379951477051
4552.333510398865
197.89160346984863
978.305835723877
290.83120346069336
768.0080046653748
3344.7437562942505
1535.5754790306091
504.3045845031738
917.4372844696045
1267.482382774353
526.5498690605164
2647.500400543213
1048.8457489013672
0.0
0.0
0.0
2049.042362689972
468.1903657913208
388.08923292160034
1527.302167415619
6168.4947056770325
798.1175694465637
77.38170051574707
47.232308864593506
1316.2709369659424
0.0
0.0
0.0
1775.2781081199646
5610.8621554374695
791.62128305



Elapsed time to compute best fit: 42.681 seconds
Cross-validation score: 0.6691433272403142
Test score: 0.6162790697674418
Best Hyperparameters: {}
39961.73473763466
226796.98728513718
501879.32699775696
33869.32755994797
14780.250699043274
7579.358478546143
1795.8118586540222
416.60454988479614
621.3042478561401
633.027973651886
15085.17491197586
19732.12681412697
8982.156722784042
0.0
1300.0720829963684
0.0
348.77614641189575
778.3163995742798
9759.00768995285
1112.6253242492676
597.7603378295898
88.64042901992798
2965.0991587638855
502.2984080314636
967.6825261116028
286.7533550262451
1260.2117137908936
3142.2504119873047
1036.676547050476
810.9414687156677
233.7755880355835
628.2486925125122
764.4958763122559
5370.388078689575
628.7664294242859
0.0
0.0
0.0
895.1161465644836
761.8911519050598
1904.2479667663574
971.0204348564148
25312.969116210938
472.046537399292
308.5135269165039
83.0072021484375
1570.7333464622498
0.0
0.0
0.0
936.2273588180542
1544.5937881469727
1604.015213489532



Elapsed time to compute best fit: 43.367 seconds
Cross-validation score: 0.6471084298734108
Test score: 0.7045454545454546
Best Hyperparameters: {}
20133.1149020195
268723.9880785942
420643.82031822205
37664.596271038055
7611.123942375183
4281.231147766113
890.2571225166321
248.76881074905396
393.2720069885254
50.32343816757202
35944.99404859543
56520.8870716095
4346.04327249527
0.0
1281.591700553894
0.0
2059.617392539978
1032.7445859909058
10158.902519702911
1430.0678720474243
570.7391157150269
133.8329300880432
3755.294722557068
687.0430898666382
1389.923738002777
14.973690032958984
241.49525928497314
3462.0171394348145
1624.2046155929565
1388.1556129455566
1877.351501941681
840.2354617118835
1071.6437196731567
3558.3527932167053
829.252790927887
0.0
0.0
0.0
623.1512551307678
312.45955419540405
1762.001799106598
738.6729865074158
14299.389180660248
649.7308239936829
110.14170169830322
0.0
1537.2076454162598
0.0
0.0
0.0
2186.337031364441
5068.452298641205
635.8897213935852
154.9987926



Elapsed time to compute best fit: 50.754 seconds
Cross-validation score: 0.6204211427831345
Test score: 0.6828193832599119
Best Hyperparameters: {}
51334.14863777161
261406.95384979248
446453.8656902313
39307.440799713135
20965.391763687134
6686.433934688568
327.9792580604553
178.53956365585327
118.27146005630493
94.21890258789062
21585.3803730011
14706.209492206573
10248.43762588501
0.0
421.8141622543335
0.0
1036.7220177650452
301.824453830719
5833.1339502334595
1893.6931200027466
84.29578971862793
95.60672855377197
4111.1874079704285
450.5478572845459
314.2029056549072
394.3219690322876
220.71848964691162
4418.63636636734
613.632504940033
1102.4898376464844
937.0001697540283
696.6345672607422
1011.6321859359741
9269.676369667053
860.7991428375244
0.0
0.0
0.0
1138.4406361579895
286.38410663604736
1591.5395889282227
949.5282049179077
5924.232549667358
1448.3899021148682
216.56830835342407
511.9006881713867
612.9480953216553
0.0
0.0
0.0
2905.7577805519104
1082.2982077598572
1138.9014883



Elapsed time to compute best fit: 53.099 seconds
Cross-validation score: 0.6409196660136368
Test score: 0.6802721088435373
Best Hyperparameters: {}
23432.335438728333
266084.1427140236
426351.9355959892
33886.90133523941
23344.642677783966
352.003466129303
792.0801134109497
265.7258529663086
37.76914930343628
106.5823974609375
23328.57886648178
52197.00415325165
12612.210998535156
0.0
1777.004379272461
0.0
1490.633556842804
795.132520198822
6024.162584781647
1498.1454591751099
539.3964099884033
214.5310263633728
3205.455015182495
779.8745617866516
818.3631448745728
493.88876962661743
1111.683253288269
3993.705936908722
721.9199867248535
823.6266555786133
604.9953727722168
512.5126566886902
536.9534687995911
6988.977234840393
1123.479076385498
0.0
0.0
0.0
823.1114768981934
666.2330198287964
978.5248594284058
1895.7012362480164
16575.973158359528
1150.3211526870728
314.53743267059326
328.27333879470825
2442.9905128479004
0.0
0.0
0.0
1802.389732837677
5166.46261548996
2407.614535331726
78



Elapsed time to compute best fit: 52.391 seconds
Cross-validation score: 0.6801197980792673
Test score: 0.6278538812785388
Best Hyperparameters: {}
21275.1103246212
259996.65425157547
447971.6863641739
31863.338779449463
16757.190856933594
251.33756160736084
444.0421075820923
346.5834114551544
242.4856095314026
24.126830101013184
20042.26941871643
57106.55487728119
8514.771134376526
0.0
1646.9578084945679
0.0
676.8993992805481
337.0836133956909
6793.452771186829
1514.863030910492
541.2677254676819
25.07332944869995
2542.2704038619995
1003.7926683425903
1323.5527391433716
68.57186841964722
4126.440500736237
3402.625358104706
553.2179870605469
1492.920877456665
892.302412033081
1621.9793934822083
941.4650683403015
5530.722378730774
382.8453917503357
0.0
0.0
0.0
1274.2717685699463
429.36076259613037
1847.5462999343872
1679.0690355300903
16605.929430007935
573.067419052124
83.54410171508789
718.3174476623535
1029.5867085456848
0.0
0.0
0.0
1335.006860256195
3707.825183868408
1126.4865612983

## 4.3 Rebalancing Strategy - UNDER

### 4.3.1 Random Forest

In [117]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

under_randomforest_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    pipeline = Pipeline(steps = [#['smote', SMOTE(sampling_strategy = 0.5, n_jobs=2)],
                              ['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    under_randomforest_normalized_performance_df = pd.concat([under_randomforest_normalized_performance_df, new_performance_df])
    
under_randomforest_normalized_performance_df.to_csv("../data/05_model_output/under_randomforest_normalized_performance_df.csv")



### 4.3.2 XGBoost

In [118]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


under_xgboost_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    GXBoostPipeline = Pipeline(steps = [#['smote', SMOTE()],
                                    ['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    under_xgboost_normalized_performance_df = pd.concat([under_xgboost_normalized_performance_df, new_performance_df])
    

under_xgboost_normalized_performance_df.to_csv("../data/05_model_output/under_xgboost_normalized_performance_df.csv")




Elapsed time to compute best fit: 3.062 seconds
Cross-validation score: 0.08599712553745037
Test score: 0.08866024518388792
Best Hyperparameters: {}
0.022555681
0.067271896
0.13355409
0.011312832
0.014317993
0.01282184
0.0065475833
0.0
0.0
0.0014906706
0.009620549
0.08235731
0.0044902326
0.0
0.020694945
0.0
0.0020649573
0.002378944
0.0040786252
0.025440386
0.0
0.005157022
0.0066734003
0.0078639975
0.005917439
0.0018476575
0.0
0.033487212
0.0
0.0022251161
0.019692438
0.0024524329
0.003938794
0.00037000878
0.008827455
0.0
0.0
0.0
0.02307649
0.0
0.07165545
0.0078013567
0.01897325
0.003968927
0.009649804
0.0
0.0062634125
0.0
0.0
0.0
0.00395143
0.008059532
0.0083457
0.015489318
0.0
0.0
0.0014191307
0.016332727
0.017734213
0.0018853423
0.00091532303
0.0046116044
0.0
0.011039541
0.0007339437
0.011528654
0.0031011743
0.0
0.0013074973
0.0039926837
0.0036224283
0.0
0.00082554546
0.0
0.0035662649
0.0
0.0012124488
0.003971866
0.0008134258
0.0
0.0
0.0010607963
0.00575234
0.014157874
0.0027324148
0.



Elapsed time to compute best fit: 2.907 seconds
Cross-validation score: 0.08147674406201486
Test score: 0.08174668874172185
Best Hyperparameters: {}
0.077210695
0.05152781
0.12512317
0.0096934
0.027526688
0.03152079
0.005301287
0.0005336418
0.0
0.0
0.019468589
0.0076794853
0.0024656996
0.0
0.0023440542
0.0
0.016414285
0.0006723394
0.014204322
0.0130501855
0.005817615
0.0
0.018233456
0.009338014
0.0019239562
0.0
0.0074373526
0.016109068
0.012716893
0.012280743
0.0045192856
0.0048054587
0.003645924
0.0057606655
0.0039135166
0.0
0.0
0.0
0.008935866
0.0
0.016460368
0.0014616706
0.0
0.0028840008
0.0005424565
0.0
0.0025910756
0.0
0.0
0.0
0.00912541
0.012551095
0.0036956992
0.013237396
0.0
0.0013846819
0.00084984046
0.015197082
0.00068406505
0.009281975
0.0068666823
0.0042186882
0.0
0.0012392685
0.0036525591
0.005265598
0.004595351
0.028487328
0.003413905
0.0011893751
0.011964193
0.0
0.0036817647
0.0013333897
0.020189017
0.0
0.018844102
0.004626176
0.0011228462
0.002615944
0.005576964
0.00707



Elapsed time to compute best fit: 2.690 seconds
Cross-validation score: 0.0824160668819791
Test score: 0.0834658187599364
Best Hyperparameters: {}
0.009199132
0.0509336
0.12657878
0.009598985
0.07273002
0.04595085
0.0011021738
0.0
0.0
0.0
0.0075142663
0.023414915
0.0062815994
0.0
0.019763215
0.0
0.0064602266
0.0011903442
0.02710322
0.03451306
0.010473652
0.009668884
0.019815456
0.014074516
0.0024525723
0.0016517221
0.005797852
0.017123772
0.005197825
0.0010208602
0.00066968735
0.0015042983
0.0014453488
0.0019528726
0.012251886
0.0
0.0
0.0
0.0019018324
0.0
0.0034218072
0.008025357
0.0030142278
0.001989702
0.0021919096
0.005970103
0.0009137573
0.0
0.0
0.0
0.0012685151
0.020789746
0.0042261807
0.0005454346
0.0
0.02486762
0.012615078
0.004090484
0.005594357
0.004255345
0.006122182
0.0036407998
0.0
0.0
0.0025357415
0.006948757
0.004628186
0.007970499
0.037000995
0.0019435808
0.0020776957
0.0012805858
0.0077372044
0.0
0.008066287
0.0
0.0
0.018295165
0.010591852
0.014506994
0.0
0.002815907
0.



Elapsed time to compute best fit: 2.644 seconds
Cross-validation score: 0.08506988756706074
Test score: 0.08226324237560192
Best Hyperparameters: {}
0.013394313
0.06478346
0.15121847
0.020175247
0.045056578
0.05759462
0.003733918
0.014486595
0.0
0.0
0.035858937
0.03348425
0.006987875
0.0
0.0077201882
0.0
0.01454137
0.025670098
0.0005467029
0.027904456
0.03542743
0.0
0.0005606969
0.0
0.00050848065
0.0015675968
0.0018004958
0.022742486
0.005422841
0.0029462148
0.00083473336
0.0041573793
0.002078068
0.0007676815
0.0046408377
0.0
0.0
0.0
0.002387659
0.0
0.016684728
0.0028097855
0.0110211475
0.0014289382
0.0
0.0
0.0043123425
0.0
0.0
0.0
0.0033890647
0.01627321
0.0027448377
0.0
0.0
0.0
0.0077917143
0.0267928
0.013363729
0.012774696
0.014571827
0.006747428
0.0
3.502024e-05
0.005751133
0.017017985
0.0020822946
0.0
0.00060772116
0.0034171997
0.0012609039
0.0
0.005293237
0.00096083875
0.013251311
0.0
0.0008712987
0.0
0.0061699087
0.0
0.0017556307
0.032289103
0.002334346
0.009600907
0.019829178
0



Elapsed time to compute best fit: 2.671 seconds
Cross-validation score: 0.08710639253809314
Test score: 0.09189548272807795
Best Hyperparameters: {}
0.020745521
0.062858425
0.15068907
0.008684478
0.051472694
0.042570885
0.011225435
0.0
0.0
0.0
0.018281795
0.0038504018
0.006840987
0.0
0.0
0.0
0.029829277
0.0
0.016826034
0.016679177
0.00042105967
0.0
0.03354213
0.007982733
0.0
0.007943031
0.021748729
0.022589916
0.0
0.00446659
0.0006376305
0.0035485611
0.0021846397
0.011612829
0.017520621
0.0
0.0
0.0
0.004868284
0.0
0.009649803
0.0010828228
0.0
0.0055070156
0.005100421
0.0
0.0068135676
0.0
0.0
0.0
0.0044249925
0.019944068
0.0027376374
0.0
0.0
0.0005174188
0.021726483
0.018457256
0.0014339604
0.060791865
0.014492344
0.0038398297
0.00041295667
0.0
0.00038300827
0.007558939
0.039498575
0.0
0.004211035
0.0018725835
0.0025482771
0.0
0.0008410077
0.0
0.000604741
0.0
0.0
0.0012292347
0.0022546693
0.00038569074
0.0006000579
0.0041034804
0.009921634
0.011538023
0.0045443596
0.008664396
0.0
0.0
0.



Elapsed time to compute best fit: 2.792 seconds
Cross-validation score: 0.08374818187851588
Test score: 0.08201827242524917
Best Hyperparameters: {}
0.019729134
0.063914336
0.13255589
0.019655377
0.033478282
0.05902393
0.0
0.0
0.0
0.0
0.011702327
0.061426815
0.0
0.0
0.0019118413
0.0
0.04551563
0.004161743
0.002908115
0.017321708
0.0072109136
0.002946014
0.0099549815
0.0
0.0007330019
0.0
0.0007529055
0.008172297
0.0026854547
0.005644623
0.0013964233
0.012099391
0.00613839
0.010665255
0.010058924
0.0
0.0
0.0
0.0072739664
0.0
0.008573949
0.00033839088
0.0031118377
0.008282753
0.0
0.0
0.026894037
0.0
0.0
0.0
0.00482809
0.008159287
0.0047108647
0.0
0.0008088914
0.0023823439
0.010681542
0.0069255335
0.002363776
0.0018386778
0.012974894
0.010512357
0.0
0.00077290845
0.0
0.005286814
0.01726068
0.0004641573
0.00046043072
0.0006655992
0.0001985175
0.0
0.0028794731
0.0030274969
0.009415783
0.0
0.00086289935
0.015885472
0.0016019669
0.0
0.0
0.02950788
0.0010108837
0.0016958227
0.008947193
0.0
0.00



Elapsed time to compute best fit: 2.628 seconds
Cross-validation score: 0.0904083433486079
Test score: 0.08191747572815533
Best Hyperparameters: {}
0.045140315
0.0646764
0.1488886
0.01152567
0.018633232
0.030144347
0.002733185
0.0
0.0
0.004935284
0.023337921
0.012347109
0.0118069025
0.0
0.014169825
0.0
0.014084274
0.0
0.0034408565
0.04349932
0.0
0.0
0.01270364
0.003833874
0.0
0.0036487652
0.00515535
0.006818903
0.008663777
0.036009587
0.0007753014
0.0016131138
0.0030643092
0.01588528
0.0013515645
0.0
0.0
0.0
0.0025288714
0.0
0.0076657413
0.0
0.0
0.002763654
0.013253806
0.0
0.0065321685
0.0
0.0
0.0
0.00075237954
0.008260418
0.008065143
0.004938209
0.0
0.015851704
0.0013706499
0.03279343
0.0046727974
0.010977417
0.005643541
0.0022731822
0.0
0.0
0.018642759
0.0026846116
0.007940826
0.0007909665
0.004156926
0.006222931
0.0076010185
0.0035827484
0.00050926564
0.0025690221
0.0024558862
0.0
0.008320882
0.002437199
0.0
0.0
0.018974246
0.0028500834
0.0025482567
0.0013277945
0.017101368
0.031783



Elapsed time to compute best fit: 2.756 seconds
Cross-validation score: 0.08105972948622399
Test score: 0.08389261744966443
Best Hyperparameters: {}
0.014404899
0.05912777
0.12435288
0.005010791
0.0066324356
0.077320784
0.003084608
0.0
0.0
0.002023677
0.09077504
0.017728508
0.007938531
0.0
0.0
0.0
0.019431556
0.0012915933
0.017364101
0.0125628775
0.0012807208
0.0
0.0063902875
0.0
0.007848138
0.0
0.004950849
0.011810766
0.0037614969
0.014947119
0.0022148278
0.0023287665
0.002553235
0.0005442378
0.011795724
0.0
0.0
0.0
0.00931904
0.005436806
0.017372463
0.018902248
0.024249634
0.01235777
0.003379771
0.0
0.012153289
0.0
0.0
0.0
0.00093857036
0.0
0.0040530674
0.054697186
0.014883106
0.012917164
0.003363066
0.013529724
0.006992807
0.011203111
0.0019214127
0.0067563644
0.0
0.00080660486
0.004005148
0.015217742
0.0
0.00039260252
0.0065246653
0.003991072
0.0010845026
0.0
0.0007961901
0.0024271314
0.0026261199
0.0
0.0
0.021829749
0.0079084635
0.0
0.0
0.0063529224
0.0056466674
0.025558773
0.0082



Elapsed time to compute best fit: 2.652 seconds
Cross-validation score: 0.08164483343111975
Test score: 0.08790737564322469
Best Hyperparameters: {}
0.04788173
0.055254113
0.18520984
0.009875431
0.02630063
0.025844133
0.0
0.0018299044
0.0
0.0
0.0009476614
0.015686916
0.005182532
0.0
0.01840228
0.0
0.0019543513
0.008014503
0.026649302
0.013577592
0.0
0.0
0.05480346
0.017283792
0.0064083487
0.0
0.005283605
0.0022083048
0.0019199181
0.0
0.0015837941
0.0
0.002261638
0.0024299768
0.00586719
0.0
0.0
0.0
0.0045232615
0.0
0.0345642
0.0030354054
0.0
0.00065910275
0.013219462
0.0003677438
0.048943613
0.0
0.0
0.0
0.016493239
0.010023681
0.010373792
0.0
0.0
0.0046023056
0.010152425
0.014436001
0.010969408
0.0031058341
0.0036999001
0.006489343
0.0057355063
0.008375474
0.005913519
0.0
0.012643615
0.0
0.0073374826
0.0005290718
0.004387411
0.0
0.00014194923
0.011413747
0.0013675572
0.0
0.0
0.0
0.0013951942
0.0017676747
0.0011431135
0.0018675532
0.0026325453
0.0018609132
0.00023768796
0.0
0.003171135
0



Elapsed time to compute best fit: 2.654 seconds
Cross-validation score: 0.08270731510385762
Test score: 0.08226324237560192
Best Hyperparameters: {}
0.01849558
0.048773404
0.10721363
0.008456301
0.012322471
0.011382312
0.0070482544
0.030566983
0.0
0.0
0.028122418
0.2047157
0.008534843
0.0
0.0013717928
0.0
0.010085573
0.0035718428
0.010664807
0.0041193888
0.0
0.0
0.039987247
0.006075495
0.00071702857
0.028064953
0.0022519834
0.009693785
0.0016158613
0.008106132
0.014462265
0.008750243
0.0016126033
0.00033561658
0.004032607
0.0
0.0
0.0
0.003183167
0.0
0.0008042975
0.003920462
0.0
0.016788037
0.00458074
0.00069150527
0.015119446
0.0
0.0
0.0
0.009627037
0.008209069
0.0023303952
0.0110509405
0.0
0.01393725
0.0
0.00920749
0.015970223
0.0078005195
0.0
0.0038324348
0.0
0.0
0.0035081045
0.0005712742
0.002955248
0.0
0.002967697
0.0036548662
0.009082991
0.0
0.000786578
0.0
0.007982532
0.0
0.03766393
0.003385158
0.0002897468
0.0
0.00041234272
0.004840627
0.0
0.01686967
0.0007564967
0.012282434
0.0



Elapsed time to compute best fit: 2.535 seconds
Cross-validation score: 0.08818176413230902
Test score: 0.07587859424920128
Best Hyperparameters: {}
0.02691089
0.0506837
0.1600311
0.022256885
0.006296872
0.032159425
0.001743335
0.0
0.0
0.0
0.013253095
0.08283358
0.00490578
0.0
0.017553754
0.0
0.020508235
0.0
0.010455737
0.065186806
0.0
0.0
0.0060343905
0.0021205214
0.0
0.0024882187
0.0055637606
0.0
0.009606253
0.0069121206
0.008172504
0.0063486076
0.004618143
0.004095944
0.0012065669
0.0
0.0
0.0
0.017566206
0.0
0.007916334
0.00825603
0.0
0.0070664836
0.0
0.0005744965
0.00068063434
0.0
0.0
0.0
0.005856983
0.019813817
0.0034632252
0.0
0.038997643
0.014765363
0.0
0.0016375033
0.0033371935
0.010961478
0.010800787
0.0029717158
0.0
0.0
0.002009039
0.008603931
0.0060778763
0.0
0.0020734402
0.0035085618
0.0020076565
0.014450729
0.0
0.005722975
0.00045863632
0.0
0.016953481
0.00020852084
0.0030847986
0.0
0.0
0.0009814927
0.007230636
0.0
0.025278438
0.0
0.0012539901
0.0
0.009553135
0.006217784
0



Elapsed time to compute best fit: 2.594 seconds
Cross-validation score: 0.08538553302084576
Test score: 0.0957943925233645
Best Hyperparameters: {}
0.013943615
0.059676997
0.13795134
0.010319712
0.055613246
0.025678527
0.0059682857
0.0029098797
0.0
0.0
0.014981095
0.0050260834
0.014494353
0.0
0.0
0.0
0.01800514
0.02202761
0.010056385
0.015163962
0.0
0.0014869635
0.028044475
0.0010652023
0.0
0.0027241928
0.002231088
0.003836793
0.0037656245
0.003961744
0.0028370908
0.0028951985
0.0025644964
0.0
0.0026683982
0.0
0.0
0.0
0.0076792454
0.0
0.01248262
0.007229211
0.020000074
0.010457568
0.0084187575
0.0
0.0042139348
0.0
0.0
0.0
0.0075789257
0.009364648
0.001071084
0.0
0.0057277177
0.0072321533
0.024321333
0.008972858
0.008097697
0.000703558
0.0135861635
0.018748475
0.0028623273
0.0005745401
0.0
0.0
0.00052351505
0.0
0.01160468
0.025924977
0.004470318
0.0
0.0
0.0
0.013128882
0.0005010883
0.0
0.0044357846
0.013801549
0.0
0.004141269
0.027971696
0.016311675
0.0
0.016798446
0.0036892113
0.001250



Elapsed time to compute best fit: 2.678 seconds
Cross-validation score: 0.08302225532616656
Test score: 0.08453085376162299
Best Hyperparameters: {}
0.030022483
0.052915264
0.13940679
0.009835037
0.01752037
0.038707532
0.0
0.0027812624
0.0035242601
0.0
0.02257952
0.054683015
0.011458157
0.0
0.016918326
0.0
0.011065182
0.0003475252
0.006321297
0.02828528
0.0
0.0
0.0012084091
0.049204513
0.0
0.001032131
0.004345644
0.0
0.0
0.0020701536
0.0031216894
0.033595163
0.0017018671
0.0019404661
0.004277164
0.0
0.0
0.0
0.0013645269
0.0
0.0035429306
0.0091493875
0.002178057
0.008742798
0.0
0.00081209897
0.002870262
0.0
0.0
0.0
0.0069283666
0.010113087
0.001298772
0.0
0.0
0.018908609
0.004023453
0.0058173193
0.004899463
0.0025113027
0.042851515
0.011910165
0.0
0.0
0.0058930693
0.017190674
0.023769999
0.015544542
0.0011303872
0.0008268078
0.0032077779
0.0
0.0011399153
0.05372984
0.0033337316
0.0
0.0
0.001416236
0.027833859
0.007423596
0.004294227
0.0015783985
0.0012923311
0.002900526
0.0
0.018628463




Elapsed time to compute best fit: 2.545 seconds
Cross-validation score: 0.09223567822774517
Test score: 0.08305298570227083
Best Hyperparameters: {}
0.0068800044
0.06367963
0.1296148
0.009472017
0.04177001
0.04819703
0.056480605
0.0021482909
0.0066858735
0.003817119
0.0016971482
0.026725762
0.0028437406
0.0
0.0
0.0
0.041643873
0.00063488906
0.0030006159
0.06330475
0.0
0.0008213499
0.009050233
0.0028250522
0.005588715
0.008038599
0.012818743
0.0
0.009082526
0.0040909015
0.0019192504
0.0027095426
0.00404601
0.0
0.0018815155
0.0
0.0
0.0
0.0024732742
0.0
0.005134616
0.0037389845
0.0
0.0029532625
0.023320694
0.0
0.0007990253
0.0
0.0
0.0
0.007952189
0.004386634
0.00037347872
0.023134647
0.0
0.025468856
0.0
0.0072266445
0.0070637944
0.0034880463
0.00079591526
0.012894142
0.0065218275
0.0009328501
0.002493475
0.01501585
0.0024137434
0.0
0.0012888879
0.00078702805
0.0024761106
0.0
0.0012788532
0.0
0.0033594526
0.0
0.0
0.008203207
0.0007741258
0.003461095
0.0055028824
0.0091279885
0.0011905273
0



Elapsed time to compute best fit: 2.581 seconds
Cross-validation score: 0.08120807171858976
Test score: 0.08297872340425531
Best Hyperparameters: {}
0.025246855
0.051489975
0.15345943
0.005847552
0.021005
0.0069816173
0.013831062
0.0
0.0008650373
0.0018126558
0.009281281
0.068917066
0.00389816
0.0
0.007848976
0.0
0.01270535
0.027338548
0.008796339
0.017156448
0.0015908562
0.0
0.007375863
0.02876942
0.0056552878
0.029478887
0.0
0.020710476
0.008191447
0.0021356589
0.0008689828
0.0023880533
0.0025050736
0.0
0.001798046
0.0
0.0
0.0
0.0011731907
0.0
0.002671408
0.0154292295
0.0
0.0042376877
0.0023884599
0.0
0.0009016885
0.0
0.0
0.0
0.009497452
0.016353324
0.0033331844
0.0004123333
0.0
0.0012293196
0.0012782194
0.018857632
0.0017810201
0.009183873
0.017604839
0.0031773266
0.00040838393
0.0082533695
0.009726754
0.0027440237
0.0014210568
0.0
0.0037447792
0.0021131365
0.012205305
0.0
0.016427632
0.0
0.0
0.0
0.021552922
0.0051982114
0.0027549786
0.0056514507
0.011699494
0.0029807335
0.001205376



Elapsed time to compute best fit: 2.589 seconds
Cross-validation score: 0.08800935382858308
Test score: 0.08686440677966102
Best Hyperparameters: {}
0.046744697
0.055726916
0.17576675
0.010691279
0.04639074
0.029167885
0.00078966
0.0
0.0
0.0
0.036677796
0.024260664
0.012350029
0.0
0.0
0.0
0.018892387
0.0013656957
0.0068672867
0.021894379
0.0
0.0
0.0017517898
0.0030247085
0.005455291
0.0
0.016454171
0.001830295
0.008164111
0.004965879
0.0036171833
0.011918021
0.005004046
0.0048506893
0.013689317
0.0
0.0
0.0
0.0062629767
0.0
0.0035959275
0.008289309
0.0
0.022263033
0.0
0.0
0.008029318
0.0
0.0
0.0
0.003543521
0.007162877
8.783631e-05
0.002092924
0.0
0.0017382395
0.0046720947
0.0032383215
0.0062355003
0.018238697
0.05063334
0.004111931
0.0
0.0
0.0013589886
0.006696462
0.02100015
0.0
0.0037879013
0.0010051723
0.0011354391
0.0
0.002815739
0.0006029352
0.00081326405
0.0
0.0005246494
0.0
0.017952645
0.0
0.07196939
0.008529435
0.005877517
0.00529122
0.0026922054
0.008153595
0.0
0.009734319
0.0




Elapsed time to compute best fit: 2.585 seconds
Cross-validation score: 0.08046225543059587
Test score: 0.07841672890216578
Best Hyperparameters: {}
0.018282041
0.058184795
0.1457102
0.018667087
0.017803999
0.07456995
0.007880869
0.0054171053
0.04423688
0.024724301
0.020792356
0.023325017
0.008724509
0.0
0.011228887
0.0
0.014485435
0.0
0.015139446
0.01151583
0.003307722
0.013072176
0.022750366
0.0016686288
0.0
0.0048738522
0.006696711
0.0022047344
0.0007113599
0.0059228963
0.020585287
0.0011725086
0.0032577282
0.0
0.0039813938
0.0
0.0
0.0
0.0043806466
0.0
0.00088079745
0.004614831
0.0
0.013748492
5.481075e-05
0.00040528257
0.0
0.0
0.0
0.0
0.014595342
0.0019458906
0.0010242221
0.0
0.0
0.007088199
0.0022935232
0.0149952695
0.0043559987
0.007142788
0.0038464505
0.0057118456
0.0
0.0
0.0013141526
0.00488944
0.0013334781
0.0
0.001932809
0.024422726
0.0024077073
0.0029897608
0.00046027172
0.0
0.007757001
0.0
0.0038204463
0.0012383126
0.023394275
0.00048290842
0.002291207
0.025422577
0.0034544



Elapsed time to compute best fit: 2.611 seconds
Cross-validation score: 0.0808083029205106
Test score: 0.08457095709570957
Best Hyperparameters: {}
0.028174976
0.07418731
0.14335296
0.019536143
0.039095636
0.09241377
0.0
0.010686572
0.0
0.0
0.0
0.013940652
0.011720972
0.0
0.0
0.0
0.0035401944
0.011571545
0.014097989
0.006252326
0.0
0.0018172192
0.011122763
0.0
0.0
0.003828436
0.007880868
0.0103842495
0.0
0.0022896593
0.008283783
0.0011705509
0.0060800207
0.0008445008
0.009777595
0.0
0.0
0.0
0.014036945
0.0
0.016942551
0.013697456
0.0
0.008054976
0.0013718422
0.0
0.0008331349
0.0
0.0
0.0
0.01716652
0.011435675
0.00038276275
0.0
0.025050681
0.049622953
0.0025210378
0.0152079705
0.0046791234
0.012579119
0.005424135
0.004254519
0.0
0.0
0.0008911302
0.006364382
0.0017320985
0.0
0.0005547437
0.0016960145
0.030240497
0.005538215
0.0053469315
0.0
0.00603099
0.0
0.0
0.0
0.012889093
0.0
0.0021560276
0.020071523
0.004823945
0.016075514
0.0006022622
0.0026058103
0.0018689649
0.0
0.0
0.00042217146




Elapsed time to compute best fit: 2.637 seconds
Cross-validation score: 0.08311140748843866
Test score: 0.08634868421052633
Best Hyperparameters: {}
0.050427277
0.0589562
0.11674949
0.0076726037
0.0539524
0.028496662
0.012074729
0.009624871
0.0
0.0
0.0
0.024213463
0.00059614354
0.0
0.0
0.0
0.0088381935
0.010850296
0.022927094
0.044201184
0.0
0.0
0.003621923
0.009219083
0.0
0.03483383
0.0046537514
0.0024670651
0.002696012
0.010781833
0.002212989
0.0014495023
0.006560361
0.0071898773
0.011791532
0.0
0.0
0.0
0.005851811
0.0
0.00873034
0.006881718
0.0
0.0009309288
0.015560492
0.00040128586
0.019087045
0.0
0.0
0.0
0.0045895777
0.009336262
0.0033048103
0.005154631
0.007003427
0.011372389
0.0006367942
0.015564014
0.0040189703
0.0024751143
0.0
0.006269674
0.0
0.0
0.00092277775
0.0021276658
0.014094341
0.0
0.0067815734
0.0
0.0
0.008720485
0.0002919662
0.006056267
0.0072878697
0.0
0.0009763481
0.0047471346
0.0010642699
0.049867548
0.0047060717
0.0032766643
0.0011772572
0.005077032
0.008685883
0.



Elapsed time to compute best fit: 2.523 seconds
Cross-validation score: 0.08686210561655358
Test score: 0.09401114206128135
Best Hyperparameters: {}
0.01656531
0.052119453
0.1351855
0.017379303
0.01439131
0.026356231
0.0
0.0
0.0
0.0
0.0
0.073285624
0.0019263956
0.0
0.0
0.0
0.0
0.02816561
0.0064444807
0.02400751
0.0
0.0
0.014948182
0.11727865
0.009804261
0.009613633
0.0034177396
0.012133182
0.0025682123
0.0055561513
0.0036079702
0.003656841
0.0035749846
0.0058070035
0.0030024066
0.0
0.0
0.0
0.003573113
0.0
0.004654315
0.00833454
0.0014074296
0.0165157
0.0122484835
0.0
0.001426032
0.0
0.0
0.0
0.011325861
0.019382956
0.0026410131
0.0
0.0012238084
0.0010238386
0.010681311
0.008993684
0.011049879
0.0075316182
0.0033043737
0.0013138902
0.0014565578
0.0
0.0019977652
0.003103997
0.00048153425
0.00048636599
0.0017394837
0.004532834
0.0013349312
0.0037617434
0.0043990016
0.013668534
0.002274187
0.0
0.0
0.007774019
0.014747874
0.0
0.0010993683
0.00029903257
0.023495337
0.012290231
0.00091558933
0



Elapsed time to compute best fit: 2.468 seconds
Cross-validation score: 0.08500657509383709
Test score: 0.08547008547008547
Best Hyperparameters: {}
0.012112956
0.050129727
0.12965949
0.018153612
0.0016172102
0.023467883
0.0031393673
0.0037560153
0.021017304
0.0054354114
0.017219072
0.09653895
0.0059340536
0.0
0.008630211
0.0
0.037211366
0.004638031
0.033815928
0.026232636
0.0030317511
0.0
0.0
0.013057765
0.018910239
0.0
0.009840251
0.0028103832
0.0068446803
0.0037569785
0.0
0.0028581466
0.004962633
0.0015675713
0.004302681
0.0
0.0
0.0
0.008466109
0.013959344
0.014086536
0.0
0.0
0.0
0.014269636
0.0
0.0019546528
0.0
0.0
0.0
0.007478343
0.011422543
0.0016399858
0.0034069896
0.0005132084
0.003279481
0.004911203
0.01585396
0.016767494
0.008847398
0.0
0.004892104
0.0
0.0
0.0
0.008192151
0.0010654146
0.0
0.0047092917
0.0047835074
0.001998479
0.0007529522
0.0007160318
0.0023751878
0.003820038
0.0
0.0070237243
0.0083790645
0.007898569
0.0
0.0
0.010706174
0.0
0.0062229168
0.014873373
0.01042293



Elapsed time to compute best fit: 2.466 seconds
Cross-validation score: 0.08113702373197937
Test score: 0.08735979292493527
Best Hyperparameters: {}
0.02999427
0.059862535
0.13077448
0.017671803
0.013514584
0.0194896
0.012263752
0.007797577
0.0026078098
0.0
0.027010461
0.03256801
0.0
0.0
0.0041677193
0.0
0.0016559771
0.038893323
0.029569607
0.010365305
0.0
0.0
0.003947037
1.7773675e-05
0.012226979
0.018155089
0.0046245595
0.008123577
0.004149992
0.005517035
0.0045863297
0.004002025
0.0011701994
0.005821847
0.0055401735
0.0
0.0
0.0
0.016093628
0.0
0.009902406
0.011278081
0.0
0.011962582
0.0
0.0
0.0
0.0
0.0
0.0
0.012332464
0.01728147
0.014234586
0.022096131
0.0
0.005199893
0.0017417443
0.007082327
0.008114016
0.0024723178
0.0018426968
0.0011322475
0.0
0.0
0.0037231953
0.00033134012
0.009530026
0.018362414
0.0
0.0077827983
0.007941244
0.0
0.006810907
0.0006746587
0.0
0.0
0.0
0.0074267755
0.01255185
0.008940847
0.013352289
0.009370706
0.01663797
0.0014727635
0.015134882
0.009544342
0.01085



Elapsed time to compute best fit: 2.488 seconds
Cross-validation score: 0.0815800084661478
Test score: 0.0893719806763285
Best Hyperparameters: {}
0.025800874
0.04515597
0.12583649
0.010210563
0.02095686
0.015669039
0.007935668
0.0
0.0
0.0
0.013170319
0.038094662
0.009196332
0.0
0.0027238003
0.0
0.0
0.027086345
0.011969624
0.028998641
0.0
0.0011660283
0.003471859
0.0
0.0
0.0059360154
0.010688531
0.0
0.0045651277
0.008009405
0.0005049157
0.0061816038
0.0053349347
0.0023924334
0.0030393503
0.0
0.0
0.0
0.002163355
0.0
0.0124106575
0.014757642
0.012022588
0.010167128
0.018920697
0.0
0.023102263
0.0
0.0
0.0
0.00650599
0.013428284
0.0011762608
0.0
0.0
0.0020625088
0.0026449715
0.0034027135
0.01591511
0.006618715
0.018819215
0.0032861196
0.0
0.0
0.0028671338
0.009733703
0.0010034769
0.0
0.005277505
0.0039888145
0.008708071
0.007734441
0.0048821694
0.0
0.0065670726
0.0
0.0004339085
0.002354967
0.0012745183
0.0
0.013906391
0.0053879987
0.004299617
0.0011922824
0.0059239604
0.0022158804
0.002146



Elapsed time to compute best fit: 2.505 seconds
Cross-validation score: 0.09029240590553556
Test score: 0.0723870056497175
Best Hyperparameters: {}
0.059710164
0.047431916
0.15159439
0.0017285206
0.023748375
0.020446861
0.007626691
0.0
0.0
0.0
0.023169316
0.0038796468
0.00453754
0.0
0.0137337055
0.0
0.008694067
0.024487143
0.01477931
0.008621669
0.0
0.0
0.00073531544
0.0
0.0
0.0
0.0
0.016597532
0.00046808404
0.0029607166
0.0063517913
0.0009801796
0.004340135
0.0003250393
0.0032800888
0.0
0.0
0.0
0.010994564
0.0
0.025958566
0.011695214
0.0
0.0067401975
0.0049916613
0.0
0.0
0.0
0.0
0.0
0.0
0.012758263
0.0012824284
0.0
0.0
0.025186485
0.00563656
0.016520055
0.0009908555
0.023500407
0.0023119075
0.0064029605
0.00027987646
0.0
0.0010444825
0.03590153
0.0025745144
0.0
0.0
0.007194259
0.0011045334
0.0014248892
0.0029349816
0.0040590973
0.0075842678
0.0
0.028219463
0.006738039
0.015070441
0.023737164
0.018794505
0.003918416
0.0
0.021020565
0.0038297547
0.020926664
0.0023709142
0.0
0.0026466448



Elapsed time to compute best fit: 2.572 seconds
Cross-validation score: 0.0849122969749266
Test score: 0.0893635571054926
Best Hyperparameters: {}
0.042591512
0.050336357
0.13922939
0.0049310667
0.04149383
0.036671083
0.0
0.0016057435
0.03007203
0.0013954808
0.009613862
0.022600846
0.005337571
0.0
0.0
0.0
0.03451215
0.0004401175
0.0
0.020257538
0.0
0.0
0.009340335
0.011222718
0.028866563
0.0
0.010220972
0.0046859076
0.0025191645
0.011925252
0.0071739755
0.0062584146
0.001125463
0.0060203956
0.0013082061
0.0
0.0
0.0
0.008205888
0.0
0.004779314
0.003295275
0.0
0.0012097504
0.00063877733
0.0
0.004967877
0.0
0.0
0.0
0.0087255845
0.01250517
0.0045112283
0.0063136294
0.0
0.004142955
0.0040047197
0.013164448
0.01878125
0.0021638118
0.026896523
0.0034112628
0.002532725
0.0
0.005546778
0.0134937065
0.00082911924
0.0
0.003837893
0.024186946
0.011779344
0.0
0.002941747
0.0046171853
0.0013729937
0.0
0.0
0.0040032174
0.0050895363
0.0
0.0
0.011505343
0.02219637
0.023162814
0.00045555647
0.020735841


### 4.2.3 LightGBM

In [119]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

under_lightgbm_performance_normalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    LightGBMPipeline = Pipeline(steps = [#['smote', SMOTE()],
                                    ['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    under_lightgbm_performance_normalized_df = pd.concat([under_lightgbm_performance_normalized_df, new_performance_df])
    

under_lightgbm_performance_normalized_df.to_csv("../data/05_model_output/under_lightgbm_performance_normalized_df.csv")




Elapsed time to compute best fit: 2.604 seconds
Cross-validation score: 0.26358958230266055
Test score: 0.2447129909365559
Best Hyperparameters: {}
57.78304586900472
1160.4013781266099
2504.2948398759004
3.298063797845316
103.00339536641106
91.41364170765173
0.9127465247556081
29.399687302757705
0.0007686709868721664
0.0
1.6091538441829698
130.60071944277757
1.130074317244862
0.0
6.086156721729641
0.0
12.881589958212299
30.020936274757
72.61366761054512
4.236354519360972
0.0007719250279478729
1.9060200293838854
0.4879993137810743
11.12381405054839
1.518327246147802
0.2522307571817244
5.5159141493461865
8.155618143349372
0.292206296265223
7.781493977424816
13.515126725069422
0.6595318048994618
1.3952099604863548
4.06484278672724
11.662537268074772
0.0
0.0
0.0
5.358687922697462
0.0
24.745546672927862
10.446610871303834
1.1205800092284335e-06
10.142317248857134
0.4263912746049803
7.718589656313668e-11
8.001482138000732
0.0
0.0
0.0
61.928145269774866
21.9023835559586
8.692116021313005
0.03



Elapsed time to compute best fit: 2.336 seconds
Cross-validation score: 0.2877863378794544
Test score: 0.265685019206146
Best Hyperparameters: {}
16.833223887596247
1068.9091983653902
2468.180371986735
74.82097414085622
36.1436734230071
134.5987605643204
21.3665503823778
0.040395995394248985
2.3389500711346045e-05
4.2433159025412337e-07
227.53158021952294
40.480918681747156
35.79166723938922
0.0
26.066882317002527
0.0
26.722975108143
9.407262506737606
72.14479069984822
68.11957150265893
1.1207899888177053e-05
4.938908944041032e-06
16.488678773561997
3.7848054357240937
0.19094822739316442
6.671796439268926
6.8082055942088555
6.111126300755494
0.2275239134261362
10.701278538820755
1.8902839591408267
2.4413924671954135
6.068525718080197
1.4808305056545672
5.99967759660412
0.0
0.0
0.0
1.226584901176122
0.0
4.08319086525205
3.4872563182031895
0.0
2.5657617346672605
1.2707837338682566
0.00023148094011737896
9.11599835814112
0.0
0.0
0.0
13.9167124932618
0.636838774050478
2.33991778886684
4.43



Elapsed time to compute best fit: 2.391 seconds
Cross-validation score: 0.25458889400059687
Test score: 0.26315789473684215
Best Hyperparameters: {}
54.60282961170985
1082.558856472116
2314.2112563545043
96.50532121062001
188.1762097207392
127.04053960421687
0.4493729946791305
0.040756547034892954
0.0
0.0
31.506958405653762
165.62457319205322
5.9975410628031565
0.0
0.6417609798415924
0.0
9.110309075069786
36.666550870045555
49.97078151007543
45.10215759637508
0.9149116228523773
0.0016550713808101136
15.601645474478005
1.7189456740838551
0.721581497695297
0.09219261957332492
5.8673372299032565
3.2278434748641267
4.6676268633139095
9.708880171221976
1.6183492704856373
1.2170744523516017
10.406613101803087
7.76504442399073e-05
14.377077904598153
0.0
0.0
0.0
21.236624603002838
5.9249199694022536e-05
32.560293355742346
32.537102251437126
7.867158967655996e-05
16.715714898476364
2.1315308377139672
0.004432568122865632
2.187551197170251
0.0
0.0
0.0
8.931530892949283
5.345278062337911
2.385328



Elapsed time to compute best fit: 2.410 seconds
Cross-validation score: 0.265897799517915
Test score: 0.2605540897097625
Best Hyperparameters: {}
30.6954806957385
1086.4449349458487
2539.4158461928446
22.69499825627554
36.264017073590864
114.07778185427232
3.3180298368823093
0.3955248791853343
0.0
0.0
193.29572740872362
55.793991848917294
54.1340726693788
0.0
8.21581403228349
0.0
46.92742092390841
5.953277085607005
14.99863629624662
76.85319327558594
0.02489813478052838
0.3115026970299368
0.0901292303913408
0.0919729132881173
0.679298407368151
0.0001602896547510113
0.0038120791396529796
1.3004789633855607
9.509942470619185
2.5321159074348065
1.8700012573782896
2.8713320839942753
6.464972365110874
0.9142815804716551
13.872046103728572
0.0
0.0
0.0
12.404529854197499
0.05229249969124794
1.8795104170629884
6.577786874939651
4.1204499211744405e-05
2.1033168565046925
5.821920402925872
0.0
5.39267233645603
0.0
0.0
0.0
9.236584784501474
25.06890683155637
8.714955479898915
2.2867078510837473
0.



Elapsed time to compute best fit: 2.431 seconds
Cross-validation score: 0.27574257189497464
Test score: 0.28928571428571426
Best Hyperparameters: {}
55.55476207487417
1199.5124326196033
2351.299159748118
110.83150173380116
82.02701774305393
108.3763300219963
1.6405407598425694
21.9460143734339
0.030292702029538265
3.3810314177535474
26.692535741820958
102.73563821552375
20.26310509643882
0.0
0.8900421031990056
0.0
16.53799150768407
24.961017015034603
69.87395418536332
19.48966742412448
1.0043884367551073
11.38978959954693
23.292914785595457
4.498823833243401
1.3584703940781253
9.023012815965615
12.920290215758328
7.330366526299975
1.6238506947215683
2.6513529248959635
9.354364793274664
20.778976309172556
0.5429035599860903
4.095269230359053
12.12118055406028
0.0
0.0
0.0
9.686180015374049
0.0
10.582232285857401
7.147523928004915
0.2367767914529808
5.73066188095612
0.05476289052236183
0.0
20.199542496120415
0.0
0.0
0.0
44.08130843351106
12.043050017852096
1.0080726567962666
12.7473400300



Elapsed time to compute best fit: 2.338 seconds
Cross-validation score: 0.2723162101695527
Test score: 0.2814001372683596
Best Hyperparameters: {}
20.61638088037244
1442.4474194276875
2153.0743120827724
40.70599681396651
79.36441813781927
208.3538055265389
8.442039776686157
0.0019872347661348266
0.00033295917366160666
0.13980597880109258
14.899465143966365
9.744641808421632
18.016823348065373
0.0
0.4057763657028701
0.0
20.600358948511595
1.0595764042757387
56.819813600306865
26.031967111152277
0.47975588012923254
0.5488908413398255
67.52978078372371
11.601030524652444
0.040545153478888096
2.672946167293894
2.6932111985381297
32.18233632335735
16.455661075972134
1.6510193727928009
0.2761802121094661
5.941192890177242
1.4026419656047158
8.750056246616168
29.51769632469615
0.0
0.0
0.0
16.641882300143468
4.887196093797684
10.991830239309634
2.701614276586028
6.7110926346589155
6.798730410721399
2.4941658250887713
0.0
3.0252405323275973
0.0
0.0
0.0
15.193202374183452
36.61416070926104
11.77



Elapsed time to compute best fit: 2.473 seconds
Cross-validation score: 0.2705822733140661
Test score: 0.26907356948228883
Best Hyperparameters: {}
87.43935496113488
1033.1790243199503
2307.162652275893
80.0954209677875
82.54312286938335
63.7163141917806
0.635732262991052
2.219829858804644
0.015088099986314774
0.11390600353479385
108.92999196355558
200.12309624296336
14.739648642653805
0.0
0.0
0.0
2.139523536956643
1.3724744637855295
108.08588101648289
98.44567878913726
8.17740285583568e-05
3.0280269980430603
14.751460609267586
0.04669707737956763
0.24222374102631505
0.04711751298600575
2.3176818878452456
32.239677475691295
0.011779647428982187
5.146008816047114
0.8115643890182582
5.659888137727009
17.235637520732364
3.6964842174705304
7.6163206372060746
0.0
0.0
0.0
9.1597167799101
0.0
15.247954927368411
2.1125079783822898
3.4755545957013965
8.070638411397072
0.9384893383899566
0.11445999890565872
13.792243996804132
0.0
0.0
0.0
7.217612058979206
17.227622682990397
12.256479469092577
0.



Elapsed time to compute best fit: 2.519 seconds
Cross-validation score: 0.26705379014336456
Test score: 0.2521525215252153
Best Hyperparameters: {}
43.06971793073396
1058.66197451956
2600.7454091593354
0.053185198456048965
156.25567392615662
46.492905459842405
0.0028722638121392308
1.1053544662227526
0.036585584250843795
0.03265504904980787
124.84184542833519
38.495606609063614
38.741373110571885
0.0
3.502410213949986
0.0
32.91801957808581
23.992148418195256
25.623537430649986
61.32342790797706
2.3331927792746843e-05
7.511133543464076
26.066626986419806
13.92394397433408
1.3095998576946752
2.2299600800024706
4.750394263785637
11.505224788340342
0.06014307855362044
11.683027259670325
0.1244972694020543
4.624485106679202
1.8343897699768061
34.34991555814129
1.7780691810834945
0.0
0.0
0.0
4.185237047005256
0.0
9.80013095129863
1.1123117598947787
8.729210065894222e-08
3.243022355375186
1.202508775860636
3.773720163735561e-05
1.2609152047255705
0.0
0.0
0.0
23.03572309397943
11.9360351291452



Elapsed time to compute best fit: 2.572 seconds
Cross-validation score: 0.2733642887312332
Test score: 0.24983776768332253
Best Hyperparameters: {}
94.00613837557874
1142.0908687206083
2281.0665961228074
14.526241786777973
108.98356644207024
238.19433682312632
0.4242381352389213
6.424929509999913
0.07703245198354125
0.06971002370482893
38.883795797984405
30.93383452414735
114.74136742600957
0.0
5.885751803902725
0.0
6.063460546549458
66.08659052602543
41.11287162969892
85.48119547284922
3.0652635754080393e-06
1.2807383624871549e-05
0.7941283729842326
8.83596914254717
0.06467089802026749
0.008243459058215774
13.601159855204155
4.959362340021536
3.2949062563188374
28.33107846136801
0.10931965596800097
6.838568769364467
18.764743534952196
0.9242533007956101
2.035070307867727
0.0
0.0
0.0
32.298498597706676
0.002395340008661151
8.326317182113524
1.7629614927719786
5.252370908856392
5.398216961315775
0.9715314771911778
0.0
1.8852674741892506
0.0
0.0
0.0
4.805311105637169
0.139598799440959
5.



Elapsed time to compute best fit: 2.317 seconds
Cross-validation score: 0.2655718249271957
Test score: 0.2709847984137475
Best Hyperparameters: {}
36.717389307392544
1129.564238872298
2403.2033657394068
24.90705087734689
25.1617586321658
196.26168163681126
0.038636801652222275
8.990313304490272
0.5588648798325524
0.6153827730934224
82.40102566336677
98.20142406169848
18.493145570763726
0.0
6.6979599264299585
0.0
8.057754653826251
0.4399702152800711
65.98478228625753
42.959738205937775
3.617499899633003e-08
1.6951300085832344e-10
42.18001657072455
12.276148379336632
2.377345025539398
0.3578337749952144
0.7336147298058325
5.3334192543275485
2.860236504037907
7.057500059160249
1.926820500970945
2.378172347924155
9.263322551994966
1.7576775096058554
9.374235563445726
0.0
0.0
0.0
27.7524065597171
0.057668399065732956
3.5600575603378726
16.86201632958237
3.3318586498571676
2.931608658291731
0.03701379266813909
0.0
7.8374241452167235
0.0
0.0
0.0
2.860962898971138
15.239568212136874
12.0384734



Elapsed time to compute best fit: 2.401 seconds
Cross-validation score: 0.27917302012325046
Test score: 0.26720106880427524
Best Hyperparameters: {}
46.182483411706926
1404.7530781388507
2205.318129625615
23.057153913151744
170.30796029482252
40.86066984140092
0.021634461744341804
1.8853211164235049
0.00019406700448598713
0.6745618805289268
21.927857302391345
144.91316824233627
43.070744311294135
0.0
7.076704339221969
0.0
9.54831367555672
2.9948719857764745
7.6152411528321124
126.03831222239697
0.20501110848552084
0.21387454792420613
1.3831430366996766
9.97068729677599
1.0151030546222293
13.640026389899162
2.6660668476231466
5.0219813405875575
2.023335327419835
3.833256287045466
27.41967342253197
3.9434638488811573
9.361052204784393
0.22341383081514654
0.5699586330374586
0.0
0.0
0.0
34.41379649748463
1.977019934429336e-07
4.68749206507605
12.605377225939264
0.029187752530162925
3.6801721963586544
2.3730396237160476
0.0010422374884910823
2.0298226212532526
0.0
0.0
0.0
27.61412237898621




Elapsed time to compute best fit: 2.318 seconds
Cross-validation score: 0.26795813098126214
Test score: 0.23809523809523805
Best Hyperparameters: {}
13.051464705514539
1058.0296836338948
2549.6117142666476
0.07643689960241318
21.741878070494717
79.93570204247604
7.28722921124747e-05
3.3081694562162025
0.33263603712893397
6.016552815957255e-07
174.1901076982663
254.50555196503416
4.518214007748611
0.0
4.4821385103705325
0.0
0.7723352980890659
39.88317276444954
53.308461326128516
21.712897587669463
0.3232243912489352
2.4759070527125004e-06
4.796884042601846
5.935778521995015
0.03678798247710802
5.121331388130784
0.6748537193292958
69.43811262808907
1.0113051164380877
2.1300107861036577
7.3840837155431585
1.7910059753447185
1.216314936121421
8.964647781145683
11.759161916927058
0.0
0.0
0.0
30.67414695947107
5.699299920891576e-09
1.7903583232909788
3.8393563492027916
0.0
3.639319385870472
0.5787037897456315
0.0
1.0448438944790688
0.0
0.0
0.0
5.496104661232039
17.016551647880416
15.76594238



Elapsed time to compute best fit: 2.306 seconds
Cross-validation score: 0.2591332121840334
Test score: 0.2921840759678597
Best Hyperparameters: {}
56.98560655241923
1184.525222331335
2177.5062136877305
65.16149972687364
8.631694818390486
124.34992693109812
1.5049919930397664
0.01883166958577931
4.2126700350308965e-07
0.0641105646500364
14.852666567584848
214.32693897406568
58.8837610947078
0.0
0.0021582714845931328
0.0
12.977859803289725
11.052473791255824
123.87410220728064
43.7999427961716
0.04146579938200967
5.702892065428955e-05
6.794495878873249
9.23859495543173
0.11797047753499612
0.7495478825004813
4.949993109373054
18.981774194878003
2.0149947858569863
4.906520686722942
2.5103542481526455
3.4063437536891144
2.874296023521458
15.475742167751243
14.795589161341923
0.0
0.0
0.0
26.17163851151524
0.0
21.70470566428095
11.725903052065778
0.0
3.694740616741683
1.5832928014048262
0.06285449862480164
0.009121918042060972
0.0
0.0
0.0
7.93439077118378
47.182410910825325
2.0497936098674314



Elapsed time to compute best fit: 2.319 seconds
Cross-validation score: 0.27014481091700376
Test score: 0.28388928317955997
Best Hyperparameters: {}
11.068132417028664
1027.009024173628
2239.790871390754
117.17455551771062
65.8573123985232
40.369304462548826
6.649130471436905
1.6027734325116825
0.8175647480529733
0.0007484053318407291
177.5048485480308
102.4884415406559
141.81928916191373
0.0
1.1375789758468855
0.0
0.06700198963671289
8.157734691028102
24.28411153272286
69.98980536966755
5.236135089394061
2.8155564534192763
27.235508067160254
1.4287004237485519
0.2634853075703609
5.953105307540149
4.915688781496188
18.02893095084673
3.7558279145107907
8.152537540728034
3.2660849135938563
15.110418283970233
7.71601883702069
1.6214304303416829
8.3635359310911
0.0
0.0
0.0
23.933403363907715
2.069969892501831
9.94497353568074
10.682396311124583
9.286330282520794e-08
6.669427081175959
0.48761851942287393
0.0
0.8276031729744524
0.0
0.0
0.0
7.731971161091255
19.285736996976354
8.9841605744465



Elapsed time to compute best fit: 2.382 seconds
Cross-validation score: 0.2704131059837098
Test score: 0.25792417650714733
Best Hyperparameters: {}
43.88773949631404
1162.7817760045637
2356.963333252574
82.93826784243979
14.7315341641887
200.19491291449694
0.630664209017292
0.23790015525451125
0.09397402796457754
0.00029741430626017973
13.450556942517936
142.94362257793406
31.293881345255215
0.0
2.3712127821672766
0.0
28.446173249546643
3.9723308343136843
65.4290195292136
80.5606092414345
0.14325869642198086
1.8684779703617096
6.081594261003374
0.9298483277279956
0.3575360029935837
0.19865869948989712
0.06680443465947794
8.95186059197553
22.056893567230947
2.6931015706897643
2.171970738616933
6.009173317477432
7.515970722604221
5.582622991559216
35.91342059633928
0.0
0.0
0.0
2.5141295181658454
0.32106900215148926
1.00151074926141
11.408634142461784
0.21042179316282272
8.93477727540808
2.501908034124649
0.003964725083292819
4.50728598068771
0.0
0.0
0.0
14.526521337380013
15.619898709833



Elapsed time to compute best fit: 2.460 seconds
Cross-validation score: 0.26419745285470364
Test score: 0.25450031036623216
Best Hyperparameters: {}
124.61492977086574
1106.1599762492963
2322.1319749496133
2.448615524946945
123.66444055873285
127.63432546277022
1.2786309552839157
1.639203237988113
0.0006491660024039447
2.2194108374419557
26.496110708367603
55.28951468536125
12.575472127624074
0.0
2.7740524920859038
0.0
4.250420712975597
17.283764013901642
73.79412459242157
56.54324954999721
1.9905196524898585
11.567624517489968
25.842921284573578
1.6584142505891748
1.234141420234664
0.10704605651324073
3.5073822344784276
5.53700416913863
1.7206058413354413
16.189872061957978
6.90681063845841
12.693661079768406
7.431968908605301
5.294424028762666
14.941734559767465
0.0
0.0
0.0
30.454195242801376
0.9875801410526037
23.910551474170575
7.916758083290006
0.14607757551129907
19.924681788451572
0.0003205172743045992
0.023988140281289816
4.906635619978488
0.0
0.0
0.0
6.020307206079906
24.75420



Elapsed time to compute best fit: 2.312 seconds
Cross-validation score: 0.2652940034899884
Test score: 0.2488972904851922
Best Hyperparameters: {}
42.013109923542565
1216.064339416177
2321.0597129421353
21.463935201833237
100.91131153510631
60.40973780801355
0.6049931970773272
0.1462453403521522
1.5431846911931189
2.112030035957382
65.48135782124534
95.03288258807945
72.06262629263651
0.0
3.0212155075130003
0.0
3.70183262688721
8.511697259313575
11.88236957448375
82.74893578473373
0.4186260342593613
0.003995524629260672
41.63197708976091
1.41317763932177
1.566290020942688
1.5040358735537556
0.5486067599217677
6.935730444207984
3.5956225730351434
8.916747045760445
7.8318801028434635
1.3342749678369508
1.9243765812539766
2.8105696072444264
10.13892494308675
0.0
0.0
0.0
36.15449493879479
2.977365042171577
18.41285976900516
6.894239148171378
0.7478111909996272
28.530540233761897
12.543804159267452
0.001512030023150146
3.815116835485135
0.0
0.0
0.0
15.78539242184817
29.615904400440304
11.52



Elapsed time to compute best fit: 2.526 seconds
Cross-validation score: 0.255093047519594
Test score: 0.2668918918918919
Best Hyperparameters: {}
49.26975670895717
1170.7810236682487
2271.2451090386267
40.40069590899591
247.81195600772634
42.72695880750385
0.07335864057631625
2.110725060450039
1.4786370221991092
5.986359741250169e-10
10.85685584874227
40.770609754884745
22.83754260680543
0.0
0.007162426250497234
0.0
15.670994695762973
5.374581873857717
76.31620886319978
141.21350553439973
6.569346878444776
3.620657294988632
18.212224987675917
4.9951276086701455
0.2943902127444744
0.21883134121350167
7.695625712629145
7.337977922997398
49.28783106356289
4.681242971143393
10.42586458121086
4.8198278338819565
3.720415740214875
0.07346378102738527
23.331205961516726
0.0
0.0
0.0
4.358046619916388
0.0
8.844704190466615
19.895447342661612
4.6080174719465035
3.434230214394556
0.55882569246819
0.012702651672270804
0.2898196879499153
0.0
0.0
0.0
22.225766484205177
0.4970578996903896
3.7231675805



Elapsed time to compute best fit: 2.410 seconds
Cross-validation score: 0.2693178708091534
Test score: 0.2643754130865829
Best Hyperparameters: {}
88.190610199668
1142.8567879767736
2251.3194132534745
42.17977506495366
36.562596623367675
138.0755388091405
0.14286999219239682
9.37892508584764
0.03160299270075484
0.03077537402100461
49.40052415689958
100.39556131062673
19.196992271354375
0.0
19.63185076675518
0.0
60.55772095708634
12.486859235065936
12.545102328026548
90.34783739245262
0.2951116270330463
2.7681079506874084
6.417978166674582
3.063988089905912
1.1568683975184513
0.4207061345769034
1.2037225494264128
27.62867600566654
0.3513268265557447
2.750891134143721
0.2104773037135601
2.8177952174391976
29.651924495314354
0.2574201623564747
64.87795009611224
0.0
0.0
0.0
18.417209570650932
0.0
8.58150178194046
9.409559543520809
1.2250400232005632e-06
12.394874697531497
2.4385482473601705
1.9348599380464293e-06
8.325610705756759
0.0
0.0
0.0
30.601029340467626
14.84899306184705
8.56610445



Elapsed time to compute best fit: 2.375 seconds
Cross-validation score: 0.26712402390626644
Test score: 0.2762619372442019
Best Hyperparameters: {}
41.968075599347266
1021.967597368964
2375.679783516923
75.24108072771138
115.99760433912813
126.94322951512451
0.48781705613890836
0.7880640158182359
0.05111140105873346
0.019861100241541862
13.944879246564327
114.41980989911474
36.157110523835534
0.0
1.3979258669827601
0.0
19.25122432176088
16.46999363526379
46.79118738043632
73.53767656911714
0.23438750840552336
1.3224048132087773e-06
36.87607482794374
8.69608715896696
13.230066227653879
0.0060144886792841135
11.640529755706844
19.47824710364388
52.44981963332701
4.197502198619463
0.45343445458432186
35.59457387849352
33.81274013266294
0.051106837029053054
9.946414700259679
0.0
0.0
0.0
16.4061536005253
1.1841700077056885
53.4896421121173
1.7497194051815295
0.0
17.87094438707882
5.0875637902365085
4.397522993385792
1.324937952532042
0.0
0.0
0.0
2.2108199997874127
14.354147336343395
8.68543



Elapsed time to compute best fit: 2.316 seconds
Cross-validation score: 0.2707171056309984
Test score: 0.30688622754491024
Best Hyperparameters: {}
24.607912483149498
903.2238579337794
2424.9867679478825
31.38801709301428
47.096413343540945
171.89381238981704
0.7373303853203264
2.4308609438737676
0.014035020339179027
0.0
27.061599954838222
190.7099146804991
74.63731638726748
0.0
0.6512842767024267
0.0
22.89178689339467
12.31376841253812
37.39608015741417
89.47847371078062
0.0027470150380395353
4.082410752236683
15.914540023042827
4.488103646368721
4.5943993452893075
12.887651814167736
1.220367495992245
3.552156628281012
1.0744628686924678
14.115889968730094
42.31533960619841
3.4161024886093116
2.4970486402385177
13.771737587599047
12.855009015772682
0.0
0.0
0.0
31.45376298181401
0.0008319880793123957
2.90510358537827
7.439490170198042
12.378453641362285
2.3688616935444315
5.528300474347494
0.138635941199027
4.366419112249387
0.0
0.0
0.0
46.571859835498806
21.095745231777983
4.401939866



Elapsed time to compute best fit: 2.467 seconds
Cross-validation score: 0.27259075645678166
Test score: 0.2757619738751814
Best Hyperparameters: {}
149.37868437056196
1220.9950368567163
2231.983831547111
27.04420905899847
59.533886465463574
72.02814210875947
0.034624525237752835
1.0376287894755953
9.436734842438455e-06
0.0
101.9206464713647
182.10013456163875
4.995167661572246
0.0
4.8607001775702985
0.0
6.699340163792921
92.69274339227948
83.32799925461158
3.766136146505177
0.21711157554852267
23.874966711943223
43.44122544032186
0.11641211382141137
6.906078796197562
0.10462601522340265
1.101373288747709e-05
0.2489785242720095
0.9254101340867847
2.121290674290564
0.48617027367753174
3.1352864307170614
1.825468358303768
5.125787643119111
1.0675462888687806
0.0
0.0
0.0
8.050919435590403
0.0
8.75941511061957
0.1727950867511402
0.0
12.777801451241343
0.13735709528432927
0.21897560358047485
3.5816620619093555
0.0
0.0
0.0
6.417711182213251
3.243937493885558
2.74617885369591
0.616510424615718



Elapsed time to compute best fit: 2.449 seconds
Cross-validation score: 0.2694467828750175
Test score: 0.24030110017371167
Best Hyperparameters: {}
59.03852865487158
1399.0522554429094
2222.4787018131233
27.68377005495131
12.666945027746754
80.74886505486076
0.671457420323577
0.151540081840011
0.6542426682242279
1.4371788334101439
129.6037839524031
142.85783283087136
40.3973192757239
0.0
0.1628105966673239
0.0
0.7581472101260829
11.702338801300357
63.454513865279154
48.586784183316446
0.1501410561000598
0.2491578747724077
7.36053973692583
1.4657638900286132
0.573694525454659
2.9950697086998828
0.0272610049541937
28.554140481501694
0.6343571164049602
0.8375912178906901
0.822538499072117
13.58882653677313
24.846352017847153
0.9241068001992794
20.265252529009295
0.0
0.0
0.0
14.378206973869482
0.8648061975161312
17.389330866865123
4.775603767040623
0.17979988610136388
10.743497635507875
6.086817119778701
0.6097339987754822
0.8047035494992119
0.0
0.0
0.0
13.125602850000895
39.50772020344307



Elapsed time to compute best fit: 2.448 seconds
Cross-validation score: 0.2697935168455954
Test score: 0.26281635301752104
Best Hyperparameters: {}
141.58932673233082
933.998746445106
2515.9359180021947
116.10716191292958
31.879242692050774
119.33562249950006
0.6028995802771533
5.823485195828664
0.4337703971625455
0.2263595014810562
131.6843128420211
17.42237313712758
6.402793418966155
0.0
38.526316890458176
0.0
26.60045786925008
7.09755804700972
22.17392922158713
49.46082817546555
0.2247184877055588
3.894115359994297
12.551180370235443
0.7924233859155834
0.09882184254892268
3.7608531868554564
0.04013721122104208
8.05373352533116
7.408569601372195
5.653517393531015
1.9310604085465046
4.146567059340132
8.503113763957934
1.6688626173829206
2.1623834966917057
0.0
0.0
0.0
25.02161098027642
0.0
5.705522116689707
7.315962903995569
0.00735562015324831
16.363489889695494
2.660923986396604
1.4823399782180786
7.958282396299637
0.0
0.0
0.0
2.322552902975112
4.728271838079763
9.423635049992635
2.6



Elapsed time to compute best fit: 2.351 seconds
Cross-validation score: 0.2596660005117719
Test score: 0.29517638588912887
Best Hyperparameters: {}
8.397885194632307
1306.1148170093043
2019.4057120637194
24.88776385533662
80.7672199578922
79.57617108418512
0.6729590124522751
0.1793559814323089
1.4280450045589532e-05
0.957554703050846
62.97103868472914
147.26718271762124
34.540228749560775
0.0
9.472017011896241
0.0
6.297771478376726
12.37970032295378
88.94713314965176
138.14003307205354
13.08033286922083
1.365006681207136
46.18442544250837
5.0779825171279045
23.361572201436502
0.10599499940872192
1.5416366169511249
81.70031838281488
5.060862515596455
0.9934662889854444
4.313050890125498
8.485053067489854
5.097293713600186
0.775005374167705
12.22625878126645
0.0
0.0
0.0
10.404013260267675
0.0
7.825177573254651
7.736320700688111
4.140260102758475e-06
14.769747053128906
2.2284940080345104
0.0
69.03862529022865
0.0
0.0
0.0
23.055701303689553
22.24858415212239
22.055574703647313
0.3069534630

## 4.4 Rebalancing Strategy - 5050

### 4.4.1 Random Forest

In [122]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

fiftyfifty_randomforest_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    pipeline = Pipeline(steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                              ['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    fiftyfifty_randomforest_normalized_performance_df = pd.concat([fiftyfifty_randomforest_normalized_performance_df, new_performance_df])
    
fiftyfifty_randomforest_normalized_performance_df.to_csv("../data/05_model_output/fiftyfifty_randomforest_normalized_performance_df.csv")



### 4.4.2 XGBoost

In [123]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


fiftyfifty_xgboost_normalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    GXBoostPipeline = Pipeline(steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                                    ['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    fiftyfifty_xgboost_normalized_performance_df = pd.concat([fiftyfifty_xgboost_normalized_performance_df, new_performance_df])
    

fiftyfifty_xgboost_normalized_performance_df.to_csv("../data/05_model_output/fiftyfifty_xgboost_normalized_performance_df.csv")



Elapsed time to compute best fit: 328.387 seconds
Cross-validation score: 0.5552853710616941
Test score: 0.6407563025210085
Best Hyperparameters: {}
0.008213203
0.0740317
0.18115638
0.023409331
0.025575502
0.0011970132
0.000937525
0.0032342207
0.003512246
0.0008302695
0.019140704
0.18674402
0.007286747
0.0
0.0065664626
0.0
0.0062433095
0.0019110653
0.022544676
0.009321324
0.0015628877
0.0014284629
0.010649279
0.003302912
0.0014573256
0.0010171402
0.001106818
0.0048666224
0.0035975105
0.0010007828
0.0015743576
0.0016296909
0.00078339776
0.011843124
0.0016023851
0.0
0.0
0.0
0.005905623
0.0009378646
0.0049738274
0.0044090566
0.033139784
0.003556894
0.00023329562
0.02812802
0.005473448
0.0
0.0
0.0
0.0046601677
0.0062604505
0.0011090263
0.006161742
0.0068675256
0.006610001
0.00086440414
0.014990058
0.009100717
0.0065768366
0.008210446
0.0036318821
0.0021104778
0.0036850246
0.0034162188
0.006471666
0.004748791
0.0050059063
0.0023839404
0.0038697792
0.0011878056
0.0044144206
0.00794353
0.0063



Elapsed time to compute best fit: 317.078 seconds
Cross-validation score: 0.5749875916667382
Test score: 0.5230496453900709
Best Hyperparameters: {}
0.018732533
0.09800744
0.18962249
0.030189289
0.045280878
0.05440948
0.0010861823
0.0022603637
0.005381973
0.0009108456
0.027759403
0.055659875
0.0071801525
0.0
0.0019461667
0.0
0.008024561
0.0048836907
0.023028925
0.014699801
0.005203829
0.0016320425
0.008893654
0.005610192
0.011937913
0.0023813331
0.0015473189
0.005611762
0.0016478001
0.0074212328
0.006710742
0.0034618448
0.0045599374
0.0038273053
0.0019021359
0.0
0.0
0.0
0.0011048708
0.0035379461
0.004203973
0.003856208
0.019863212
0.0032264574
0.00026611937
5.2278876e-05
0.014395474
0.0
0.0
0.0
0.0068138796
0.003778391
0.0028166482
0.0031667303
0.011950737
0.000984355
0.0025102606
0.029814657
0.0079365745
0.008664113
0.0069548865
0.0023447517
0.0
0.0009997695
0.0017678791
0.017117271
0.0013048137
0.006620017
0.0023284345
0.0051064705
0.0018114125
0.0033113817
0.0011114731
0.0074424953




Elapsed time to compute best fit: 347.296 seconds
Cross-validation score: 0.552122247681966
Test score: 0.6109022556390977
Best Hyperparameters: {}
0.013494303
0.088481985
0.20184368
0.03412458
0.026602404
0.0020439483
0.0023299076
0.0014564671
0.0013331972
0.00079410325
0.017761271
0.13486981
0.0065436554
0.0
0.0025922612
0.0
0.005945369
0.0026810481
0.022159902
0.0071224645
0.001967532
0.0019804242
0.005766314
0.002202839
0.014905843
0.0042355536
0.0014202965
0.0077996417
0.0024099613
0.0016638158
0.0021959338
0.001893
0.0010127933
0.006229017
0.00066312583
0.0
0.0
0.0
0.0036840911
0.002068994
0.004771385
0.0032890209
0.02859658
0.0018601734
0.00076320046
0.0028268865
0.011370162
0.0
0.0
0.0
0.0011499715
0.001339371
0.003525199
0.00033528858
0.0074291965
0.010947118
0.0023138148
0.013881034
0.011359341
0.006579949
0.009507522
0.0023669468
0.013976873
0.0028603538
0.0014980454
0.011223803
0.0010395471
0.00010506177
0.012592523
0.0055202567
0.0012641776
0.0028442459
0.00051302824
0.011



Elapsed time to compute best fit: 370.616 seconds
Cross-validation score: 0.5527232940834415
Test score: 0.6404958677685951
Best Hyperparameters: {}
0.010874956
0.07999912
0.19460106
0.02635401
0.024443228
0.0012726266
0.005897586
0.001397818
0.00091893575
0.0041618287
0.017557055
0.14247453
0.033888914
0.0
0.0042660898
0.0
0.0034993577
0.0014269382
0.020788232
0.0055796
0.0016976302
0.007870818
0.009992053
0.002420524
0.006724179
0.0027464812
0.0026934692
0.007854348
0.0011011933
0.00066348613
0.0011319636
0.0012390015
0.0033692943
0.011006414
0.00077371084
0.0
0.0
0.0
0.0040506255
0.00323179
0.00086379185
0.003603222
0.021983126
0.0031744242
4.6297326e-05
0.0020053475
0.0065923333
0.0
0.0
0.0
0.0031908785
0.0077579883
0.0031224675
0.00029174995
0.012768486
0.0056767357
0.0006057808
0.017394345
0.0077576907
0.008323805
0.0058916905
0.0023774179
0.0004850315
0.0013246537
0.000726212
0.005215259
0.0042517837
0.008538413
0.0026762704
0.0015967502
0.003410049
0.0017116661
0.012419664
0.00



Elapsed time to compute best fit: 370.377 seconds
Cross-validation score: 0.5735450728840137
Test score: 0.6199186991869918
Best Hyperparameters: {}
0.0346266
0.09123541
0.18975018
0.033526015
0.027404256
0.057833236
0.0016448314
0.0020971692
0.010165889
0.00093119947
0.025779106
0.03700674
0.014965059
0.0
0.00083714107
0.0
0.006483079
0.0011910648
0.013089941
0.0030815874
0.001880899
0.010962266
0.0059775272
0.0044674627
0.0036176287
0.0033290265
0.0013313831
0.006610633
0.0051533342
0.00079319614
0.003035882
0.006786561
0.0017607241
0.008406416
0.0011497427
0.0
0.0
0.0
0.00615866
0.0047691045
0.003132731
0.0033782069
0.019851506
0.0038618809
0.0036142576
0.017648105
0.011985465
0.0
0.0
0.0
0.0013847151
0.0035539789
0.004929309
0.0017716396
0.030216975
0.0010737326
0.0024965843
0.0122563895
0.0042073606
0.007041691
0.0056274645
0.0018754796
0.0
0.0029341811
0.0048661046
0.0032013734
0.006041627
0.0059520015
0.0017191741
0.0010481792
0.0026581916
0.0014703898
0.0011816027
0.014952845
0



Elapsed time to compute best fit: 358.655 seconds
Cross-validation score: 0.5802387918844358
Test score: 0.5820610687022901
Best Hyperparameters: {}
0.02457455
0.08884834
0.20504545
0.031290665
0.029293098
0.0018457926
0.0025157498
0.00068341347
0.0014593967
0.0005664733
0.029176608
0.1058117
0.026271973
0.0
0.0055085793
0.0
0.004406756
0.0007777025
0.01129848
0.0063045453
0.004039226
0.0018485712
0.006525264
0.0016527129
0.004720644
0.0055625173
0.00265068
0.0048069376
0.0012011096
0.003471611
0.008111679
0.0024826012
0.0009791785
0.0056577306
0.0023938124
0.0
0.0
0.0
0.00094144995
0.0026327604
0.003642784
0.0027748877
0.026524749
0.006052746
0.0017670841
8.771534e-05
0.009330796
0.0
0.0
0.0
0.003583042
0.014269037
0.001834646
9.967947e-05
0.00410511
0.02293477
0.0064647445
0.014581235
0.005970748
0.010566627
0.005138572
0.003575386
0.00012452072
0.0026848875
0.00256015
0.009239461
0.00092327717
0.0010101724
0.0022366878
0.0016474401
0.0015662655
0.0043137106
0.0050720656
0.0021866183



Elapsed time to compute best fit: 369.529 seconds
Cross-validation score: 0.5655004881214386
Test score: 0.549645390070922
Best Hyperparameters: {}
0.018670116
0.084679104
0.19430311
0.03583692
0.020625237
0.0036057497
0.002000017
0.001885684
0.0006508823
0.0007879209
0.03424526
0.11651849
0.020556552
0.0
0.0032888008
0.0
0.0046061114
0.0015767494
0.010328065
0.012110719
0.0012141276
0.0009645936
0.0075438702
0.0027669868
0.008019491
0.00021572458
0.0035316374
0.00982429
0.0057915687
0.00092843594
0.0014171135
0.0025198965
0.0010032004
0.006481908
0.00095929427
0.0
0.0
0.0
0.00538427
0.004763267
0.0022109337
0.005700922
0.032872066
0.0021625203
0.0017520827
0.008063019
0.008412563
0.0
0.0
0.0
0.0027867893
0.0049331505
0.004015486
0.003317855
0.00866319
0.010981763
0.0009313331
0.012045576
0.0040400825
0.008693108
0.0079644425
0.0019576205
0.0011896417
0.0018573375
0.0017893483
0.0069035823
0.0014982279
0.000964993
0.004513201
0.0014680185
0.0035154345
0.009126131
0.0011502453
0.0004971



Elapsed time to compute best fit: 382.187 seconds
Cross-validation score: 0.5721360104802498
Test score: 0.6152343749999999
Best Hyperparameters: {}
0.018130789
0.07869677
0.1848646
0.03370671
0.014814817
0.02005805
0.0013645112
0.0014778284
0.00091483735
0.0070190947
0.024640508
0.17476559
0.013354414
0.0
0.0010576317
0.0
0.006026339
0.0032327038
0.019894078
0.00067345594
0.002988438
0.0013842805
0.0080236485
0.0021257328
0.0033056533
0.0038648685
0.0008975335
0.005846156
0.0015753008
0.0014368872
0.005416152
0.0025007287
0.0006572152
0.0074827913
0.0025813452
0.0
0.0
0.0
0.0025958223
0.0021550532
0.004120598
0.0046255407
0.025211887
0.0038173737
0.0009122961
0.0
0.009165964
0.0
0.0
0.0
0.0016246656
0.006902521
0.004275582
0.00096092833
0.005460864
0.0054442473
0.009103572
0.011749149
0.006740511
0.01027716
0.0045822705
0.0031106416
0.0005122391
0.0014219017
0.0013008637
0.0059764115
0.003532585
0.0024215144
0.011830875
0.0047291936
0.001084573
0.006560422
0.005288508
0.006650166
0.00



Elapsed time to compute best fit: 363.357 seconds
Cross-validation score: 0.5904969926304109
Test score: 0.587121212121212
Best Hyperparameters: {}
0.018285178
0.07877556
0.21495631
0.025802251
0.049457323
0.054998167
0.0028544948
0.0017396797
0.0013961055
0.00068773725
0.025799796
0.013669486
0.010649109
0.0
0.0024422186
0.0
0.01424281
0.003909571
0.00741026
0.01996728
0.0027195434
0.011574008
0.005511935
0.0026566242
0.0011351034
0.00031546975
0.0007170823
0.006985959
0.007950894
0.0011757193
0.003498544
0.002083215
0.0013584228
0.011872219
0.002227437
0.0
0.0
0.0
0.0014721835
0.0047834367
0.0057234974
0.00083305413
0.015957698
0.0023560992
0.0014832444
0.034176327
0.0076233004
0.0
0.0
0.0
0.0050779595
0.0044356342
0.004188306
0.0064057736
0.008807364
0.0006475876
0.0032182287
0.022023665
0.017860558
0.007904151
0.0047629056
0.0024554546
0.00085264
0.0005824467
0.00601617
0.016437948
0.0014437156
0.002452835
0.00091392465
0.0010800672
0.0022805484
0.002722733
0.008673495
0.001988922




Elapsed time to compute best fit: 341.143 seconds
Cross-validation score: 0.5599110707665538
Test score: 0.5130597014925373
Best Hyperparameters: {}
0.038124166
0.079698905
0.22144558
0.02954767
0.054448728
0.0025461346
0.0032895028
0.0018490744
0.0008601242
0.0065698456
0.013616442
0.039763786
0.034851927
0.0
0.0026906861
0.0
0.012877523
0.001145193
0.010507826
0.0056988923
0.003590214
0.00090296636
0.009988855
0.002584928
0.0023721252
0.0019399904
0.0031158542
0.007509679
0.002244136
0.001600179
0.004069977
0.0019039555
0.0004669519
0.010011555
0.0021107704
0.0
0.0
0.0
0.0030644662
0.0021228932
0.00327285
0.0031358695
0.01783637
0.0030530442
0.0009653262
0.00039802675
0.004841284
0.0
0.0
0.0
0.004537397
0.0076184687
0.0010535953
0.06538183
0.0038767175
0.0016557964
0.003057327
0.010909493
0.0027584217
0.009356262
0.0044716853
0.0039501083
0.00075139955
0.006619516
0.0061576557
0.0057530096
0.0011002498
0.0042567914
0.0014606261
0.0014917995
0.0022734872
0.0032109104
0.0013410412
0.00



Elapsed time to compute best fit: 341.649 seconds
Cross-validation score: 0.5580249965451716
Test score: 0.6066176470588237
Best Hyperparameters: {}
0.031832
0.08319016
0.1888618
0.026769122
0.025472462
0.057574026
0.0008867033
0.004696967
0.001096117
0.00032272958
0.030938178
0.014949323
0.024298009
0.0
0.0052698557
0.0
0.0021510562
0.002209708
0.017948031
0.002995087
0.002368904
0.0027246063
0.010411723
0.015581782
0.00056209974
0.0033611995
0.0036066526
0.007795824
0.0018067125
0.008843405
0.0058782916
0.0019005495
0.0010371233
0.007015939
0.0020768843
0.0
0.0
0.0
0.0023996814
0.003889195
0.0033791745
0.0029108026
0.03542257
0.0022305637
0.0011141365
0.009920393
0.008407565
0.0
0.0
0.0
0.0051548057
0.003785076
0.0034346168
0.0033333567
0.0071069575
0.0018809764
0.0019354844
0.02605154
0.0060397866
0.008078233
0.0038027107
0.003687783
0.00275409
0.0006040485
0.0019251463
0.014698329
0.00429528
0.011653291
0.0035480398
0.0028545826
0.0014866239
0.0035503984
0.0018865824
0.006862431
0.



Elapsed time to compute best fit: 315.270 seconds
Cross-validation score: 0.5685515396576369
Test score: 0.5589430894308943
Best Hyperparameters: {}
0.015601818
0.08173401
0.18537967
0.02580239
0.022705635
0.004226802
0.0023575956
0.0014924079
0.0013553284
0.00036227968
0.025134621
0.10351837
0.008010839
0.0
0.008112204
0.0
0.010871092
0.0047184047
0.026060496
0.015790254
0.0028107932
0.013430118
0.0053004865
0.0027895279
0.0032342512
0.0014243757
0.0023270098
0.0059072254
0.0037926238
0.0015244505
0.008008366
0.0037145542
0.0013947265
0.0057569006
0.0011973345
0.0
0.0
0.0
0.0058949497
0.001066186
0.0061567635
0.0045516845
0.053115383
0.0032562094
0.00024893996
0.025666757
0.0068362663
0.0
0.0
0.0
0.0017785841
0.014816293
0.0018803484
0.0037277783
0.016121745
0.0068053426
0.005631822
0.017091455
0.00846272
0.0074122576
0.0070799645
0.0012377437
0.0030641193
0.0015307369
0.011969909
0.01035032
0.0007047869
0.0033978587
0.0009194706
0.0017857866
0.003088612
0.0045902156
0.00067035516
0.0



Elapsed time to compute best fit: 333.485 seconds
Cross-validation score: 0.5782369334636901
Test score: 0.5700000000000001
Best Hyperparameters: {}
0.036177423
0.08335179
0.20420958
0.027401444
0.020223007
0.02313786
0.0003873599
0.0010821708
0.0026702771
0.00026477416
0.01931763
0.05672638
0.009129855
0.0
0.0014335776
0.0
0.0019007055
0.0018106756
0.016916819
0.0057305866
0.0019691798
0.0017714914
0.019143617
0.0023240817
0.0011889227
0.0033428988
0.0021668812
0.0045397757
0.0067256065
0.0016804877
0.00053674006
0.0019595637
0.003947971
0.0066556134
0.000905071
0.0
0.0
0.0
0.001620249
0.002336441
0.0069029224
0.004329203
0.02517849
0.0021200092
0.007836595
0.0022890295
0.0135637
0.0
0.0
0.0
0.007420827
0.0036946544
0.0006648586
0.0
0.012929091
0.0006451197
0.0020930215
0.027554842
0.0041465773
0.0071913064
0.0033025348
0.0047139428
0.006298283
0.0016965315
0.0005347445
0.012942277
0.0055176793
0.0023540081
0.0053648404
0.0072188787
0.0030362855
0.005493142
0.0020355242
0.0005893697
0



Elapsed time to compute best fit: 359.676 seconds
Cross-validation score: 0.5544665381094191
Test score: 0.5910852713178294
Best Hyperparameters: {}
0.0059804847
0.086980075
0.20613593
0.022439118
0.048794027
0.0072345706
0.0017516302
0.0015255604
0.0030350976
0.0012162552
0.02666935
0.072544456
0.019484786
0.0
0.004265354
0.0
0.0042452132
0.0030590473
0.03072402
0.014956027
0.0017091266
0.00091071776
0.006512654
0.0057599726
0.0024146414
0.00027926406
0.0011711451
0.0076583503
0.0024952732
0.0009081364
0.009064478
0.0014012989
0.0008444569
0.0073945494
0.0011859714
0.0
0.0
0.0
0.003942569
0.0025118385
0.005751437
0.00449247
0.046548154
0.002477574
0.007682984
0.0007102214
0.00996951
0.0
0.0
0.0
0.0018295015
0.001637094
0.0017728873
0.0009299465
0.0039942483
0.0021544767
0.0031117112
0.010847925
0.012228155
0.012524405
0.00514967
0.0024590841
0.0021453053
0.00033023782
0.0008140109
0.016990885
0.0039667427
0.0012338371
0.0068876212
0.0018226319
0.0011110757
0.0021351043
0.018094087
0.0



Elapsed time to compute best fit: 338.301 seconds
Cross-validation score: 0.5432450959006137
Test score: 0.6003937007874016
Best Hyperparameters: {}
0.009640811
0.09420741
0.20803005
0.026728947
0.019214537
0.030139728
0.0016487838
0.0012586553
0.0013263018
0.00017665261
0.028681537
0.048020586
0.024176525
0.0
0.0032355834
0.0
0.0052452725
0.0059577115
0.0076451832
0.0056092013
0.0020506363
0.010015047
0.009963764
0.0012095867
0.0015061825
0.0027063065
0.0024404947
0.006641746
0.0031326213
0.0032666305
0.0047098906
0.0024854648
0.0028351406
0.008895942
0.000793811
0.0
0.0
0.0
0.0007836094
0.000818757
0.003035856
0.0006505317
0.046001058
0.0035288485
0.0006367925
0.0015755981
0.0144391535
0.0
0.0
0.0
0.0045100753
0.011053654
0.0013315444
0.021851774
0.0065753167
0.0020081548
0.0016355286
0.020836744
0.009611295
0.012440784
0.007951748
0.005030725
0.00026913406
0.0013981387
0.0054295766
0.008733945
0.001264505
0.0009038066
0.0020761604
0.0038106136
0.0020609312
0.0019410437
0.002485247
0



Elapsed time to compute best fit: 349.837 seconds
Cross-validation score: 0.5474769554579623
Test score: 0.5690298507462687
Best Hyperparameters: {}
0.009880016
0.0872629
0.17854643
0.02396776
0.026568394
0.052754052
0.0038303882
0.0011875889
0.0010673968
0.001091267
0.023912283
0.10416027
0.012752452
0.0
0.0015120186
0.0
0.009481972
0.0012679079
0.013091326
0.005875231
0.0015352425
0.0017396451
0.0067625646
0.0025954524
0.006550836
0.0048208856
0.0022381062
0.0058309855
0.003655281
0.0034786374
0.0052523687
0.0010989993
0.0055876356
0.006076408
0.0027937118
0.0
0.0
0.0
0.0027976304
0.0008189617
0.005699766
0.0050263647
0.013410978
0.004015698
0.0027792563
0.008054282
0.0087944325
0.0
0.0
0.0
0.0033008922
0.007917362
0.0022216577
0.019404802
0.011663191
0.007662688
0.0005789375
0.017355174
0.0038047945
0.010005214
0.0069423746
0.0028059557
0.0043579647
0.00075548346
0.0015608108
0.015192647
0.00096199557
0.0036821356
0.0023385494
0.0069796974
0.0017203531
0.0053912187
0.0015187538
0.00



Elapsed time to compute best fit: 353.604 seconds
Cross-validation score: 0.570461367616775
Test score: 0.5545774647887325
Best Hyperparameters: {}
0.008270593
0.09171843
0.19856547
0.02480126
0.057649124
0.039237447
0.0022088897
0.0025822318
0.00030089723
0.0016752243
0.019085128
0.07024549
0.026462028
0.0
0.0022314992
0.0
0.005754499
0.0048904824
0.012609086
0.016503321
0.0014668431
0.00038316252
0.001387336
0.0016735721
0.01033774
0.00018766013
0.0013803345
0.009532118
0.0024427364
0.0017453745
0.0015035631
0.0022659611
0.0018040603
0.008110737
0.0040712166
0.0
0.0
0.0
0.0017202258
0.004654122
0.0031730605
0.0064059524
0.025360422
0.0030031407
0.0118780555
0.01721343
0.0062187174
0.0
0.0
0.0
0.003063972
0.004827607
0.0028941918
0.0002505939
0.003968099
0.006335251
0.0055765053
0.013325835
0.006729981
0.010119258
0.009840548
0.0020475849
0.00010985971
0.0027647626
0.0013038077
0.009032404
0.0009262864
0.0017328848
0.002477519
0.0015820365
0.002320692
0.004970668
0.00725159
0.01300928



Elapsed time to compute best fit: 330.919 seconds
Cross-validation score: 0.5688306236851988
Test score: 0.5615942028985508
Best Hyperparameters: {}
0.024716245
0.08653763
0.23013495
0.033157237
0.02073142
0.007724762
0.0026175878
0.0013204217
0.0004256831
0.002247992
0.023139797
0.10183552
0.00837455
0.0
0.0035944837
0.0
0.004487041
0.009197672
0.021701833
0.007297707
0.0032465449
0.0025817845
0.014152499
0.001158292
0.0048605427
0.0010300447
0.0020164587
0.0069641136
0.0026244908
0.0048877634
0.0040339963
0.0010202272
0.0008454084
0.005262468
0.002537787
0.0
0.0
0.0
0.0030462113
0.0026857802
0.004154353
0.0033813452
0.018230757
0.0064651333
0.0032690011
2.9654011e-05
0.013895297
0.0
0.0
0.0
0.0032029517
0.003097985
0.0007593589
0.0017930472
0.015582966
0.003926582
0.001026908
0.012168462
0.015173945
0.0076054283
0.009399132
0.004671056
0.007824681
0.0013141877
0.009459946
0.0019319431
0.0008104456
0.0011657487
0.007247545
0.006305417
0.002301483
0.0021586986
0.0026200083
0.0006472248



Elapsed time to compute best fit: 358.772 seconds
Cross-validation score: 0.5580270534809217
Test score: 0.5615942028985508
Best Hyperparameters: {}
0.0037376701
0.09182476
0.1862418
0.017942877
0.020049635
0.0030698415
0.0034117925
0.0024902956
0.0075588794
0.00015638451
0.023457343
0.15479432
0.010608634
0.0
0.0038400525
0.0
0.0012328078
0.0038560042
0.0501404
0.0010879292
0.004333146
0.0010456776
0.005751614
0.0036676556
0.004687771
0.00033364102
0.0023606387
0.011139501
0.0022249313
0.001540318
0.0010436005
0.0016445386
0.0020858226
0.007370277
0.00059001753
0.0
0.0
0.0
0.0023871558
0.006252885
0.0029801407
0.008593732
0.020643251
0.0035305445
0.0032587615
0.0026791843
0.017618276
0.0
0.0
0.0
0.0068494887
0.0009396534
0.004444211
0.0009978479
0.009447926
0.008421705
0.007489568
0.0075477306
0.0071915016
0.011771169
0.008646601
0.0017003862
0.005605971
0.0010898366
0.003155644
0.008935663
0.0021568125
0.008365673
0.000519189
0.004508053
0.0021759262
0.0046698744
0.0005329116
0.00146



Elapsed time to compute best fit: 362.991 seconds
Cross-validation score: 0.5579055627716455
Test score: 0.5813953488372092
Best Hyperparameters: {}
0.04355183
0.083679184
0.18071645
0.0319099
0.020832438
0.030215435
0.0056922142
0.0016549195
0.0014929688
0.003026228
0.022896845
0.053051434
0.03648069
0.0
0.00083667075
0.0
0.010795382
0.0012958638
0.013907527
0.010605156
0.0010274905
0.00037280304
0.0080343485
0.00086161814
0.00921507
0.00403908
0.011220766
0.0047189235
0.0036689357
0.0022350051
0.0042183828
0.00035622093
0.0015826602
0.0036653832
0.0010640151
0.0
0.0
0.0
0.003427731
0.010585347
0.004129885
0.0059433538
0.03041621
0.0017402709
0.0011493389
0.0071921917
0.008437636
0.0
0.0
0.0
0.003850772
0.006760445
0.0018321425
0.003885121
0.014156218
0.008478962
0.009881894
0.010645594
0.009813533
0.00944012
0.0066999784
0.0029831317
7.346936e-05
0.0029306603
0.0014218729
0.006452365
0.00052114844
0.00077182276
0.0028653976
0.0021806227
0.0014573187
0.0025795028
0.0010445053
0.010405



Elapsed time to compute best fit: 341.972 seconds
Cross-validation score: 0.5656464528326618
Test score: 0.6025179856115108
Best Hyperparameters: {}
0.007220923
0.087245926
0.20703356
0.035422396
0.019635256
0.0010102829
0.0008436159
0.0015459055
0.0028135192
0.00053918274
0.02842969
0.14219889
0.03015998
0.0
0.0023836363
0.0
0.006488414
0.0017221548
0.015305289
0.014186086
0.0024630285
0.00094678474
0.013440212
0.0012835341
0.0007951304
0.0028015666
0.0065529076
0.0065523987
0.0005851332
0.002928042
0.0010686083
0.00085362996
0.0011840357
0.008610249
0.0027067612
0.0
0.0
0.0
0.0069471756
0.0037240032
0.005143707
0.0040510884
0.02863099
0.0042925435
0.00055502227
0.00015408745
0.0036648784
0.0
0.0
0.0
0.004619618
0.0070698885
0.0049684034
0.0015406237
0.003598913
0.004364566
0.007050221
0.017010141
0.010115444
0.0081573
0.0062537203
0.004877064
0.00013601632
0.0015057304
0.0040076477
0.011319114
0.0020899037
0.0031239805
0.003707082
0.008306682
0.0035281407
0.0034737631
0.0011995552
0.



Elapsed time to compute best fit: 351.757 seconds
Cross-validation score: 0.600915648224274
Test score: 0.5217391304347826
Best Hyperparameters: {}
0.029394891
0.09157795
0.2168654
0.029253742
0.02721789
0.02389837
0.004576979
0.0012316946
0.0011747248
0.000640224
0.02175392
0.05868358
0.025296232
0.0
0.001983987
0.0
0.0037562838
0.0014597013
0.014682139
0.017298194
0.002258483
0.0008954316
0.007024057
0.017851193
0.002813143
0.0005803062
0.0013680776
0.007934198
0.0039814822
0.006532744
0.0008066804
0.0018519254
0.00064184464
0.00565659
0.0042812335
0.0
0.0
0.0
0.0023202444
0.0029844136
0.005931966
0.003515094
0.01211063
0.0032533947
0.0013408375
0.007154383
0.0070583783
0.0
0.0
0.0
0.0010568104
0.0021954773
0.0057685585
0.0011235767
0.00807868
0.0043553337
0.008993595
0.015454274
0.0064169914
0.009594005
0.002931728
0.0023450213
1.4641699e-05
0.0059994315
0.0014159348
0.005831733
0.0031241488
0.004411758
0.00011240929
0.002642677
0.0043116123
0.0059774923
0.0011343793
0.00054115383
0



Elapsed time to compute best fit: 383.370 seconds
Cross-validation score: 0.5533531129087338
Test score: 0.6402439024390244
Best Hyperparameters: {}
0.012078198
0.09333026
0.2010629
0.02933016
0.027888369
0.0024986442
0.0018362432
0.0011418856
0.002662923
0.0007847308
0.015140373
0.1468924
0.008005905
0.0
0.006461602
0.0
0.008701895
0.002517806
0.018098222
0.015721753
0.002867776
0.0009555167
0.005908243
0.007529901
0.008062213
0.0009687278
0.01699204
0.007042316
0.00052357593
0.002489582
0.0010880514
0.0012020746
0.00081477023
0.010231986
0.0020436747
0.0
0.0
0.0
0.0054987227
0.0027788104
0.0017396163
0.0019787198
0.033154953
0.0038585952
0.0030945484
0.010463626
0.008322518
0.0
0.0
0.0
0.0035692963
0.004196985
0.0021400999
0.00073996413
0.007264536
0.004956434
0.002739479
0.009587201
0.0031349012
0.0077277105
0.004712572
0.0030536798
0.0008090049
0.002006294
0.0037736443
0.00031883558
0.0076281438
0.0069589484
0.0011024812
0.0032756594
0.0016941684
0.0046739145
0.0026210868
0.0030263



Elapsed time to compute best fit: 386.155 seconds
Cross-validation score: 0.5823091973198309
Test score: 0.5523255813953488
Best Hyperparameters: {}
0.0074173147
0.0815514
0.20083302
0.02386513
0.02709111
0.11642974
0.0020257242
0.001536078
0.0024012362
0.00037928336
0.013310354
0.02302131
0.022639006
0.0
0.0020430302
0.0
0.0029810767
0.001726555
0.025922405
0.014058302
0.0017818458
0.0068131043
0.0068084076
0.0006978566
0.00274879
0.0018688696
0.003196715
0.00588062
0.0036576476
0.0005040725
0.004908861
0.00041877074
0.0012565127
0.0038711259
0.0018962454
0.0
0.0
0.0
0.009310218
0.0021385066
0.003914177
0.0029627907
0.035646643
0.0036377292
0.0011946107
0.00039250177
0.008443913
0.0
0.0
0.0
0.007311518
0.005824117
0.0014887905
0.0012372414
0.012827762
0.006042249
0.0018397472
0.011705747
0.008739005
0.010963427
0.0067730043
0.0020728782
0.0021059548
0.0016299152
0.0017403839
0.003540057
0.0006304326
0.0007101305
0.0015163698
0.005866859
0.0030057621
0.0036920195
0.0046217334
0.0044326



Elapsed time to compute best fit: 362.004 seconds
Cross-validation score: 0.606680231615192
Test score: 0.6201550387596899
Best Hyperparameters: {}
0.030643102
0.081929214
0.22450805
0.03550385
0.058232732
0.008193369
0.0018391052
0.0019853008
0.002436632
0.0005153513
0.02837303
0.04695794
0.021453226
0.0
0.0013057485
0.0
0.0041352455
0.00258305
0.017296713
0.0061359783
0.0022613832
0.0008699861
0.004906944
0.003521222
0.012693659
0.005901201
0.0014169392
0.009260558
0.0025276386
0.0018703002
0.00015213729
0.0020052579
0.0040667523
0.007235329
0.0033571606
0.0
0.0
0.0
0.0031158149
0.0018608423
0.00530982
0.003925872
0.025525687
0.002556471
0.001192876
0.0
0.0060975044
0.0
0.0
0.0
0.003550084
0.0032184913
0.003138317
0.002268173
0.02586291
0.013668725
0.0025632705
0.015467302
0.00869707
0.006171937
0.008494242
0.0035746638
0.008901677
0.00084509695
0.0018572783
0.0038714863
0.003991019
0.00040754554
0.0034068257
0.0046600723
0.0018522722
0.004127286
0.004526695
0.0034054045
0.0013331416

### 4.4.3 LightGBM

In [124]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

fiftyfifty_lightgbm_performance_normalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features_normalized,
                                                    labels_normalized,
                                                    test_size=0.2,
                                                    stratify=labels_normalized)


    LightGBMPipeline = Pipeline(steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                                    ['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    fiftyfifty_lightgbm_performance_normalized_df = pd.concat([fiftyfifty_lightgbm_performance_normalized_df, new_performance_df])
    

fiftyfifty_lightgbm_performance_normalized_df.to_csv("../data/05_model_output/fiftyfifty_lightgbm_performance_normalized_df.csv")




Elapsed time to compute best fit: 26.889 seconds
Cross-validation score: 0.6416525718537833
Test score: 0.6673960612691465
Best Hyperparameters: {}
11049.550123929977
122809.74846220016
228348.88435673714
15055.791818857193
5150.469569444656
5248.648651838303
229.28939056396484
92.49226093292236
28.469899892807007
3.4128100872039795
11808.452516794205
24486.231491088867
7570.001002073288
0.0
221.29809951782227
0.0
1265.688229560852
122.73808121681213
2574.008068561554
296.4226198196411
121.22879004478455
91.36183905601501
1396.058708190918
595.2151336669922
176.88661861419678
104.05828976631165
181.4058482646942
1827.9775731563568
386.51252818107605
668.6883103847504
162.01818943023682
176.02502942085266
145.8270604610443
1966.5427750349045
164.24018907546997
0.0
0.0
0.0
263.16384196281433
498.7430992126465
664.6156160831451
402.1708097457886
7469.618593931198
523.0647268295288
76.77638971805573
38.83284950256348
452.4039649963379
0.0
0.0
0.0
496.67347526550293
6049.083718061447
652.61



Elapsed time to compute best fit: 22.366 seconds
Cross-validation score: 0.6648010300034608
Test score: 0.6745182012847967
Best Hyperparameters: {}
8976.782280445099
131352.44734048843
219654.139731884
19806.496074438095
7913.265506029129
494.56751799583435
160.34930109977722
161.93143939971924
181.42329931259155
98.21561861038208
9462.02279162407
30741.945014953613
1593.1677293777466
0.0
1371.431336402893
0.0
1962.9775846004486
150.26874113082886
6013.732450723648
550.7407312393188
224.76487064361572
31.0450599193573
850.7298457622528
460.35590648651123
224.08715963363647
388.0572278499603
146.14726042747498
1815.3244712352753
417.2704744338989
237.85596013069153
339.6005642414093
437.8126401901245
196.53131103515625
3052.1252076625824
171.33846974372864
0.0
0.0
0.0
1103.2385172843933
818.2275056838989
303.76397573947906
553.4259314537048
6738.732562303543
631.5722546577454
297.05240631103516
396.8247947692871
448.54619789123535
0.0
0.0
0.0
1340.250247478485
4723.249593734741
282.1479



Elapsed time to compute best fit: 21.831 seconds
Cross-validation score: 0.6736392793680103
Test score: 0.7024793388429752
Best Hyperparameters: {}
8225.606984615326
122508.74421095848
227512.2492518425
14945.907955169678
9883.508477926254
514.7607200145721
93.2614495754242
184.18243145942688
313.61340856552124
294.6496877670288
8887.404393553734
37034.35743498802
2673.680190563202
0.0
301.89781188964844
0.0
1985.0796370506287
564.1384580135345
3232.611634016037
229.92170023918152
337.7842495441437
101.81981229782104
1622.4868297576904
291.40026688575745
108.38454723358154
191.49144172668457
1149.22208070755
1015.1925239562988
976.1235928535461
377.89405512809753
14.341430187225342
213.73318696022034
102.59400868415833
3268.269954919815
152.00241041183472
0.0
0.0
0.0
876.4595205783844
339.95454239845276
756.1844253540039
505.871866941452
9579.455384254456
323.47822093963623
106.0817699432373
94.445148229599
700.850090265274
0.0
0.0
0.0
504.1489305496216
2458.1807959079742
345.075052261



Elapsed time to compute best fit: 21.622 seconds
Cross-validation score: 0.6550696885993897
Test score: 0.6827309236947791
Best Hyperparameters: {}
8913.807422876358
130428.71684265137
222700.76372122765
20928.89561665058
5254.020076274872
3390.3885958194733
683.4241511821747
67.69986057281494
51.31640028953552
91.78048872947693
10904.377135753632
29754.50388801098
3061.984360218048
0.0
761.0052852630615
0.0
1861.3694672584534
183.3394067287445
5216.232329368591
768.3544595241547
177.39128065109253
62.069589614868164
2240.7356305122375
107.11394047737122
122.48800849914551
113.148681640625
582.3948354721069
916.7300696372986
250.95259368419647
264.5575797557831
591.475665807724
165.99210047721863
126.09535050392151
1665.9825248718262
269.1943018436432
0.0
0.0
0.0
728.054881811142
255.67528009414673
532.2316701412201
795.9866228103638
9709.252644777298
989.4220252037048
192.2809238433838
260.8567476272583
496.1994035243988
0.0
0.0
0.0
507.51138496398926
504.54844903945923
201.6060934066



Elapsed time to compute best fit: 22.256 seconds
Cross-validation score: 0.6529106627186035
Test score: 0.6167400881057268
Best Hyperparameters: {}
8002.308705687523
133907.4848845005
223503.8850747347
16652.242874503136
14729.18077659607
20664.582399845123
300.8456885814667
240.71371841430664
124.31756210327148
136.87865924835205
7351.318361878395
11026.554829359055
6218.487335681915
0.0
233.280091047287
0.0
119.9667296409607
622.8421277999878
2791.4863379001617
840.324609041214
251.03560972213745
19.71800994873047
1701.5667338371277
435.4050409793854
92.1173803806305
156.02707171440125
115.82128071784973
2054.1939091682434
153.28565764427185
234.6471300125122
312.4015941619873
702.9107694625854
474.3096306324005
2047.8732025623322
94.10655879974365
0.0
0.0
0.0
1110.541537284851
375.90927505493164
448.74350476264954
773.48717045784
8366.973577618599
735.1800541877747
482.90434074401855
29.465410232543945
524.4906351566315
0.0
0.0
0.0
331.6530992984772
85.33102083206177
752.23515605926



Elapsed time to compute best fit: 22.394 seconds
Cross-validation score: 0.6596767113803759
Test score: 0.6808510638297871
Best Hyperparameters: {}
16397.358658909798
122560.7263007164
224748.60231626034
16616.26975440979
6408.739308476448
283.33422088623047
75.25280117988586
140.153968334198
51.82360076904297
24.577219009399414
10953.637992501259
25391.021300792694
6359.390107393265
0.0
767.2831451892853
0.0
395.4680573940277
206.3918013572693
4783.296165943146
74.54386949539185
93.39029002189636
254.942476272583
3239.8274347782135
459.343505859375
201.83946108818054
118.86623907089233
386.4663076400757
1848.5222022533417
409.7995026111603
89.70973014831543
315.0729765892029
325.1223738193512
165.80771255493164
2652.0378259420395
161.7744710445404
0.0
0.0
0.0
465.7559378147125
169.89281272888184
933.0802128314972
1013.039657831192
8203.720456123352
572.5297758579254
118.47996830940247
182.36334681510925
1122.8644766807556
0.0
0.0
0.0
656.9273416996002
4196.692440986633
428.68541431427



Elapsed time to compute best fit: 22.947 seconds
Cross-validation score: 0.6521160236043164
Test score: 0.6846473029045642
Best Hyperparameters: {}
24047.093515872955
138331.13797545433
211072.13784778118
20639.98254752159
12383.394466400146
6179.999531507492
70.73061943054199
161.39248824119568
50.889668226242065
25.38271975517273
13188.875512838364
9697.095495462418
2148.1380019187927
0.0
468.2010703086853
0.0
420.6772780418396
196.91547751426697
3311.2775242328644
1690.1519720554352
176.95408153533936
49.555830121040344
2440.820424079895
613.3173608779907
147.34737014770508
185.85810089111328
777.4460015296936
1143.1444792747498
235.96937727928162
107.71362948417664
1075.9703333377838
158.68393850326538
165.29697704315186
2740.742521762848
214.364750623703
0.0
0.0
0.0
666.2909960746765
162.3015570640564
680.841566324234
358.3697602748871
3200.7943583726883
287.640310049057
331.59756231307983
237.50000023841858
499.47220730781555
0.0
0.0
0.0
668.1443750858307
455.3225402832031
259.25



Elapsed time to compute best fit: 23.092 seconds
Cross-validation score: 0.66071462206329
Test score: 0.6659619450317125
Best Hyperparameters: {}
9506.419652938843
127047.98231244087
224976.39970231056
14378.427535772324
7822.4538143873215
5404.861137866974
291.0272674560547
214.91467952728271
379.3496242761612
141.69806003570557
8748.22997879982
32047.704243183136
6399.166468381882
0.0
359.3146640062332
0.0
682.1646931171417
240.7056107521057
2585.4433631896973
822.845300912857
189.55516982078552
91.67090225219727
4242.095221996307
73.20727062225342
235.02056407928467
27.315359592437744
171.45052194595337
2284.6696965694427
17.89389967918396
160.5656819343567
485.53170812129974
515.1278450489044
338.2012391090393
1756.3134932518005
407.58792304992676
0.0
0.0
0.0
374.63180804252625
465.70632815361023
840.3454859256744
373.7262580394745
2916.574478030205
500.7641906738281
26.841009616851807
8.859609842300415
917.838870048523
0.0
0.0
0.0
1232.4139306545258
2551.816662788391
513.875711917



Elapsed time to compute best fit: 23.125 seconds
Cross-validation score: 0.6613502445433384
Test score: 0.6097560975609756
Best Hyperparameters: {}
14760.322875976562
125466.72257566452
219756.6503932476
20061.053760290146
15168.58333659172
3535.5538246631622
223.5042986869812
71.50196099281311
30.187469959259033
79.03250074386597
12804.503455162048
11933.948206424713
1360.1527490615845
0.0
1142.9050989151
0.0
313.9718451499939
1031.673640012741
6744.43986582756
1214.4882271289825
184.08450782299042
299.8480956554413
1782.0277364253998
321.8158723115921
716.0169982910156
180.77486419677734
359.73969650268555
1917.600828886032
99.92182064056396
405.2741334438324
593.2206392288208
409.46977710723877
570.9258275032043
3463.604909181595
754.8642802238464
0.0
0.0
0.0
596.0656385421753
218.52422833442688
902.7165524959564
434.4252254962921
4453.605751037598
424.5131742954254
319.1054458618164
8.767109870910645
821.6243829727173
0.0
0.0
0.0
697.9110043048859
2803.784621477127
552.735197305679



Elapsed time to compute best fit: 23.410 seconds
Cross-validation score: 0.6555131401104215
Test score: 0.6359649122807017
Best Hyperparameters: {}
27339.38508939743
128599.11805319786
228210.73671209812
15960.965533018112
4033.4678404331207
2210.0799872875214
646.4980952739716
285.63935804367065
54.07189059257507
2.211549997329712
11324.215852022171
12672.185236215591
2190.5554320812225
0.0
327.6053092479706
0.0
787.239842414856
208.87992978096008
4870.299649000168
385.38364243507385
138.4985809326172
109.61418890953064
1473.992343902588
326.6496169567108
677.459979057312
93.13517022132874
227.34476256370544
1788.0866522789001
302.4582827091217
159.15106797218323
630.576988697052
258.0149371623993
261.76714539527893
2357.476180791855
496.1440796852112
0.0
0.0
0.0
870.4537951946259
337.3133726119995
196.35062503814697
527.3970241546631
6176.694396972656
482.8003180027008
165.24343347549438
8.2080397605896
841.0655813217163
0.0
0.0
0.0
794.1941401958466
2555.7207980155945
664.9008450508



Elapsed time to compute best fit: 23.789 seconds
Cross-validation score: 0.6607923632511331
Test score: 0.6553911205073997
Best Hyperparameters: {}
5180.936473846436
127695.39557242393
241224.7214462757
10604.075785160065
7658.090433597565
3207.598554611206
678.2266030311584
588.6774899959564
36.95759963989258
35.64922070503235
9065.131587982178
29619.33487892151
4349.207729101181
0.0
263.90692377090454
0.0
569.917692899704
429.89665031433105
2340.4468710422516
1036.3921194076538
318.10339403152466
20.560250282287598
1723.549854516983
491.8802559375763
455.3864254951477
192.6735692024231
2939.172742128372
1947.2102081775665
194.99985003471375
453.17378211021423
545.5245852470398
1126.7622725963593
345.7147685289383
1284.6493601799011
287.65186977386475
0.0
0.0
0.0
507.77003145217896
333.0396237373352
536.8623780012131
669.0932283401489
6497.928395748138
330.5400218963623
136.46540069580078
7.796780109405518
670.958963394165
0.0
0.0
0.0
1828.0914089679718
1213.2050762176514
63.524549961



Elapsed time to compute best fit: 25.346 seconds
Cross-validation score: 0.6467253406624648
Test score: 0.6993736951983299
Best Hyperparameters: {}
4547.593893289566
128629.60637617111
229886.39505171776
14657.113959550858
8744.205864667892
2363.671763896942
109.68627214431763
161.33431339263916
172.1063780784607
10.15231990814209
9117.496294021606
26050.49350833893
1920.9519703388214
0.0
312.61182022094727
0.0
750.2178840637207
206.4925618171692
5768.162975072861
379.1573178768158
109.27446985244751
15.352899551391602
2335.932847738266
585.1686313152313
415.54288244247437
182.61996722221375
394.1214735507965
2016.5978701114655
320.2151834964752
188.1196141242981
685.6122896671295
94.5718789100647
166.00877714157104
783.3274779319763
511.68080163002014
0.0
0.0
0.0
661.8220608234406
407.5451018810272
587.3565356731415
867.3468332290649
5331.463979721069
979.8732852935791
242.18962979316711
337.28490352630615
1137.2683036327362
0.0
0.0
0.0
1581.1757552623749
3998.374125957489
397.5593440



Elapsed time to compute best fit: 25.255 seconds
Cross-validation score: 0.6574956921373895
Test score: 0.6645569620253164
Best Hyperparameters: {}
6191.796803474426
133417.64151906967
234000.82265615463
14321.397981405258
14488.240484952927
7672.2435421943665
193.48300981521606
377.83545565605164
89.26405048370361
130.95771312713623
7298.637183189392
10661.450627088547
3821.871301651001
0.0
1147.5530643463135
0.0
581.4012489318848
284.9207227230072
5920.016312599182
2625.370810031891
238.47164106369019
18.461859703063965
2053.970353126526
109.6461615562439
51.536189556121826
30.670830726623535
444.141809463501
1360.2313439846039
182.7874526977539
675.213041305542
725.7852816581726
630.8547217845917
650.7689006328583
3071.0083684921265
323.999963760376
0.0
0.0
0.0
835.3368210792542
46.06904077529907
457.7755994796753
1088.925588130951
1971.5638177394867
633.9601039886475
231.36279034614563
306.6921033859253
660.8461894989014
0.0
0.0
0.0
650.1688981056213
683.2399382591248
1350.03639888



Elapsed time to compute best fit: 23.666 seconds
Cross-validation score: 0.6505976515585036
Test score: 0.6616052060737527
Best Hyperparameters: {}
11802.389243364334
120795.30780553818
233128.24883377552
16955.77417564392
7028.373439788818
438.7027771472931
220.36046838760376
1082.027634382248
144.91218733787537
84.3301305770874
14103.538478732109
21878.361550807953
4795.069112062454
0.0
1529.3149299621582
0.0
981.2957932949066
294.91863441467285
1845.4595394134521
376.6960668563843
122.5158703327179
147.21078848838806
2009.4684653282166
132.35205841064453
325.13290309906006
72.27089786529541
103.99862003326416
2702.9279642105103
850.9957885742188
219.06364226341248
466.3154766559601
161.12694084644318
495.9021534919739
2431.974223971367
163.49482774734497
0.0
0.0
0.0
675.811220407486
134.94787979125977
412.09730672836304
637.5216517448425
17698.91873240471
575.1612157821655
106.78448343276978
140.97840213775635
779.33156645298
0.0
0.0
0.0
2050.0566288232803
745.0851821899414
173.3176



Elapsed time to compute best fit: 23.767 seconds
Cross-validation score: 0.6676208214223049
Test score: 0.631578947368421
Best Hyperparameters: {}
4889.308439612389
131814.56025981903
225911.656126976
13782.03677392006
4909.004864692688
16547.16939353943
345.92579317092896
36.00295042991638
229.10060036182404
11.459650039672852
8301.091279506683
19602.49702477455
5915.5013692379
0.0
427.0703835487366
0.0
2291.8170256614685
212.86319589614868
2395.5073385238647
785.697226524353
122.83473086357117
65.38790845870972
1965.80286693573
272.2050817012787
115.14434146881104
109.62328958511353
345.83273220062256
1667.5453927516937
99.07924127578735
317.26451206207275
132.45847082138062
324.25567054748535
126.84950041770935
2252.856169939041
581.5285446643829
0.0
0.0
0.0
698.459451675415
141.6349973678589
818.9420392513275
532.7565774917603
8924.535365343094
477.3281807899475
90.03606009483337
0.0
733.6201119422913
0.0
0.0
0.0
1384.9196028709412
3625.2285096645355
499.4225630760193
3.07275009155



Elapsed time to compute best fit: 23.838 seconds
Cross-validation score: 0.6463214204423432
Test score: 0.6924643584521385
Best Hyperparameters: {}
10053.500629425049
133604.05063605309
210983.98459148407
18255.529238939285
9809.062470674515
1247.7625768184662
425.61690878868103
269.33633947372437
28.60534954071045
17.21179986000061
12004.668005466461
31218.716757297516
4478.358401298523
0.0
444.4065890312195
0.0
682.135217666626
98.19677948951721
2713.8657660484314
769.2823173999786
101.88090085983276
214.85850739479065
1222.6320021152496
447.11146783828735
411.8759093284607
80.49833846092224
232.74027156829834
2334.1362487077713
468.54847145080566
124.48274874687195
324.8534278869629
655.488365650177
1056.6880412101746
3670.9756295681
877.4147841930389
0.0
0.0
0.0
424.447988986969
95.83941125869751
370.4438021183014
626.4101068973541
5143.368479251862
297.6612503528595
53.51449108123779
34.7804594039917
552.4146499633789
0.0
0.0
0.0
322.78184819221497
4829.2654485702515
561.256660223



Elapsed time to compute best fit: 23.878 seconds
Cross-validation score: 0.6559991694588408
Test score: 0.6645569620253164
Best Hyperparameters: {}
6192.648020744324
130202.58715033531
214923.00172543526
16071.07014799118
8411.698887825012
1392.7029242515564
731.8392717838287
285.48573899269104
25.17340087890625
117.41443920135498
10498.227345705032
33204.21662127972
3957.8671438694
0.0
511.62555170059204
0.0
1750.3364214897156
288.6973168849945
1575.446861743927
108.1093590259552
135.48132038116455
218.7875623703003
1362.4408040046692
500.02160024642944
436.45185804367065
286.0833086967468
188.65231013298035
2376.5891482830048
184.2088565826416
240.04014992713928
543.6083343029022
149.81376957893372
195.890878200531
2312.3190653324127
215.88465666770935
0.0
0.0
0.0
731.6107983589172
335.98233819007874
480.2859592437744
758.4021170139313
7677.169419527054
1024.2657039165497
142.27713823318481
36.46481990814209
807.364492893219
0.0
0.0
0.0
1211.3960423469543
3688.698296546936
888.764444



Elapsed time to compute best fit: 23.888 seconds
Cross-validation score: 0.6475166994180357
Test score: 0.7173447537473234
Best Hyperparameters: {}
8277.754762887955
129211.66266942024
217992.49856829643
14475.814189195633
14022.013226747513
475.30154061317444
106.38549017906189
111.0082483291626
114.98886299133301
64.0404405593872
11849.186889648438
26519.226704359055
2321.477861404419
0.0
1176.4554805755615
0.0
341.2967095375061
426.39971828460693
6301.799966812134
1232.7636458873749
137.1212182044983
100.64450931549072
1578.3094973564148
611.0683174133301
545.6753458976746
9.553730010986328
1055.7621142864227
1245.065869808197
486.0008749961853
127.22497129440308
120.90137982368469
567.0018994808197
945.4325077533722
2236.935067653656
271.55219745635986
0.0
0.0
0.0
357.28074526786804
625.5279080867767
1781.046588897705
316.80650806427
8090.350856781006
491.3835573196411
293.1330542564392
250.46876692771912
981.3629522323608
0.0
0.0
0.0
613.7137289047241
3084.413319826126
129.5688486



Elapsed time to compute best fit: 23.688 seconds
Cross-validation score: 0.6501373907283857
Test score: 0.6808943089430894
Best Hyperparameters: {}
13340.965451717377
125314.34010636806
217067.24576067924
22519.15434408188
2245.6325540542603
2612.2349054813385
372.44278287887573
59.72879958152771
124.54962944984436
47.64583992958069
14995.737937927246
30579.6617538929
1259.6663715839386
0.0
139.39889526367188
0.0
1072.1074466705322
603.5825438499451
7226.347841978073
638.40993309021
12.009459972381592
39.97924995422363
1926.7931745052338
95.24389052391052
209.46967124938965
334.237979888916
1460.2241184711456
1807.171752333641
603.1482203006744
722.237988948822
232.34533786773682
839.8236963748932
187.85333228111267
2234.529751777649
398.439195394516
0.0
0.0
0.0
542.3051989078522
223.90440964698792
1210.1292896270752
943.4408240318298
7093.861925125122
542.975759267807
227.72366070747375
45.81945991516113
845.3466165065765
0.0
0.0
0.0
337.4068467617035
3162.6746804714203
145.7746992111



Elapsed time to compute best fit: 25.407 seconds
Cross-validation score: 0.6411324999600095
Test score: 0.7173447537473234
Best Hyperparameters: {}
11317.860730409622
132770.41640877724
223030.9322655201
13431.170810461044
8679.026693105698
4201.742641925812
764.7963905334473
80.89414978027344
167.6734802722931
32.097299575805664
10116.257761240005
12839.199630975723
2194.1146745681763
0.0
210.79132056236267
0.0
724.1421537399292
439.10611605644226
1870.2496831417084
8642.352724313736
201.0940010547638
123.68426132202148
918.835168838501
378.012734413147
905.7761738300323
343.2056608200073
100.61378049850464
1619.9650120735168
285.8286564350128
520.4026794433594
617.3986432552338
469.3131048679352
767.2536516189575
2214.0236978530884
1004.4311921596527
0.0
0.0
0.0
1140.333705663681
339.16518807411194
895.8043131828308
478.8391990661621
5356.245355844498
405.0653350353241
34.41861009597778
52.56383943557739
581.6568927764893
0.0
0.0
0.0
786.4393029212952
1986.9908559322357
634.034304380



Elapsed time to compute best fit: 23.944 seconds
Cross-validation score: 0.6572698612913128
Test score: 0.6722689075630252
Best Hyperparameters: {}
4836.282293081284
126769.5664024353
241335.68493914604
10946.851886987686
19701.372455120087
227.14282739162445
189.48168754577637
208.72743570804596
49.671738386154175
158.43176174163818
12544.416783452034
12361.419703245163
2338.652347803116
0.0
176.37212944030762
0.0
810.7084360122681
456.4757900238037
7899.393360614777
1207.3374319076538
358.9928092956543
26.62883996963501
1642.4002604484558
564.1928791999817
296.6685838699341
68.02616024017334
352.2125973701477
1968.275262594223
280.5204927921295
359.3174339532852
1312.0446605682373
406.4295265674591
610.1419162750244
1493.402738571167
185.5636191368103
0.0
0.0
0.0
822.4957144260406
199.312153339386
574.6712367534637
689.0417358875275
2703.0985066890717
429.6609091758728
170.46020126342773
6.961659908294678
1118.5703756809235
0.0
0.0
0.0
638.2577607631683
324.0871880054474
196.69612002



Elapsed time to compute best fit: 23.989 seconds
Cross-validation score: 0.6417297148498824
Test score: 0.6236559139784946
Best Hyperparameters: {}
2458.7781443595886
127312.8452296257
218330.10865020752
16577.4201836586
9503.189569473267
259.4069335460663
46.688979625701904
291.34627747535706
126.30256080627441
20.303709983825684
14302.879905462265
25634.30464696884
4277.857082128525
0.0
1083.2858443260193
0.0
650.6257610321045
162.73920106887817
8715.783371448517
1282.3907635211945
128.2356994152069
72.10432863235474
2396.1878957748413
444.79324531555176
939.4432425498962
143.29574751853943
474.9612522125244
1843.4790534973145
1025.6474640369415
2363.2528808116913
490.76622581481934
407.9572551250458
465.8259024620056
1807.4472823143005
523.612154006958
0.0
0.0
0.0
807.9729237556458
262.55450081825256
2302.2655577659607
383.22753405570984
9646.525122404099
139.75739884376526
134.52036666870117
116.14673638343811
571.2087073326111
0.0
0.0
0.0
572.8892464637756
3809.078731775284
312.20



Elapsed time to compute best fit: 23.836 seconds
Cross-validation score: 0.6419480400173071
Test score: 0.6367432150313151
Best Hyperparameters: {}
29277.16590166092
127967.51976966858
220379.36378622055
18649.872777223587
7250.184095621109
1270.8597812652588
309.2576217651367
208.7180414199829
54.795408725738525
10.438679933547974
7047.302357673645
19016.19881105423
3898.045157432556
0.0
200.9880199432373
0.0
252.95784735679626
293.2242932319641
3673.352352142334
1337.0837752819061
39.92377018928528
234.1621389389038
1967.7677714824677
139.15914154052734
251.1218912601471
305.01427841186523
876.8787565231323
1011.3689475059509
201.1583800315857
866.8366115093231
967.1104595661163
316.7657415866852
126.68788123130798
1921.8226454257965
386.4913296699524
0.0
0.0
0.0
759.6889545917511
385.6005058288574
529.0091481208801
1037.4936695098877
5615.857574701309
690.2556657791138
147.98637676239014
447.7582893371582
636.825207233429
0.0
0.0
0.0
1003.7690823078156
767.700724363327
172.292490005



Elapsed time to compute best fit: 23.752 seconds
Cross-validation score: 0.6530399519308218
Test score: 0.6918238993710693
Best Hyperparameters: {}
25244.391271829605
125676.07018518448
226595.98479247093
15877.489344835281
5319.977232217789
934.3552436828613
218.1382794380188
118.7799003124237
40.17856025695801
0.0
8414.40387749672
14446.960265874863
2521.8148851394653
0.0
44.57141947746277
0.0
537.9985246658325
355.8448443412781
5724.484726667404
343.9816837310791
363.4835777282715
35.382879972457886
3094.4318499565125
413.67497062683105
662.7984044551849
198.0571391582489
229.06142020225525
1964.049694776535
296.6476137638092
941.3987419605255
113.02229976654053
719.693234205246
242.53213167190552
1958.2166759967804
650.4301295280457
0.0
0.0
0.0
420.88434970378876
204.6716504096985
412.1803569793701
507.2897138595581
10239.089464902878
282.8729615211487
348.7085723876953
251.02084231376648
755.3023715019226
0.0
0.0
0.0
1407.601480960846
3063.4657340049744
142.76836895942688
379.6603



Elapsed time to compute best fit: 23.790 seconds
Cross-validation score: 0.6394841688308042
Test score: 0.6938775510204082
Best Hyperparameters: {}
2624.6335458755493
132908.641477108
228857.30421864986
11223.301025867462
10802.030872106552
118.60743188858032
345.9604752063751
135.99867749214172
58.27038025856018
195.33519792556763
10689.684051275253
19543.14131975174
5785.863033771515
0.0
1982.497163772583
0.0
1695.107944726944
435.7306571006775
6726.397826433182
808.807902097702
74.29590940475464
342.74076795578003
1845.506177663803
542.6622183322906
489.8126232624054
216.21145033836365
158.08814811706543
1593.2161905765533
92.14103841781616
516.4621860980988
342.5537097454071
520.9767699241638
188.71563291549683
1364.5467956066132
973.6570243835449
0.0
0.0
0.0
342.91631603240967
330.79865646362305
843.2784588336945
472.9322874546051
4820.6964457035065
1206.6963453292847
159.50639867782593
48.26384973526001
687.1707711219788
0.0
0.0
0.0
849.5693120956421
1775.2197346687317
1031.48423

# 5. Modeling - Non-Normalization

In [125]:
features = processedData_dealServiceFeatures
labels = processedData_dealServiceLabels

## 5.1 Rebalancing Strategy - None

### 5.1.1 Random Forest

In [126]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

none_randomforest_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    pipeline = Pipeline(steps = [#['smote', SMOTE(sampling_strategy = 0.5, n_jobs=2)],
                              #['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    none_randomforest_nonnormalized_performance_df = pd.concat([none_randomforest_nonnormalized_performance_df, new_performance_df])
    
none_randomforest_nonnormalized_performance_df.to_csv("../data/05_model_output/none_randomforest_nonnormalized_performance_df.csv")



### 5.1.2 XGBoost

In [127]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


none_xgboost_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    GXBoostPipeline = Pipeline(steps = [#['smote', SMOTE()],
                                    #['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    none_xgboost_nonnormalized_performance_df = pd.concat([none_xgboost_nonnormalized_performance_df, new_performance_df])
    

none_xgboost_nonnormalized_performance_df.to_csv("../data/05_model_output/none_xgboost_nonnormalized_performance_df.csv")




Elapsed time to compute best fit: 155.449 seconds
Cross-validation score: 0.7197207506916448
Test score: 0.6746031746031744
Best Hyperparameters: {}
0.010138035
0.04613013
0.031268656
0.023838531
0.01694456
0.008081574
0.0041829837
0.0036889052
0.008950735
0.013114819
0.0082103005
0.014120189
0.031912196
0.0
0.0069374377
0.0
0.0076576094
0.012493291
0.019045405
0.030423338
0.00942378
0.004769548
0.022423176
0.0049826815
0.0026523154
0.0072868727
0.006684529
0.023904886
0.009969261
0.0036416391
0.0040869974
0.008715483
0.007098338
0.0013958581
0.0038506438
0.0
0.0
0.0
0.006629899
0.001011573
0.010534315
0.0091191325
0.00989507
0.0054883836
0.010201433
0.0074885087
0.017724704
0.0
0.0
0.0
0.003892833
0.0061335545
0.0056253904
0.004424249
0.00051450694
0.0014057818
0.009488605
0.0069430447
0.00593394
0.011138991
0.0062970743
0.0045467205
0.010233939
0.0012318856
0.0039605056
0.0059259096
0.004910221
0.0
0.003026062
0.0068242126
0.006666357
0.0070025558
0.0069595273
0.0055990685
0.00957339



Elapsed time to compute best fit: 153.410 seconds
Cross-validation score: 0.7273479805797711
Test score: 0.6635802469135802
Best Hyperparameters: {}
0.0077620978
0.047519654
0.034038253
0.024935499
0.009723491
0.007714812
0.0048967563
0.006492876
0.01385441
0.0037706238
0.011776428
0.015020592
0.00996369
0.0
0.0042704903
0.0
0.008870867
0.008953986
0.0093186265
0.047779575
0.008386266
0.0054033347
0.013256996
0.007448646
0.005780918
0.0053442144
0.004106613
0.0211216
0.0059447265
0.0037605132
0.0041812207
0.00837383
0.0031474377
0.0036810974
0.007440079
0.0
0.0
0.0
0.0056479
0.009054547
0.0076540783
0.0076572313
0.031510256
0.0054683364
0.010744044
0.0
0.01842624
0.0
0.0
0.0
0.0028960458
0.006700699
0.0037793326
0.009519477
0.0023972278
0.00613796
0.009420117
0.009984953
0.008710048
0.018907504
0.005760467
0.0050046737
0.017780734
0.000786547
0.004000395
0.0072126696
0.0093651805
0.007503398
0.00524012
0.0056900755
0.006308641
0.017996801
0.0041302512
0.018467078
0.004169319
0.0
0.0059



Elapsed time to compute best fit: 156.714 seconds
Cross-validation score: 0.7039411391000598
Test score: 0.7166666666666667
Best Hyperparameters: {}
0.008734686
0.04621798
0.03345078
0.026834775
0.010227527
0.005675917
0.00388073
0.004760331
0.0033774355
0.007647707
0.007008196
0.01783156
0.011228591
0.0
0.008240957
0.0
0.011627526
0.012409166
0.0073632724
0.025090294
0.008827349
0.004832062
0.021621179
0.011022296
0.005292284
0.0067540808
0.004555622
0.029559873
0.0048977355
0.00514482
0.0051492583
0.008654799
0.0069391257
0.00326947
0.0057040863
0.0
0.0
0.0
0.011472257
0.0048952405
0.0075351093
0.0082753645
0.008005142
0.01678009
0.011377648
0.0
0.021467308
0.0
0.0
0.0
0.0036998137
0.009654914
0.0056533273
0.0026795187
0.00515772
0.0024961312
0.0069435094
0.0054704496
0.0027845136
0.011565632
0.010386811
0.012007891
0.013321293
0.013651483
0.02877432
0.0044403677
0.0055992366
0.0
0.0041147606
0.0041989298
0.010395933
0.00043311334
0.009533525
0.007723124
0.0046048407
0.0
0.013395879




Elapsed time to compute best fit: 164.512 seconds
Cross-validation score: 0.7611929742499458
Test score: 0.7746478873239437
Best Hyperparameters: {}
0.009485694
0.04978585
0.029495958
0.025715087
0.0092820525
0.0060560843
0.0053745178
0.007825484
0.008352215
0.009247243
0.008667628
0.01242909
0.019389722
0.0
0.0032179558
0.0
0.016025003
0.0099200085
0.0054411385
0.015083883
0.002608249
0.0074416846
0.00994823
0.0087070465
0.0011011997
0.0028547356
0.010724356
0.02874526
0.0051545496
0.006110587
0.006258297
0.007647888
0.0030630014
0.004976709
0.0039277496
0.0
0.0
0.0
0.005924416
0.009366177
0.0062567517
0.0057862294
0.025390022
0.009063339
0.0048245895
0.0
0.0122237485
0.0
0.0
0.0
0.004977728
0.009234094
0.004474093
0.0015124823
0.0017262414
0.006735169
0.0047443975
0.0054682004
0.010292875
0.0112792505
0.022689806
0.01061557
0.0035942476
0.002289974
0.004293991
0.0009830255
0.0076458957
0.009137927
0.0052067055
0.0063628852
0.00669491
0.0005012775
0.0046978057
0.008111336
0.0067233983



Elapsed time to compute best fit: 177.860 seconds
Cross-validation score: 0.744174913388003
Test score: 0.7020547945205479
Best Hyperparameters: {}
0.006696362
0.04526766
0.032574777
0.02586858
0.013044789
0.0064065964
0.0075879567
0.010680086
0.008041975
0.00418472
0.0071581486
0.018944645
0.017502801
0.0
0.0067381556
0.0
0.014045109
0.010857975
0.0110177
0.016811864
0.0046150675
0.0068502743
0.019032413
0.007903833
0.0013584229
0.008711438
0.010695099
0.012857985
0.006743061
0.0068096225
0.0025050468
0.0064704767
0.0026491839
0.0074313176
0.009306699
0.0
0.0
0.0
0.003477433
0.0
0.0074981917
0.00431507
0.0
0.009722554
0.038015034
0.0
0.01683245
0.0
0.0
0.0
0.007033564
0.007914985
0.004441367
0.0030798358
0.0041150325
0.0024333568
0.003170875
0.0076940246
0.007917176
0.012567468
0.01657717
0.005948617
0.03171285
0.006346917
0.0068299943
0.009800672
0.004205827
0.0
0.002449272
0.003440495
0.00915151
0.009544022
0.0053654015
0.0010969029
0.007593114
0.0
0.00894428
0.01587552
0.0135156745



Elapsed time to compute best fit: 186.358 seconds
Cross-validation score: 0.7133380366250883
Test score: 0.744047619047619
Best Hyperparameters: {}
0.008949682
0.045376003
0.032508295
0.025309961
0.010344943
0.009332206
0.0086519215
0.010007452
0.0040175584
0.007679171
0.009641696
0.013844101
0.011655545
0.0
0.0069392067
0.0
0.008270432
0.009599823
0.011751318
0.026868518
0.014130388
0.007231022
0.024962291
0.008020228
0.014941378
0.004800708
0.004819066
0.026155809
0.0035597847
0.008193082
0.0040575657
0.004501722
0.0054775793
0.0023929814
0.0052253697
0.0
0.0
0.0
0.009671251
0.0012253682
0.0034970555
0.0078094807
0.0
0.008354928
0.026638508
0.0
0.03237241
0.0
0.0
0.0
0.0065312646
0.005028318
0.0025398482
0.010300231
0.0068230643
0.0048930193
0.013102189
0.0053033303
0.010364914
0.015841598
0.004628842
0.008445584
0.015352675
0.008122085
0.0029413372
0.0064959875
0.0074684257
0.0013655291
0.005173253
0.0052144085
0.0048982175
0.026123969
0.00611685
0.0054800417
0.0055878903
0.01369352



Elapsed time to compute best fit: 176.922 seconds
Cross-validation score: 0.7138275899886095
Test score: 0.7089552238805971
Best Hyperparameters: {}
0.01011331
0.047923498
0.036232647
0.024614446
0.010376125
0.008129062
0.0052903187
0.010488783
0.0055377926
0.011338974
0.007875347
0.016045539
0.018304886
0.0
0.010483565
0.0
0.007095991
0.00960145
0.009363617
0.0420446
0.0037079281
0.009100945
0.010826681
0.006721048
0.01245867
0.002852297
0.0026392683
0.019594064
0.007611627
0.006257694
0.006845802
0.0055193715
0.0040854733
0.010595411
0.006847983
0.0
0.0
0.0
0.008477551
0.0019674848
0.0039229738
0.005757383
0.0035605538
0.009209346
0.022166576
0.002585228
0.013929372
0.0
0.0
0.0
0.004353336
0.0070707174
0.006773004
0.01530479
0.0048328605
0.0049579535
0.007586519
0.006201631
0.010675152
0.014165005
0.00544953
0.010928904
0.0074725747
0.0061730156
0.005735628
0.0011733114
0.005669281
0.00036283524
0.0039254706
0.0046926634
0.009282293
0.033127867
0.0051385267
0.0034846358
0.0082180565




Elapsed time to compute best fit: 183.640 seconds
Cross-validation score: 0.7419482645972794
Test score: 0.6716417910447761
Best Hyperparameters: {}
0.010328548
0.047500614
0.03035964
0.0275246
0.01706911
0.012682126
0.0029670668
0.014189807
0.016576232
0.0041208547
0.0076615624
0.01230612
0.014090632
0.0
0.0032068607
0.0
0.012931956
0.013092459
0.012378608
0.051389843
0.0064915954
0.0029111682
0.018875916
0.0046301195
0.02253104
0.004418088
0.005548792
0.021665618
0.0041532936
0.0048712445
0.0058552492
0.0066511366
0.0056335386
0.005517802
0.004969906
0.0
0.0
0.0
0.0063196057
0.0
0.010471093
0.00871787
0.0012032383
0.0063584377
0.019901209
0.0
0.016475175
0.0
0.0
0.0
0.004823222
0.0045943065
0.004245865
0.011035885
0.009903113
0.0057220566
0.0046597775
0.006899321
0.0051404443
0.016221546
0.009594849
0.011004601
0.031078037
0.014722233
0.00038699838
0.0021320884
0.0071300077
0.006490324
0.0050061806
0.0044359337
0.0058071176
0.00022594082
0.0031054725
0.0056675016
0.011121774
0.0
0.00



Elapsed time to compute best fit: 173.908 seconds
Cross-validation score: 0.7321705873033528
Test score: 0.7341269841269842
Best Hyperparameters: {}
0.0074885734
0.042339128
0.033918317
0.02624372
0.017782586
0.008529936
0.0098490035
0.016176842
0.013951471
0.0060298024
0.007643517
0.013350959
0.01160301
0.0
0.0066625876
0.0
0.014397855
0.016778383
0.009988964
0.023934256
0.011995985
0.0038955626
0.017404374
0.0034542638
0.0076154578
0.0049670883
0.010750876
0.029179636
0.007576516
0.00755492
0.0015620313
0.0072296252
0.007942445
0.006954401
0.0047319382
0.0
0.0
0.0
0.009387845
0.010245765
0.0123242615
0.009260546
0.0041942224
0.004082306
0.017546935
0.0
0.022368673
0.0
0.0
0.0
0.0034630382
0.007713791
0.0047200485
0.009936763
0.0028873638
0.0055967174
0.0041824705
0.006528588
0.01182329
0.014603098
0.0034658224
0.0076195053
0.019451367
0.0019964576
0.005247111
0.002028771
0.0025359853
0.00532721
0.0045570247
0.0034313558
0.006261092
0.0
0.009038846
0.0037821017
0.00926539
0.01598239
0



Elapsed time to compute best fit: 175.837 seconds
Cross-validation score: 0.708549780488763
Test score: 0.7191780821917808
Best Hyperparameters: {}
0.0088113705
0.044955857
0.030336063
0.026079088
0.016356597
0.00853809
0.0067247055
0.018033342
0.0056838896
0.007369031
0.0054443018
0.01714572
0.012006873
0.0
0.0046416414
0.0
0.012389377
0.010962834
0.011964517
0.025981225
0.008274357
0.0049624094
0.012979262
0.0070975805
0.0023038571
0.007819556
0.00540422
0.019972302
0.011240078
0.0056307986
0.0015003058
0.004911322
0.0042035314
0.004089537
0.0058389674
0.0
0.0
0.0
0.01547829
0.004860314
0.013236052
0.0091602
0.0013523955
0.0040768897
0.0030357675
0.0
0.014869155
0.0
0.0
0.0
0.0064489944
0.0065864315
0.0038699065
0.001996691
0.0180525
0.003059383
0.009492875
0.009052497
0.005215456
0.012341462
0.005403511
0.009510302
0.02311082
0.013985543
0.009141169
0.0071922955
0.0069147777
0.0
0.007424159
0.0050442186
0.0057741967
0.0032688864
0.0055641937
0.0025125428
0.007947963
0.0
0.004526603




Elapsed time to compute best fit: 172.301 seconds
Cross-validation score: 0.7009439334520658
Test score: 0.7291666666666666
Best Hyperparameters: {}
0.008849916
0.04654352
0.035545696
0.029241022
0.014869819
0.0050500566
0.010033683
0.007823411
0.010696878
0.007901731
0.006118095
0.014039189
0.024948537
0.0
0.007489107
0.0
0.009323832
0.012335807
0.007186185
0.018840078
0.005256375
0.0032986663
0.025630994
0.0029659388
0.009181395
0.008646148
0.004342402
0.020933662
0.0055160564
0.006960806
0.002283274
0.00946903
0.008123027
0.003619329
0.008785512
0.0
0.0
0.0
0.009578582
0.0
0.006742249
0.006664688
0.011342257
0.004760687
0.028968642
0.0
0.013010948
0.0
0.0
0.0
0.006713795
0.00537791
0.007021941
0.0069351145
0.0068864133
0.0029160336
0.010123077
0.0088200895
0.007507558
0.013903987
0.0043593035
0.0074900426
0.011153952
0.0076243714
0.0032067532
0.0027256527
0.0073282877
0.0
0.008876814
0.004199849
0.0074426834
0.0015193039
0.0040262053
0.0077651585
0.0069047837
0.0
0.004858867
0.00538



   Accuracy  Precision  Recall        F1        F2      F0.5  \
0  0.997142   0.823529     0.5  0.622222  0.542636  0.729167   

   Average Precision  
0           0.414118  
Elapsed time to compute best fit: 200.167 seconds
Cross-validation score: 0.7309331988970594
Test score: 0.7536764705882353
Best Hyperparameters: {}
0.009664975
0.045881744
0.03169723
0.02462865
0.012237643
0.007639838
0.006867264
0.016414098
0.012928511
0.01423521
0.009917811
0.012352791
0.007850945
0.0
0.0030745422
0.0
0.008512378
0.011083038
0.0037608712
0.035398975
0.00784175
0.0032313082
0.0113601275
0.013607031
0.011016027
0.00096832233
0.007961441
0.020403711
0.008382679
0.0052398765
0.005601503
0.0066023986
0.0050327256
0.0024525595
0.0052386024
0.0
0.0
0.0
0.011409275
0.0
0.0049585477
0.0063818004
0.0055468124
0.0070721586
0.029880647
0.0
0.014058868
0.0
0.0
0.0
0.0048447135
0.004401095
0.0058725565
0.006309981
0.01019198
0.008018636
0.00583055
0.004251638
0.009297757
0.015863575
0.008865892
0.009017132
0



Elapsed time to compute best fit: 187.471 seconds
Cross-validation score: 0.7121274493447014
Test score: 0.7608695652173914
Best Hyperparameters: {}
0.012144638
0.051319923
0.032244723
0.025501503
0.012310718
0.008512085
0.008196003
0.0021534264
0.008398784
0.015176811
0.008731013
0.015614709
0.012125295
0.0
0.005471822
0.0
0.013169659
0.021238318
0.008858729
0.033732463
0.008771239
0.003944027
0.018883022
0.0013950581
0.019275196
0.00087105663
0.0069859372
0.019655334
0.0071861837
0.005301234
0.009778022
0.0031224885
0.0055724583
0.005472109
0.005841754
0.0
0.0
0.0
0.00939538
0.0017668197
0.0064995955
0.005726212
0.0
0.008614967
0.0042685103
0.0
0.015595126
0.0
0.0
0.0
0.004455429
0.004751131
0.0032974891
0.0097349845
0.006996372
0.0070971847
0.007510156
0.0044560484
0.0056464584
0.012152534
0.008245774
0.007757105
0.012928295
0.0003247235
0.0028601354
0.009140812
0.007253171
0.015749605
0.0048977816
0.0036082836
0.010330949
0.0022719665
0.0064653424
0.0061037466
0.0049519236
0.033743



Elapsed time to compute best fit: 188.256 seconds
Cross-validation score: 0.6820408346744783
Test score: 0.7467532467532468
Best Hyperparameters: {}
0.0073892334
0.043016315
0.03353413
0.029018963
0.015646353
0.008046543
0.0068511735
0.014262471
0.00819731
0.007278838
0.0049914187
0.015106136
0.011040558
0.0
0.0033648552
0.0
0.008509245
0.0145653775
0.0073286286
0.03930426
0.0058216457
0.00612474
0.016865151
0.0077677825
0.0010156883
0.00931872
0.009905208
0.02142476
0.0045150653
0.0054568145
0.003719693
0.005901493
0.0056633763
0.004718119
0.005545554
0.0
0.0
0.0
0.005693631
0.009994955
0.010090982
0.006507874
0.008339483
0.0077900235
0.029375289
0.0
0.028424734
0.0
0.0
0.0
0.0033226765
0.005235043
0.002763336
0.0029377274
0.003241482
0.0068426793
0.005102132
0.004585838
0.0055081486
0.012182447
0.008092375
0.0070063095
0.016992033
0.012531239
0.0070991945
0.0070936363
0.0026698774
0.0
0.00929238
0.007765361
0.0063214083
0.009918755
0.005429596
0.0027884657
0.005809435
0.0
0.001778384



Elapsed time to compute best fit: 176.475 seconds
Cross-validation score: 0.7167675762057684
Test score: 0.6818181818181818
Best Hyperparameters: {}
0.008858564
0.046046097
0.03659924
0.024316773
0.01677469
0.0080597345
0.006969107
0.0037710639
0.008271289
0.007528635
0.005279904
0.0138953095
0.01626267
0.0
0.0019244893
0.0
0.013582441
0.018572882
0.011167791
0.034973588
0.0038260436
0.007971316
0.019609572
0.005167214
0.005937772
0.015074465
0.013619292
0.019116638
0.0052476013
0.0036750338
0.0035031918
0.0091213435
0.0047248816
0.005637065
0.0039230436
0.0
0.0
0.0
0.011487049
0.0038082805
0.008691841
0.0042957277
0.0047032293
0.009147242
0.019852791
0.011934746
0.018073225
0.0
0.0
0.0
0.006664487
0.008387845
0.005311235
0.009260203
0.008769281
0.004052665
0.004979292
0.007348197
0.0073633348
0.01464911
0.01520469
0.01139599
0.024026798
0.007724753
0.0054943636
0.00609823
0.0048835245
0.0
0.006041636
0.0049114903
0.007583719
0.0034255674
0.008808259
0.0008123415
0.0068611405
0.0
0.004



Elapsed time to compute best fit: 181.100 seconds
Cross-validation score: 0.6820700947505239
Test score: 0.7730263157894737
Best Hyperparameters: {}
0.008520947
0.046657294
0.03329485
0.023823882
0.011088079
0.0070731333
0.004020567
0.0173695
0.036431756
0.0062928977
0.008995098
0.013283904
0.010447463
0.0
0.004992181
0.0
0.010844515
0.012474348
0.010805505
0.024772622
0.005149893
0.004918899
0.017650453
0.008362313
0.00084943627
0.0028467167
0.011311221
0.02685393
0.0031028127
0.0045130937
0.003096044
0.0054051634
0.0070804595
0.0035122621
0.006844871
0.0
0.0
0.0
0.004321319
0.00607123
0.008055122
0.0073435567
0.0018649402
0.012795592
0.014309916
0.0
0.014292032
0.0
0.0
0.0
0.00629992
0.00464358
0.0047501624
0.009444955
0.0056010312
0.0025020668
0.014943492
0.0062373118
0.010204031
0.016181482
0.004896023
0.0074932096
0.008420171
0.007810205
0.002493543
0.00436905
0.008300272
0.0
0.013187224
0.0042379675
0.010072029
0.0050610565
0.0050165304
0.005840422
0.010551217
0.010690817
0.00594



Elapsed time to compute best fit: 179.158 seconds
Cross-validation score: 0.7154508102727853
Test score: 0.6801470588235294
Best Hyperparameters: {}
0.008634333
0.04406729
0.02944691
0.024592478
0.0150460815
0.009991348
0.0064736474
0.03861596
0.0077700447
0.008322841
0.0063283197
0.017643204
0.019652482
0.0
0.003426271
0.0
0.002639118
0.00794358
0.0048360014
0.035564076
0.010456552
0.005071488
0.008811435
0.01247775
0.0012779334
0.018986722
0.0040119514
0.011844062
0.007274683
0.005016397
0.006333722
0.0070880647
0.0046229865
0.006254396
0.006060534
0.0
0.0
0.0
0.015823025
0.0073552164
0.0054845237
0.006189105
0.00891027
0.008610013
0.0056514046
0.0
0.006183566
0.0
0.0
0.0
0.0042103636
0.004968604
0.004486285
0.009883543
0.0067071677
0.0025661616
0.010281073
0.0061143264
0.0094958665
0.015846254
0.0026826914
0.0077628638
0.0023317502
0.00080354954
0.003828111
0.005082198
0.009146595
0.006556769
0.004942681
0.0075001973
0.0056476947
0.0044666054
0.0066356524
0.010107702
0.005404213
0.0



Elapsed time to compute best fit: 181.439 seconds
Cross-validation score: 0.72549163434498
Test score: 0.6944444444444444
Best Hyperparameters: {}
0.010985646
0.047644705
0.031405415
0.02320995
0.009961911
0.010782101
0.00481354
0.010401257
0.0031056013
0.005951015
0.0067808046
0.012406284
0.021009423
0.0
0.005689213
0.0
0.011778757
0.008252105
0.0086028855
0.033705425
0.0032378163
0.003668602
0.008630487
0.008497552
0.012536104
0.017260455
0.004041049
0.021281045
0.0048387675
0.0045941765
0.0033751447
0.004753372
0.006881253
0.006006889
0.00781248
0.0
0.0
0.0
0.014326585
0.0018538961
0.0076954556
0.009954925
0.016843567
0.0075001437
0.017224496
0.0
0.007690997
0.0
0.0
0.0
0.0037986583
0.0059146425
0.004251575
0.0075389915
0.01651167
0.001674363
0.01364361
0.007974378
0.008996425
0.011687344
0.0034829853
0.0077360575
0.03424357
0.0030211667
0.008706008
0.0039842026
0.011005446
0.0023026802
0.008993951
0.0070257117
0.008371248
0.003691538
0.0052746222
0.005399495
0.009328683
0.014305939



Elapsed time to compute best fit: 178.758 seconds
Cross-validation score: 0.7021694846657525
Test score: 0.7922535211267605
Best Hyperparameters: {}
0.010138394
0.04039973
0.036371358
0.02205381
0.013699782
0.009648483
0.005112921
0.009073295
0.016978534
0.010454303
0.007598933
0.014747881
0.014847442
0.0
0.0046278895
0.0
0.014119724
0.036941294
0.008672355
0.028272266
0.0038053913
0.0152988965
0.015530404
0.011275392
0.018096587
0.009798078
0.0063243597
0.01827906
0.008588796
0.004959107
0.0029461745
0.005168876
0.0038447455
0.0019391348
0.0073475237
0.0
0.0
0.0
0.0066970373
0.0
0.008130979
0.0064812074
0.0057998584
0.011559844
0.018105304
0.0
0.0073755058
0.0
0.0
0.0
0.0046770982
0.004519321
0.0047200024
0.0025275736
0.0035725601
0.00847962
0.008255251
0.0060931477
0.0087729385
0.016168522
0.009502002
0.004025255
0.0008047097
0.0036442403
0.0071982057
0.00827978
0.007432503
0.0
0.008357709
0.006011874
0.006293931
0.030180851
0.0034738656
0.015268812
0.010418843
0.0
0.0058220583
0.017



Elapsed time to compute best fit: 179.267 seconds
Cross-validation score: 0.7173868718079262
Test score: 0.7305194805194806
Best Hyperparameters: {}
0.009329989
0.0485735
0.03523269
0.026651848
0.014068813
0.008885578
0.0066009643
0.007851885
0.002598021
0.008723361
0.014819675
0.012909104
0.012296017
0.0
0.0038742926
0.0
0.00740235
0.015481714
0.005722081
0.027904466
0.0075818053
0.0012369206
0.009982626
0.0027350436
0.012376041
0.0068914457
0.01023516
0.019575993
0.006668539
0.0073242006
0.0019591155
0.0045865444
0.005978135
0.0037611017
0.006561628
0.0
0.0
0.0
0.013896627
0.0
0.01454645
0.008795514
0.012326144
0.01055795
0.036586057
0.034218397
0.014239085
0.0
0.0
0.0
0.00577091
0.0071511534
0.0027493115
0.012695927
0.0034493378
0.00224683
0.01096248
0.004819119
0.004821475
0.0112961065
0.010076688
0.0095782
0.0
0.009240078
0.004792428
0.002986357
0.006225683
0.0
0.0105693955
0.00856482
0.010533085
3.966677e-05
0.004981074
0.014417507
0.010131685
0.0
0.010230211
0.013563156
0.010825



   Accuracy  Precision    Recall        F1       F2      F0.5  \
0  0.997198   0.803571  0.535714  0.642857  0.57398  0.730519   

   Average Precision  
0            0.43267  
Elapsed time to compute best fit: 207.504 seconds
Cross-validation score: 0.6861385526505865
Test score: 0.7142857142857143
Best Hyperparameters: {}
0.009331612
0.049566135
0.033183213
0.02390196
0.019719424
0.006597673
0.008468031
0.0048661334
0.011362116
0.007999518
0.004430882
0.013954045
0.025820963
0.0
0.007159456
0.0
0.010986243
0.00818067
0.011173937
0.01855445
0.0104277
0.0058547133
0.012704777
0.0068421974
0.010033786
0.008627534
0.008729225
0.03122119
0.005491754
0.009415818
0.007563394
0.0067805042
0.006305929
0.0057423254
0.0034947314
0.0
0.0
0.0
0.010813624
0.0045820656
0.007320081
0.008570333
0.020927433
0.009847371
0.022984546
0.0
0.015735924
0.0
0.0
0.0
0.0049399147
0.0062799375
0.0037760695
0.010264484
0.0044997674
0.007305998
0.008234022
0.0038153965
0.005947811
0.017978242
0.01657007
0.0113020



Elapsed time to compute best fit: 147.685 seconds
Cross-validation score: 0.7255689539032267
Test score: 0.6147540983606558
Best Hyperparameters: {}
0.01163676
0.04767792
0.03247467
0.02495655
0.011432137
0.010435959
0.004088424
0.026643965
0.011238749
0.005573269
0.010020481
0.014228589
0.009879879
0.0
0.005969442
0.0
0.00782297
0.0108931875
0.013143924
0.03335916
0.0050453185
0.004166729
0.02706137
0.009957981
0.0044013457
0.005226004
0.0041061947
0.017218474
0.009328868
0.006510243
0.004257227
0.002305431
0.008163889
0.0015103956
0.006103608
0.0
0.0
0.0
0.014141387
0.0058538085
0.006897215
0.012225375
0.0013034949
0.002060574
0.009984467
0.004901589
0.014300436
0.0
0.0
0.0
0.008255591
0.0063140015
0.004588057
0.0067318534
0.009168574
0.0043786815
0.010984201
0.008849933
0.009476092
0.014160548
0.0096754795
0.0069660093
0.0018564299
0.0143620595
0.0036977504
0.0051322686
0.0036080722
0.00018678991
0.001849546
0.0045517352
0.008543395
0.004586304
0.0045822426
0.0007129757
0.008485007




Elapsed time to compute best fit: 162.172 seconds
Cross-validation score: 0.7055117593989191
Test score: 0.7570422535211268
Best Hyperparameters: {}
0.009830384
0.04635117
0.031746246
0.022269832
0.010016517
0.010896092
0.002709082
0.017410487
0.0048891474
0.0052112653
0.007448375
0.018398304
0.013550755
0.0
0.0017064235
0.0
0.010118414
0.011802162
0.010637934
0.019360125
0.010460683
0.0062240385
0.013691349
0.0014569459
0.0151951
0.008296365
0.007859393
0.02021346
0.005691434
0.0053287884
0.0050228382
0.0040597375
0.00733731
0.0035764892
0.003973963
0.0
0.0
0.0
0.005638334
0.002385928
0.007436939
0.004140361
0.005469259
0.008229237
0.004651483
0.0
0.020133821
0.0
0.0
0.0
0.0053101773
0.005202833
0.0041152188
0.013270369
0.0032881075
0.0022255215
0.010747169
0.01039032
0.0042880187
0.01548899
0.01628856
0.00648521
0.017432008
0.0010526462
0.004357667
0.009684591
0.0063714692
0.0
0.010294889
0.0038870247
0.005558154
0.017329095
0.008108364
0.006568191
0.0104681095
0.018240193
0.00192443



Elapsed time to compute best fit: 166.450 seconds
Cross-validation score: 0.6951590886385742
Test score: 0.7462686567164178
Best Hyperparameters: {}
0.010146568
0.048024513
0.03391436
0.025211982
0.013567207
0.008218616
0.004297725
0.0033216977
0.0075808195
0.008229635
0.00933886
0.020434076
0.023432085
0.0
0.0048584095
0.0
0.01511239
0.020508746
0.007915956
0.028885279
0.018227618
0.0046309317
0.016199918
0.007403824
0.022962956
0.011346497
0.004914672
0.016357858
0.005565044
0.0023467995
0.0054412447
0.0073733986
0.005584544
0.0076411422
0.005210912
0.0
0.0
0.0
0.014135361
0.011191134
0.018181311
0.008448619
0.0008889979
0.008085586
0.0121692605
0.0005583506
0.013555585
0.0
0.0
0.0
0.0044979216
0.005382585
0.0070504854
0.0032336146
0.0073432936
0.0070838234
0.014502916
0.016437115
0.007853358
0.0111419065
0.004952449
0.013496725
0.0017263953
0.0043761292
0.0044433423
0.0063539227
0.00822639
0.0027939288
0.0097952485
0.0067275516
0.0049543804
0.002091006
0.0063554887
0.0037300074
0.00



Elapsed time to compute best fit: 151.705 seconds
Cross-validation score: 0.7110812731799794
Test score: 0.6884057971014492
Best Hyperparameters: {}
0.008839785
0.050040398
0.036938544
0.02627877
0.0128913
0.006418187
0.007296318
0.0055035967
0.001991507
0.016885258
0.0054012155
0.015095482
0.014124925
0.0
0.006909604
0.0
0.010119316
0.007984511
0.0046744244
0.041566074
0.005087896
0.012007084
0.009589607
0.0079128565
0.0148525415
0.020742157
0.008964027
0.025551515
0.0055942265
0.005549789
0.00446749
0.007044603
0.005724503
0.004024277
0.0043606996
0.0
0.0
0.0
0.009305054
0.0048520286
0.009171476
0.009707972
0.015730469
0.011670882
0.00913114
0.00048825672
0.016567666
0.0
0.0
0.0
0.008615006
0.0068024895
0.0037097698
0.003351073
0.0054007135
0.0043212753
0.00422706
0.006217587
0.013287819
0.01251241
0.011308515
0.0070737493
0.0
0.000696546
0.015740436
0.004423092
0.0063175494
0.0
0.009138342
0.009336652
0.004765838
0.013537606
0.005671853
0.0078100464
0.006123073
0.0
0.0014471156
0.00

### 5.1.3 LightGBM

In [128]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

none_lightgbm_performance_nonnormalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    LightGBMPipeline = Pipeline(steps = [#['smote', SMOTE()],
                                    #['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    none_lightgbm_performance_nonnormalized_df = pd.concat([none_lightgbm_performance_nonnormalized_df, new_performance_df])
    

none_lightgbm_performance_nonnormalized_df.to_csv("../data/05_model_output/none_lightgbm_performance_nonnormalized_df.csv")




Elapsed time to compute best fit: 22.057 seconds
Cross-validation score: 0.5056326779468614
Test score: 0.4962779156327543
Best Hyperparameters: {}
407.0251264758408
10373.985574606806
6816.155181840062
2517.937844272703
197.14728463813663
126.06625126302242
69.28392706811428
124.48733355104923
50.240965992212296
60.69285961985588
312.0991622284055
2101.8802295178175
2033.6908477395773
0.0
122.90034711360931
0.0
233.20187091827393
731.6417377889156
208.65172450989485
901.7565812915564
147.2139260172844
244.2511735111475
111.54081730544567
530.0273438245058
238.52669921517372
86.80942092835903
88.34586082398891
1552.5081155598164
334.7779320627451
593.3185428529978
9.37090902030468
92.78223326802254
222.06505851447582
14.714074030518532
460.7797964140773
0.0
0.0
0.0
88.24858552217484
1.5789242759346962
124.23124423623085
99.72903490066528
2.9943310618400574
94.78195229172707
10.798491910099983
0.0
23.399702288210392
0.0
0.0
0.0
330.22908475995064
52.20801144838333
82.83670282363892
18.0



Elapsed time to compute best fit: 18.322 seconds
Cross-validation score: 0.516521927698927
Test score: 0.5357142857142857
Best Hyperparameters: {}
1340.9492868483067
10952.735225237906
4251.8926512002945
325.60911409556866
2563.7113108038902
136.63991966843605
157.67837250232697
89.28652961552143
192.590543076396
69.17556263506413
58.31927656382322
272.04616629704833
1250.51189135015
0.0
321.4289901033044
0.0
1072.0477664917707
228.67682354152203
1155.3767034113407
1568.4071960374713
139.28009263426065
572.6531059369445
463.4676144272089
100.91368055343628
1.7999000549316406
11.264666870236397
201.6179146617651
2347.4648192301393
768.443105109036
147.8962927311659
73.23534555733204
1315.0503768399358
168.69501413404942
116.11761832982302
150.5456205010414
0.0
0.0
0.0
576.9741312488914
3.29278901219368
130.36443862318993
90.52853589504957
14.584632456302643
102.45831919461489
72.15455393493176
0.0
1661.7678823024035
0.0
0.0
0.0
525.1659990213811
73.10237511247396
111.19360659271479
60.5



Elapsed time to compute best fit: 20.844 seconds
Cross-validation score: 0.524280351230989
Test score: 0.5303030303030303
Best Hyperparameters: {}
1853.402029298246
10824.629078436643
4598.214181892574
790.3295242637396
648.4460282027721
175.56934782862663
76.06385986506939
823.823419764638
153.547451287508
20.767235152423382
78.06733324378729
1971.1506152004004
41.122632279992104
0.0
55.42897117882967
0.0
1358.7121490165591
758.6774611920118
105.96000717952847
198.8381448239088
135.01754827052355
534.0168227478862
32.93394911289215
2.2440859973430634
107.29574179649353
94.5306246727705
88.75130115821958
1147.4970539361238
578.6769392862916
99.38042201846838
44.03797143697739
189.16074925661087
110.80751769989729
11.15547202527523
274.36269676685333
0.0
0.0
0.0
77.93973010778427
1.6841620057821274
74.5952485948801
220.38245505839586
17.816236600279808
194.10932407528162
70.26867819577456
0.0
654.0422490239143
0.0
0.0
0.0
88.27896639704704
163.85649805516005
40.4916792139411
17.23902086



Elapsed time to compute best fit: 22.526 seconds
Cross-validation score: 0.48786052141463365
Test score: 0.5651105651105652
Best Hyperparameters: {}
1563.7374857105315
9123.975290626287
3495.754194907844
2401.883876003325
2600.0301452577114
422.73098772764206
852.3638124763966
79.85156665742397
989.4821384251118
63.83819855004549
260.0067135877907
176.84070170670748
360.63962649554014
0.0
80.60009868443012
0.0
132.91110014915466
747.6401837468147
141.8310745060444
3581.7109801843762
204.3187477439642
7.009477950632572
475.7521457821131
100.75605940818787
1.104083001613617
4.980119064450264
81.58489428460598
1379.5818136632442
361.9958125650883
195.75450916588306
22.778859302401543
672.0253984108567
215.81708800792694
36.99715845286846
262.70397636294365
0.0
0.0
0.0
63.0269550383091
1.4779409915208817
67.99550032615662
99.04403442144394
4.169523984193802
92.8012113571167
303.00325578451157
0.0
105.28036198019981
0.0
0.0
0.0
98.41112792491913
37.63463893532753
177.64887833595276
20.41356



Elapsed time to compute best fit: 20.712 seconds
Cross-validation score: 0.49753848262837136
Test score: 0.5269607843137255
Best Hyperparameters: {}
2224.246818229556
9758.724923066795
4415.976244602352
1237.4491541422904
104.81959216296673
113.2768694460392
38.04342249035835
271.82793644443154
3.643736034631729
261.74929582327604
800.4048790447414
2096.384069826454
887.8839757144451
0.0
117.43074141442776
0.0
1129.9032582342625
49.09232771396637
712.1836590766907
762.4758819267154
31.12074153870344
52.66365986317396
129.92893784493208
49.211530685424805
18.480493173003197
55.51685090363026
723.2442479133606
2215.0266976580024
161.94224556535482
257.9477538987994
28.969613380730152
149.91471097618341
58.12186435610056
23.445970632135868
121.01844504475594
0.0
0.0
0.0
107.4424514696002
223.69528160244226
809.1136708557606
279.42960526794195
6.244858920574188
141.02203245088458
768.9909129291773
0.0
82.89857774972916
0.0
0.0
0.0
128.6018423512578
139.72841672599316
183.9699512720108
20.3



Elapsed time to compute best fit: 22.981 seconds
Cross-validation score: 0.46557567032596436
Test score: 0.625
Best Hyperparameters: {}
300.66796190291643
6171.922607012093
4485.517335116863
516.974697176367
1967.7667375952005
883.6771674901247
64.18261374533176
213.68160454928875
244.11720301955938
96.81272465735674
86.1323738321662
1869.5563725754619
688.7872303575277
0.0
426.00885827094316
0.0
96.37582395225763
1291.2125183939934
822.3152962625027
1817.6024371385574
120.880555331707
65.40950427949429
370.96637086570263
75.30496567487717
3.113330125808716
27.117977000772953
245.94776060432196
4110.826349683106
133.03014995902777
1043.8770139589906
62.35695469379425
146.59238988161087
342.94158582389355
20.217921942472458
45.40282931923866
0.0
0.0
0.0
101.79793149232864
2.352532908320427
525.6062484234571
124.39383571594954
1.6723883152008057
225.88104934990406
528.5796018689871
6.353529930114746
89.1400468274951
0.0
0.0
0.0
474.4855501651764
507.8998667523265
68.18901644647121
20.226



Elapsed time to compute best fit: 21.894 seconds
Cross-validation score: 0.5043720240847134
Test score: 0.505050505050505
Best Hyperparameters: {}
1279.7328050732613
9324.926100939512
4554.319085974246
352.3213288746774
2160.3510541543365
394.9291532933712
51.49437153339386
189.87253506854177
124.37192387133837
166.99579180777073
385.2402470558882
231.55994990468025
694.6352391242981
0.0
313.43979097902775
0.0
1640.4484871029854
170.6305715739727
1440.510686866939
1919.0235247164965
90.98088672757149
128.31780287623405
91.02520629763603
7.897980779409409
1.2210620045661926
9.699071139097214
125.89847627282143
2796.3877144008875
247.30593041330576
96.306441642344
22.329474359750748
157.601508744061
72.84520894289017
134.32402677834034
107.41948690265417
0.0
0.0
0.0
130.56021865829825
0.2590959966182709
64.21148008108139
205.31806453317404
14.387565866112709
170.15270048379898
38.1845468133688
0.5808230042457581
128.27398885786533
0.0
0.0
0.0
175.40855190902948
84.67802765965462
151.0595



Elapsed time to compute best fit: 21.272 seconds
Cross-validation score: 0.4742985526224606
Test score: 0.5185185185185185
Best Hyperparameters: {}
319.89129815250635
9609.610999263823
4373.713569056243
246.72072249650955
304.3473605364561
201.61516918987036
20.742449805140495
696.2002168595791
111.82648825645447
82.25222188234329
1432.47253908962
2826.3723246380687
400.9914730191231
0.0
121.48794621229172
0.0
723.0855469182134
81.90220885723829
76.40439441427588
2029.165148884058
109.99374767392874
36.473549112677574
1447.867508739233
39.83557963371277
9.217714950442314
68.41938135027885
296.1334706246853
2480.672363355756
100.79329873621464
2129.0901186168194
27.559964768588543
128.3639162108302
57.37223221361637
134.40938176959753
122.11312471330166
0.0
0.0
0.0
498.10465809702873
0.6725520044565201
1712.9655591621995
210.140045709908
25.73310597240925
56.556305423378944
16.564752407371998
0.0
2394.3412078022957
0.0
0.0
0.0
384.3342500850558
49.75786678493023
149.82784163206816
247.1



Elapsed time to compute best fit: 20.888 seconds
Cross-validation score: 0.5212194005130295
Test score: 0.38860103626943004
Best Hyperparameters: {}
652.5036417581141
11352.600349918008
3052.3295820318162
651.0127822197974
187.316886164248
427.4137452542782
27.91357472538948
70.06277269124985
29.889962524175644
30.505546368658543
1012.557836972177
205.51937901228666
1472.5067866146564
0.0
83.12047957628965
0.0
1347.2077028900385
1068.943248257041
1392.1087264195085
1500.5144721716642
124.5375435128808
43.359074890613556
147.51806895434856
41.39536702632904
3.6766480058431625
5.5611419677734375
45.920997977256775
3138.3122823163867
67.3077338412404
199.1250707358122
141.89348463714123
92.29949535429478
80.30344476550817
41.89633207768202
295.43834383040667
0.0
0.0
0.0
113.14069921523333
2.2283260375261307
44.531670816242695
98.22427473962307
17.17666646838188
103.88853920996189
137.81211613863707
0.0
2547.0258404910564
0.0
0.0
0.0
175.22054344415665
57.452920131385326
199.75178238004446



Elapsed time to compute best fit: 21.275 seconds
Cross-validation score: 0.5056035038735959
Test score: 0.4385964912280702
Best Hyperparameters: {}
601.1674858406186
10335.448736034334
3981.9698392748833
733.2766766771674
350.0399838760495
1875.122082516551
15.759634152054787
1061.167017698288
1.1197600364685059
178.06458589434624
436.73861176520586
2456.3300440981984
782.6357019469142
0.0
329.3657726943493
0.0
234.80690722167492
78.1876400411129
1305.7780273109674
924.8781700357795
70.90698818862438
58.70733770728111
81.63485598564148
23.449969470500946
67.88718650490046
26.169528916478157
251.98427258431911
4403.583496816456
107.09629040211439
208.01497055590153
45.067981883883476
167.09312812238932
122.51157210767269
8.325815975666046
112.46705231815577
0.0
0.0
0.0
38.77758201956749
7.468400001525879
2679.4675077795982
59.62529468536377
49.81832620501518
254.7304442077875
911.5360618829727
0.0
532.0431096404791
0.0
0.0
0.0
68.37910431623459
55.49021653831005
190.98048301786184
28.23



Elapsed time to compute best fit: 18.318 seconds
Cross-validation score: 0.4640142485294499
Test score: 0.5172413793103449
Best Hyperparameters: {}
1294.739067722112
9997.475080288947
4561.338943608105
483.3066963404417
2336.8114671334624
149.59322749078274
104.17418587207794
104.97262006998062
66.55066750943661
453.12349743023515
720.609698548913
362.36170103400946
2013.7290228158236
0.0
136.49411579966545
0.0
187.1609015762806
994.6334213316441
103.59558027982712
131.49745894223452
36.60419178009033
59.9224353954196
53.446463003754616
70.8221056163311
49.923685386776924
21.31046360731125
195.8477404937148
3482.5551549196243
380.1599220186472
103.53125051409006
28.91269575059414
446.1780539602041
161.88598182797432
6.875221982598305
156.59950899332762
0.0
0.0
0.0
272.200064457953
4.886837989091873
133.8946440666914
581.6141080409288
26.346694752573967
128.4925771355629
273.65497383475304
0.0
485.65711779892445
0.0
0.0
0.0
303.52561992406845
492.34366931021214
356.1160374209285
43.5847



Elapsed time to compute best fit: 21.392 seconds
Cross-validation score: 0.49201705242526445
Test score: 0.5693069306930694
Best Hyperparameters: {}
519.4298123307526
8219.523739218712
6692.852151077241
565.5557571873069
339.1758527047932
124.10533501207829
252.4519346281886
552.1265354678035
47.86615481227636
115.16356489062309
1932.24142152071
1776.3770040757954
914.3681887239218
0.0
121.60500927269459
0.0
855.3555755615234
94.26049338653684
49.9811979457736
2173.9203887581825
51.40052418410778
252.5703842639923
348.91561802476645
24.900780841708183
1.7313549369573593
0.9837420210242271
428.22138949483633
4232.013774372637
92.30070979893208
133.1812326014042
22.479133173823357
160.1236582621932
206.11090555787086
27.178469873964787
144.8946876525879
0.0
0.0
0.0
127.03195090591908
3.3731890618801117
236.21895003318787
849.2115196213126
1.6525800228118896
237.17015262693167
9.155513614416122
0.0
7.702085971832275
0.0
0.0
0.0
432.0088699012995
90.71253041923046
268.2412749901414
182.800



Elapsed time to compute best fit: 20.793 seconds
Cross-validation score: 0.5281618507745657
Test score: 0.4659949622166246
Best Hyperparameters: {}
1365.8942363113165
11629.118043787777
5424.662337869406
1208.268591336906
1063.96110220626
76.86727625131607
754.3634372800589
197.41698885709047
39.7745099067688
102.10501739382744
106.7055700942874
2091.6691902950406
557.6845091804862
0.0
130.6440174281597
0.0
27.40282367914915
1080.089326262474
798.523055717349
1207.8101427964866
45.54324807971716
315.8796617016196
128.4933149665594
29.632218807935715
1.621231034398079
374.6398794427514
27.255401492118835
4559.158527329564
231.3415318503976
226.12139738351107
73.073138281703
138.65312875062227
55.45120833069086
76.08819872885942
55.88343635946512
0.0
0.0
0.0
465.4647903777659
49.580842062830925
198.04120532423258
114.87905237451196
1.2955770194530487
203.99391499906778
48.48340979218483
0.14668500423431396
2157.8365012407303
0.0
0.0
0.0
75.14723925292492
83.1306471042335
60.1813722178339



Elapsed time to compute best fit: 21.396 seconds
Cross-validation score: 0.4963165097369976
Test score: 0.426829268292683
Best Hyperparameters: {}
214923.14962485433
97556712.39483145
10995470.23923567
284.4856703877449
370556.6188007891
9177.096468389034
484.7919546365738
1030.7642392218113
337389.4155277908
40841.10210227966
6510.524897947907
41931.197198182344
1723.3412223756313
0.0
1655.6622607707977
0.0
1513.153711348772
112.87477058172226
1635.4966209679842
906.2081593871117
771.6285001933575
90.7612179517746
60554460.57218519
9530.134660065174
7459.966808259487
53.19630241394043
1208.94746991992
17544.89563715458
866.6963666975498
850856.0307349861
84.78715616464615
17801.699519574642
5193.415072083473
122.31197816133499
1491.146532535553
0.0
0.0
0.0
143465.76884186268
4.495217978954315
28358.243969976902
8411.361877471209
12.931185007095337
5002061.660879433
1340.1528200507164
0.0
1727.0404016971588
0.0
0.0
0.0
163359.12017911673
249749.01843464375
209396.8446007669
32897.19178



Elapsed time to compute best fit: 18.444 seconds
Cross-validation score: 0.5041191474678489
Test score: 0.5038759689922481
Best Hyperparameters: {}
1100.9089579731226
9806.1095386222
4927.800974600017
627.6479403376579
761.7886136770248
79.92695669084787
104.45231685042381
793.8143827840686
77.00027213990688
81.8537058159709
1827.3263526633382
228.6046979278326
1297.7109839469194
0.0
74.55330633372068
0.0
168.48037764430046
189.10763634741306
2574.115781866014
389.9873678907752
96.56385821476579
70.02945893257856
345.4039802029729
5.028373032808304
24.942997880280018
93.21507400274277
114.10149985551834
3262.653413929045
313.429800003767
160.8896383717656
17.860933393239975
330.3145987018943
23.76063196361065
39.5182069838047
182.49128714948893
0.0
0.0
0.0
839.1636866629124
21.08237224817276
193.71253569424152
322.499373242259
1.9533230066299438
185.3101388812065
299.7772782891989
0.0
780.0873549133539
0.0
0.0
0.0
96.00161552429199
57.718955025076866
671.4320815056562
21.92716749012470



Elapsed time to compute best fit: 22.496 seconds
Cross-validation score: 0.460802511324653
Test score: 0.5710659898477157
Best Hyperparameters: {}
1285.6050363853574
11234.692285921425
3034.5916485264897
812.8539464473724
2741.3017625510693
196.92558446153998
9.284544289112091
974.9170186705887
26.817674048244953
79.44400823116302
100.11883995682001
257.0254961550236
238.88004967570305
0.0
422.55008716881275
0.0
799.9421582147479
1148.088778346777
77.16674346476793
344.70574291050434
218.10612070560455
108.95840327441692
98.7275008559227
7.819623023271561
3.4476950764656067
73.0115232616663
58.33420734107494
2182.7695123031735
1400.0623949691653
130.68292754888535
113.67888881266117
219.7042037025094
152.30221769958735
15.697870299220085
271.9954408854246
0.0
0.0
0.0
125.64285025000572
17.20400047302246
81.49775886535645
920.9645108729601
7.163390934467316
50.58436833322048
204.77856954187155
0.0
106.11237468570471
0.0
0.0
0.0
166.85007801651955
130.9188313782215
119.04101048409939
62.



Elapsed time to compute best fit: 22.539 seconds
Cross-validation score: 0.5097232186260856
Test score: 0.4671717171717171
Best Hyperparameters: {}
2050.258845746517
6822.784222278744
5013.3315890952945
1871.0964379273355
502.69823909550905
169.3508274257183
27.70828129351139
3598.6752640753984
149.3577117472887
11.211827121675014
92.08491586521268
2448.9714648276567
49.23226075619459
0.0
125.27616565674543
0.0
2083.376938536763
163.97925624251366
381.38622330129147
343.9651942551136
291.2899634614587
191.5702039897442
91.16405376791954
2.9537619277834892
1.214840054512024
29.17324636876583
194.38372442126274
1122.729447670281
323.59693114832044
176.62566532194614
51.079514145851135
147.01802548766136
646.0805341005325
19.05308285355568
277.04223155230284
0.0
0.0
0.0
147.6994042545557
1.0561800003051758
106.00796981155872
898.0481649190187
40.95438864082098
205.76535214483738
112.14515521377325
0.0
14.985594183206558
0.0
0.0
0.0
81.30714812129736
89.97598324716091
217.5456526055932
32.



Elapsed time to compute best fit: 21.362 seconds
Cross-validation score: 0.49324067556879897
Test score: 0.4961832061068702
Best Hyperparameters: {}
435.33699820563197
7966.870481826365
5078.17471505329
1949.3827430568635
1163.6033020988107
515.9670787602663
193.81438209861517
37.797641046345234
108.26573778688908
15.06779745221138
99.23939763754606
2498.9502911940217
230.88257547467947
0.0
900.7343156263232
0.0
185.36906602978706
1199.0959524214268
1078.7176389619708
1482.9747679531574
494.03344245254993
276.02385821938515
485.59231059253216
23.3447687625885
0.5001720041036606
526.6770409047604
43.91572839021683
4330.188984684646
522.3951995521784
97.43776367604733
30.828802406787872
97.59146845340729
64.93126201629639
139.0650527626276
110.13010673969984
0.0
0.0
0.0
288.85232424736023
17.280643671751022
295.3345066830516
94.44882048666477
1.9257789850234985
153.74097256362438
37.90128421783447
0.0
1403.7922071516514
0.0
0.0
0.0
69.2639499232173
59.22073336690664
258.5072257220745
18.



Elapsed time to compute best fit: 21.059 seconds
Cross-validation score: 0.47433129940876134
Test score: 0.5443037974683543
Best Hyperparameters: {}
602.3792182579637
10297.86678152904
4401.319787636399
790.1904209926724
223.5554517135024
207.71567127853632
97.37030516564846
201.81641595065594
37.121061861515045
106.70040883123875
1272.8022989034653
2066.72063382715
660.3828612565994
0.0
80.19666791707277
0.0
796.3270573616028
117.74153436720371
1324.3151816800237
902.6035650596023
78.16860749199986
3.5481800884008408
127.77382822334766
4.583656966686249
6.6214799880981445
64.73663964122534
95.33759379386902
5020.09865155071
55.69528806954622
113.68654131144285
74.52637553215027
149.4890534169972
146.81031601130962
50.17057787999511
111.04902597516775
0.0
0.0
0.0
250.26078186929226
9.335258811712265
74.4321878105402
17.032704435288906
14.311482191085815
320.64452613145113
98.41635206341743
0.0
77.2824834510684
0.0
0.0
0.0
614.2132978886366
304.1335889622569
171.12654937058687
29.372190



Elapsed time to compute best fit: 21.394 seconds
Cross-validation score: 0.48174035494309686
Test score: 0.48346055979643765
Best Hyperparameters: {}
1863.0840918719769
9592.670355178416
3318.288410253823
1288.7050260007381
1948.1072095185518
190.3564960360527
122.48052901774645
85.44996120780706
549.4482105821371
136.64802470058203
117.1490139812231
481.2972705066204
172.7657793685794
0.0
325.52813628315926
0.0
335.0311435684562
2651.4557191729546
103.30028722435236
2738.338682472706
100.10884016752243
109.82385174930096
1231.2390205562115
29.337674900889397
0.7787629961967468
65.20016366243362
32.41765259206295
2042.2702524363995
74.88204193115234
183.3059878051281
31.94329410791397
441.5879833251238
85.5742894783616
29.834943905472755
293.5399850383401
0.0
0.0
0.0
45.84445479512215
3.008357957005501
119.397336140275
197.11823058873415
5.500899396836758
133.3911080956459
8.001714989542961
0.4527079910039902
21.51855903863907
0.0
0.0
0.0
1160.725908756256
68.54610323160887
72.86483461



Elapsed time to compute best fit: 21.592 seconds
Cross-validation score: 0.5227636259903614
Test score: 0.49999999999999994
Best Hyperparameters: {}
1401.9064922593534
11569.522207960486
3411.214326508343
1132.5960325971246
2337.2221094816923
488.7154190838337
44.66011464595795
553.570403650403
547.3537446707487
1346.2991580329835
271.20065776258707
986.465749129653
480.94613314419985
0.0
167.2364332973957
0.0
638.3237355351448
1669.470075726509
748.3091378360987
241.2879270836711
497.12257519364357
144.69523188471794
588.0024739801884
49.891810804605484
14.075658202171326
651.2749066501856
89.61408471316099
4697.61479754746
700.2166191786528
2444.050639078021
38.2567463517189
373.37953494489193
209.19597767293453
45.423351138830185
159.11009165644646
0.0
0.0
0.0
48.53104241937399
7.5568602085113525
71.61710654199123
1420.2827257551253
55.038856104016304
44.807349383831024
23.06506621837616
0.0
150.32169035077095
0.0
0.0
0.0
232.9168797507882
135.6963860914111
172.87079551815987
166.31



Elapsed time to compute best fit: 18.284 seconds
Cross-validation score: 0.4808672216857663
Test score: 0.5651105651105652
Best Hyperparameters: {}
1918.0971603877842
12101.636293061078
3169.9826393201947
2614.148568727076
2756.3023783266544
1297.2191203162074
105.78352485597134
70.89119212329388
54.93899883329868
37.57026281952858
192.46697540581226
772.9813895523548
417.7033641412854
0.0
50.413594879209995
0.0
25.859680607914925
2511.5583273917437
355.2137314043939
173.30870462581515
34.77873969078064
92.50015938282013
1109.5375552773476
27.78135457634926
28.19004988670349
20.838146314024925
921.3659799844027
36.59887616336346
357.6597002670169
59.1576075181365
19.282334938645363
267.0693825632334
86.68814744055271
35.87758465111256
162.42066571116447
0.0
0.0
0.0
1930.2574013248086
0.261678010225296
72.29409137368202
78.02387061715126
19.683749675750732
280.3282485306263
101.57495395094156
0.0
56.57690040767193
0.0
0.0
0.0
123.7753923535347
84.9039132297039
101.92139618843794
46.3858



Elapsed time to compute best fit: 21.515 seconds
Cross-validation score: 0.5157396320753389
Test score: 0.4691358024691357
Best Hyperparameters: {}
675.4766093492508
6729.545973025262
6056.200755797327
1361.2976031452417
1094.6868750154972
320.8022855594754
1389.890615195036
85.51404576003551
38.241915464401245
49.846513882279396
531.2679923400283
1918.258432134986
4719.508034467697
0.0
146.29803431779146
0.0
143.7600747793913
104.95538534224033
1524.8902572244406
777.9371379464865
510.7448841780424
137.22431901097298
148.031435161829
2.883919060230255
15.340309262275696
13.428975999355316
257.8786456808448
482.24488515406847
54.106806084513664
220.20786357671022
21.224229976534843
113.8851415142417
167.53951059281826
8.601309932768345
365.83382197469473
0.0
0.0
0.0
135.53087547421455
0.0
86.52489333599806
136.17342294752598
32.55716037750244
360.8588181436062
8.543761014938354
3.2900529503822327
155.6944823563099
0.0
0.0
0.0
149.82562838494778
158.83650565892458
343.7458230406046
5.87



Elapsed time to compute best fit: 21.308 seconds
Cross-validation score: 0.5348478850877468
Test score: 0.5163727959697733
Best Hyperparameters: {}
1477.01120236516
10049.23159835115
5042.695903439075
1089.5443216636777
295.10426457971334
94.55618565529585
86.48749692738056
902.5121378302574
5.82197605073452
60.10801042616367
275.9518651664257
1977.4179609939456
229.95567624270916
0.0
98.00013206154108
0.0
1051.5038469135761
54.99972726404667
181.00618682429194
2549.1692216172814
143.65606721490622
141.50164418667555
35.779700107872486
315.32296746969223
3.477614998817444
12.669816076755524
306.9295150488615
4841.022925287485
184.49208241701126
110.03321109712124
69.8941353186965
253.49744120240211
40.18036910146475
48.586968041956425
68.26683243364096
0.0
0.0
0.0
595.9938753992319
68.19047574698925
99.03315369784832
136.92732986807823
30.32275990396738
85.85305191576481
31.73393516242504
10.409899711608887
83.43056771159172
0.0
0.0
0.0
134.29867171496153
61.85558865964413
151.59979736



Elapsed time to compute best fit: 22.632 seconds
Cross-validation score: 0.5023902022013983
Test score: 0.5405405405405406
Best Hyperparameters: {}
948.4041052907705
5637.8706998080015
5548.735421285033
2466.8308284804225
697.2894767038524
199.72280737012625
306.09097846597433
191.5503306724131
69.98900124430656
77.14616083353758
2027.99708699435
2133.2993715256453
755.1486764773726
0.0
175.29875373840332
0.0
51.62124954909086
1106.6018437445164
115.54566560313106
3849.178627073765
10.01283896714449
283.1584098637104
1253.6641455888748
72.28402996063232
45.07634115219116
1.855300411581993
56.01622510701418
2027.6368643268943
197.36499647051096
102.5512929558754
35.43717838823795
133.3118343129754
209.3286620527506
28.0267965644598
215.2736014202237
0.0
0.0
0.0
224.91091798245907
5.226613745093346
104.94194509088993
226.16666620969772
0.2043640986084938
185.36029405891895
738.275777220726
0.0
1435.8938312605023
0.0
0.0
0.0
87.8701452165842
111.84500855952501
75.19838748127222
173.735356

## 5.2 Rebalancing Strategy - SMOTE

### 5.2.1 Random Forests

In [129]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

smote_randomforest_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    pipeline = Pipeline(steps = [['smote', SMOTE()],
                              #['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    smote_randomforest_nonnormalized_performance_df = pd.concat([smote_randomforest_nonnormalized_performance_df, new_performance_df])
    
smote_randomforest_nonnormalized_performance_df.to_csv("../data/05_model_output/smote_randomforest_nonnormalized_performance_df.csv")



### 5.2.2 XGBoost

In [130]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


smote_xgboost_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    GXBoostPipeline = Pipeline(steps = [['smote', SMOTE()],
                                    #['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    smote_xgboost_nonnormalized_performance_df = pd.concat([smote_xgboost_nonnormalized_performance_df, new_performance_df])
    

smote_xgboost_nonnormalized_performance_df.to_csv("../data/05_model_output/smote_xgboost_nonnormalized_performance_df.csv")




Elapsed time to compute best fit: 670.265 seconds
Cross-validation score: 0.678420280221722
Test score: 0.6829896907216495
Best Hyperparameters: {}
0.015345782
0.11010442
0.22272816
0.044212606
0.019608442
0.0054910798
0.0019059173
0.0023412982
0.008189683
0.0038799162
0.024405811
0.097385764
0.01637542
0.0
0.0074730366
0.0
0.010648929
0.0030160195
0.026137143
0.007074622
0.0033991614
0.000507533
0.005187624
0.0008926561
0.00645607
0.00027103123
0.0013635984
0.00080640614
0.0010474785
0.0040843524
0.004955832
0.0005019541
0.00070637214
0.007825915
0.0008362427
0.0
0.0
0.0
0.0053892895
0.0003658376
0.006827981
0.0020141737
0.0031476384
0.0013524089
0.00043094976
0.0031536913
0.008352364
0.0
0.0
0.0
0.008936353
0.007918482
0.001647599
0.0044431533
0.006074531
0.0049785464
0.0016460185
0.008011446
0.009513523
0.0071387156
0.0073024603
0.0021719958
0.0007616936
0.0008878756
0.0021061304
0.0046783686
0.0020853018
0.011566612
0.00035337993
0.008161704
0.0037107444
0.0069329026
0.001289842
0.



Elapsed time to compute best fit: 665.086 seconds
Cross-validation score: 0.67434483821295
Test score: 0.7047872340425532
Best Hyperparameters: {}
0.010118916
0.10087753
0.24736442
0.032814234
0.038232934
0.0015080497
0.002725428
0.0012804153
0.0031653356
0.0018774407
0.02659459
0.067309745
0.0045786463
0.0
0.005299343
0.0
0.0040596626
0.024221139
0.037426323
0.007326723
0.001628856
0.0009819189
0.011128346
0.0022436646
0.0051321345
0.006661172
0.0021692012
0.0005238576
0.0016200816
0.0031207493
0.00807696
0.0010118923
0.00034754165
0.006601125
0.000563583
0.0
0.0
0.0
0.0052036094
0.009615627
0.006107053
0.0041390304
0.007519534
0.0017966784
0.0055077774
0.0021357364
0.011460915
0.0
0.0
0.0
0.00023221676
0.0005851998
0.00072686863
0.0031620755
0.008760118
0.0009132444
0.011563272
0.011465354
0.004908597
0.00763126
0.0029812444
0.0011604199
0.0009613125
0.0017510647
0.013758949
0.00031942714
0.00078684173
0.007578409
0.000519731
0.00047452256
0.0038058993
0.008805167
0.0060310457
0.0006



Elapsed time to compute best fit: 676.535 seconds
Cross-validation score: 0.6847852425507237
Test score: 0.65
Best Hyperparameters: {}
0.004619665
0.12498137
0.23462306
0.043482997
0.023008913
0.007329513
0.0020304827
0.0021363413
0.0030550284
0.0011581901
0.037218843
0.015969303
0.008528166
0.0
0.0065933857
0.0
0.0031936008
0.00286251
0.04867344
0.012793318
0.0019745235
0.0025659844
0.0013410695
0.0016457872
0.0035697909
0.00026130737
0.0007172647
0.0013190224
0.0023616205
0.008242196
0.004019367
0.0013973038
0.0014976482
0.009315606
0.0009880548
0.0
0.0
0.0
0.0059184027
0.00021798506
0.0058160275
0.007010738
0.0061816457
0.003895517
0.0037565688
0.010376839
0.009501554
0.0
0.0
0.0
0.00024594265
0.0028020947
0.0002563891
0.0069150263
0.0037828078
0.003142752
0.0021880742
0.0022218362
0.007250686
0.0063159545
0.003221605
0.0035812738
0.0
0.000110560155
0.003392047
0.0073671937
0.000929734
0.008251556
0.004148678
0.0057629743
0.0024998295
0.013474602
0.0006281452
0.00016111656
0.0005074



Elapsed time to compute best fit: 659.384 seconds
Cross-validation score: 0.6518228099496041
Test score: 0.7313829787234043
Best Hyperparameters: {}
0.012478223
0.0950969
0.22822738
0.049429625
0.022091737
0.0029712413
0.0027701086
0.0011120342
0.00084427436
0.003239189
0.01090504
0.10030974
0.009071917
0.0
0.0032737148
0.0
0.01221735
0.0027254114
0.061997205
0.019358901
0.0012081634
0.00026271728
0.0031022097
0.004851597
0.021017004
0.0006270837
0.001046744
0.0012111758
0.0008057619
0.0037297872
0.0092894845
0.00040957236
0.0004924174
0.0015735927
0.0010974945
0.0
0.0
0.0
0.010624042
0.0017813387
0.005111701
0.0035511623
0.012363395
0.002261768
0.0020513502
0.0025606689
0.008928162
0.0
0.0
0.0
0.007685512
0.012292612
0.0003850583
0.0017636187
0.006846379
0.00050539593
0.009728872
0.004903201
0.0031147979
0.0078077465
0.0034712232
0.009254001
0.00040371533
0.0013672727
0.019330157
0.0004878019
0.0009178784
0.008664366
0.0005266542
0.0038894305
0.010693187
0.009575781
0.0018689634
0.001



Elapsed time to compute best fit: 687.855 seconds
Cross-validation score: 0.6683971412334898
Test score: 0.6818181818181818
Best Hyperparameters: {}
0.0047910437
0.16570732
0.27179512
0.0443645
0.034984924
0.008898282
0.002497387
0.0011096271
0.0046236673
0.00047239504
0.009677849
0.035421275
0.0068571414
0.0
0.0036941543
0.0
0.0028158745
0.0038286743
0.062330846
0.0008288218
0.0042549744
0.00032587885
0.0050239884
0.0022615874
0.0005567591
0.0010028367
0.005148717
0.0024919643
0.00047539384
0.003288357
0.0069801975
0.0014399032
0.0012859419
0.004485985
0.00058701285
0.0
0.0
0.0
0.009742801
0.0013996123
0.0032756291
0.012069267
0.005180056
0.0014544597
0.0008045331
0.005009042
0.011456866
0.0
0.0
0.0
0.000718204
0.0014451619
0.0014683586
0.0006752299
0.00978704
0.004429929
0.0015676399
0.0058093253
0.0037030133
0.0105693415
0.00799173
0.002758506
0.00014375386
0.004862429
0.002510209
0.0004731303
0.00016439075
0.008042144
0.00046812827
0.0044060014
0.0023235758
0.011127722
0.0005061767



Elapsed time to compute best fit: 663.180 seconds
Cross-validation score: 0.666555905746806
Test score: 0.6601123595505618
Best Hyperparameters: {}
0.0067131706
0.09675401
0.21575403
0.053552896
0.015830088
0.012853415
0.0031852454
0.00055669434
0.0007076379
0.003234437
0.008938709
0.1342379
0.011244397
0.0
0.006097162
0.0
0.010940363
0.035166502
0.059967883
0.00096016587
0.0017962819
0.00067141274
0.006347612
0.0035105073
0.009486727
0.0038018182
0.002718244
0.0012873947
0.00073378533
0.0012289339
0.0056398595
0.00038147054
0.00028882734
0.0033935816
0.0006459176
0.0
0.0
0.0
0.0048384615
0.0033108913
0.0031515132
0.0015402463
0.0034390176
0.0028632793
0.0018064738
0.0059243715
0.00793591
0.0
0.0
0.0
0.0026802698
0.0060089757
0.0033687807
0.0023906932
0.011247474
0.0035671047
0.00052684825
0.0061321864
0.004220187
0.0050876746
0.0041802335
0.0014129316
0.0003751482
0.0012854084
0.00050103536
0.0030511569
0.00055085623
0.009358158
0.0005490222
0.0061201546
0.004595535
0.008428141
0.0013



Elapsed time to compute best fit: 677.296 seconds
Cross-validation score: 0.6558259747968105
Test score: 0.7183908045977012
Best Hyperparameters: {}
0.01218369
0.114253625
0.23994847
0.04928373
0.021136915
0.004909611
0.002352115
0.0006909587
0.0025047157
0.0011579138
0.01650759
0.09260002
0.009996263
0.0
0.0075725564
0.0
0.008037461
0.0026797175
0.05165217
0.0057480116
0.0031704006
0.0013414833
0.006797223
0.0010624093
0.00039239967
0.0002513741
0.0038678132
0.0023121329
0.002284331
0.0023962099
0.006985435
0.00088661385
0.0020535232
0.013029139
0.0032366302
0.0
0.0
0.0
0.006962369
0.0008593302
0.0054224203
0.0035155322
0.0046169455
0.001207199
0.0031669997
0.002565158
0.006421246
0.0
0.0
0.0
0.0017207983
0.004028416
0.0012701148
0.0037024568
0.007518492
0.0077866735
0.002394906
0.0065589217
0.0027036502
0.00889634
0.0029639006
0.0021867894
0.0
0.004836235
0.0014097431
0.0003704381
0.0010120283
0.011070104
0.0008954689
0.0033836218
0.0029256833
0.012118671
0.002006261
0.0011294151
0.0



Elapsed time to compute best fit: 664.058 seconds
Cross-validation score: 0.6474914241374873
Test score: 0.7162921348314606
Best Hyperparameters: {}
0.005628253
0.10334386
0.22365515
0.032560434
0.01851513
0.0038270794
0.0015710199
0.0010601982
0.0015653843
0.0005014381
0.022206645
0.10932851
0.006723848
0.0
0.008732737
0.0
0.008385314
0.005988233
0.062806524
0.0037777007
0.0025992303
0.000527615
0.002167948
0.0057993145
0.0028232937
0.0018665481
0.0025861936
0.0014039003
0.00026134547
0.0036155372
0.0053724614
0.0033254009
0.0010278776
0.012142057
0.0004303828
0.0
0.0
0.0
0.0050777127
0.0019322255
0.0046174135
0.004064878
0.0037869802
0.0022846248
0.00054754975
0.00450813
0.0065266015
0.0
0.0
0.0
0.0037118215
0.015457747
0.0005414191
0.0024141741
0.0074374964
0.0035138247
0.00094816374
0.004966014
0.011954684
0.009489681
0.0050281836
0.0030832472
0.00014778806
0.0027639046
0.0024390207
0.0012709344
0.00089605147
0.0082174335
0.00041218175
0.0063814456
0.0012014176
0.00832764
0.0009210



Elapsed time to compute best fit: 675.955 seconds
Cross-validation score: 0.665473514981489
Test score: 0.6901041666666666
Best Hyperparameters: {}
0.0061135525
0.10624774
0.24383484
0.042034194
0.019153649
0.00042736
0.0019350525
0.001718793
0.0021216837
0.00049630296
0.01425646
0.09078777
0.01498676
0.0
0.005732337
0.0
0.010819474
0.004783237
0.08277099
0.011441189
0.0031934723
0.0007960615
0.006694976
0.00077949994
0.0003988881
0.0033988506
0.001283245
0.0009327784
0.000708977
0.0015600603
0.00913151
0.00091285276
0.002455518
0.0014366673
0.0007935196
0.0
0.0
0.0
0.0062682107
0.004166763
0.005626629
0.006096983
0.0095890565
0.0021861112
0.0010278673
0.0035905826
0.0043264637
0.0
0.0
0.0
0.0003100682
0.007882608
0.002088688
0.0013295896
0.009844646
0.002727008
0.0065248352
0.005158814
0.004927051
0.011304825
0.003371028
0.00416467
0.0035814317
0.00088284386
0.0012637897
0.0034153876
0.0003647886
0.016644668
0.0007954543
0.0057276483
0.0020117993
0.011137366
0.0012622139
0.0009498261




Elapsed time to compute best fit: 674.995 seconds
Cross-validation score: 0.6670379734781745
Test score: 0.6586021505376344
Best Hyperparameters: {}
0.011995232
0.11150077
0.20133093
0.03334939
0.014179286
0.0026658825
0.0015496097
0.0037355996
0.0019816265
0.0028218103
0.019517818
0.14648429
0.0039502946
0.0
0.0045229318
0.0
0.010410352
0.0034356047
0.05802274
0.009547473
0.0035713448
0.003132769
0.012443525
0.002431273
0.0031153217
0.00014118836
0.00064446667
0.0010418308
0.0018101161
0.00161543
0.012078185
0.00030412208
0.00031805155
0.0016970646
0.00037188924
0.0
0.0
0.0
0.0070986836
0.0010343066
0.0025081483
0.0045726807
0.0048171342
0.0013120231
0.0025664992
0.0012292939
0.0065488764
0.0
0.0
0.0
0.0074810153
0.005668134
0.00042989105
0.0029703777
0.00467302
0.006066325
0.0029851403
0.0055811056
0.011211726
0.008765788
0.008273428
0.00532143
0.0
0.0006313755
0.001149841
0.006547381
0.0006373637
0.006623043
0.00030202998
0.0055094217
0.0026571543
0.009850406
0.0030275472
0.0006682




Elapsed time to compute best fit: 723.277 seconds
Cross-validation score: 0.6562373239057703
Test score: 0.6666666666666666
Best Hyperparameters: {}
0.0056651654
0.13179372
0.21422799
0.04187713
0.02774486
0.001057912
0.0020440405
0.001958847
0.0009687893
0.00238752
0.020168167
0.08313762
0.037516907
0.0
0.0076912283
0.0
0.00590592
0.002748484
0.057036385
0.010809184
0.0031463343
0.00027864237
0.0009788914
0.009344756
0.0024441755
0.0039447886
0.0003970639
0.0008433491
0.0007237216
0.0028881158
0.013329899
0.0006963663
0.0024744484
0.0034364266
0.00037678497
0.0
0.0
0.0
0.0055698254
0.0023370988
0.005326996
0.003198142
0.005825843
0.00332437
0.00060707994
0.0019286384
0.0060301824
0.0
0.0
0.0
0.0059690024
0.011905679
0.00027844598
0.0015334896
0.009228758
0.004733673
0.0007824944
0.009303566
0.009501649
0.00784172
0.006834558
0.003046257
0.003037096
0.00030294104
0.0015477414
0.00041837883
0.0012047857
0.009664783
0.0010216625
0.00570855
0.0019283292
0.009837402
0.0030376439
0.00060292



Elapsed time to compute best fit: 697.138 seconds
Cross-validation score: 0.667484873695488
Test score: 0.6447368421052632
Best Hyperparameters: {}
0.005336466
0.12872568
0.27012262
0.047636673
0.021717267
0.0053673135
0.0035408796
0.0029161996
0.0047290614
0.006439939
0.033566453
0.0057952944
0.0055010556
0.0
0.0057875584
0.0
0.00308303
0.005002353
0.03772969
0.024445849
0.0035389303
0.0010193964
0.00031449806
0.0016837146
0.0014205734
0.0017888002
0.0022062534
0.00084444456
0.0021542404
0.0037642883
0.007037817
0.006545377
0.0018673282
0.011561558
0.0014231122
0.0
0.0
0.0
0.0070199305
0.0014741269
0.008587464
0.0055023097
0.0033869206
0.0044320193
0.0019992976
0.0024056237
0.013728039
0.0
0.0
0.0
0.0047622784
0.017462773
0.00029289807
0.0032381644
0.0051646153
0.0057673035
0.012448532
0.008636635
0.0057636835
0.010480958
0.0016217855
0.0033928677
0.002122148
0.0007808603
0.0026720248
0.0077894167
0.0009840197
0.01740804
0.0024827814
0.0036470117
0.0014107755
0.008641501
0.005253577
0



Elapsed time to compute best fit: 694.733 seconds
Cross-validation score: 0.6463931031429875
Test score: 0.7352941176470588
Best Hyperparameters: {}
0.0048988387
0.0935154
0.23159285
0.050260775
0.017339403
0.0007708238
0.0015748589
0.000886722
0.007795755
0.002273232
0.0098355515
0.09232161
0.008247196
0.0
0.0045706052
0.0
0.016661167
0.002109313
0.05743134
0.015775315
0.0017339519
0.00091339176
0.0098995175
0.009196155
1.3762036e-05
0.0002797456
0.0009709946
0.0010733393
0.001243266
0.004342063
0.006559587
0.0006400359
0.0004476353
0.007914426
0.0007385943
0.0
0.0
0.0
0.013039846
0.0015596271
0.004277244
0.0070418078
0.0031761047
0.002204911
0.004218013
0.0066712466
0.005613907
0.0
0.0
0.0
0.0038617013
0.012135049
0.00092051626
0.0013232852
0.005492508
0.00078810664
0.00029918787
0.0059965733
0.003566337
0.0066559445
0.009520514
0.0031304627
0.014109041
0.0005279972
0.001322415
0.004476273
0.0009298884
0.010132538
0.0004873049
0.0037090152
0.001760443
0.010158641
0.0009919559
0.00088



Elapsed time to compute best fit: 710.762 seconds
Cross-validation score: 0.6951428208963158
Test score: 0.6586021505376344
Best Hyperparameters: {}
0.0051033753
0.12112972
0.23895654
0.04318109
0.03138671
0.0011179526
0.002373086
0.0009830643
0.0022487298
0.0035786086
0.021140058
0.048158456
0.0073592686
0.0
0.0186083
0.0
0.0063923127
0.005027647
0.04249157
0.0013908376
0.004876972
0.00056932616
0.0038963256
0.0012832421
0.01704082
0.0005581349
0.0026961507
0.001912498
0.00054904795
0.0034994755
0.005693039
0.0004011483
0.0019844673
0.0068277474
0.000566662
0.0
0.0
0.0
0.011172355
0.0027433299
0.0032205775
0.01058378
0.004809287
0.0014862798
0.00028075193
0.003637128
0.006999398
0.0
0.0
0.0
0.008103655
0.0034041905
0.00092499383
0.0009755643
0.0070749647
0.004234478
0.0011094547
0.007792892
0.010660767
0.012376964
0.002919284
0.003037363
0.00030107962
0.0009604972
0.027646748
0.0025961292
0.0017747744
0.010345754
0.00036620468
0.008104506
0.0011498276
0.00807836
0.00028805548
0.000570



Elapsed time to compute best fit: 672.339 seconds
Cross-validation score: 0.670908155167028
Test score: 0.6111111111111112
Best Hyperparameters: {}
0.0039916486
0.15763196
0.21242787
0.03501065
0.025256578
0.00280013
0.0038410812
0.0009372919
0.009748816
0.001856368
0.026659682
0.0331281
0.04213747
0.0
0.0039390377
0.0
0.009625905
0.0018583001
0.050911658
0.0066085286
0.0036314977
0.00022744451
0.0077461563
0.0023327216
0.009887734
0.0012712195
0.0043430235
0.004825901
0.0046872594
0.0066657662
0.00780503
0.00031403187
0.0010010104
0.011238862
0.001560923
0.0
0.0
0.0
0.01149092
0.0019151657
0.0062909336
0.002160371
0.0026834728
0.0023245753
0.0017918979
0.002708152
0.0089227
0.0
0.0
0.0
0.0009832263
0.008627416
0.00035761393
0.00082045456
0.004802811
0.00095154555
0.0009161474
0.007532612
0.0132299485
0.0096454155
0.004329917
0.006480585
0.00020247525
0.0047498946
0.0012736844
0.0013044779
0.0019739654
0.008960638
0.0004884159
0.009063535
0.0023810407
0.01049631
0.0009713114
0.00896368



Elapsed time to compute best fit: 701.940 seconds
Cross-validation score: 0.6811905131302854
Test score: 0.685483870967742
Best Hyperparameters: {}
0.0046337694
0.11285458
0.21320273
0.048847478
0.01956866
0.002676789
0.0011402415
0.00091712014
0.003012599
0.002384036
0.018218704
0.10955979
0.0068891062
0.0
0.00584653
0.0
0.0083533935
0.0036523321
0.058926508
0.040363718
0.0041815517
0.0012261302
0.004128965
0.0031477092
0.0002124807
0.0006369874
0.0010017022
0.0006840085
0.0013222147
0.0026829352
0.01292352
0.0006322552
0.0013004043
0.002977601
0.0038172102
0.0
0.0
0.0
0.0074624275
0.003200719
0.0023027293
0.0050413273
0.0017497761
0.0010650018
0.003420892
0.003785658
0.00846072
0.0
0.0
0.0
0.0011509138
0.0008246699
0.0002492423
0.0019248476
0.012513214
0.0029741041
0.0017223035
0.0048868163
0.012879671
0.0066920533
0.0013649791
0.0009973021
0.0
0.0010498601
0.0027269952
0.005344428
0.0014075736
0.020434607
0.00027977643
0.007830556
0.0031333806
0.009031091
0.0029973164
0.00058811536




Elapsed time to compute best fit: 706.222 seconds
Cross-validation score: 0.6893847237800205
Test score: 0.65
Best Hyperparameters: {}
0.0057526645
0.12405515
0.20286451
0.047455803
0.020578928
0.00036446078
0.00045690787
0.0014946535
0.0029932633
0.001373029
0.03351203
0.05946192
0.0037084497
0.0
0.0053777574
0.0
0.010614423
0.0017579445
0.07860997
0.030505298
0.002571951
0.00078290515
0.0042443434
0.00042255115
0.005777849
0.0018396833
0.0020216024
0.00095538987
0.0006042262
0.0040470804
0.012072903
0.0008539702
0.0014692083
0.008149217
0.001342998
0.0
0.0
0.0
0.01210667
0.0018538786
0.0068113273
0.007692422
0.004956586
0.0011043202
0.0005351676
0.005277735
0.011259794
0.0
0.0
0.0
0.0018975813
0.006865008
0.000998988
0.0014259071
0.008828992
0.0052437037
0.0009029705
0.0059531205
0.0032188853
0.009129471
0.0017385648
0.005149402
0.0004494284
0.002296776
0.0012840552
0.00088069116
0.0028497858
0.0065987934
0.0005770048
0.0072751655
0.0064490046
0.0097471075
0.0006336405
0.000953638
0.



Elapsed time to compute best fit: 697.310 seconds
Cross-validation score: 0.6667501944470722
Test score: 0.6052631578947368
Best Hyperparameters: {}
0.004704477
0.1127151
0.21645996
0.032741144
0.029136078
0.000903097
0.002141041
0.0011931206
0.006732646
0.00030896533
0.012751862
0.0732142
0.036751285
0.0
0.0029929536
0.0
0.006090167
0.0215017
0.06103609
0.004089847
0.0015141751
0.0012854941
0.0124403
0.002694878
0.0040000924
0.00034594876
0.00071542466
0.0021546127
0.002289233
0.00098312
0.0055558616
0.0005234144
0.0005595608
0.007924954
0.0013384124
0.0
0.0
0.0
0.010459831
0.0027753308
0.005142369
0.0040658046
0.009327067
0.001676216
0.0005215668
0.0038527916
0.0042097084
0.0
0.0
0.0
0.0013360846
0.0014018349
0.0011085033
0.0014604693
0.009686107
0.00063166564
7.4033785e-05
0.006914513
0.010098324
0.009921461
0.0035487814
0.0017657368
0.0
0.0008308541
0.0013751514
0.0003191738
0.0004914861
0.0084721055
0.00036484151
0.010454905
0.0024011205
0.010213234
0.0017830635
0.00092159293
0.00



Elapsed time to compute best fit: 670.529 seconds
Cross-validation score: 0.6649016570623862
Test score: 0.6859756097560976
Best Hyperparameters: {}
0.009862568
0.105817676
0.2199755
0.0320728
0.019818677
0.0015280328
0.0013054542
0.003151748
0.004171815
0.002317564
0.013692238
0.12581494
0.012364358
0.0
0.0092677865
0.0
0.0088257035
0.024589423
0.053966943
0.03694689
0.002047917
0.00038108596
0.003580129
0.0033189813
0.0036656726
0.0010293329
0.0014116191
0.0021356456
0.0014586513
0.0036270705
0.012958944
0.00049984246
0.0010003947
0.00351011
0.0016032248
0.0
0.0
0.0
0.0063751377
0.002355985
0.0064521194
0.0023872303
0.002548748
0.0031300576
0.0073002507
0.0074807727
0.0050819134
0.0
0.0
0.0
0.0032060293
0.008699421
0.001826855
0.002502258
0.014117932
0.0050779004
0.0031224657
0.008761873
0.0044937367
0.006598956
0.0010062153
0.0039006958
0.00012248555
0.0023267185
0.00089822337
0.0002928511
0.00042401743
0.013022393
0.000621233
0.007492513
0.00244923
0.008564511
0.00083786354
0.00057



Elapsed time to compute best fit: 678.756 seconds
Cross-validation score: 0.6837688802549187
Test score: 0.7327586206896551
Best Hyperparameters: {}
0.004398287
0.11618243
0.21246211
0.037936542
0.023941673
0.0009176201
0.0020184144
0.0013629524
0.0033440914
0.0015321145
0.028363297
0.071143754
0.003457945
0.0
0.0065751895
0.0
0.009908115
0.0099159535
0.07943158
0.007876957
0.0034294939
0.0012611989
0.0003554741
0.00068945216
0.008087842
0.0071061323
0.00059858477
0.0008018804
0.0073716207
0.0015554554
0.002979707
0.00036959356
0.00055575464
0.008461955
0.0005770109
0.0
0.0
0.0
0.005317483
0.0042526694
0.001498818
0.0022296344
0.009031584
0.0016475553
0.008362009
0.006203524
0.004178825
0.0
0.0
0.0
0.008648397
0.00063401303
0.001132448
0.0024076924
0.008444172
0.007874925
0.003104151
0.008074241
0.0016951354
0.007837159
0.0033125323
0.0031393508
0.00041484283
0.0007280171
0.009665432
0.002101533
0.0012964554
0.008499564
0.0008437952
0.0029777125
0.0035435976
0.012321011
0.00090740185
0



Elapsed time to compute best fit: 666.043 seconds
Cross-validation score: 0.6531803112219057
Test score: 0.6806930693069306
Best Hyperparameters: {}
0.0040814276
0.10204393
0.22582984
0.040314034
0.014019229
0.002565482
0.0017646796
0.00090022956
0.00093965756
0.0013841067
0.012201673
0.14333183
0.016244564
0.0
0.004716712
0.0
0.008339905
0.0052176686
0.072363146
0.005632866
0.0009689364
0.00032290985
0.011677168
0.0019204937
0.0024920856
0.003386935
0.0015495429
0.0008253404
0.0014056745
0.0011026764
0.0054032635
0.00041870764
0.00028462382
0.007052722
0.0030508728
0.0
0.0
0.0
0.008488297
0.0017687733
0.007530358
0.0019513718
0.0040920903
0.0019858787
0.0018695219
0.002237943
0.008948374
0.0
0.0
0.0
0.002896515
0.0023459422
0.00379472
0.006819136
0.010427983
0.0033100515
0.019201197
0.0046328567
0.006760235
0.008124934
0.0018664236
0.0008679639
0.00031215054
0.0005002525
0.0016981207
0.0004712675
0.0011805352
0.0102397315
0.0013820403
0.004336396
0.0066317567
0.010052634
0.0018332684




Elapsed time to compute best fit: 688.918 seconds
Cross-validation score: 0.6796020887224278
Test score: 0.6308411214953272
Best Hyperparameters: {}
0.0032584777
0.12320737
0.21272324
0.042411372
0.021495052
0.0018726654
0.0025254083
0.0013855494
0.0019849131
0.0026643926
0.009860489
0.08208695
0.009813328
0.0
0.006062172
0.0
0.008145191
0.004843701
0.069087885
0.013015207
0.0018033143
0.00046317442
0.0067945374
0.000886151
0.010647033
0.0009765694
0.0011927871
0.0033763018
0.00033340763
0.0022815776
0.013102681
0.0008618217
0.0002651819
0.015712705
0.00044320608
0.0
0.0
0.0
0.005778416
0.0015631224
0.007254373
0.0059605115
0.012181757
0.001211869
0.0033688482
0.0027820233
0.008948104
0.0
0.0
0.0
0.0025447654
0.0048296265
0.0015908424
0.0017418157
0.004396016
0.005142676
0.0026326065
0.008467582
0.012821982
0.008965946
0.0018132777
0.0033522712
0.00031439523
0.0003013188
0.0010105425
0.004075741
0.0014154805
0.011796207
0.0007653449
0.0025666058
0.0032346835
0.00941765
0.00070372125
0.



Elapsed time to compute best fit: 670.910 seconds
Cross-validation score: 0.6603079542443387
Test score: 0.6976744186046512
Best Hyperparameters: {}
0.00609717
0.12460751
0.27542797
0.054784525
0.020897718
0.009904567
0.0026055747
0.004580981
0.0037633432
0.00049588276
0.02326948
0.041781016
0.01542009
0.0
0.003637881
0.0
0.002520329
0.0026326166
0.060879633
0.008425129
0.0020907845
0.0005577891
0.008148384
0.0062427563
0.000575758
0.00457656
0.006124491
0.001725195
0.0007399246
0.003466853
0.010802751
0.0005534227
0.0006373891
0.0076961215
0.0009837055
0.0
0.0
0.0
0.005626638
0.0049647577
0.0029862344
0.0019081804
0.0077441405
0.0021231505
0.0019628743
0.0077180564
0.0069178077
0.0
0.0
0.0
0.00038297402
0.006702144
0.0031438072
0.006701542
0.005948349
0.0013699691
0.0017959473
0.0043223086
0.0025772254
0.008138742
0.003724672
0.0015201068
0.00046340388
0.0011095047
0.0053574373
0.00039310145
0.00062040245
0.008542532
0.00075729995
0.0063617597
0.005943866
0.007849248
0.0055709467
0.00



Elapsed time to compute best fit: 667.562 seconds
Cross-validation score: 0.6520257984254794
Test score: 0.7070707070707071
Best Hyperparameters: {}
0.005930151
0.11444912
0.20960237
0.04094844
0.013808249
0.0023976727
0.0027764442
0.0013170349
0.0005026529
0.00091706746
0.013218804
0.092729226
0.003120519
0.0
0.0027932122
0.0
0.010149761
0.0044019194
0.05969471
0.08090906
0.0016008458
0.0002185169
0.0004293157
0.0074337064
0.015125582
9.7537144e-05
0.0018065599
0.00086237135
0.00246
0.001937396
0.010662231
0.0004783001
0.00078490924
0.0076619065
0.00069890014
0.0
0.0
0.0
0.007162958
0.0013154225
0.0024622274
0.0050989846
0.007208944
0.001692558
0.0030516358
0.0034807324
0.006575404
0.0
0.0
0.0
0.00080933137
0.0024667915
0.0010678965
0.0013848698
0.005198372
0.0009615951
0.0016000303
0.0049509886
0.011675364
0.0067985826
0.0023718611
0.003735045
0.0011196791
0.002771937
0.0018860088
0.00059959677
0.0008579548
0.009536781
0.0009326546
0.0074435514
0.006282323
0.009663104
0.0030385603
0.



Elapsed time to compute best fit: 662.118 seconds
Cross-validation score: 0.6674809570193745
Test score: 0.6862745098039216
Best Hyperparameters: {}
0.004655235
0.1299028
0.26732314
0.04293579
0.022614058
0.001558822
0.00089279324
0.0034209741
0.0042317426
0.00043349224
0.028880462
0.004575195
0.011024958
0.0
0.007245049
0.0
0.003949823
0.0045368457
0.078795165
0.035451084
0.002775521
0.001018799
0.00059422135
0.0019301726
0.00041744692
0.0007988588
0.0025299003
0.00075391517
0.00037101636
0.0043432247
0.009700042
0.0011810035
0.0006948434
0.0049960916
0.0008361669
0.0
0.0
0.0
0.0102427965
0.0023390732
0.0076867254
0.010153186
0.007533701
0.0008570171
0.0047064857
0.0077737477
0.0062021906
0.0
0.0
0.0
0.00086420623
0.006134559
0.0014131775
0.00016009843
0.006942269
0.0039592427
0.005164965
0.0026227378
0.0068354984
0.006044313
0.0010636493
0.0011890543
9.0554226e-05
0.0006518756
0.0033879618
0.008904474
0.0018028975
0.012117427
0.0007302175
0.0044694943
0.0029433714
0.011895209
0.00362

### 5.2.3 LightGBM

In [131]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

smote_lightgbm_performance_nonnormalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    LightGBMPipeline = Pipeline(steps = [['smote', SMOTE()],
                                    #['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    smote_lightgbm_performance_nonnormalized_df = pd.concat([smote_lightgbm_performance_nonnormalized_df, new_performance_df])
    

smote_lightgbm_performance_nonnormalized_df.to_csv("../data/05_model_output/smote_lightgbm_performance_nonnormalized_df.csv")




Elapsed time to compute best fit: 41.367 seconds
Cross-validation score: 0.6377495786371326
Test score: 0.6395348837209303
Best Hyperparameters: {}
6213.285076618195
302127.985298872
487379.5143995285
55633.233102083206
8123.730497837067
643.2545585632324
446.7536506652832
872.0243356227875
131.94036054611206
95.66547083854675
10283.773772001266
23446.7070248127
6759.341908216476
0.0
2501.422074317932
0.0
1251.0178170204163
610.2478322982788
28698.59246277809
1127.7800636291504
391.1833596229553
58.319281339645386
344.2672300338745
146.68808794021606
334.87835025787354
90.06333041191101
67.52353096008301
1771.6430456638336
1149.3529696464539
1882.9003033638
262.9424719810486
330.7925138473511
81.9145712852478
5191.2612953186035
214.76577639579773
0.0
0.0
0.0
1912.7399892807007
531.4586749076843
2345.134689092636
977.7896711826324
1301.9440512657166
289.40348863601685
314.0163118839264
505.35352993011475
1014.4458131790161
0.0
0.0
0.0
431.0676317214966
1080.2747991085052
238.48856306076



Elapsed time to compute best fit: 41.417 seconds
Cross-validation score: 0.6517320017488175
Test score: 0.580046403712297
Best Hyperparameters: {}
7313.210209131241
297820.9848122597
485371.8884294033
40315.57759284973
30058.114769935608
633.5559220314026
1297.5656077861786
241.69287967681885
540.859696149826
42.08806085586548
8911.434385538101
12725.075277090073
10786.238243579865
0.0
2243.9041719436646
0.0
1404.6411097049713
592.474080324173
12187.527060985565
2584.6514296531677
470.3764681816101
23.797300100326538
1269.6675007343292
584.5985527038574
321.3761978149414
507.03455996513367
530.1421558856964
1850.0462837219238
78.08257031440735
1448.7851860523224
1862.4354882240295
233.24365377426147
58.356401443481445
6708.505774259567
383.3771414756775
0.0
0.0
0.0
2237.7576389312744
139.9134397506714
1335.4953503608704
1072.6369767189026
3105.9904112815857
1085.9702372550964
579.5061411857605
525.733900308609
1220.9266955852509
0.0
0.0
0.0
1660.0828223228455
1732.219398021698
1059.574



Elapsed time to compute best fit: 42.335 seconds
Cross-validation score: 0.6414456554978225
Test score: 0.671462829736211
Best Hyperparameters: {}
8021.819134950638
250873.46304178238
513677.10538339615
41680.818845033646
19753.79964852333
1251.8704013824463
82.22093057632446
158.857008934021
1947.0492072105408
71.59811067581177
9372.861216545105
45181.42422628403
2878.3453736305237
0.0
1742.212233543396
0.0
322.80241870880127
1059.5865879058838
30488.954179286957
324.0862011909485
987.910234451294
59.67561054229736
1422.4403042793274
418.21960258483887
29.427000045776367
144.68962001800537
244.15293097496033
422.0310490131378
137.4281497001648
1415.3048391342163
2408.461753845215
323.6377465724945
85.33442091941833
1143.6345937252045
452.27507162094116
0.0
0.0
0.0
2545.9164006710052
400.2172522544861
759.6403801441193
860.6419770717621
1796.4675691127777
647.1268239021301
291.65781021118164
529.6429142951965
754.6749567985535
0.0
0.0
0.0
1991.549996137619
4197.871527194977
155.5223977



Elapsed time to compute best fit: 41.999 seconds
Cross-validation score: 0.5981029620924186
Test score: 0.6995412844036697
Best Hyperparameters: {}
11467.822226285934
287955.15723228455
485623.58351945877
27362.802679777145
31821.724287986755
189.68852853775024
2118.4197130203247
60.536049365997314
176.17311835289001
0.0
6806.017356157303
30801.659671783447
8480.887920379639
0.0
2315.5041856765747
0.0
4268.1235427856445
4325.85470867157
22884.32399725914
908.9683599472046
317.50946140289307
59.728858947753906
1263.90425491333
226.89128994941711
13.136459827423096
77.73420858383179
686.379909992218
1329.8502748012543
102.11335134506226
1941.005707025528
2613.7314536571503
116.1805784702301
232.1264877319336
3842.3064980506897
487.96513414382935
0.0
0.0
0.0
2436.1199827194214
239.067307472229
2114.7162897586823
1096.8660011291504
354.60148763656616
403.37004804611206
1085.5475533008575
1039.0576829910278
1404.6526803970337
0.0
0.0
0.0
637.7882347106934
1326.4142405986786
524.176070690155



Elapsed time to compute best fit: 43.526 seconds
Cross-validation score: 0.6705408192817259
Test score: 0.671462829736211
Best Hyperparameters: {}
9777.674536705017
306628.0130531788
478683.16573500633
32803.24427008629
24847.925010204315
104.05047988891602
1574.14067196846
114.5563497543335
681.463826417923
167.59424710273743
14866.06683588028
8146.878716945648
14526.266046524048
0.0
2814.4196996688843
0.0
2073.316255569458
3879.710078239441
7128.933227300644
9746.453679323196
313.2484984397888
12.498100280761719
2625.343855381012
361.5793869495392
367.5203056335449
607.7383871078491
1446.6769230365753
533.8016314506531
314.15570402145386
1553.9808340072632
1459.4469437599182
349.2222225666046
55.350289821624756
1838.9730253219604
169.34321117401123
0.0
0.0
0.0
5684.768459320068
460.4282488822937
1793.545416355133
1053.112517118454
767.2606098651886
2405.494110584259
266.15207719802856
873.7556004524231
833.0504851341248
0.0
0.0
0.0
549.4057621955872
645.5771536827087
553.596350193023



Elapsed time to compute best fit: 42.061 seconds
Cross-validation score: 0.6119943798325302
Test score: 0.684931506849315
Best Hyperparameters: {}
5238.6467027664185
245692.72607707977
522172.948043108
37849.01030564308
13818.684676885605
841.3777623176575
346.8545198440552
228.31215071678162
186.68252420425415
108.38089847564697
12907.735828399658
36976.30615091324
2527.0653495788574
0.0
4041.545487880707
0.0
1424.246740579605
616.4675974845886
27990.304732322693
2208.623326778412
293.82285261154175
47.32323980331421
1616.486558675766
625.5472149848938
28.60456085205078
73.98049092292786
389.67084288597107
1153.3035702705383
233.7241497039795
1975.4195017814636
3107.872911453247
209.6228301525116
155.28750944137573
4031.384213209152
306.67504692077637
0.0
0.0
0.0
1301.1305663585663
156.71842765808105
662.845109462738
1883.0010259151459
596.6580681800842
1081.6296801567078
382.59767293930054
675.4453296661377
1303.450445652008
0.0
0.0
0.0
2351.266365289688
2300.8168816566467
148.815750



Elapsed time to compute best fit: 43.683 seconds
Cross-validation score: 0.6257398780042465
Test score: 0.5833333333333334
Best Hyperparameters: {}
17623.83202815056
252138.66660904884
509722.0577466488
32600.65527653694
17391.453001260757
7469.339803218842
260.19716143608093
386.9693121910095
569.8704371452332
26.97107982635498
6935.882700443268
51308.320850372314
2825.3114590644836
0.0
2440.981032848358
0.0
2180.698775291443
359.567129611969
15279.740208148956
452.5517318248749
176.19465947151184
40.72359013557434
986.391713142395
347.9753484725952
119.49870300292969
32.15349054336548
90.0475103855133
806.851868391037
300.51382541656494
1422.9842450618744
1241.946521282196
669.2625110149384
755.5625283718109
3222.0625076293945
219.28924131393433
0.0
0.0
0.0
4196.940429925919
443.39557552337646
5746.044727325439
822.8418340682983
2364.4878470897675
614.3745942115784
250.24670004844666
664.7328431606293
870.3644576072693
0.0
0.0
0.0
76.87572050094604
1111.854255437851
378.6117465496063



Elapsed time to compute best fit: 43.151 seconds
Cross-validation score: 0.6136320293158091
Test score: 0.6455399061032864
Best Hyperparameters: {}
6581.641905069351
263260.1478085518
502169.8123421669
39590.33311891556
28268.46794104576
298.1422197818756
115.62644815444946
39.112109661102295
1797.505077123642
214.1352162361145
9455.14336848259
29682.574989557266
5995.865459918976
0.0
3266.8310284614563
0.0
3546.7000393867493
366.4115047454834
21453.018461227417
1949.827898979187
336.8136250972748
41.08500099182129
1661.3399124145508
729.3372349739075
8.110329627990723
88.53816819190979
176.09121870994568
501.00807189941406
105.41800308227539
2674.7504715919495
1706.6405806541443
88.65472865104675
146.61910247802734
1920.140828371048
419.4345200061798
0.0
0.0
0.0
3469.8202414512634
312.35554003715515
2524.878979921341
1553.5794095993042
1453.3026893138885
704.6167166233063
318.55570220947266
846.937258720398
991.165992975235
0.0
0.0
0.0
1837.187183856964
1830.4650473594666
670.16165614



Elapsed time to compute best fit: 41.193 seconds
Cross-validation score: 0.6245307071108411
Test score: 0.6100478468899522
Best Hyperparameters: {}
5401.81499004364
238944.5545580387
502989.03195762634
33164.39577913284
16873.32187151909
1536.1025595664978
315.9575595855713
174.3969283103943
469.0568633079529
179.42774963378906
15567.346848964691
51635.19012117386
5576.381878614426
0.0
3990.3025941848755
0.0
4800.295205354691
592.1850235462189
42570.0116648674
2724.312408208847
599.11452293396
0.0
1077.3591146469116
110.05397081375122
430.23750495910645
0.0
184.6711802482605
260.0553197860718
143.75571393966675
816.5582113265991
1133.5775260925293
264.9806351661682
345.76170015335083
3483.800537586212
470.39865732192993
0.0
0.0
0.0
2932.208786010742
387.73333501815796
1593.8047580718994
1551.5394327640533
951.8465342521667
771.033194065094
856.9765348434448
1454.9727668762207
1055.0786199569702
0.0
0.0
0.0
2066.349175453186
2161.7158761024475
355.94589829444885
217.27539205551147
2404.



Elapsed time to compute best fit: 43.658 seconds
Cross-validation score: 0.616619326396014
Test score: 0.684931506849315
Best Hyperparameters: {}
5947.01930642128
318004.57527041435
474903.9101731777
41748.09953570366
13880.845113992691
528.1467146873474
286.52505707740784
262.49047684669495
713.4178268909454
209.08312034606934
16907.562187433243
4823.304472446442
1535.0722658634186
0.0
6258.667648792267
0.0
1079.3084115982056
792.7223558425903
10770.464463233948
24661.42603111267
199.55478954315186
17.72007989883423
1524.3532304763794
786.1451632976532
90.80873799324036
29.96819043159485
341.36990308761597
456.9272346496582
359.47403144836426
805.9131262302399
3009.077847957611
562.012264251709
190.52793335914612
2582.8914909362793
216.16539096832275
0.0
0.0
0.0
6141.542804479599
207.32397937774658
3197.1309559345245
1151.36771774292
1228.266497373581
1031.5298020839691
87.90741014480591
1214.430461883545
961.3051316738129
0.0
0.0
0.0
1411.1725873947144
1319.500958442688
276.421558141



Elapsed time to compute best fit: 41.824 seconds
Cross-validation score: 0.6278055922936187
Test score: 0.5957943925233644
Best Hyperparameters: {}
10423.42019534111
324773.8091711998
484027.11053824425
26038.009913682938
31105.2376434803
444.0501847267151
248.6832618713379
242.13565826416016
916.1752667427063
490.41470074653625
13389.687285423279
3684.187973022461
6383.94241309166
0.0
1761.156816482544
0.0
1223.8576474189758
605.0130367279053
20546.830569028854
532.0372414588928
892.5023131370544
36.52279090881348
905.013922214508
1216.0448660850525
8.497929573059082
115.3407974243164
1177.4578013420105
747.1263093948364
140.87539172172546
1859.9843995571136
2352.9760830402374
915.9792499542236
200.58455753326416
2338.3733463287354
499.93246364593506
0.0
0.0
0.0
5281.495334863663
198.20923709869385
3025.497381210327
2782.3495852947235
801.0039112567902
2098.2873191833496
107.15256214141846
265.52349185943604
1689.5276045799255
0.0
0.0
0.0
120.74992108345032
1180.5971574783325
398.5413



Elapsed time to compute best fit: 42.538 seconds
Cross-validation score: 0.6483692638645999
Test score: 0.6407322654462243
Best Hyperparameters: {}
12985.637073755264
227795.61694049835
506921.1460585594
47902.56809544563
20514.67236328125
258.6111581325531
1924.2850325107574
128.4586272239685
144.2528429031372
151.3785161972046
9226.404579877853
60723.13487672806
1662.2418839931488
0.0
1555.5251717567444
0.0
3560.8748269081116
4195.728087425232
17231.813764095306
407.51576256752014
463.1785297393799
89.25912094116211
4677.635933876038
1061.0205025672913
127.56532907485962
40.64147925376892
150.8403525352478
1594.644317150116
228.19261050224304
2276.782704114914
1518.1740090847015
175.62130784988403
90.03030061721802
1695.2048997879028
153.58056902885437
0.0
0.0
0.0
3021.2578859329224
755.8358373641968
5115.92086482048
909.1601228713989
1155.1805827617645
984.1766080856323
177.34144163131714
967.860110282898
1376.5123143196106
0.0
0.0
0.0
78.17335057258606
1015.1692357063293
129.275557



Elapsed time to compute best fit: 43.107 seconds
Cross-validation score: 0.6276918034630203
Test score: 0.6947608200455581
Best Hyperparameters: {}
12732.70288324356
234531.8013138771
511598.5740251541
38608.10690283775
16955.73297572136
1626.0499792099
638.2991442680359
209.02319955825806
239.5862386226654
121.64665794372559
11470.25881767273
45868.31740689278
1564.386911392212
0.0
1464.0004007816315
0.0
2618.47483587265
2366.9086542129517
29615.588849544525
637.54825258255
505.31946444511414
93.55609059333801
100.92195796966553
1641.8497004508972
140.3425488471985
55.79957103729248
185.8554081916809
462.34943413734436
538.3050329685211
1521.1477642059326
3164.8848185539246
98.40823078155518
667.8887979984283
3184.027331829071
1412.3995604515076
0.0
0.0
0.0
3352.54643368721
285.26221799850464
2329.6764059066772
1954.974473953247
793.8693208694458
1687.7673482894897
99.93946719169617
596.3612143993378
1581.5859036445618
0.0
0.0
0.0
3694.5013751983643
1169.3028695583344
201.963869571685



Elapsed time to compute best fit: 42.864 seconds
Cross-validation score: 0.6599900867534847
Test score: 0.6597222222222222
Best Hyperparameters: {}
10939.409690380096
269911.3480718136
487761.4941112995
49001.64982056618
27217.542789936066
1030.0709540843964
571.0815546512604
125.00237917900085
576.9648857116699
120.79310655593872
5180.604522228241
37443.37092876434
2479.90558385849
0.0
2700.193715572357
0.0
2930.280577659607
588.0471105575562
15306.289387226105
430.59955620765686
1651.53329539299
60.199501037597656
924.0829966068268
895.8079223632812
239.00647115707397
89.22470927238464
976.848263502121
2681.072356700897
279.47069215774536
3114.5596470832825
1915.3699848651886
223.8188557624817
147.08685159683228
2972.8502888679504
111.35737085342407
0.0
0.0
0.0
3589.3865492343903
225.89436054229736
3481.515262365341
1346.8026525974274
1371.0100312232971
430.68210220336914
152.26576328277588
491.86870288848877
1301.5671305656433
0.0
0.0
0.0
1271.16104054451
1494.4033930301666
37.10220



Elapsed time to compute best fit: 42.680 seconds
Cross-validation score: 0.6333438169245786
Test score: 0.6557377049180327
Best Hyperparameters: {}
4939.585233926773
258779.7856874466
492438.4005289078
39473.25352215767
11612.295672178268
373.96580481529236
1351.016431570053
389.59741854667664
321.7226209640503
35.75220990180969
11191.634770870209
51483.88949346542
1603.9588613510132
0.0
3293.4024436473846
0.0
2526.9190106391907
523.6868908405304
28469.53993010521
388.9538722038269
667.0908718109131
79.81060218811035
735.6380755901337
491.8430767059326
897.2787089347839
187.0733437538147
247.25104665756226
375.07255959510803
209.32150864601135
1481.0667231082916
2179.7043204307556
185.54647064208984
329.58016562461853
2032.5523171424866
250.2355990409851
0.0
0.0
0.0
3290.588274717331
353.8718523979187
1085.691422700882
1371.604787349701
1054.6086542606354
759.1673288345337
360.13454246520996
386.8317985534668
2131.4218974113464
0.0
0.0
0.0
3759.3655683994293
3684.2266569137573
230.4097



Elapsed time to compute best fit: 43.827 seconds
Cross-validation score: 0.6366497189056038
Test score: 0.6264501160092808
Best Hyperparameters: {}
5605.354379415512
270230.9662165642
541091.1729180813
30541.69877076149
14777.119798660278
1859.6980502605438
157.90375065803528
272.21330094337463
177.1463713645935
244.9256443977356
24895.61840558052
1461.7611272335052
3842.539116859436
0.0
855.0131993293762
0.0
2635.966983795166
271.91583585739136
23688.125886440277
4566.114295005798
440.971786737442
73.29036045074463
541.6249008178711
2495.80881357193
40.53063988685608
32.0113000869751
337.09143114089966
292.88433146476746
662.554790019989
3455.6932303905487
4321.653992652893
323.36303758621216
844.9019594192505
760.9628186225891
413.9268591403961
0.0
0.0
0.0
1756.591168165207
455.39752674102783
2340.29528093338
885.2206721305847
720.2350120544434
631.0645444393158
363.26561307907104
821.3509511947632
1441.612595319748
0.0
0.0
0.0
324.78304982185364
3124.471650123596
457.15003514289856




Elapsed time to compute best fit: 43.080 seconds
Cross-validation score: 0.6149144680050481
Test score: 0.641025641025641
Best Hyperparameters: {}
3656.007924556732
290935.6413695812
483509.0376803875
34074.00401806831
16775.21359395981
121.11256790161133
680.8906562328339
396.2675573825836
460.3589656352997
56.95926094055176
18141.11527633667
13700.228234052658
2133.598838329315
0.0
3443.5319192409515
0.0
1650.203739643097
1216.8143649101257
29240.031133651733
8032.704017162323
495.06067991256714
129.97155928611755
99.01559829711914
700.9776277542114
10.950699806213379
7.2308502197265625
926.1845245361328
447.10586857795715
1255.343889951706
1560.7139356136322
3369.7992193698883
290.0742099285126
189.12144947052002
3565.4458606243134
245.02411770820618
0.0
0.0
0.0
2262.710057258606
356.72243428230286
2053.153710126877
892.361545085907
3470.5124068260193
134.45223808288574
133.06606125831604
1141.6476333141327
812.6824378967285
0.0
0.0
0.0
864.5938572883606
2175.240405321121
161.539649



Elapsed time to compute best fit: 42.284 seconds
Cross-validation score: 0.6213879309671266
Test score: 0.651658767772512
Best Hyperparameters: {}
6021.14421582222
268512.6722404957
535451.6821434498
34928.447078228
16726.630303144455
1599.6505796909332
160.66744947433472
191.79942727088928
330.3596029281616
55.136770486831665
24672.76910853386
7772.658814191818
1382.2304644584656
0.0
2600.3891835212708
0.0
2408.335865497589
897.0686702728271
18988.724347114563
1227.1525082588196
488.4109790325165
137.42223405838013
356.04469871520996
378.07718300819397
312.8708972930908
371.3454895019531
1413.3961608409882
305.07715344429016
276.4839189052582
2378.170110464096
3767.865747451782
1158.7721421718597
171.24997973442078
436.41148591041565
298.26327657699585
0.0
0.0
0.0
7954.011084079742
274.241916179657
1315.4339053630829
2015.8276705741882
823.8290863037109
811.9787378311157
200.97885823249817
726.245602607727
1649.6892278194427
0.0
0.0
0.0
937.5052282810211
2724.2396092414856
528.2300269



Elapsed time to compute best fit: 44.051 seconds
Cross-validation score: 0.6365206275812063
Test score: 0.689252336448598
Best Hyperparameters: {}
8648.363379716873
250980.40879631042
499785.4652593136
51901.527935028076
18468.398983478546
1410.6645753383636
311.155104637146
111.28310108184814
581.4398565292358
159.92181968688965
10050.936584472656
46732.79697418213
3067.4851624965668
0.0
1464.7876715660095
0.0
4496.286222457886
1256.1130874156952
25382.21342945099
1949.7024836540222
343.43420124053955
47.43749952316284
487.28409337997437
498.02794122695923
899.0793881416321
8.737289905548096
69.28379011154175
765.7252743244171
35.91077995300293
3073.809841632843
2029.445434808731
347.4655227661133
223.76157331466675
2004.2724556922913
694.4775857925415
0.0
0.0
0.0
4656.33642244339
192.81574249267578
3022.83442234993
1605.0225319862366
1056.5653038024902
456.97860074043274
147.50683212280273
379.2641611099243
1062.0615410804749
0.0
0.0
0.0
250.1583251953125
2910.321059703827
350.379767



Elapsed time to compute best fit: 42.274 seconds
Cross-validation score: 0.6313938915279815
Test score: 0.6470588235294118
Best Hyperparameters: {}
4871.482068538666
268834.2448647022
522703.99972987175
40209.155349731445
12806.060122013092
1098.8365559577942
518.2798030376434
376.0508463382721
1245.0992951393127
90.14544892311096
20265.650646448135
6863.29439496994
271.48171734809875
0.0
2022.8632035255432
0.0
345.49638652801514
1553.8228673934937
30227.238125562668
2274.897711753845
521.0213112831116
165.52373957633972
691.2201261520386
405.1108298301697
598.8658905029297
102.83583045005798
1430.747445344925
402.39894127845764
841.3955779075623
1422.0949227809906
1749.6733422279358
374.0940148830414
449.48489356040955
1314.2994754314423
839.0343878269196
0.0
0.0
0.0
3068.0220658779144
593.1649372577667
912.138341665268
4140.252416610718
1674.6151390075684
748.0541911125183
691.5875086784363
844.6685271263123
1250.2122285366058
0.0
0.0
0.0
3054.800330400467
1903.3365120887756
764.0898



Elapsed time to compute best fit: 42.300 seconds
Cross-validation score: 0.6351501175325491
Test score: 0.5971896955503512
Best Hyperparameters: {}
6620.620559453964
211438.90947127342
530444.3928563595
30660.76681113243
13744.045049905777
5398.773521184921
994.6879699230194
464.41873931884766
535.2706475257874
177.69821643829346
9642.21091079712
69045.99518156052
1226.2146248817444
0.0
3311.5629024505615
0.0
4702.406491994858
895.7231533527374
32998.74291014671
958.192352771759
1714.0689146518707
414.81055307388306
1944.1934895515442
81.2173101902008
559.0809936523438
22.62487030029297
29.518810510635376
1097.71581864357
171.61158323287964
954.7508292198181
2815.71928191185
249.81267046928406
215.83539581298828
3182.3276693820953
1124.354001045227
0.0
0.0
0.0
1960.9718611240387
148.4853482246399
1578.902652978897
1080.8326234817505
759.5251212120056
540.4186961650848
138.00690078735352
785.2132258415222
1328.0601437091827
0.0
0.0
0.0
357.20087146759033
1739.7019319534302
86.6462607383



Elapsed time to compute best fit: 43.180 seconds
Cross-validation score: 0.6692423655117581
Test score: 0.5903614457831325
Best Hyperparameters: {}
5822.717181682587
300567.969312191
469758.011705637
47358.421882390976
17331.27156805992
721.1369791030884
342.46763038635254
127.5480306148529
275.06338715553284
219.12826871871948
14970.120973587036
19411.17199230194
1585.6403603553772
0.0
1060.3514966964722
0.0
1842.280750989914
733.9952487945557
40213.574424266815
720.5232691764832
729.1385505199432
97.28022980690002
963.260434627533
507.4862606525421
57.65597057342529
54.617900133132935
1374.4339604377747
348.42344331741333
824.5151720046997
1178.7273964881897
5147.912907600403
248.26106190681458
85.8471109867096
505.17478370666504
180.388503074646
0.0
0.0
0.0
2966.9215984344482
133.45515847206116
2834.462189435959
682.9297788143158
1123.748905658722
1206.5899398326874
538.5684397220612
208.59202766418457
1128.5502767562866
0.0
0.0
0.0
586.5980949401855
1503.3495962619781
168.074571847



Elapsed time to compute best fit: 42.503 seconds
Cross-validation score: 0.6439124294667139
Test score: 0.6572769953051643
Best Hyperparameters: {}
6453.470726966858
240503.93354582787
514954.8033616543
40661.13736176491
15933.006079912186
8043.682196140289
786.3591024875641
327.7378034591675
504.2446184158325
223.4606523513794
5715.894711732864
56302.1524438858
2000.2086703777313
0.0
2105.751364707947
0.0
2844.088457107544
479.9982223510742
23506.575275182724
1490.9392790794373
738.328533411026
14.29390001296997
554.1017112731934
299.0804977416992
838.0087914466858
89.00835251808167
777.4852106571198
767.5562105178833
306.6277987957001
1399.7100973129272
3358.4665911197662
162.97911286354065
222.0174698829651
2188.2549018859863
340.6099216938019
0.0
0.0
0.0
6175.265417337418
611.9618539810181
2121.0917773246765
1021.8334784507751
1520.352111339569
261.10545086860657
93.89158248901367
371.2784676551819
1406.293513059616
0.0
0.0
0.0
109.74860072135925
230.74077701568604
510.595106124877



Elapsed time to compute best fit: 43.917 seconds
Cross-validation score: 0.6496648333731024
Test score: 0.684931506849315
Best Hyperparameters: {}
7475.547007799149
293102.39365935326
482315.7057976723
38335.325984716415
18460.35427260399
234.4949197769165
347.7584512233734
222.9093475341797
1807.3008711338043
45.63496017456055
20176.518152952194
14072.31059718132
4218.197725772858
0.0
2421.5405955314636
0.0
1523.3151924610138
2561.7736718654633
27398.812081336975
2710.811002969742
776.1297833919525
39.26275062561035
2581.910116672516
451.20387506484985
6.8518900871276855
4.534780025482178
1782.0860137939453
666.7866382598877
50.03195095062256
1351.1974401474
1311.869707107544
284.44732666015625
72.4193000793457
2744.508144378662
138.24444198608398
0.0
0.0
0.0
6225.205312490463
203.6416881084442
4856.517219305038
2093.6403529644012
669.2114272117615
644.335102558136
433.2761812210083
610.5852317810059
1279.3315136432648
0.0
0.0
0.0
243.85468769073486
684.4101150035858
184.7775199413299



Elapsed time to compute best fit: 41.242 seconds
Cross-validation score: 0.6548114530183532
Test score: 0.6032482598607889
Best Hyperparameters: {}
18311.15731573105
210346.92846632004
538740.1056995392
49205.90703487396
13035.136844873428
3215.7492566108704
427.8766825199127
477.048734664917
168.4846978187561
137.27657842636108
11068.631160259247
62739.89408111572
1638.4256796836853
0.0
2100.51447224617
0.0
463.92462635040283
264.33247208595276
28469.016491651535
827.1719851493835
249.12387704849243
225.6808602809906
871.8572902679443
590.5134325027466
107.73529815673828
175.21431303024292
4496.071005582809
520.6605269908905
609.2130146026611
1105.8688254356384
2379.5128960609436
143.647629737854
129.13321948051453
2439.400425195694
175.9173707962036
0.0
0.0
0.0
1947.3429446220398
351.43262672424316
966.4977271556854
198.13953351974487
2008.8423652648926
250.58215618133545
90.49244213104248
327.55223083496094
1020.0066692829132
0.0
0.0
0.0
2024.5432949066162
1317.291631937027
186.9250

## 5.3 Rebalancing Strategy - UNDER

### 5.3.1 Random Forest

In [132]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

under_randomforest_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    pipeline = Pipeline(steps = [#['smote', SMOTE()],
                              ['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    under_randomforest_nonnormalized_performance_df = pd.concat([under_randomforest_nonnormalized_performance_df, new_performance_df])
    
under_randomforest_nonnormalized_performance_df.to_csv("../data/05_model_output/under_randomforest_nonnormalized_performance_df.csv")



### 5.3.2 XGBoost

In [133]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


under_xgboost_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    GXBoostPipeline = Pipeline(steps = [#['smote', SMOTE()],
                                    ['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    under_xgboost_nonnormalized_performance_df = pd.concat([under_xgboost_nonnormalized_performance_df, new_performance_df])
    

under_xgboost_nonnormalized_performance_df.to_csv("../data/05_model_output/under_xgboost_nonnormalized_performance_df.csv")




Elapsed time to compute best fit: 3.092 seconds
Cross-validation score: 0.08187101904534172
Test score: 0.09121621621621621
Best Hyperparameters: {}
0.02007926
0.10558209
0.1426677
0.018283622
0.011181448
0.06616544
0.0009087479
0.0010768097
0.0
0.0
0.0273724
0.025428548
0.0023511285
0.0
0.0
0.0
0.010377506
0.005100776
0.011807043
0.025579434
0.0
0.023222117
0.020792212
0.024090366
0.008701964
0.0
0.0021149255
0.03472791
0.0
0.0013025353
0.0038388974
0.042347673
0.0052571134
0.007151521
0.0027232459
0.0
0.0
0.0
0.010757004
0.0
0.014489921
0.0015638437
0.0
0.006820341
0.0
0.0
0.00060888333
0.0
0.0
0.0
0.0076069133
0.0073336614
0.00556647
0.009299065
0.0
0.0
0.0
0.011462108
0.022199448
0.001264637
0.0051673255
0.0026867888
0.00087391346
0.0
0.0
0.0014364087
0.037813373
0.0
0.0014872153
0.0030810114
0.003167987
0.0
0.0036808837
0.0
0.0043826653
0.0
0.0
0.0054444796
0.004146659
0.0
0.0273033
0.0028628248
0.003498483
0.004593967
0.0026418404
0.0077171396
0.0007979749
0.0
0.00080883317
0.001



Elapsed time to compute best fit: 2.800 seconds
Cross-validation score: 0.0793308477636119
Test score: 0.08944346289752651
Best Hyperparameters: {}
0.056514602
0.08045829
0.12833233
0.011918627
0.017324626
0.0034693924
0.028883643
0.0
0.0057548503
0.0
0.018187422
0.024259416
0.0047401725
0.0
0.0079094935
0.0
0.0047584046
0.026074845
0.020458877
0.016278557
0.0
0.011545029
0.017223924
0.008865115
0.002568532
0.0
0.0018186375
0.019662015
0.00065579277
0.0032688899
0.0003457123
0.0042099818
0.0005682455
0.0023260282
0.0032373758
0.0
0.0
0.0
0.0071172817
0.0
0.0048966976
0.013753249
0.0
0.01104991
0.0019060323
0.0
0.0041641537
0.0
0.0
0.0
0.005144088
0.00895777
0.016012568
0.0
0.0
0.0018940701
0.0
0.0014380063
0.003097117
0.01085579
0.010537972
0.0026941076
0.0
0.0027172577
0.028892078
0.0035576299
0.0050632004
0.0
0.0011873374
0.0022933143
0.002885696
0.0
0.0075706993
0.0048213373
0.0033623783
0.0
0.0005428062
0.004792278
0.0046445993
0.051999222
0.020768145
0.0011910343
0.019973865
0.018



Elapsed time to compute best fit: 2.878 seconds
Cross-validation score: 0.08307346927180878
Test score: 0.06665569453587887
Best Hyperparameters: {}
0.008346699
0.12035615
0.15451209
0.01308296
0.008979454
0.015637051
0.0
0.0012052497
0.0
0.0
0.0
0.06062324
0.0048205866
0.0
0.0
0.0
0.0447928
0.028324569
0.030837676
0.0
0.0
0.0005945565
0.0
0.0151856495
0.0025463316
0.0
0.0
0.0068240627
0.0
0.026403716
0.008680328
0.005806302
0.0054078763
0.0008920068
0.0017792782
0.0
0.0
0.0
0.0047362465
0.0
0.012486259
0.004109779
0.0
0.00062980526
0.0
0.008116448
0.0
0.0
0.0
0.0
0.0029834535
0.0037224442
0.016171405
0.0010571892
0.005836672
0.008177033
0.003106946
0.015870374
0.017435206
0.011695891
0.0022116818
0.00051862217
0.0
0.0
0.00062819343
0.0039759837
0.0
0.024811326
0.009202186
0.0024543682
0.0024268893
0.0
0.0
0.002715832
0.0018379855
0.0
0.0
0.0048662275
0.0071769687
0.0
0.0
0.00226438
0.0067516826
0.0033430257
0.0021624926
0.039551705
0.010024646
0.0
0.0
0.0017076089
0.00802329
0.0
0.014



Elapsed time to compute best fit: 2.784 seconds
Cross-validation score: 0.07971816044958403
Test score: 0.08984725965858043
Best Hyperparameters: {}
0.0032501712
0.08140225
0.13237923
0.01404433
0.0044229073
0.018916955
0.0034004962
0.0008070924
0.0
0.0020605724
0.012407721
0.07439489
0.005357952
0.0
0.011566909
0.0
0.0018095798
0.02151702
0.022104973
0.047716774
0.0
0.0021630586
0.02577683
0.0
0.0053553935
0.004899055
0.061860684
0.003320219
0.030758908
0.015155393
0.0
0.0034736653
0.0052882563
0.0012120535
0.0049771946
0.0
0.0
0.0
0.00043917485
0.0
0.007260162
0.025819104
0.0
0.004423056
0.0
0.0
0.0057205893
0.0
0.0
0.0
0.008100691
0.005262393
0.0020691706
0.0
0.0
0.005924521
0.0
0.00579655
0.0037182234
0.00785871
0.004019769
0.004822809
0.0
0.018107902
0.0015845932
0.0011691825
0.0025084224
0.0
0.0054010614
0.0056775184
0.0
0.0
0.025117354
0.014300651
0.011306621
0.0
0.005346616
0.0
0.0031834198
0.0
0.0048368103
0.0053373203
0.0035271998
0.007817141
0.0147607075
0.023234107
0.000980



Elapsed time to compute best fit: 2.804 seconds
Cross-validation score: 0.08145243057718211
Test score: 0.08936261843238587
Best Hyperparameters: {}
0.03831591
0.07255998
0.12341629
0.015408364
0.014929831
0.005773015
0.0
0.0
0.026812749
0.0
0.041288916
0.085076556
0.005670213
0.0
0.010632507
0.0
0.014857474
0.07763494
0.022943502
0.017570619
0.0
0.0005010548
0.0038946932
0.051420916
0.007682042
0.0
0.00030573813
0.0023081119
0.0
0.0014813639
0.015997538
0.0
0.0009249694
0.0
0.0
0.0
0.0
0.0
0.009440812
0.0
0.0061995434
0.0032903145
0.0
0.00067558605
0.008418989
0.0011249001
0.0032520085
0.0
0.0
0.0
0.0
0.013053011
0.0010586411
0.00094018044
0.0
0.012351251
0.0010615552
0.008300192
0.02966237
0.010538799
0.010824042
0.0116047235
0.0
0.0
0.0
0.0
0.0034237416
0.00080090814
0.0004673937
0.02877351
0.0008659082
0.0
0.00059632433
0.0033392853
0.005728974
0.0
0.0
0.0
0.001635859
0.0
0.0002897723
0.0009639382
0.0
0.0074891783
0.010625241
0.021632101
0.0
0.0
0.0011485604
0.0
0.008452666
6.57867



Elapsed time to compute best fit: 2.876 seconds
Cross-validation score: 0.08116474077984719
Test score: 0.08209538702111024
Best Hyperparameters: {}
0.012086683
0.0832247
0.12645508
0.01157493
0.035355963
0.0023423107
0.02124529
0.00222926
0.0016485465
0.0
0.017202612
0.0107479775
0.001523243
0.0
0.005832788
0.0
0.0
0.0
0.011031871
0.03182611
0.0
0.0
0.008452733
0.010303047
0.0
0.0
0.011953797
0.018532382
0.0
0.013071719
0.00050018716
0.0045868917
0.0019718744
0.0059195124
0.0037330445
0.0
0.0
0.0
0.0031169113
0.0
0.002631286
0.014227571
0.0
0.0031360222
0.0003827741
0.0
0.003492345
0.0
0.0
0.0
0.0030541786
0.012849308
0.008257092
0.030320574
0.0
0.0
0.005298782
0.0076990915
0.0
0.01815278
0.002226868
0.0020483702
0.0
0.0
0.011882683
0.0
0.012336338
0.0
0.0012754166
0.0
0.0014457755
0.0
0.011602801
0.00947215
0.004566202
0.0
0.0035954982
0.0057279584
0.01356325
0.014588103
0.0056087626
0.002021153
0.004847028
0.010526353
0.0075194384
0.014505702
0.003763567
0.0
0.0062857294
0.001545295



Elapsed time to compute best fit: 2.738 seconds
Cross-validation score: 0.08229972594953237
Test score: 0.08390705679862305
Best Hyperparameters: {}
0.0101295505
0.1070519
0.122618325
0.018144585
0.028411768
0.03705387
0.0043313326
0.0
0.0
0.0
0.0
0.022740778
0.008036823
0.0
0.0
0.0
0.0009963331
0.0
0.026170557
0.044248346
0.0
0.0
0.015004829
0.0
0.018679138
0.0031531523
0.0003755993
0.0024954625
0.0013102428
0.0032413017
0.0074099503
0.0038890634
0.0048465244
0.0
0.017421883
0.0
0.0
0.0
0.01422997
0.0
0.0017112986
0.007042538
0.0
0.0009497132
0.0
0.0
0.0018051609
0.0
0.0
0.0
0.004821386
0.01029713
0.004499781
0.0056190873
0.018154364
0.0
0.0014066596
0.0034184128
0.024383646
0.011825665
0.011545125
0.0048441943
0.0010384314
0.0
0.0038341323
0.0008574178
0.0045747766
0.02547909
0.0078085377
0.009585592
0.016512405
0.001311816
0.0
0.0045790053
0.0014062875
0.0
0.0062626763
0.02808071
0.010556228
0.010679351
0.0
0.0052984743
0.0
0.011581613
0.011169941
0.029688109
0.008619016
0.0
0.0
0.0



Elapsed time to compute best fit: 2.690 seconds
Cross-validation score: 0.08228508360727665
Test score: 0.08695652173913043
Best Hyperparameters: {}
0.026530946
0.08494355
0.13230309
0.026113426
0.078324236
0.01778495
0.0063481824
0.0
0.0022554414
0.003222252
0.0064332862
0.051159516
0.0015324875
0.0
0.0
0.0
0.032160386
0.029615715
0.001050273
0.002385451
0.0050003454
0.010555377
0.008350949
0.023147581
0.0
0.010518437
0.0
0.017083684
0.00033858686
0.005513837
0.0018356852
0.0044593364
0.0
0.0011089916
0.0049341503
0.0
0.0
0.0
0.002848568
0.0
0.0032839281
0.031382468
0.0011887301
0.022956789
0.0
0.0
0.0021778042
0.0
0.0
0.0
0.0070967977
0.016754955
0.0037444693
0.0
0.0
0.0055181775
0.0017175461
0.0072440305
0.0190779
0.0013667627
0.004534422
0.0027838121
0.0
0.0
0.00358981
0.0042694174
0.0011566858
0.0
0.002646969
0.0007778277
0.0062217135
0.0
0.0056659225
0.0026661893
0.0031050937
0.0
0.0
0.012129334
0.008631829
0.010500779
0.004155919
0.008664468
0.01940478
0.016703265
0.0016153671
0



Elapsed time to compute best fit: 2.906 seconds
Cross-validation score: 0.08384367315293459
Test score: 0.08198380566801618
Best Hyperparameters: {}
0.024500983
0.09758427
0.14457312
0.004686889
0.004406467
0.009058073
0.0057396623
0.0
0.0
0.0012393254
0.008650686
0.01726124
0.014960328
0.0
0.019385885
0.0
0.0042959065
0.057694174
0.024354849
0.0013411469
0.0
0.0
0.014027965
0.0
0.012137053
0.0
0.0023732698
0.010435659
0.0
0.00090206554
0.009755115
0.0007101832
0.0015975866
0.0002319221
0.0021620577
0.0
0.0
0.0
0.02771937
0.0
0.0017091398
0.0075756637
0.0
0.0032566504
0.0
0.0
0.009102961
0.0
0.0
0.0
0.004806641
0.009493845
0.008483436
0.0025664936
0.0
0.0
0.0032610947
0.008107318
0.023577806
0.010116601
0.0023653528
0.00437498
0.0
0.0
0.030031847
0.0
0.0049647777
0.0012174558
0.0
0.002376937
0.0022444166
0.0
0.04554739
0.00092858897
0.0013146305
0.0
0.0003039259
0.0
0.0025792462
0.033323973
0.003909937
0.005075274
0.011606549
0.006126147
0.003974964
0.0024494182
0.0048687295
0.0
0.0
0.



Elapsed time to compute best fit: 2.752 seconds
Cross-validation score: 0.07962728164123614
Test score: 0.09483546617915906
Best Hyperparameters: {}
0.006907721
0.07800329
0.09911262
0.012809204
0.016668051
0.00601786
0.0
0.0
0.0152396
0.0
0.025299512
0.025117785
0.0066434983
0.0
0.022276355
0.0
0.0012485107
0.013458236
0.024121042
0.019357864
0.00390808
0.0
0.02368729
0.006885993
0.009677124
0.0
0.0
0.019902624
0.0
0.0022969993
0.0011102952
0.0033544833
0.0010775948
0.00037515166
0.009546133
0.0
0.0
0.0
0.020362142
0.0
0.0011073212
0.026348861
0.0
0.0011081555
0.0
0.0
0.0044127633
0.0
0.0
0.0
0.029755918
0.002803321
0.0070823766
0.0019736374
0.0
0.0012505202
0.0018299634
0.0069658286
0.009707148
0.018499114
0.0016731105
0.00970445
0.0
0.0
0.0
0.005296522
0.0
0.0
0.004318943
0.0014644796
0.009204399
0.0053926622
0.0
0.00029402488
0.0011424865
0.0
0.0007646179
0.013261799
0.0007011852
0.011342695
0.013809338
0.002688959
0.0023190554
0.0030570223
0.00033847085
0.016469024
0.0009281444
0.



Elapsed time to compute best fit: 2.913 seconds
Cross-validation score: 0.08392400880893675
Test score: 0.0847107438016529
Best Hyperparameters: {}
0.012092454
0.12322545
0.15545362
0.019734466
0.0033700208
0.006722224
0.0017588929
0.010664246
0.0
0.0
0.012364507
0.0363556
0.0129787335
0.0
0.0044786893
0.0
0.0070458897
0.0
0.00065921503
0.044157322
0.0
0.0
0.008787724
0.0
0.0
0.0
0.0
0.0069868057
0.00092082197
0.016286477
0.015367205
0.057258584
0.0009108924
0.0025846537
0.0
0.0
0.0
0.0
0.0036700438
0.0
0.0051565603
0.005985946
0.0
0.0015316599
0.0052345195
0.0
0.022459634
0.0
0.0
0.0
0.0050585154
0.012200055
0.00570188
0.00298113
0.009733742
0.010482548
0.0046423925
0.0028543568
0.015964672
0.00814726
0.0035801234
0.0017676802
0.0
0.0
0.0045200028
0.0009337496
0.000496279
0.0
0.008121319
0.0033200334
0.0
0.0
0.0
0.007612325
0.0026827927
0.0
0.0
0.0072264182
0.0387586
0.009314973
0.001763316
0.0071815406
0.012619143
0.0048032566
0.0040856176
0.0
0.0017603391
0.0013506089
0.031586453
0.



Elapsed time to compute best fit: 2.840 seconds
Cross-validation score: 0.08493966046892244
Test score: 0.0867750439367311
Best Hyperparameters: {}
0.039169557
0.09629051
0.14209297
0.0188703
0.0077101113
0.01070765
0.0
0.020978566
0.0
0.0
0.00690914
0.025229609
0.0022214972
0.0
0.019543879
0.0
0.0
0.0
0.044362433
0.0265005
0.018380428
0.0
0.02612805
0.0046369475
0.0045369035
0.0035761024
0.0
0.002158138
0.0035563854
0.00081678695
0.0
0.00041414532
0.005222709
0.0009242126
0.0027565705
0.0
0.0
0.0
0.00085268356
0.009630036
0.0031566673
0.014889666
0.0
0.0050427876
0.0
0.00033894015
0.014286217
0.0
0.0
0.0
0.0054169865
0.0072806478
0.0067558796
0.0
0.0
0.0025690119
0.01252347
0.005482243
0.0037291057
0.0061865677
0.0
0.0030277213
0.00811819
0.0014834125
0.023430582
0.0022859604
0.0029630195
0.0
0.018039087
0.0011172107
0.014290017
0.0
0.0009108301
0.0022737873
0.0012561718
0.0
0.06435509
0.01460556
0.012647326
0.0
0.018753735
0.0056995964
0.016072996
0.00091399846
0.0015750128
0.0068882



Elapsed time to compute best fit: 2.689 seconds
Cross-validation score: 0.08138452460457232
Test score: 0.09440400363967243
Best Hyperparameters: {}
0.010190222
0.09678367
0.17293324
0.0333409
0.020002889
0.01807296
0.0024413927
0.007791262
0.0048074205
0.0
0.0043281894
0.040442947
0.0012200467
0.0
0.01844157
0.0
0.012081986
0.0050495695
0.014507594
0.0035613286
0.0
0.001962085
0.0027600282
0.0012677833
0.017231135
0.012764578
0.0070131845
0.012539963
0.00319602
0.018229011
0.015214957
0.0020133816
0.001689384
0.0
0.0023015563
0.0
0.0
0.0
0.014534887
0.0029148422
0.0055013695
0.00033859693
0.0
0.0020401904
0.0043531433
0.0
0.012792172
0.0
0.0
0.0
0.029414782
0.020126669
0.0015198715
0.0
0.0
0.00239996
0.0117605645
0.0030638531
0.02844428
0.0028677424
0.0084100785
0.002332029
0.0
0.007880497
0.017821195
0.0
0.00048961607
0.0
0.00073232997
0.003995691
0.0008914807
0.0013127805
0.004351684
0.005562767
0.0011517719
0.0
0.0
0.0079031605
0.005442792
0.0
0.0
0.007821076
0.0067136427
0.0047957



Elapsed time to compute best fit: 2.739 seconds
Cross-validation score: 0.08354331154843188
Test score: 0.08021933387489848
Best Hyperparameters: {}
0.019898003
0.09981379
0.1224445
0.019974865
0.011371006
0.016307315
0.008605514
0.0009828822
0.005185084
0.0
0.010956427
0.026224082
0.016707458
0.0
0.003080253
0.0
0.013796891
0.027860511
0.05073992
0.0022663148
0.002967118
0.0
0.0
0.0031165336
0.0
0.00047206075
0.0
0.0043945145
0.009196079
0.014532402
0.02242484
0.0008880894
0.0018937095
0.0
0.0037665742
0.0
0.0
0.0
0.0051385975
0.0
0.007521996
0.008902577
0.015071032
0.009503117
0.0037897022
0.0
0.0017309524
0.0
0.0
0.0
0.0027387955
0.0055573615
0.0052574216
0.0006293727
0.010933974
0.0022205834
0.005117476
0.013654565
0.010222555
0.0040775724
0.05593901
0.0038255134
0.0
0.0
0.014557171
0.0
0.0065003093
0.0
0.0014159888
0.0
0.023584884
0.007836366
0.0022438357
0.004225959
0.00949488
0.0
0.0
0.00662991
0.018499888
0.0
0.0
0.0020642001
0.0007103426
0.0
0.0041778805
0.0030772246
0.0054997



Elapsed time to compute best fit: 2.818 seconds
Cross-validation score: 0.08277139789810216
Test score: 0.08213141025641024
Best Hyperparameters: {}
0.0063025816
0.091700815
0.12558271
0.008128247
0.008796152
0.007192657
0.03927178
0.00333935
0.0
0.0
0.03522196
0.05252528
0.005587175
0.0
0.012609075
0.0
0.0053100623
0.018654423
0.016936487
0.0090121655
0.0
0.0
0.01543817
0.007308576
0.0069684335
0.0
0.0006959042
0.048803195
0.0002508327
0.0023001763
0.0026773594
0.00080763747
0.0
0.0
0.014915929
0.0
0.0
0.0
0.007354313
0.0
0.0053988593
0.003791567
0.0
0.0024650982
0.00033167904
0.0036886556
0.00071721437
0.0
0.0
0.0
0.0199812
0.008536957
0.0026509317
0.0032854248
0.00068560446
0.01838252
0.0015314353
0.007227171
0.025436979
0.004189166
0.0
0.0051497407
6.4243846e-05
0.0
0.007932339
0.0011824016
0.010234149
0.0
0.001107913
0.0020152917
0.0009430656
0.0
0.0011281521
0.005211186
0.0014312508
0.0
0.0013017185
0.008344854
0.00362485
0.0
0.0049589043
0.0020060742
0.0031332034
0.0043209405
0.



Elapsed time to compute best fit: 2.769 seconds
Cross-validation score: 0.07928420795614438
Test score: 0.08409468438538206
Best Hyperparameters: {}
0.008378141
0.081079245
0.14737967
0.0055768923
0.040761337
0.018935757
0.0028792084
0.002985584
0.006892541
0.033138163
0.013047141
0.009381195
0.01778472
0.0
0.0023606806
0.0
0.02400416
0.0
0.0031291093
0.012164278
0.0
0.0
0.017717712
0.004162418
0.0
0.0034622543
0.0030910352
0.016137475
0.041784637
0.006746271
0.0034919758
0.0016488219
0.0075542056
0.01328516
0.012735581
0.0
0.0
0.0
0.011144543
0.0
0.0050411923
0.010567439
0.0
0.001216719
0.016288713
0.0
0.0
0.0
0.0
0.0
0.008845279
0.0020552005
0.016728977
0.0015486712
0.023245655
0.0
0.0041761254
0.001329707
0.012892852
0.0053776246
0.003921853
0.01374479
0.0
0.0
0.0
0.0039584823
0.0007079029
0.0
0.0006633873
0.01616763
0.003977865
0.0
0.0013273109
0.0
0.001433813
0.0
0.008558718
0.0060710493
0.004309383
0.0022214341
0.002757664
0.0029535193
0.0056030042
0.009907583
0.017786723
0.03320



Elapsed time to compute best fit: 2.492 seconds
Cross-validation score: 0.08097576918269336
Test score: 0.08501259445843827
Best Hyperparameters: {}
0.013038319
0.09320586
0.20822085
0.00512768
0.01048754
0.024592258
0.0
0.029607385
0.0
0.0043358565
0.034748625
0.023767835
0.002696788
0.0
0.018775454
0.0
0.0108755445
0.023572028
0.0029737228
0.011557136
0.0
0.0
0.0050597247
0.0053876727
0.0018590913
0.018223122
0.0
0.0
0.0
0.0025590113
0.0
0.004152109
0.0027605514
0.0
0.009023605
0.0
0.0
0.0
0.0044837203
0.0
0.0
0.011364528
0.0
0.0048551224
0.0
0.0020404228
0.005674842
0.0
0.0
0.0
0.0016143997
0.012910667
0.0005206408
0.0
0.0
0.0
0.0010205256
0.0042382996
0.0057204566
0.014619045
0.010381389
0.0
0.0
0.0028700181
0.068593346
0.0
0.009834061
0.0
0.000978696
0.006800777
0.0056156493
0.0
0.0013287371
0.0
0.0029005546
0.0
0.0007336758
0.005465855
0.007521342
0.0064191697
0.0
0.008729636
0.007560791
0.028842455
0.019726813
0.017231235
0.0024060977
0.0
0.0
0.0011185303
0.0055442434
0.00216259



Elapsed time to compute best fit: 2.247 seconds
Cross-validation score: 0.08469510842584543
Test score: 0.07917656373713382
Best Hyperparameters: {}
0.006533122
0.11495197
0.18819956
0.019529944
0.015872043
0.021392712
0.0
0.0
0.0
0.0
0.0019399442
0.042159334
0.0
0.0
0.0
0.0
0.017288465
0.0077039176
0.035827186
0.01045668
0.0
0.0
0.0048086466
0.012132943
0.0020230308
0.000683912
0.0
0.017019028
0.0
0.00093264907
0.0018695063
0.035363752
0.00465656
0.0
0.00073704193
0.0
0.0
0.0
0.017553259
0.0037586377
0.010589181
0.0
0.0
0.0023035998
0.007903036
0.0
0.0
0.0
0.0
0.0
0.01262212
0.009663238
0.0058628763
0.0
0.0
0.0
0.0019366192
0.0048357495
0.0035684132
0.024996184
0.02853601
0.0019335592
0.0045989244
0.0016114841
0.0
0.0051495363
0.00555931
0.0
0.012726419
0.0
0.003046204
0.0
0.006366937
0.0
0.0
0.0
0.0
0.0024909608
0.0049159173
0.0060597393
0.038263354
0.0014313638
0.0018282669
0.010924517
0.0033929956
0.0036572241
0.0018497831
0.0
0.0006457403
0.0
0.005979243
0.0
0.0030119608
0.0028669



Elapsed time to compute best fit: 2.520 seconds
Cross-validation score: 0.07725358653640821
Test score: 0.07860429447852761
Best Hyperparameters: {}
0.0066931536
0.08571682
0.10810685
0.010895382
0.017084824
0.0106042065
0.0070260568
0.0
0.049038872
0.0
0.005225671
0.090601645
0.000826435
0.0
0.028667144
0.0
0.011939483
0.011990965
0.031522572
0.005677454
0.035227813
0.0
0.0011227916
0.0032725942
0.0006991103
0.0012651591
0.0
0.0059519997
0.009755577
0.0022590107
0.004619709
0.008452484
0.0015828897
0.007727344
0.002618779
0.0
0.0
0.0
0.0070344377
0.0
0.006588987
0.0042274017
0.0
0.00430499
0.005898919
0.0
0.037018117
0.0
0.0
0.0
0.006557506
0.014662809
0.0064270543
0.00031656318
0.0
0.027054021
0.002928114
0.0076710153
0.0052696
0.026062313
0.0
0.020312944
0.006447135
0.0
0.0067173033
0.007709046
0.0012279508
0.0
0.005366207
0.0011036112
0.006011848
0.0
0.004449569
0.0
0.0038258047
0.0007277652
0.0018488376
0.00079473073
0.0017037449
0.0
0.00052225153
0.0036825468
0.001407604
0.012213



Elapsed time to compute best fit: 2.511 seconds
Cross-validation score: 0.0826365058174067
Test score: 0.08745733788395904
Best Hyperparameters: {}
0.010999444
0.06906872
0.13520408
0.008941063
0.012876972
0.004235383
0.0
0.013693392
0.0030640734
0.0
0.015686484
0.010820405
0.0061177397
0.0
0.016400725
0.0
0.0
0.00070715667
0.021292377
0.033813342
0.00074762374
0.0004931736
0.0024186366
0.006128064
0.01208899
0.00043761564
0.0
0.02785063
0.0051343706
0.011062013
0.0
0.0024200846
0.0022128737
0.0
0.00785191
0.0
0.0
0.0
0.0063362503
0.0
0.010677373
0.0019482644
0.0
0.000993118
0.00626176
0.0
0.0004415861
0.0
0.0
0.0
0.006566479
0.009689361
0.009939173
0.0
0.0
0.0033261168
0.0035962726
0.018136118
0.006370513
0.006769934
0.0097644245
0.0047712433
0.0006571298
0.0010320471
0.0010865207
0.0075280485
0.009117022
0.0
0.016000975
0.00981315
0.0076739127
0.0
0.0015426762
0.0
0.0016569712
0.0
0.0
0.0077522076
0.0125157535
0.0007798177
0.0
0.0022253857
0.0036353718
0.00066070625
0.015041996
0.085



Elapsed time to compute best fit: 2.361 seconds
Cross-validation score: 0.0823400125686765
Test score: 0.08114215283483978
Best Hyperparameters: {}
0.02182564
0.09035967
0.14107405
0.0040408345
0.027118204
0.008353062
0.0025490588
0.0
0.0
0.0
0.028007608
0.010005217
0.0056075067
0.0
0.0
0.0
0.01903838
0.008288407
0.009413026
0.0075770216
0.017312381
0.028828617
0.025858434
0.0044533354
0.0
0.0
0.0
0.045690425
0.0
0.0052875075
0.0029231983
0.002253221
0.0062287827
0.005192526
0.0022180716
0.0
0.0
0.0
0.05186822
0.0
0.0074936743
0.0039712754
0.0
0.0021347553
0.0
0.0
0.00032519278
0.0
0.0
0.0
0.004216951
0.010839396
0.0032830571
0.00460351
0.0015624347
0.00814332
0.0026041612
0.0020385173
0.014935224
0.0
0.007934698
0.002683968
0.0
0.0054405867
0.0043355594
0.008735784
0.002762072
0.0
0.0
0.006079742
0.010106399
0.03615736
0.021369398
0.004873855
0.0012488811
0.0
0.00081482244
0.0011915003
0.006493784
0.0
0.0063374313
0.0071178325
0.0037046361
0.004324591
0.0024741688
0.0
0.008455873
0.0




Elapsed time to compute best fit: 2.589 seconds
Cross-validation score: 0.08174591456605226
Test score: 0.10962566844919787
Best Hyperparameters: {}
0.010547344
0.080679245
0.13135043
0.016917268
0.022657454
0.02670296
0.023642948
0.0
0.0075147725
0.0
0.011411371
0.017547397
0.008582268
0.0
0.024458157
0.0
0.0
0.024877446
0.0022092855
0.021611232
0.0023061766
0.00052947784
0.006832394
0.0069403644
0.0019414634
0.0
0.012411156
0.0027764807
0.002236597
0.013212804
0.00067057146
0.017175702
0.002324899
0.011473895
0.003113309
0.0
0.0
0.0
0.009254528
0.0
0.007857583
0.0028982833
0.0
0.0067117806
0.0025638575
0.0027615323
0.0014422933
0.0
0.0
0.0
0.007794764
0.0141848205
0.0
0.013717252
0.0042163176
0.0
0.013403038
0.015387167
0.024776448
0.01401478
0.009665772
0.008385689
0.0
0.0
0.0064215995
0.0023804258
0.0009548012
0.0
0.0029830283
0.00038558722
0.040528517
0.0011222195
0.004377309
0.0020933647
0.0040513747
0.0
0.0
0.0041850796
0.0030436704
0.0070108245
0.013729143
0.0042514047
0.002100



Elapsed time to compute best fit: 2.346 seconds
Cross-validation score: 0.0837839271575553
Test score: 0.07776497695852534
Best Hyperparameters: {}
0.0065853843
0.09496024
0.14805514
0.023736393
0.012787883
0.025663657
0.0112213
0.0
0.0
0.0
0.022915665
0.016452648
0.0024170785
0.0
0.014968534
0.0
0.047207057
0.008424697
0.0021699423
0.0174424
0.006414415
0.0
0.0
0.002692336
0.003815301
0.0060506617
0.0
0.014870931
0.0015234997
0.0053084334
0.0018916697
0.0037141691
0.00086623174
0.0041133617
0.000555391
0.0
0.0
0.0
0.009596417
0.024653325
0.0031960849
0.018138874
0.0
0.0023389105
0.0041231113
0.0
0.010460847
0.0
0.0
0.0
0.004019907
0.011950994
0.0058122906
0.0
0.0
0.0027950774
0.0
0.0096272845
0.011074048
0.04044311
0.0024223232
0.0010821857
2.0435444e-05
0.0
0.0
0.0010630159
0.00230246
0.0
0.010511478
0.0326458
0.0018038293
0.0
0.01401823
0.014710927
0.0009447296
0.0
0.0
0.011647795
0.0023793795
0.0068588606
0.030927323
0.0012127011
0.0021651322
0.012802853
0.0046350816
0.0
0.00101092



Elapsed time to compute best fit: 2.375 seconds
Cross-validation score: 0.0886172164144389
Test score: 0.0960603112840467
Best Hyperparameters: {}
0.042735506
0.07455419
0.11731792
0.015765334
0.05563895
0.012109596
0.003447624
0.0
0.0
0.0
0.010605352
0.02194368
0.0069674384
0.0
0.011340687
0.0
0.032797072
0.007275311
0.0047639464
0.008014283
0.0
0.0
0.036318135
0.016006362
0.0014319416
0.0020317615
0.0018330707
0.0024117117
0.0
0.0074644857
0.005089167
0.0011974493
0.0033725086
0.0
0.002271498
0.0
0.0
0.0
0.004693598
0.036193665
0.0068437536
0.00407451
0.0
0.0011635518
0.0
0.0
0.00066556415
0.0
0.0
0.0
0.0037837778
0.0050895778
0.006397693
0.009403689
0.0072687375
0.008206677
0.0016024943
0.0022133985
0.008868877
0.023253879
0.006679768
0.0018680898
0.0016949162
0.0
0.0
0.018119616
0.0014532122
0.0
0.0
0.007735473
0.0033275732
0.0
0.0017348457
0.0
0.0026168784
0.0
0.0021042528
0.0
0.00137474
0.014626785
0.0019751743
0.0069346307
0.01654509
0.0025266102
0.0036827323
0.0055162674
0.0004



Elapsed time to compute best fit: 2.478 seconds
Cross-validation score: 0.08192915396031888
Test score: 0.07745098039215687
Best Hyperparameters: {}
0.011679349
0.101962216
0.14205027
0.01242729
0.026839823
0.01375507
0.00395626
0.004250374
0.0
0.0
0.008735113
0.001891155
0.0
0.0
0.038332894
0.0
0.028191615
0.025032546
0.012773456
0.01596017
0.033318117
0.005473051
0.021959245
0.0009616076
0.0
0.0052120606
0.003288618
0.0079797525
0.00038776733
0.008222211
0.050540067
0.0012785124
0.0054125204
0.0029451502
0.006756725
0.0
0.0
0.0
0.0023408532
0.0035992567
0.010225672
0.02069062
0.0
0.010345807
0.0
0.0
0.0059115323
0.0
0.0
0.0
0.02331651
0.008991044
4.055529e-05
0.00680358
0.0
0.0
0.0059419805
0.009045035
0.015911441
0.0022518442
0.018982468
0.0046428433
0.0
0.0
0.005657532
0.001823473
0.0046790224
0.0
0.0036655907
0.0021948519
0.007429732
0.0
0.0
8.301896e-07
0.008121776
0.0
0.0011046629
0.0033235527
0.0073635243
0.004680893
0.00081314286
0.004655121
0.0066759414
0.005922462
0.0
0.0233

### 5.3.3 LightGBM

In [134]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

under_lightgbm_performance_nonnormalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    LightGBMPipeline = Pipeline(steps = [#['smote', SMOTE()],
                                    ['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    under_lightgbm_performance_nonnormalized_df = pd.concat([under_lightgbm_performance_nonnormalized_df, new_performance_df])
    

under_lightgbm_performance_nonnormalized_df.to_csv("../data/05_model_output/under_lightgbm_performance_nonnormalized_df.csv")




Elapsed time to compute best fit: 2.649 seconds
Cross-validation score: 0.27049246697399354
Test score: 0.22814733369983506
Best Hyperparameters: {}
91.0830513251487
1327.8534193158935
2198.7227406890825
38.239953551196855
78.27546706289954
128.91998595962244
0.05071767958670215
42.75941577961112
0.533118998631835
0.5133389830589294
16.96484854412315
29.355954472665925
5.035326375914373
0.0
7.871524052978424
0.0
31.530995099235966
63.025548955657655
26.929479381800114
3.812355293463477
1.145286194257922
1.4088345803320408
1.3643552906635108
2.9122484033612697
0.09200038338127747
0.16430368500095938
1.756469305958568
5.532901661168444
9.896717314534797
13.199101451082466
12.205934032177833
14.780051109823837
12.567811124854359
0.7245283489860013
12.861468152936553
0.0
0.0
0.0
28.66648003249575
0.49772098660469055
9.200478142148029
4.867870638647995
0.054101695598660626
11.914879471786705
3.7430060491706296
0.0
2.5436660797321156
0.0
0.0
0.0
7.554150654271712
10.449341372973768
7.3758078



Elapsed time to compute best fit: 2.592 seconds
Cross-validation score: 0.2636767621573424
Test score: 0.27835051546391754
Best Hyperparameters: {}
32.34265490648996
1369.8522291343609
2164.961197489016
199.12705568473967
21.584210203220916
78.93294165844968
2.098590314492002
18.67766159485904
0.07157819718122482
2.107439968312974e-06
93.06912871482518
26.389336978454928
12.100265301097693
0.0
1.5589510314166546
0.0
5.668016432591114
33.08170598426336
73.60427739547539
92.11388322819819
0.0005829740362628399
0.0004162651212027413
8.536485963630582
17.69528670202155
1.1343869181582704
0.005734353220674088
2.2730060811150277
2.6730872813423576
11.15866176003722
9.63016875503006
14.29177785047799
5.125905064953406
16.664670326475058
1.4778400610666722
13.08529485490429
0.0
0.0
0.0
0.30633616087769866
0.0
3.6639005105469096
12.732621935506078
0.12983789667487144
7.868388627574266
1.2803899157958787
8.207417989680721e-08
2.2489965909833627
0.0
0.0
0.0
1.1588036668216986
7.018002121573488
3.



Elapsed time to compute best fit: 2.625 seconds
Cross-validation score: 0.26077555516931733
Test score: 0.2439759036144578
Best Hyperparameters: {}
76.98182239210306
1426.96964428884
2293.048767882951
97.14993134982336
66.13977681886455
87.03712184017739
2.482601661585022
2.5289883414469614
0.8373999257028117
4.445239057118993
67.90487980970592
71.85417841901325
4.47683579413706
0.0
2.1890743524836545
0.0
4.019100710530287
8.84637739747581
51.55496577552929
83.25034522139197
4.4135896410502085e-05
0.037292939922991764
13.770569030678697
9.342914269390306
0.23030014634605322
1.8017638790303465
0.23480039038146927
16.51974426002268
0.0162406759785792
1.7709065528224759
1.1637382438424961
0.31156783308092595
13.80719288545818
2.7394384028566794
6.759955973648025
0.0
0.0
0.0
13.18056441277399
1.316235835702173e-05
13.201834841815874
8.257347726898429
10.555801345523832
20.184733080993468
0.0027299962418886325
0.0
4.153074309899239
0.0
0.0
0.0
2.413379075770045
2.8821971064792535
22.8315695



Elapsed time to compute best fit: 2.515 seconds
Cross-validation score: 0.24913891020211199
Test score: 0.24217365623154163
Best Hyperparameters: {}
79.95443534208522
1465.1000231006021
2102.9846524338477
95.88590848286242
60.69487235206469
16.364254395314358
0.8901664590318386
1.615571767219052
0.46825385792180896
8.703500057549718e-06
16.929897270258518
133.7044702916617
7.431312976259505
0.0
1.2112580154972932
0.0
16.799141049915516
0.2555212981356352
93.0742348801675
12.35581011987869
57.506173810012065
2.190100430078581
3.630519986717667
2.5218113253829806
0.8580162713195598
0.05472178694617469
7.883654033574672
3.23080296950132
0.6187584478816152
7.690544026230111
0.29602939245344556
13.57164642679391
17.793729701521183
4.910792582949767
19.562435825136788
0.0
0.0
0.0
22.031711799821288
0.13186223038064782
5.026508926940693
1.684242470373594
0.0
3.2705973203093874
4.2064568836431135
0.0
0.8446463271801612
0.0
0.0
0.0
8.060835438427313
36.21423057452895
2.6094190529505426
0.931531



Elapsed time to compute best fit: 2.597 seconds
Cross-validation score: 0.264670610075591
Test score: 0.24892440073755379
Best Hyperparameters: {}
19.040351933848715
1417.240454721677
2339.7065459283167
82.00009313460806
70.05237951064599
13.122686636131078
4.356250149823609
0.01667432116551737
6.012852611547714e-05
0.45212000608444214
42.82150622841147
149.59154414689795
34.81877729758166
0.0
0.5301185826865549
0.0
26.93928020377519
4.495162261713526
12.070011898788866
31.450968175338502
0.0014673077356985864
2.658176554357299
1.9122064893609263
5.130368444454931
6.629927210764436
0.8581773637724994
0.04237969487229265
7.916122544453987
5.8956668214407415
21.936671322423546
0.11998902695834168
5.643944072469596
0.15178825914384933
1.4589223025930425
1.9988890412171512
0.0
0.0
0.0
13.922682373971275
1.2886600494384766
2.041115087889384
7.045805670444984
1.6891285959571007
4.862758101847394
0.04591348834496545
0.0
11.559099544655268
0.0
0.0
0.0
62.03367727612033
12.501719309571833
22.44



Elapsed time to compute best fit: 2.515 seconds
Cross-validation score: 0.25132176216739505
Test score: 0.2644996813256852
Best Hyperparameters: {}
56.221951637677236
1417.5873668136694
2096.799680555453
81.30669779599747
102.44827244028342
43.362904921496536
0.209650127433691
0.19702111546834905
0.0
0.5759170055389404
45.478484928667115
146.488009960128
7.379123389328674
0.0
0.6361120117876382
0.0
64.37824077360253
44.21758104256195
36.498400653588334
117.4450588500466
0.00012001517278648066
0.00012306673175466187
3.326234222150708
2.685452196678947
2.586580881111473
1.7383072738957708
16.007876332212135
21.67151969127268
0.13310041755592072
0.7750684052715906
7.061429315053289
33.48957079591648
7.302555564842704
0.6762285986114875
41.55985063503555
0.0
0.0
0.0
15.40008092387244
3.772260060941335e-05
4.319543515141949
4.2047136932249725
0.0
9.04503797886815
0.5213983075327923
0.020376499742269516
2.16653115865223
0.0
0.0
0.0
34.80212867982944
14.86451702468608
1.5333227101661402
1.294



Elapsed time to compute best fit: 2.531 seconds
Cross-validation score: 0.24965258471616028
Test score: 0.25450031036623216
Best Hyperparameters: {}
52.983790109687085
1281.5637264989978
2414.4059725380444
55.05268521839753
25.883252568426677
2.1319627678639304
0.8628870689516344
2.0553215024177947
9.223333538761758e-09
0.0
162.190961162239
124.84126897516853
15.299627243993314
0.0
3.629200709201222
0.0
0.03131664610179016
0.003582000444083633
39.2787308124911
76.98423842739768
0.03763205220951704
1.843639899401655
0.17537802214251763
0.973880365272862
4.238500110886889e-07
0.9346061138232653
3.3351119643754146e-09
1.6344566484414969
0.24740787320771454
0.5183819442879907
3.340833303319145
5.520432719408439
0.6770041132221842
0.6902529218572477
3.4556838348607144
0.0
0.0
0.0
23.936542817137674
0.0027517787566466723
13.381873482713686
3.5882056583195743
0.0
2.9404594069491377
1.7239472978458916
0.0
6.72857440454607
0.0
0.0
0.0
3.202142795790277
4.687116405002612
1.2093072065554769
0.250



Elapsed time to compute best fit: 2.351 seconds
Cross-validation score: 0.24965528049593652
Test score: 0.27000650618087185
Best Hyperparameters: {}
5.487234133132359
1201.3807058496295
2355.5373910879493
119.37952552106107
58.901840498490515
66.64253463107939
4.244751075220788
34.20383590956496
1.7833325580141057e-08
0.39254780791587507
20.985721147898825
235.67064703066765
25.630512506530366
0.0
1.7241258494771248
0.0
6.607829283465385
3.3406346749104747
62.66293116132972
16.021952085550026
21.966411024089386
4.310540104638705
13.831535610060627
0.3807399142655967
2.081464548238886
0.4485948427081894
0.0775301945400877
0.49731970134011627
1.5590285471625953
1.5358943502504125
7.189370018414683
1.2965533935372293
0.35650825794542956
2.2865531893239677
0.04619348408207713
0.0
0.0
0.0
13.127501084380745
0.0
19.180332573122556
10.15363231501349
0.6911860108375549
1.2696781680069575
1.5668267977183064
7.562178303333056e-06
0.966926641933374
0.0
0.0
0.0
47.74121826373115
16.54023703429824




Elapsed time to compute best fit: 2.301 seconds
Cross-validation score: 0.2609412887542915
Test score: 0.2593295382669196
Best Hyperparameters: {}
24.733393827855178
1365.256825784385
2168.6551906107557
93.67695715706681
18.96421219022626
17.295007780868882
3.92586796409282
14.563450112920627
19.738112238003627
8.036049202132919
70.12956652768747
54.12583230804973
1.4882257725412273
0.0
22.149860783743783
0.0
1.145684963546078
1.109637276051859
48.6537416241116
9.523483812241512
32.15951617513204
0.14795125470749682
7.819401189012574
3.347142785096821
3.8885708115741835
1.3521020672342274
2.7470408442324166
59.81780706124972
1.860735779243301
30.00673870486355
0.4051178641873281
0.853189466849248
22.95855607847804
0.2390913896296496
12.430682878654997
0.0
0.0
0.0
17.046396345187
0.2402493953704834
15.90032589520166
48.62892708075065
1.8187099695205688
8.637808621103853
0.24572518939601373
8.754150182532783e-12
2.0885538747255934
0.0
0.0
0.0
19.16914649005321
31.58123928456441
5.8159654



Elapsed time to compute best fit: 2.600 seconds
Cross-validation score: 0.2563342851141591
Test score: 0.262148337595908
Best Hyperparameters: {}
101.3495961681345
1317.7417642551895
2218.297937148905
48.87654125802153
21.925750245991615
97.959239016857
7.457977066678701
4.988118083309928
0.7201315746218193
0.28681601974931503
13.12508760006188
97.7273007375801
30.349332341830436
0.0
3.0697746658267597
0.0
71.50626856561553
5.7016783040992784
0.10793016886022055
0.981339038037914
0.7239630222320557
1.2732599057490006
12.245902200174388
8.098215667018167
7.511581736588447
7.250693067995371
0.5672943752454058
95.03237948823742
0.6727724079339961
14.284620130590849
16.47025629394954
18.5462549937416
9.735474971743317
27.762287208920455
12.431476504206138
0.0
0.0
0.0
3.0292658280788203
0.5844979882240295
1.259947602931863
23.70549885182271
0.0
7.026267345177333
3.877306126745019
0.008754880167543888
5.442181541542529
0.0
0.0
0.0
39.50650158976519
26.291743544502506
14.78567913593175
2.8782



Elapsed time to compute best fit: 2.629 seconds
Cross-validation score: 0.25036927761940336
Test score: 0.2706270627062706
Best Hyperparameters: {}
20.35106176177223
1498.9134167151594
2278.3274680870786
100.2006441122223
46.754205004981394
68.61378699309927
2.4270195207499956
0.7698939263154565
0.30047884016469084
0.03962859939683776
28.945849181269416
42.81288049053164
0.12933152489375035
0.0
5.873677813409029
0.0
15.918124634575568
8.742589258213119
19.53612727025407
10.650010738517393
4.7473891713131575
0.3626320611380915
2.740812579388148
3.201338638377564
0.04634637819413001
11.675812502841083
3.650688599851754
83.3046529465666
4.726650426650581
5.9651098434681185
21.982314437121122
24.707299035819524
2.97891507099073
0.6149116910660792
1.2444153933942181
0.0
0.0
0.0
5.595631438592761
0.0
15.134555016019462
11.989815750190246
0.0
10.530523426023704
1.1144027499762077
1.9017857439962427e-06
0.11748038682449646
0.0
0.0
0.0
5.078712489621391
8.616851813352923
3.307346395590553
6.802



Elapsed time to compute best fit: 2.441 seconds
Cross-validation score: 0.24330590460405208
Test score: 0.265991133628879
Best Hyperparameters: {}
23.278010892168474
1398.0756307372217
2231.8882031465446
87.78923210346686
70.80107093749643
161.52322591960294
4.370135905986107
2.8818973730241746
4.382595112233753
0.0026180892782576848
20.063070194472214
12.210241921555854
26.548417835773307
0.0
3.7179652951792788
0.0
47.354594053220204
0.2229869916069814
33.89983200806803
109.17104958575476
0.023003409289295673
0.24010155191181903
24.823838736861944
4.330710693847669
16.184172608530332
7.09781300289869e-06
2.845750523688948
23.182480823620722
0.0001357641390244968
0.9031781285387819
0.1601806599779751
6.683082845387958
17.493312247695144
4.220859465659736
50.245090567875614
0.0
0.0
0.0
23.826099978144743
0.0
5.660091093791998
8.832054813768224
1.111041009426117
11.780623228084732
4.963548689325023
0.36659049292183
3.9069319322983347
0.0
0.0
0.0
10.555224186365644
11.784373942643242
13.2



Elapsed time to compute best fit: 2.503 seconds
Cross-validation score: 0.24309388812684993
Test score: 0.22613065326633164
Best Hyperparameters: {}
53.99331156356552
1522.9888193658737
2230.124162966376
55.39683586554435
34.96125987826157
17.32051692754558
0.36843593835834554
0.2890678730377982
0.0018691800069063902
0.2541845079140863
10.50463275770348
63.534654783822894
1.0483327185081421
0.0
1.4903495953640293
0.0
31.787157868809167
24.111155867133363
79.34377947079975
3.0001011441489647
0.038974201217231746
7.337480010519791
2.3604582392683167
1.892844255694798
0.07418678744377871
0.05492698893272774
9.054401308795377
28.7539030380188
9.13796742638462
11.872405421188986
4.409635106137872
4.8375688373079715
10.819532059028054
1.1086269868537784
13.084973487555615
0.0
0.0
0.0
11.015862824490682
0.060519199818372726
19.33845533501088
45.777254701142745
0.06787521461228607
6.22191140392691
0.37921720004805337
0.0
3.8088770300987447
0.0
0.0
0.0
23.022268202368405
38.99945252255585
9.225



Elapsed time to compute best fit: 2.516 seconds
Cross-validation score: 0.2545509085353336
Test score: 0.29985549132947975
Best Hyperparameters: {}
25.953558389520946
1093.3198344359942
2369.0959545409155
150.40643878406107
70.69955167580618
199.98252101692452
10.084817101796595
0.6772965207611121
0.02898675585465682
0.03604372626357133
24.684607505212885
124.65390389228662
37.50854512574122
0.0
2.294545638068677
0.0
34.59737055711851
4.735536240257819
19.419335373634464
12.590958640071511
0.0004917785389853824
0.0
37.635561032137886
3.9538126629935837
0.9776392824830662
0.014611195894303819
0.4760387523826145
6.085112175617525
0.332597267810848
0.9212158554648355
2.468933790441668
7.167796699107342
39.656439058343274
3.023090044962373e-08
2.007074694158433
0.0
0.0
0.0
5.372466427941789
0.11976700276136398
9.655314021437576
4.560351231019874
1.6738818711559134e-05
5.917056855636911
1.6339591910961841
0.0
0.10025915334522528
0.0
0.0
0.0
13.197552020291571
16.94193527533684
16.5880497913



Elapsed time to compute best fit: 2.629 seconds
Cross-validation score: 0.25801442362089577
Test score: 0.26874585268745854
Best Hyperparameters: {}
20.77176206766535
1434.2832751033684
2285.5642353879643
35.94985185010703
99.5493213589104
29.56726356383544
0.6670785220982509
3.5661991858187005
0.36003708084987807
3.487277030959376
13.875516308949763
135.65826286032583
2.7653567016099316
0.0
0.020625676645731172
0.0
31.590197540585635
0.8008199160263536
97.56768138173473
0.3017878150462037
4.210645032993057
0.00028744558881987903
0.07643688366083268
0.5043257214529149
3.212192108359873
1.1443475112318993
5.266964984019321
24.62774394088955
0.8393493390262847
5.914351304449529
1.6786304630436761
10.042007810809137
2.027026832926247
0.39282306621743146
6.631652920512039
0.0
0.0
0.0
21.6549935747988
0.8812699347880653
13.335007493898388
22.0080976061412
0.2776260147220455
7.589419025950685
0.03692332490863848
0.0
0.02345276229731752
0.0
0.0
0.0
65.64239933990973
20.427101284775166
7.01396



Elapsed time to compute best fit: 2.566 seconds
Cross-validation score: 0.25190404528423493
Test score: 0.26500638569604085
Best Hyperparameters: {}
35.086124904686976
1499.3578971374957
2061.191599735883
23.646174752634273
32.341282905873726
24.024681567577556
0.306317017910521
2.736277863803984
0.0017182893396352483
1.250739956049074e-09
102.64280403148979
174.26001967718358
0.811814120367197
0.0
2.347801983356476
0.0
0.27601428447972554
85.08079343757453
20.736299673208165
111.09206027826646
1.1999528010671696
0.08805400133132935
27.433246842186637
1.441683255845276
0.563681000001452
1.0098603628216907e-05
0.8919671928476219
5.876313576030501
14.601641947234322
1.9059174529625782
10.862269965668212
7.534021201705947
12.866677709316956
0.44742135384863857
7.8838953516947665
0.0
0.0
0.0
64.8502188018831
0.0
0.26656506319275586
6.819350233862237
0.14868199825286865
26.085675964625977
3.635415918733896
2.032235612681177e-06
5.0866929569792774e-05
0.0
0.0
0.0
19.80611698200657
10.7473239



Elapsed time to compute best fit: 2.570 seconds
Cross-validation score: 0.25057973575783843
Test score: 0.25549805950840887
Best Hyperparameters: {}
49.11920522452569
1449.1693001518493
2141.5774411915627
53.80696999923805
103.44779341226659
56.07266495292643
3.173541500574764
5.035075250196037
8.522019925294444e-06
1.4561210020053181
44.583808322662925
6.610372610137364
20.543512238399526
0.0
3.7099671583346208
0.0
5.271531197944311
0.10030157039645515
75.25555908919435
72.60700003516061
0.04311990193667892
6.162277608279254e-05
6.550423618715286
2.516691437824875
1.8132114390394278
33.16178813136492
10.45938472041312
24.75097073720112
0.27317284413311427
1.9197157593468646
1.945752419122962
3.895167475788412
9.398141104291193
1.258626472909503
0.36733601391348936
0.0
0.0
0.0
13.866688491613782
4.525099939201027e-07
17.766302272413206
0.9563499184505488
0.07951030135154724
11.761832637803973
0.7622498618006954
0.040366899222135544
21.468933000660396
0.0
0.0
0.0
3.34623544278573
19.383



Elapsed time to compute best fit: 2.531 seconds
Cross-validation score: 0.25324315599103037
Test score: 0.2468768590124926
Best Hyperparameters: {}
34.23177562367971
1698.6241280781676
2102.2918394907133
107.3181373120494
86.26274088415275
71.64582845470524
2.762253077328751
1.5981945644929851
0.026597237923379424
1.1546070157232862
8.978423705326763
52.316940910460275
19.738437397691815
0.0
8.164382817340268
0.0
3.4767524060383
26.247409888924153
84.85561570162302
38.32880382593967
4.054719829582609e-05
1.980067692347598e-05
14.521613105581027
1.6699069337283654
0.10806402607704513
0.38111363885896665
11.021899782132511
1.9160151026956354
0.04380908242975279
14.608982466579437
0.03912136978879671
9.463859564738351
1.1112009799650127
0.197728623529142
23.64861427969529
0.0
0.0
0.0
6.8380435680831155
0.0
7.312706634525143
20.340387342566355
0.0
33.709205526865894
1.0708715715987864
0.000357138839461868
8.742498950481435
0.0
0.0
0.0
19.949061971780377
3.5091111440626648
3.213468302023242



Elapsed time to compute best fit: 2.507 seconds
Cross-validation score: 0.2369828387953307
Test score: 0.2636655948553055
Best Hyperparameters: {}
23.77394046218077
1516.9718008481084
2119.2669138443343
32.194584590292465
39.69342782214685
42.72848951167316
1.222304566278467
1.3954472205027013
1.1792222403273627
0.07252830080688
88.11520426382314
185.58413687752852
3.442887993100504
0.0
0.5235232524792082
0.0
5.00436754855294
5.887128375910894
51.82885663244924
40.09043956298464
2.0648419297705827
0.6082931730338785
0.21810715284983218
9.797308583029888
7.468861561563046
0.572726719499941
1.4845531379557722
16.858987897525108
0.06869373107617305
7.2350875881164995
10.241136789853629
13.369607108428319
4.325023309289993
2.015135416085699
6.910576009739833
0.0
0.0
0.0
26.83068858792332
3.4714796002210733
2.852595418129601
15.768414420308684
1.0182200590329415
1.0497324810297854
1.9722727578076222
0.0
5.390202815193642
0.0
0.0
0.0
37.74721429112782
6.149029372128931
5.7829395920109015
0.0



Elapsed time to compute best fit: 2.415 seconds
Cross-validation score: 0.2591653213016626
Test score: 0.2516881522406384
Best Hyperparameters: {}
56.67699226491552
1616.9992686812454
2069.1883828942086
148.25599759715385
85.3117805964035
116.01215963224527
0.11246793765349139
1.0247637962422398
0.6147510904148561
0.7929392521546106
27.86764570473747
72.40873531039264
28.426974058283747
0.0
7.024110713116372e-05
0.0
10.277024287364963
0.020210803082488793
4.45374866754418
69.50195718312517
0.0
1.0363640954164088e-06
0.6921797854796097
2.2096619580186014
2.417137255929214
2.731614996705738
1.2559927547293315
3.5669241048966933
0.16521091614820171
3.46615249590755
2.7509694272987986
4.540997992661945
4.712928896274283
0.01641469306446197
2.909428734045056
0.0
0.0
0.0
7.346711244205782
0.0
4.42793863411023
1.7367148954147948
0.0
0.24470842450916794
2.280749657511117
0.0
1.6444645718824642
0.0
0.0
0.0
12.436108206270362
23.907850323829337
5.012297410880688
0.055463429951274956
2.1687299067



Elapsed time to compute best fit: 2.600 seconds
Cross-validation score: 0.2531482120395657
Test score: 0.2557544757033249
Best Hyperparameters: {}
22.05321830286881
1243.971129618037
2273.042418564428
17.16626454330941
332.78316177493826
46.741942881465896
1.3015033913104928
0.3136612251224804
1.287345941066178
0.32708815377893075
73.45633709819637
46.367250315803545
0.7738593666753664
0.0
2.8171182774020247
0.0
4.709897662134057
0.471372112590803
54.93830793226687
86.33367613429384
0.0432197446733138
1.2948659957601194e-06
0.242571048719281
2.7491158987166067
0.23758520185947418
0.00018864660387091226
0.062362927357753506
3.411149497473905
39.92391324502186
9.98290449436471
3.2591455701429943
2.60624014853703
3.632627310008047
5.130031059743487
9.050157638163103
0.0
0.0
0.0
7.182997258096716
0.07283270359039307
4.927955504497092
6.4174468073723805
0.0
4.030734274217647
14.62276482771449
0.00021676099277101457
7.341514955725966
0.0
0.0
0.0
7.278809121711416
2.957810787745965
10.5922671



Elapsed time to compute best fit: 2.459 seconds
Cross-validation score: 0.2615513982357677
Test score: 0.25061124694376524
Best Hyperparameters: {}
116.35223659520231
1377.1274446385216
2254.4602789714436
10.03385791616974
67.4857249102411
151.02358212830347
13.800235227845313
0.18625738901914946
3.2387200121775095e-07
0.0
18.576815371054387
8.743902537376309
0.4274514504421316
0.0
18.50468389229104
0.0
30.548256522829963
13.060841991400006
32.9509295414729
16.072907195973205
0.004331556923716562
0.1501251195994462
4.874744714449662
1.9379431491430257
2.4331534799223036
0.5206618502588753
46.45981424726151
26.841040375770515
0.31717988152247045
0.9653766760417606
0.6782872335656323
4.299984444098083
0.5163716289458868
2.7421022511608104
20.12649962435057
0.0
0.0
0.0
0.836125727385924
2.4394299543928355e-05
2.1193817392875163
7.3493263081991245
0.0007867955819662598
83.74599532302679
0.32392108352819093
0.0
5.691646450188211
0.0
0.0
0.0
14.282811364575448
27.618439180194063
9.8759132542



Elapsed time to compute best fit: 2.412 seconds
Cross-validation score: 0.2419757817884754
Test score: 0.2866022099447514
Best Hyperparameters: {}
93.21054951346717
1257.7867656249653
2329.678032333945
88.8417207294915
99.99668378311168
49.22002059357151
0.019911086662524724
2.7703730173650456
3.7190300645306706e-05
0.019175900146365166
14.8212894753689
199.01506818318745
0.6122267551991811
0.0
0.17494050413370132
0.0
6.727769786837956
2.904887538476938
62.57985073602766
6.967264776768647
0.0013874772574009597
0.18307599093827776
9.287093112433155
3.2603909704634066
4.039824041501561
3.350658653281083
11.05576712926343
17.512229093873472
23.35794802427509
1.6543229290527588
6.576564236846299
0.07578137930640144
3.063449869962589
13.28561353402436
5.214159868851581
0.0
0.0
0.0
2.1598187730846137
0.0
3.8322181736894096
5.208023460784318
1.8572666756808758
11.342739088788221
2.600693667994282
0.0
0.3569083200710068
0.0
0.0
0.0
13.017494678277767
43.26271223691201
13.605951687022339
6.1107



Elapsed time to compute best fit: 2.650 seconds
Cross-validation score: 0.24537791858125946
Test score: 0.2468768590124926
Best Hyperparameters: {}
72.00331636712234
1725.778076792223
1929.1165263107196
24.26554372534156
91.98584553510422
21.39541314796532
0.9267894078924549
0.1617402976483504
4.67810545861721
0.009711047462246825
16.908151266052002
38.259172631397846
5.65909394561565
0.0
3.773817050293985
0.0
8.783052391179886
156.206025230678
13.79234699148981
9.104558007367169
0.4227430370379154
0.6985635813489575
19.695304946949975
2.194171519800503
1.011976306883298
2.365067314189085
3.589283028820371
26.81656562044296
6.49715268088858
0.09061891941035365
10.280614111383525
8.975506976974355
17.76095558703703
1.4078019369021604
8.090076561221263
0.0
0.0
0.0
15.377929986596264
0.0
6.728235050591877
7.104130521456158
0.7586899995803833
5.780568836676224
7.79521552045955
1.575785006480146e-06
0.4924987758586905
0.0
0.0
0.0
31.847058271753095
16.20103547260586
6.537152941024177
3.7323



Elapsed time to compute best fit: 2.531 seconds
Cross-validation score: 0.27102037115442534
Test score: 0.2751677852348993
Best Hyperparameters: {}
16.45090239634889
1527.4363790954424
2331.75463542733
46.81657751284959
57.155174963213554
53.928080993695566
5.330796961597245
0.18083015594176932
0.00017032290270435624
0.04410953582862254
31.89396285350813
2.5882435135344446
0.6799856130338806
0.0
0.0
0.0
25.874082748755
1.2369204398203562
4.389332294258165
99.89323263564195
1.1742121559545353
4.946512334048748
14.223584419238875
4.136832373495311
5.085888604815407
0.7330571069186916
1.3240146040916443
5.9538399638599095
15.827650901333172
1.3370271672503973
2.5016251873871056
0.06923045696514563
5.566201263429807
0.5749865513960941
9.035031300303839
0.0
0.0
0.0
45.83025284409902
0.0
10.68561257337828
4.984928853726064
0.0
27.169659434158074
1.249314562180138
1.84007003554143e-05
6.3773135315228044
0.0
0.0
0.0
31.16296610129766
3.0239894637400693
8.697372175032324
3.932797814181235
7.795

## 5.1 Rebalancing Strategy - 5050

### 5.4.1 Random Forest

In [135]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

fiftyfifty_randomforest_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })

for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    pipeline = Pipeline(steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                              ['under', RandomUnderSampler()],
                                ['classifier', RandomForestClassifier(n_jobs=-1)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    spaceEmpty = dict() 

    search = RandomizedSearchCV(estimator = pipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring='f1', 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedRFModel = search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    #print(f"Elapsed time to compute best fit: "
      #f"{elapsed_time:.3f} seconds")
    cv_score = optimizedRFModel.best_score_
    test_score = optimizedRFModel.score(X_test, y_test)
    #print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    #print('Best Hyperparameters: %s' % optimizedRFModel.best_params_)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedRFModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    
    fiftyfifty_randomforest_nonnormalized_performance_df = pd.concat([fiftyfifty_randomforest_nonnormalized_performance_df, new_performance_df])
    
fiftyfifty_randomforest_nonnormalized_performance_df.to_csv("../data/05_model_output/fiftyfifty_randomforest_nonnormalized_performance_df.csv")



### 5.4.2 XGBoost

In [136]:
import time
import numpy as np

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

import xgboost as xgb
from sklearn.metrics import fbeta_score, make_scorer
fhalf_scorer = make_scorer(fbeta_score, beta=0.5)


fiftyfifty_xgboost_nonnormalized_performance_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    GXBoostPipeline = Pipeline(steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                                    ['under', RandomUnderSampler()],
                                ['classifier', xgb.XGBClassifier(n_jobs=2)]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

    # define search space
    space = dict()
    space['classifier__learning_rate'] = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
    space['classifier__max_depth'] = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    space['classifier__min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    space['classifier__gamma'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    space['classifier__colsample_bytree'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    spaceEmpty = dict()

    GXBoostSearch = RandomizedSearchCV(estimator = GXBoostPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring=fhalf_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedGXBoostModel = GXBoostSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    
    cv_score = optimizedGXBoostModel.best_score_
    test_score = optimizedGXBoostModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedGXBoostModel.best_params_)
    
    #feature importance
    importances = optimizedGXBoostModel.best_estimator_._final_estimator.feature_importances_
    for i,v in enumerate(importances):
        print(v)

    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedGXBoostModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    print(new_performance_df)
    fiftyfifty_xgboost_nonnormalized_performance_df = pd.concat([fiftyfifty_xgboost_nonnormalized_performance_df, new_performance_df])
    

fiftyfifty_xgboost_nonnormalized_performance_df.to_csv("../data/05_model_output/fiftyfifty_xgboost_nonnormalized_performance_df.csv")




Elapsed time to compute best fit: 308.268 seconds
Cross-validation score: 0.5952363075781056
Test score: 0.6610576923076925
Best Hyperparameters: {}
0.0077696145
0.10384126
0.27160648
0.038073406
0.011233662
0.00928655
0.0018508227
0.0007057764
0.0031234971
0.00066766806
0.03670811
0.06850242
0.0051340736
0.0
0.0051502446
0.0
0.0051244455
0.0011397456
0.049706016
0.0015619018
0.0024018853
0.009645756
0.0017723858
0.0006475898
0.00086721
0.0005522008
0.0030503487
0.0037419135
0.0018719888
0.0035335594
0.008700895
0.00030719012
0.001996974
0.008940921
0.00089271436
0.0
0.0
0.0
0.002812797
0.002721992
0.0018890601
0.027838482
0.007030348
0.0017748752
0.0006441071
0.0029274533
0.009695921
0.0
0.0
0.0
0.0035960858
0.004084203
0.000397802
0.00048938073
0.006853184
0.001458211
0.00058210455
0.010202627
0.0067052585
0.009416319
0.008131092
0.003292423
0.00054764026
0.0023587616
0.0005717755
0.011810204
0.0011940353
0.01006305
0.001053008
0.005416292
0.008730183
0.010536839
0.001405877
0.000966



Elapsed time to compute best fit: 319.769 seconds
Cross-validation score: 0.6046715134756341
Test score: 0.64
Best Hyperparameters: {}
0.006024698
0.13298498
0.20357664
0.04600355
0.031061769
0.002082418
0.0013630165
0.0019151025
0.00742256
0.003630811
0.017279927
0.059080478
0.008253808
0.0
0.005599713
0.0
0.0085043665
0.0019194986
0.08758761
0.018222814
0.00349246
0.001331575
0.003971171
0.0033025672
0.0059712925
0.0017076281
0.0037712248
0.001001926
0.000976551
0.003373774
0.006063503
0.0003616896
0.001702276
0.0018070777
0.0007928131
0.0
0.0
0.0
0.010481119
0.0031632658
0.0029863755
0.0028683979
0.0041521797
0.0004962552
0.0019448557
0.0054954425
0.0056513874
0.0
0.0
0.0
0.0048095565
0.0031008853
0.00027999707
0.00021896357
0.01002328
0.006472551
0.0013825783
0.0059192907
0.0073714075
0.008323734
0.0061194384
0.0017800082
0.001634586
0.0017468267
0.0016596166
0.0032963448
0.0005277819
0.008903855
0.00035645848
0.005718974
0.0033153195
0.007557576
0.00074902596
0.0016024282
0.000547



Elapsed time to compute best fit: 316.571 seconds
Cross-validation score: 0.6111338466483687
Test score: 0.6193693693693694
Best Hyperparameters: {}
0.004709307
0.11072545
0.19081353
0.040501975
0.06318086
0.0030564475
0.0073918183
0.0025491554
0.0031126884
0.00083803234
0.019114597
0.07592422
0.005154085
0.0
0.0042376406
0.0
0.006140976
0.0010791222
0.07506283
0.0062776534
0.001010335
0.001269229
0.014205132
0.0010141636
0.0022706587
0.00044895575
0.0010156123
0.0014036743
0.00047369683
0.0039591957
0.0011449476
0.0010028735
0.00037431164
0.013479041
0.001199616
0.0
0.0
0.0
0.007801811
0.0030243513
0.0038495446
0.0028949599
0.011093425
0.006555068
0.0008497091
0.005821199
0.008471803
0.0
0.0
0.0
0.0007994269
0.002737168
0.00063135865
0.005693434
0.0068351645
0.00091137533
0.0005533649
0.0060466332
0.0030769685
0.008231408
0.008264752
0.0021873168
0.0030559232
0.0009457839
0.00214408
0.0006081168
0.00036792152
0.010319052
0.002001316
0.007495702
0.0031819781
0.01060766
0.0016814189
0.0



Elapsed time to compute best fit: 317.968 seconds
Cross-validation score: 0.5734501258541289
Test score: 0.646186440677966
Best Hyperparameters: {}
0.009627812
0.1047123
0.20953149
0.03847104
0.031094018
0.002687763
0.0016373168
0.0013098107
0.0003183832
0.002511809
0.016583962
0.12164699
0.013151344
0.0
0.006531812
0.0
0.008717847
0.009627054
0.028097728
0.0033430282
0.0038378893
0.0013830343
0.008489165
0.003038411
0.0042950655
0.0002628783
0.007514408
0.0017819676
0.0008796747
0.0062646335
0.007893817
0.0006974539
0.0007576337
0.006126236
0.00074270525
0.0
0.0
0.0
0.00645301
0.0024163479
0.0063842395
0.0034737333
0.009218527
0.0037327989
0.0015849384
0.002477884
0.009358869
0.0
0.0
0.0
0.007340634
0.009567235
0.00089719583
0.040505793
0.0025587964
0.0010230994
0.004800438
0.0052266014
0.00904363
0.0088018505
0.0041709263
0.0037274787
0.00013082413
0.0024524422
0.0037589273
0.0010531509
0.0005765024
0.011600833
0.0004596312
0.007292004
0.0027100414
0.0073507233
0.0010254921
0.0008722



Elapsed time to compute best fit: 317.637 seconds
Cross-validation score: 0.6012467391536925
Test score: 0.6308411214953272
Best Hyperparameters: {}
0.007201355
0.17801005
0.22161773
0.0419443
0.042666372
0.002109359
0.0032782704
0.005216646
0.0014230307
0.0016309847
0.02215382
0.028503703
0.008176803
0.0
0.0016070975
0.0
0.0028400624
0.0015399434
0.040151957
0.033333965
0.0011816667
0.00033866652
0.00036269467
0.0013073137
0.00082723494
0.0037701773
0.001215872
0.0025830674
0.0010859837
0.0059879636
0.00069107726
0.00038974147
0.0012653162
0.014197664
0.00051916717
0.0
0.0
0.0
0.005040168
0.0020164056
0.0039351913
0.012599309
0.0059491135
0.0032741097
0.010080025
0.0049910983
0.011263626
0.0
0.0
0.0
0.0010817172
0.0018734126
0.00088790676
0.0017313785
0.008248776
0.00049923186
0.0005534817
0.0052981535
0.0019825941
0.0098143015
0.0017422284
0.0015976578
0.00015229618
0.0008393528
0.002110393
0.0010822129
0.0008337231
0.009273605
0.00033981228
0.0018383989
0.0056361197
0.010217821
0.00



Elapsed time to compute best fit: 317.147 seconds
Cross-validation score: 0.6080248235581103
Test score: 0.5737704918032788
Best Hyperparameters: {}
0.0070637376
0.14007664
0.24048331
0.048234716
0.018255597
0.0017634579
0.0029178786
0.0012965548
0.0034195923
0.0057345117
0.023652323
0.044252023
0.010802436
0.0
0.0053732167
0.0
0.004674806
0.002419531
0.07927534
0.0016860345
0.0029892703
0.0025602165
0.004068858
0.0018407794
0.0043637976
0.0015164766
0.00055274705
0.002461542
0.0017181024
0.0069978023
0.0034230468
0.0011924536
0.00038016107
0.015976408
0.0016122396
0.0
0.0
0.0
0.010809267
0.002049351
0.00414168
0.008867697
0.0071793897
0.0035440067
0.00090710435
0.002745267
0.0051886844
0.0
0.0
0.0
0.0021263405
0.0040013166
0.0010067719
0.0010346066
0.0067622126
0.004750865
0.0046812613
0.008111926
0.005809466
0.0091792
0.001623674
0.0013307907
0.00027607544
0.00015699222
0.0061880583
0.0008081484
0.0010717243
0.010113296
0.00033177758
0.0014101606
0.004059432
0.01077902
0.0036288365
0



Elapsed time to compute best fit: 336.353 seconds
Cross-validation score: 0.579359964188938
Test score: 0.6145833333333333
Best Hyperparameters: {}
0.0058764713
0.15147683
0.21175124
0.032746255
0.038542543
0.0024925908
0.00075990765
0.0014611611
0.0040267343
0.003072447
0.015535545
0.055464316
0.011729376
0.0
0.0070019355
0.0
0.003703767
0.007073066
0.030901438
0.047719963
0.0028976437
0.0005425188
0.0031934166
0.004016869
0.00034374814
0.0069903415
0.005523395
0.0030205483
0.005038005
0.0029605126
0.0034923258
0.0009334134
0.0010492407
0.009219879
0.0015528605
0.0
0.0
0.0
0.006467277
0.0041909134
0.0048396015
0.005952565
0.0075076697
0.0005886812
0.0010945541
0.0039845705
0.010784137
0.0
0.0
0.0
0.004352075
0.004836622
0.0015072615
0.002437784
0.0077094273
0.0036629033
0.0028343634
0.0030377314
0.0032833028
0.01241908
0.0068630804
0.0013337528
0.0021930472
0.01161084
0.0019187849
0.0008490426
0.0008300337
0.005871186
0.00081811316
0.004878581
0.0050370493
0.015618831
0.0036439402
0.0



Elapsed time to compute best fit: 307.866 seconds
Cross-validation score: 0.6193420424086409
Test score: 0.5720338983050847
Best Hyperparameters: {}
0.007465522
0.11372856
0.2234973
0.035808735
0.0316443
0.0040499303
0.0036972575
0.0018722285
0.0016378318
0.000654286
0.02537249
0.07436772
0.0062320963
0.0
0.0109804785
0.0
0.0054585817
0.0027656371
0.066688925
0.014734562
0.0071112025
0.00010306748
0.0023152286
0.0051552216
0.0038248373
0.0021323813
0.005425837
0.0016756732
0.0038078406
0.0035664425
0.0073454627
0.0011334056
0.00058342825
0.002155846
0.0007302944
0.0
0.0
0.0
0.0022039213
0.0026232088
0.010042244
0.0048612384
0.009224376
0.0014819863
0.003986346
0.007327285
0.007907623
0.0
0.0
0.0
0.00089051377
0.0035771313
0.0005822358
0.008253718
0.012028579
0.004271406
0.017588723
0.005670472
0.005807138
0.0055243415
0.0007359051
0.005413886
0.00027101146
0.0003885282
0.002388888
0.00083937973
0.00079387333
0.011541676
0.0035551367
0.0030147275
0.0031198782
0.01166681
0.00047392337
0.



Elapsed time to compute best fit: 310.857 seconds
Cross-validation score: 0.605411758125185
Test score: 0.5995934959349595
Best Hyperparameters: {}
0.015021874
0.10449459
0.22070324
0.048207328
0.0083959745
0.0023263437
0.0007142705
0.011908842
0.0002812139
0.0019832356
0.020194989
0.09396896
0.0016992935
0.0
0.011209867
0.0
0.013595056
0.006532091
0.06107899
0.00934451
0.00064817286
0.0011312668
0.0027042904
0.0021298826
0.00017981413
0.0025074054
0.0028948616
0.0010678776
0.0034110455
0.0026324934
0.0042461096
0.0014148633
0.00027308706
0.010402783
0.0008605124
0.0
0.0
0.0
0.0046409094
0.0031311095
0.008646909
0.0057503665
0.008641453
0.0019319081
0.009712422
0.0023884347
0.007126113
0.0
0.0
0.0
0.0051074172
0.008439845
0.000629586
0.0015251356
0.010075556
0.013448298
0.003224915
0.005355372
0.005613685
0.012506951
0.0019332772
0.0066105416
0.00018158746
0.0003216316
0.0015147723
0.002204716
0.00080543634
0.012036754
0.0010041384
0.002553748
0.007121932
0.011098967
0.001982737
0.0015



Elapsed time to compute best fit: 322.946 seconds
Cross-validation score: 0.6029737646672276
Test score: 0.552536231884058
Best Hyperparameters: {}
0.004847328
0.118276246
0.21676412
0.042152163
0.029931579
0.003417856
0.0017681803
0.0021509076
0.00086689834
0.002487526
0.025343448
0.069652684
0.03501667
0.0
0.0035405178
0.0
0.015473382
0.011322268
0.060598902
0.00078134227
0.0021464862
0.00025849082
0.003967688
0.0023596555
0.0017466403
0.00683045
0.0026897222
0.0019851523
0.0017203782
0.0038123117
0.0029200513
0.0004927503
0.0006665239
0.005978187
0.001556894
0.0
0.0
0.0
0.008645831
0.0019337576
0.006699718
0.010350756
0.0020285985
0.0020136267
0.0008044986
0.008223782
0.006143951
0.0
0.0
0.0
0.0007355772
0.0025304407
0.00096605334
0.004707607
0.01170751
0.0014231625
0.002355863
0.0039467914
0.003023772
0.008331132
0.008005699
0.0023983931
0.00019929721
0.0006708112
0.0012105067
0.0050486308
0.001126807
0.0077773454
0.0017358428
0.004201088
0.004747586
0.010995392
0.0017639339
0.0008



Elapsed time to compute best fit: 324.378 seconds
Cross-validation score: 0.5844659431328352
Test score: 0.625
Best Hyperparameters: {}
0.0053487048
0.12651293
0.20124413
0.047873642
0.027432447
0.0055018663
0.0037254908
0.0018898348
0.005467015
0.0008844509
0.012292015
0.09325984
0.002629825
0.0
0.0071142274
0.0
0.007133404
0.0022980948
0.04755237
0.0016480223
0.003692357
0.00039699467
0.0019755382
0.0036210823
0.0059665004
0.00023275707
0.0018235308
0.0013913966
0.0023239604
0.003886948
0.008096675
0.0021529368
0.00033804457
0.0076334304
0.0012139765
0.0
0.0
0.0
0.009268913
0.003920189
0.0063504926
0.004866287
0.008711162
0.0033711416
0.00029273625
0.003936942
0.010452507
0.0
0.0
0.0
0.0062970594
0.003761509
0.0005968337
0.00048125492
0.0113666225
0.004558471
0.0022481747
0.005278996
0.008942361
0.006839768
0.0055981535
0.0046331645
0.00014643546
0.00064659124
0.047769178
0.0006812261
0.0008669301
0.0093518
0.00029105024
0.0074402755
0.005652679
0.01036531
0.001660358
0.0004787455
0.



Elapsed time to compute best fit: 312.440 seconds
Cross-validation score: 0.6350957840862167
Test score: 0.6022727272727273
Best Hyperparameters: {}
0.015823783
0.09579355
0.1865742
0.039942306
0.0080049895
0.005155381
0.002241067
0.0015410054
0.000285242
0.00090210664
0.021135364
0.17009191
0.0081572365
0.0
0.004700091
0.0
0.008475741
0.011608011
0.07403655
0.010881734
0.0009286516
0.00023925566
0.002342922
0.0010854818
0.0008413879
0.000970516
0.0006422621
0.00071916287
0.0023359093
0.0026082743
0.0074536856
0.00083383615
0.0014126874
0.0047141737
0.00027543816
0.0
0.0
0.0
0.0049380646
0.001356696
0.0050652823
0.0057231323
0.003685282
0.0023313258
0.0031948674
0.0016012929
0.005035768
0.0
0.0
0.0
0.0006886214
0.00092512206
0.0005602887
0.009395921
0.024543291
0.0015900927
0.0022270952
0.007781782
0.009588904
0.0050426032
0.0030122155
0.0012630117
0.0050863577
0.005634008
0.0029442054
0.0007221124
0.000954075
0.012913079
0.00026946364
0.011118028
0.0011427008
0.008594705
0.00047384767



Elapsed time to compute best fit: 322.304 seconds
Cross-validation score: 0.5901826184777053
Test score: 0.6305309734513275
Best Hyperparameters: {}
0.004428983
0.13390324
0.20372263
0.04285104
0.024366908
0.0035282734
0.0077941166
0.0011279719
0.0011978257
0.001973909
0.015886348
0.028844848
0.027806116
0.0
0.003985249
0.0
0.0056608124
0.0067726304
0.07430548
0.00875822
0.0073411423
0.0012085184
0.0021143518
0.0037918028
0.0016695702
0.000108028216
0.0008428615
0.0021839389
0.00064378406
0.0044820583
0.009772207
0.0018607451
0.0018583261
0.0013888697
0.0030233434
0.0
0.0
0.0
0.0099808
0.0004931633
0.0056269844
0.022049328
0.009430789
0.00024665508
0.0045623677
0.0038716411
0.0064467047
0.0
0.0
0.0
0.0075264173
0.0028152817
0.001350178
0.016396439
0.00751335
0.00072420214
0.0037847755
0.0075473795
0.0042296336
0.008218207
0.009833179
0.0035565905
0.026146894
0.004047664
0.000660237
0.0008504909
0.0006890991
0.006197065
0.0012120649
0.0066872
0.0029710259
0.015406468
0.0014696012
0.0006



Elapsed time to compute best fit: 308.068 seconds
Cross-validation score: 0.5913583816433098
Test score: 0.6623931623931624
Best Hyperparameters: {}
0.0027607426
0.120096795
0.24048054
0.037631474
0.023445277
0.0021523163
0.0010315125
0.001709946
0.0056651435
0.0012005214
0.040542744
0.016051978
0.004788153
0.0
0.0058690417
0.0
0.005631848
0.00572292
0.04882103
0.027394773
0.0011335559
0.0019235464
0.0013044701
0.0051676137
0.0009150829
0.008084652
0.0030257578
0.0013412359
0.0015162409
0.004237263
0.002078122
0.0006710934
0.0010557472
0.02106343
0.000532164
0.0
0.0
0.0
0.009590892
0.00326551
0.0047298265
0.0051730443
0.008660315
0.0016545941
0.0047078035
0.0024111203
0.0059501766
0.0
0.0
0.0
0.0013768628
0.00312164
0.0031095329
0.0014336033
0.009786602
0.001588743
0.006430315
0.0022370564
0.0070462087
0.0067179496
0.017346755
0.004020503
0.00014638713
0.004399279
0.0009039255
0.0038552512
0.0023421377
0.004746508
0.0014139822
0.0016449257
0.00395598
0.014175931
0.0023238654
0.00059847



Elapsed time to compute best fit: 323.657 seconds
Cross-validation score: 0.592091293060618
Test score: 0.6097560975609756
Best Hyperparameters: {}
0.0037240356
0.12766787
0.19823983
0.036644574
0.029731998
0.0062305806
0.0034512207
0.0012008524
0.002437563
0.0017935704
0.01264941
0.050642923
0.029246306
0.0
0.0065549263
0.0
0.007459735
0.00898222
0.060984544
0.0012330374
0.0017292917
0.00048906106
0.018023254
0.000916664
0.0019675381
0.001602989
0.0024416854
0.000650698
0.0009395779
0.0022386475
0.01972839
0.00036970162
0.00058880734
0.0051816
0.0013840431
0.0
0.0
0.0
0.010026899
0.001285345
0.0039332034
0.003000516
0.0058306656
0.0025599673
0.002541698
0.0018280878
0.0115486225
0.0
0.0
0.0
0.006426134
0.0027100658
0.00042904826
0.00067472516
0.0031714442
0.0049220836
0.026353283
0.011354686
0.011560173
0.0063832924
0.0010849492
0.002866184
0.00031257977
0.00269743
0.0037974522
0.00073576905
0.000728536
0.012283731
0.0013365737
0.011838986
0.0019750197
0.011181992
0.0008098581
0.00134



Elapsed time to compute best fit: 320.269 seconds
Cross-validation score: 0.598158835601643
Test score: 0.646186440677966
Best Hyperparameters: {}
0.013214733
0.09840049
0.17994615
0.04107769
0.011929567
0.0029061434
0.0027676157
0.0086191315
0.0064203353
0.0012717454
0.017423235
0.16425088
0.0018706488
0.0
0.011981977
0.0
0.009075584
0.025620723
0.036729023
0.018767625
0.0025980142
0.0010373391
0.0010741089
0.0043839617
0.0031420076
0.0008722828
0.0014969911
0.0005301649
0.0014255743
0.001334479
0.004833054
0.00043538888
0.0006008346
0.006859328
0.001979758
0.0
0.0
0.0
0.0034171569
0.0044465107
0.0039543537
0.00051350077
0.004446014
0.003475069
0.00056910136
0.006138284
0.0060378043
0.0
0.0
0.0
0.0017493418
0.0005229864
0.00064584665
0.00085190375
0.010117924
0.00073782593
0.00076941063
0.007716388
0.0121767325
0.0040771654
0.009847693
0.0014901225
0.00040461242
0.0016544654
0.006760791
0.008003019
0.0007489891
0.008799852
0.0022282088
0.0055075483
0.0025249638
0.009324832
0.001111931



Elapsed time to compute best fit: 320.109 seconds
Cross-validation score: 0.5834586688629623
Test score: 0.7065217391304349
Best Hyperparameters: {}
0.0056685065
0.15706351
0.20938097
0.039695166
0.017632019
0.0123898145
0.0041277017
0.0014606643
0.0022851508
0.0010974654
0.039872896
0.043256007
0.017501308
0.0
0.0065910555
0.0
0.008213452
0.0013882075
0.04278999
0.0045993202
0.0013297404
0.003588547
0.0076837097
0.0010263409
0.0058950344
0.0016869652
0.001067237
0.00085810234
0.0031802412
0.007939716
0.01308668
0.00058185396
0.0019438493
0.0076416945
0.0007520536
0.0
0.0
0.0
0.007273831
0.0008630612
0.0067906496
0.0047003273
0.0034123454
0.0013346832
0.00020912274
0.0049557555
0.0025051017
0.0
0.0
0.0
0.0039728824
0.0038980115
0.0022952443
0.0075784335
0.012449706
0.010206335
0.00084477797
0.0063617304
0.0032419115
0.010070944
0.0049157566
0.0051749526
0.0023285109
0.0056520957
0.0040337364
0.0010261531
0.0005077012
0.011695655
0.0004584752
0.0054571414
0.001704704
0.0136094745
0.0012



Elapsed time to compute best fit: 316.483 seconds
Cross-validation score: 0.5939220002245927
Test score: 0.56
Best Hyperparameters: {}
0.0104118455
0.09507687
0.19257544
0.031537883
0.019814923
0.0053260224
0.0014175315
0.009234424
0.0029563014
0.0035326995
0.014420386
0.12030913
0.006000974
0.0
0.007649014
0.0
0.0021008942
0.015022945
0.0950546
0.0059092483
0.0017256491
0.0001687197
0.014651296
0.004507012
0.0004390264
0.0038283404
0.01144023
0.0019660476
0.008603277
0.0029974997
0.010184343
0.000814746
0.00025016937
0.0016401946
0.0016392415
0.0
0.0
0.0
0.0068567586
0.0019205872
0.0049437187
0.004973055
0.014725418
0.0011680181
0.00072271604
0.0046667587
0.005515176
0.0
0.0
0.0
0.00048959494
0.00061477144
0.00037352447
0.0021042617
0.013885293
0.009414515
0.00035509837
0.004512461
0.0027778079
0.005329716
0.008438783
0.0015043638
0.00026717395
0.001484303
0.0012232682
0.0007284049
0.00031389433
0.009078442
0.0018734365
0.0046902737
0.0039509754
0.007907439
0.002763554
0.00041826942
0



Elapsed time to compute best fit: 308.798 seconds
Cross-validation score: 0.6196312553444396
Test score: 0.6045081967213115
Best Hyperparameters: {}
0.0045197913
0.13523296
0.22524722
0.037170473
0.039557952
0.0019231539
0.0029377441
0.00058008946
0.0012306698
0.0049832035
0.008756851
0.07145957
0.015905887
0.0
0.0064604525
0.0
0.010592417
0.0020815132
0.05853777
0.011920278
0.0014490352
0.007482825
0.0048209094
0.004860655
0.0045323456
0.0025396626
0.0013912084
0.002078336
0.0013609194
0.002760274
0.00842787
0.0007497825
0.0007647869
0.0029401674
0.00052056444
0.0
0.0
0.0
0.011041119
0.0036158387
0.012417646
0.0029817258
0.0029384748
0.00430758
0.0019156158
0.002327761
0.011405732
0.0
0.0
0.0
0.0054026516
0.002779203
0.0019413589
0.0009507492
0.005222102
0.0023499923
0.0039051368
0.001055858
0.007502325
0.007922406
0.0063983654
0.0040102843
0.00036277887
0.011360658
0.008494444
0.011016889
0.0009411783
0.0072267447
0.00037638212
0.0032142838
0.0032722552
0.006928369
0.0014916547
0.001



Elapsed time to compute best fit: 315.686 seconds
Cross-validation score: 0.5706549399208866
Test score: 0.6092436974789917
Best Hyperparameters: {}
0.0041064164
0.10478581
0.21815923
0.039429415
0.025194367
0.007675634
0.0022907597
0.0016713337
0.0012444085
0.0023309654
0.021390239
0.08164228
0.0057690893
0.0
0.0067806193
0.0
0.010162663
0.007040421
0.085624486
0.005976516
0.0029026128
0.00017092522
0.0062942584
0.0030681763
0.029339865
0.0010169204
0.0026061381
0.0035394696
0.0023098274
0.00405533
0.0042896573
0.0010930195
0.00043143315
0.010233307
0.0009183817
0.0
0.0
0.0
0.0133519545
0.00356499
0.0035543412
0.006570741
0.0071234885
0.0064170687
0.0007774307
0.002852369
0.007583326
0.0
0.0
0.0
0.00031164993
0.0053728027
0.0008024435
0.019987337
0.0057138163
0.0009086307
0.004501042
0.004845342
0.0023971584
0.00822151
0.010554567
0.0035797074
0.0020280275
0.0031433806
0.0005360238
0.0011128306
0.0015556159
0.00777371
0.0011929765
0.00483757
0.0019270479
0.007993051
0.0046133487
0.000



Elapsed time to compute best fit: 316.546 seconds
Cross-validation score: 0.6110305352272676
Test score: 0.5921052631578947
Best Hyperparameters: {}
0.0030398031
0.12918165
0.24881463
0.050130464
0.026365474
0.004186713
0.0065136254
0.0038774332
0.0013708004
0.0119835595
0.022400523
0.0052847434
0.012131852
0.0
0.0075886813
0.0
0.0035724328
0.0009057572
0.06468451
0.008450335
0.0027832887
0.00089449785
0.0039103003
0.007897678
0.0011715445
0.0073726596
0.0077167666
0.0013348534
0.0021979266
0.0047053476
0.007652784
0.0008428844
0.00067451305
0.005861937
0.004432593
0.0
0.0
0.0
0.008292881
0.020437872
0.0027235658
0.0043030665
0.0059766066
0.0025420666
0.0070234253
0.0033591099
0.005833222
0.0
0.0
0.0
0.0009757998
0.0032934682
0.0021498534
0.0004590812
0.0042106747
0.017914688
0.0007853449
0.013820354
0.004141756
0.0069501842
0.004558455
0.005609027
0.0014750166
0.0015530797
0.00071677193
0.0003979221
0.0014667856
0.009538312
0.00047680325
0.005568493
0.0017565169
0.007531488
0.00350654



Elapsed time to compute best fit: 319.612 seconds
Cross-validation score: 0.6085856365218123
Test score: 0.5430327868852459
Best Hyperparameters: {}
0.0038361768
0.10851789
0.22220388
0.04226872
0.023226151
0.0013372268
0.0009816323
0.0011411714
0.0014722018
0.003237144
0.024977466
0.084948264
0.0034940522
0.0
0.0066106026
0.0
0.005042361
0.01484406
0.06866546
0.023909293
0.00097321736
0.00020895374
0.0077909683
0.0077425623
6.251717e-05
0.00145293
0.003483053
0.001786171
0.00068189483
0.004163171
0.0043512695
0.0007021871
0.00027907034
0.007366934
0.0014623327
0.0
0.0
0.0
0.010158237
0.0010063346
0.0028370097
0.0026437165
0.01247911
0.0047223866
0.00072970317
0.003183874
0.0037933867
0.0
0.0
0.0
0.001437097
0.010640633
0.001077497
0.009000875
0.0059755356
0.00040190702
0.005365848
0.0032218723
0.0040902775
0.010597433
0.0030156742
0.0034510617
0.0001346338
0.00035825677
0.015872866
0.00073587045
0.0010749428
0.014465156
0.0014169714
0.004806377
0.0010138925
0.008402216
0.0012193134
0.



Elapsed time to compute best fit: 323.941 seconds
Cross-validation score: 0.5959916783018397
Test score: 0.6410256410256411
Best Hyperparameters: {}
0.006513548
0.13937764
0.2319987
0.040164847
0.023892764
0.001639919
0.001500419
0.0019273456
0.004207029
0.0019360656
0.03459204
0.03328203
0.006340999
0.0
0.006935243
0.0
0.0034180519
0.002911234
0.07058053
0.0076699373
0.0038042413
0.002994128
0.010256136
0.001156324
0.00419063
0.007167629
0.005407546
0.0020642895
0.0014118252
0.0037179687
0.008328999
0.0013146844
0.0029940153
0.008142621
0.0017235324
0.0
0.0
0.0
0.005851007
0.005502452
0.0039881025
0.004862192
0.0096940715
0.0012451052
0.0008358192
0.00203604
0.0053656376
0.0
0.0
0.0
0.0026921916
0.0038647738
0.0032929012
0.0017087386
0.004940186
0.00603368
0.0023797348
0.0027695172
0.007625655
0.009833271
0.010744914
0.0033051756
8.477398e-05
0.00037169477
0.0033186981
0.0040393155
0.00096120813
0.010230827
0.00094949134
0.008862491
0.003803507
0.009007107
0.0023949652
0.0006162401
0.



Elapsed time to compute best fit: 317.804 seconds
Cross-validation score: 0.6176162286340222
Test score: 0.5745967741935484
Best Hyperparameters: {}
0.004950905
0.14741725
0.21797904
0.035468686
0.028695602
0.017774437
0.0039846767
0.0018883665
0.005821769
0.003618678
0.04130019
0.030311648
0.0029071253
0.0
0.0052012955
0.0
0.0012895268
0.005009718
0.06525584
0.0059402967
0.0029737896
0.0005821941
0.0006693166
0.002339822
0.0039244597
0.0017685837
0.00891934
0.0011067687
0.0019017537
0.0039519747
0.002173078
0.00089065114
0.0006841493
0.007010009
0.0010862566
0.0
0.0
0.0
0.0090180775
0.004618553
0.0049824966
0.004721353
0.0056384355
0.0015377972
0.01648452
0.0024074914
0.008447292
0.0
0.0
0.0
0.0010663056
0.0034215155
0.00096584304
0.013157439
0.009072309
0.0006316514
0.007088151
0.0034882182
0.0028725553
0.0069542136
0.0050500864
0.0011414975
0.00026678093
0.0006659357
0.007270504
0.0018233876
0.00063621957
0.008426979
0.0003921115
0.006235085
0.0018814089
0.008838525
0.00045262952
0.



Elapsed time to compute best fit: 310.793 seconds
Cross-validation score: 0.590786215848522
Test score: 0.6250000000000001
Best Hyperparameters: {}
0.00601102
0.12643932
0.22596222
0.032261897
0.03457766
0.014594315
0.0010764283
0.0036116315
0.0012919683
0.0015043291
0.020716205
0.078761406
0.004810319
0.0
0.0036993215
0.0
0.0068858955
0.005691725
0.055690262
0.009976945
0.0029046515
0.0018535977
0.004492382
0.0019576002
0.0008645861
0.002157904
0.0009802434
0.0021813323
0.0008317071
0.0020203793
0.006741716
0.0010126275
0.003094014
0.018418359
0.0026886803
0.0
0.0
0.0
0.0094300285
0.0005343623
0.0021689995
0.0025644118
0.00953083
0.004372415
0.0076192417
0.001818554
0.010088636
0.0
0.0
0.0
0.0007380857
0.0038562508
0.0011735015
0.0028239305
0.011252094
0.011826046
0.006298917
0.0027601242
0.002630043
0.008640333
0.009114482
0.0013680139
0.01625433
0.0037086431
0.0017019134
0.0031632683
0.0020646893
0.0129256435
0.00055578526
0.0019748001
0.004004564
0.007653381
0.0011399242
0.00011352

### 5.4.3 LightGBM

In [137]:
import time
import numpy as np
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import fbeta_score, make_scorer


#Import feature selection stuff
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

# Import the model we are using
import lightgbm as lgb

fiftyfifty_lightgbm_performance_nonnormalized_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })


for i in range(25):

    ftwo_scorer = make_scorer(fbeta_score, beta=2)

    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)


    LightGBMPipeline = Pipeline(steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                                    ['under', RandomUnderSampler()],
                                ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]])

    stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)

# define search space
    # define search space
    space = dict()
    spaceEmpty = dict()
    space['classifier__num_leaves'] = [11, 16, 21, 26, 31, 36, 41, 46, 51, 56]
    space['classifier__min_data_in_leaf'] =  [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__max_depth'] = [-1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
    space['classifier__learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]
    space['classifier__max_bin'] = [50, 100, 150, 200, 255, 300, 350, 400, 450, 500]

    LightGBMSearch = RandomizedSearchCV(estimator = LightGBMPipeline, 
                            param_distributions=spaceEmpty, 
                            n_iter=100, 
                            scoring= ftwo_scorer, 
                            n_jobs=-1, 
                            cv = stratified_kfold)

    optimizedLightGBMModel = LightGBMSearch.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    print(f"Elapsed time to compute best fit: "
      f"{elapsed_time:.3f} seconds")
    cv_score = optimizedLightGBMModel.best_score_
    test_score = optimizedLightGBMModel.score(X_test, y_test)
    print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
    print('Best Hyperparameters: %s' % optimizedLightGBMModel.best_params_)
    
    #feature importance
    importances = optimizedLightGBMModel.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
    for i,v in enumerate(importances):
        print(v)


    #Display the model performance    
    new_performance_df = showModelPerformance(trainedModel = optimizedLightGBMModel, 
                     testFeatures = X_test, 
                     testLabels = y_test)
    fiftyfifty_lightgbm_performance_nonnormalized_df = pd.concat([fiftyfifty_lightgbm_performance_nonnormalized_df, new_performance_df])
    

fiftyfifty_lightgbm_performance_nonnormalized_df.to_csv("../data/05_model_output/fiftyfifty_lightgbm_performance_nonnormalized_df.csv")




Elapsed time to compute best fit: 24.511 seconds
Cross-validation score: 0.669905574676471
Test score: 0.65121412803532
Best Hyperparameters: {}
3065.8205741643906
115721.48223233223
270639.80736100674
22331.629565238953
4598.5955184698105
342.5653122663498
155.15969014167786
269.82331013679504
270.25729274749756
50.63903093338013
9207.40948843956
18780.082674860954
113.60890114307404
0.0
886.4327862262726
0.0
231.00514042377472
212.45023202896118
17729.81251013279
708.1601014137268
269.57067108154297
65.97796034812927
410.1166595220566
90.03957080841064
26.81481909751892
68.54204058647156
2086.7823321819305
223.46782207489014
15.576610207557678
434.74102222919464
687.2152284383774
168.45824885368347
36.897300243377686
294.02459609508514
28.357160210609436
0.0
0.0
0.0
1679.93785405159
320.81574153900146
779.3232460021973
1363.785492181778
894.5080457925797
335.49490106105804
370.2344901561737
157.0631618499756
800.4587042331696
0.0
0.0
0.0
40.30407893657684
1441.5808236598969
469.61541



Elapsed time to compute best fit: 23.958 seconds
Cross-validation score: 0.6587000468091315
Test score: 0.6629213483146068
Best Hyperparameters: {}
4679.2541518211365
120849.32763922215
251780.24500000477
20267.85897088051
5504.349816560745
270.07259011268616
306.2234505414963
82.52852082252502
104.94018793106079
97.19769954681396
4516.539827108383
28289.190749406815
784.4401167631149
0.0
837.7840029001236
0.0
524.1962153911591
806.3143184185028
17990.307054281235
4077.0853657722473
299.28356778621674
33.28593993186951
525.5496873855591
340.02861988544464
148.81967401504517
4.426939964294434
250.72452187538147
839.277951002121
210.8816430568695
554.8118633031845
1006.4374890327454
358.0942338705063
104.01995897293091
392.0795509815216
169.08429944515228
0.0
0.0
0.0
1720.2781742811203
170.5275056362152
1956.0764156579971
401.3130304813385
671.5850876569748
216.6516396999359
41.42524981498718
108.08000016212463
708.0050188302994
0.0
0.0
0.0
1679.1400861740112
1063.8157914876938
115.18454



Elapsed time to compute best fit: 24.998 seconds
Cross-validation score: 0.6599956834276567
Test score: 0.641891891891892
Best Hyperparameters: {}
3948.9517587423325
146787.2124043703
236624.38794791698
16123.24442255497
15057.263841629028
636.5808633565903
317.1571708917618
257.1836208105087
80.88834953308105
107.03499984741211
5186.034872412682
10399.219059109688
9678.088304042816
0.0
1336.882663011551
0.0
747.4886326789856
520.8514858484268
6300.730584383011
684.6984572410583
449.3686192035675
2.435349941253662
789.8878486156464
355.60916447639465
0.0
5.683129906654358
618.7807997465134
67.60314917564392
170.80663096904755
1133.8326946496964
1276.4707350730896
80.84197950363159
191.11943662166595
1726.7479672431946
104.74380075931549
0.0
0.0
0.0
1560.7186608314514
26.814099311828613
879.7030041217804
567.7461351156235
521.6424701213837
343.63519191741943
141.7339563369751
107.42594850063324
542.5161048173904
0.0
0.0
0.0
3586.460033416748
846.8858672380447
126.72030889987946
229.6642



Elapsed time to compute best fit: 25.314 seconds
Cross-validation score: 0.6554912468098364
Test score: 0.6900212314225053
Best Hyperparameters: {}
3614.892525792122
132345.22207403183
254571.9348757267
21765.823582291603
12582.11554634571
857.7642784118652
6.488669991493225
48.02707052230835
478.67210149765015
65.98619019985199
4693.016420841217
10030.70798254013
1098.0506167411804
0.0
1142.0365328788757
0.0
585.9697558879852
790.0898801088333
12455.089431166649
1362.2058540582657
314.4047920703888
15.663410186767578
257.71610283851624
152.02235889434814
124.70035636425018
7.903540134429932
61.8573899269104
708.8194160461426
24.13913059234619
1119.095895767212
1669.9330682754517
149.2579686641693
162.38440918922424
1466.92877471447
86.66615092754364
0.0
0.0
0.0
691.5964938402176
430.8352475166321
1064.52062189579
603.0961482524872
531.5596899986267
613.7577679157257
249.18454694747925
494.6790418624878
575.9302875995636
0.0
0.0
0.0
623.1035420894623
603.0191252231598
46.61781978607178



Elapsed time to compute best fit: 24.946 seconds
Cross-validation score: 0.6569483339110412
Test score: 0.7222222222222223
Best Hyperparameters: {}
2913.589554667473
151292.40211725235
238815.43718588352
18248.083189606667
5861.877880692482
889.8480242490768
373.86181581020355
240.78637218475342
1115.306603193283
4.57040011882782
7211.138073921204
11981.015842914581
628.2962630987167
0.0
589.0045019388199
0.0
1256.6497068405151
172.6778485774994
17720.251277685165
369.6626912355423
134.99977087974548
56.68860960006714
370.32005500793457
214.63528871536255
3.0329298973083496
150.07714819908142
105.31898295879364
93.12316024303436
389.71411216259
780.5448342561722
1002.5755956172943
259.2217011451721
393.1680740118027
295.2501573562622
107.84428095817566
0.0
0.0
0.0
1809.1600558757782
57.23574995994568
1413.766412138939
428.20326006412506
641.3190870285034
426.0029048919678
116.24347043037415
193.85079324245453
478.8479907512665
0.0
0.0
0.0
242.08571791648865
1011.227478146553
78.4577798



Elapsed time to compute best fit: 23.837 seconds
Cross-validation score: 0.6633172806477103
Test score: 0.6674208144796379
Best Hyperparameters: {}
3450.668680667877
134900.26177811623
246320.11700999737
18260.05816113949
14383.876331210136
331.2158706188202
288.5796480178833
202.89025974273682
822.6163849830627
11.654400110244751
3813.921142578125
18434.661462068558
3146.812151193619
0.0
912.0428513288498
0.0
1531.6241900920868
421.99918377399445
9732.840176463127
779.8614513874054
320.7214324474335
79.919588804245
253.20196425914764
323.87360978126526
242.31800436973572
163.45262837409973
312.1182007789612
296.02457904815674
144.34946167469025
1112.6559740304947
754.8193696737289
380.3165202140808
297.5393627882004
1583.042490720749
37.578540086746216
0.0
0.0
0.0
1533.226368188858
101.67416048049927
1333.7885479927063
369.8293762207031
359.6006724834442
518.6449258327484
155.23454093933105
257.50452733039856
1076.280307173729
0.0
0.0
0.0
201.54965913295746
735.2390073537827
250.98193



Elapsed time to compute best fit: 24.129 seconds
Cross-validation score: 0.6487992459216101
Test score: 0.6981981981981982
Best Hyperparameters: {}
5740.223383426666
119275.21088635921
264777.1491690874
24785.534670472145
8423.169495463371
2911.913065671921
552.7861028909683
109.57628047466278
168.13075351715088
75.36478900909424
5466.991895675659
14045.012198448181
830.4618639945984
0.0
1246.0667102336884
0.0
608.6337050199509
220.13966870307922
13387.757543683052
1325.8727051019669
160.63536190986633
48.318989992141724
678.1533966064453
181.799250125885
253.78459930419922
51.5400915145874
860.8085998296738
139.27472043037415
400.2221782207489
942.3180958032608
561.2649126052856
303.41334080696106
99.20372092723846
1033.8477442264557
214.1761975288391
0.0
0.0
0.0
1407.8046880960464
317.58712005615234
307.58344185352325
395.35042798519135
415.486829161644
349.8075386285782
344.7784948348999
497.6047251224518
837.662269949913
0.0
0.0
0.0
331.9196215867996
1017.4719412326813
128.74870014



Elapsed time to compute best fit: 24.359 seconds
Cross-validation score: 0.6535846341686318
Test score: 0.6783369803063458
Best Hyperparameters: {}
3682.30228972435
150353.86875367165
237315.28250157833
19476.551114320755
11259.049250006676
144.69200038909912
283.74150943756104
25.505269765853882
510.58787727355957
27.59516954421997
6387.89050757885
8925.124127864838
617.856761097908
0.0
522.2795572280884
0.0
1101.706878900528
94.88224172592163
14204.388112306595
1359.3219794034958
387.1172207593918
3.300339937210083
801.4963217973709
27.233709812164307
9.035120129585266
28.617368817329407
432.74547386169434
199.2202911376953
108.31651198863983
945.1114017963409
655.333989739418
279.258868098259
71.54132103919983
670.0572273731232
233.20977771282196
0.0
0.0
0.0
2122.1746814250946
54.78492975234985
960.0333800315857
1151.7643738985062
625.801816701889
433.4945719242096
82.24973034858704
139.20050883293152
690.340135216713
0.0
0.0
0.0
3414.6129162311554
463.2592097520828
158.575509786605



Elapsed time to compute best fit: 24.687 seconds
Cross-validation score: 0.6326773270343115
Test score: 0.7439824945295405
Best Hyperparameters: {}
3827.9563244581223
151665.77243709564
242926.13850140572
20921.70786035061
14935.617159485817
398.3262403011322
284.33660781383514
49.673519253730774
233.12087833881378
33.95003020763397
3569.0202516317368
9358.994644165039
2737.187873363495
0.0
755.4541757106781
0.0
401.85843539237976
699.9383115768433
10466.656769633293
1165.7445590496063
225.53254985809326
57.433690428733826
440.08765959739685
331.0028429031372
0.0
39.89097046852112
192.61885023117065
219.97175085544586
213.09874939918518
772.3270742893219
504.9379560947418
143.54750871658325
46.91006064414978
2509.15036547184
187.049880027771
0.0
0.0
0.0
777.9988243579865
178.3204116821289
1352.7310329675674
576.9758931398392
719.0393929481506
578.3435109853745
1384.221162557602
623.4023902416229
1017.0163655281067
0.0
0.0
0.0
250.77191162109375
1498.008800148964
136.41051065921783
364.



Elapsed time to compute best fit: 23.998 seconds
Cross-validation score: 0.6582619935564178
Test score: 0.6526548672566371
Best Hyperparameters: {}
2020.0217326879501
124197.94665527344
276613.6743899584
24778.159041404724
3109.6794806718826
708.9635375738144
156.67378997802734
112.87904024124146
306.5048339366913
46.80963945388794
8707.808947324753
5237.581553697586
697.4541709423065
0.0
1513.5485887527466
0.0
163.08027029037476
315.00336027145386
10412.577814936638
2035.4195185899734
265.8641265630722
79.35002875328064
294.1018981933594
310.6288158893585
78.51950073242188
112.94764971733093
684.3298888206482
188.60092210769653
202.09462094306946
1031.4367235898972
396.1748526096344
157.43653202056885
90.95791840553284
291.4018212556839
341.98185181617737
0.0
0.0
0.0
1561.738488316536
163.73553955554962
706.7471377849579
1555.635024189949
650.2354265451431
339.99401116371155
100.77075099945068
1540.2611644268036
1373.802124261856
0.0
0.0
0.0
305.17393124103546
708.4672433137894
247.80



Elapsed time to compute best fit: 23.486 seconds
Cross-validation score: 0.6542690909893781
Test score: 0.6644144144144143
Best Hyperparameters: {}
10130.637992501259
128323.21114230156
249111.6965237856
19908.274557352066
9963.574527263641
352.81583046913147
335.4103184938431
55.73941087722778
344.96680450439453
118.06858968734741
4876.084092617035
22366.12491774559
1084.4862806797028
0.0
884.8129296302795
0.0
1519.4710859060287
365.74442052841187
11030.149098873138
599.8510665893555
93.09092080593109
43.100759506225586
258.49598836898804
207.80043041706085
133.9739990234375
231.38349294662476
228.29419136047363
1300.4031128883362
60.0054292678833
981.078155040741
871.5366759300232
314.6663519144058
451.8136900663376
1646.166523694992
238.60762882232666
0.0
0.0
0.0
235.88417768478394
35.33759021759033
1268.077017068863
1258.598923921585
328.2231100797653
415.0371947288513
198.81677746772766
155.47331023216248
786.1935178041458
0.0
0.0
0.0
532.8317304849625
691.3273038864136
168.202508



Elapsed time to compute best fit: 25.271 seconds
Cross-validation score: 0.6508570909299933
Test score: 0.6373626373626373
Best Hyperparameters: {}
2046.3821060657501
156165.3867342472
240784.04500615597
22677.2678027153
4862.713805198669
445.0961289405823
382.2986879348755
241.7077980041504
459.7392489910126
68.84828972816467
9871.331734895706
4780.184259653091
245.67530751228333
0.0
853.1855127811432
0.0
145.51319026947021
510.6766390800476
3899.047342300415
12679.826982736588
369.0166668891907
56.044249296188354
361.7253999710083
349.4624389410019
0.0
34.92064046859741
351.5733312368393
374.53673458099365
197.16006934642792
414.66842901706696
790.8055263757706
364.6935214996338
75.36521172523499
1455.8366470336914
295.3782068490982
0.0
0.0
0.0
1452.2613542079926
129.45289945602417
390.6457087993622
347.2084501981735
465.5151027441025
460.57991886138916
97.21385979652405
354.80063939094543
554.7704457044601
0.0
0.0
0.0
268.65234088897705
1191.7428846359253
117.64685833454132
340.4653



Elapsed time to compute best fit: 24.213 seconds
Cross-validation score: 0.6514534389748142
Test score: 0.6884875846501128
Best Hyperparameters: {}
3202.225294828415
130345.48694121838
272382.4024039507
18270.203966975212
5641.76776599884
640.2324168682098
87.0009696483612
156.1322500705719
87.70023095607758
66.97174978256226
13064.375779032707
2061.515789270401
2319.672952055931
0.0
1149.6874406337738
0.0
194.589200258255
168.4793075323105
6950.6987845897675
2563.884605050087
517.6640971899033
21.299140453338623
155.47191905975342
652.2824554443359
175.76761484146118
22.236860752105713
47.01851046085358
125.67984020709991
93.55357933044434
1072.1519593000412
1119.2111296653748
727.4755408763885
78.01862955093384
1319.9830169677734
598.367419719696
0.0
0.0
0.0
1170.8208216428757
101.35372066497803
470.18413639068604
817.7077802419662
353.9535701274872
438.0341820716858
164.51914644241333
432.49021220207214
621.2758767604828
0.0
0.0
0.0
807.3893021345139
1119.3882200717926
676.586064696



Elapsed time to compute best fit: 24.368 seconds
Cross-validation score: 0.655968260123633
Test score: 0.6774193548387096
Best Hyperparameters: {}
2461.4401832818985
129097.99895203114
285010.1569248438
20222.939078092575
7969.605111002922
72.18995773792267
417.5558989048004
152.02528154850006
323.66053879261017
199.18443048000336
7009.110874295235
1009.8609801530838
592.0391811132431
0.0
766.5293965339661
0.0
341.0325436592102
449.1962374448776
11085.7317070961
2738.320882320404
385.9607357978821
40.67694044113159
978.2537657022476
177.18645071983337
9.214790105819702
56.474751710891724
857.7940567731857
114.32681083679199
81.74010980129242
1279.8691954612732
708.9608013629913
262.7853389978409
76.43405997753143
746.3920657634735
303.5673528909683
0.0
0.0
0.0
602.2977592945099
48.101699471473694
779.1870968341827
445.43890953063965
235.24218940734863
411.63091790676117
91.65039896965027
74.08864033222198
975.7624502182007
0.0
0.0
0.0
116.0137585401535
770.6981302499771
521.73966753482



Elapsed time to compute best fit: 24.545 seconds
Cross-validation score: 0.6625781558468247
Test score: 0.7126948775055679
Best Hyperparameters: {}
2916.8474745750427
138745.48866403103
268321.83186614513
17575.84039604664
6075.68269276619
459.64266538619995
422.30527913570404
145.6091103553772
72.34736931324005
63.43723964691162
10817.503395676613
3043.5994597673416
1012.5904279947281
0.0
949.7245824337006
0.0
274.6296714544296
326.66799771785736
9575.001163721085
853.6274229288101
231.13451290130615
95.4290018081665
429.46609354019165
430.40326833724976
66.04645919799805
105.4408016204834
921.4106262922287
553.5680816173553
504.06511330604553
453.44742572307587
1053.52255320549
103.18435871601105
132.41666173934937
1331.3526186943054
318.5152750015259
0.0
0.0
0.0
1667.1511951684952
144.8253697156906
95.39233040809631
554.8595975637436
387.7598212957382
806.5767682790756
5.424989938735962
207.66921949386597
963.9103214740753
0.0
0.0
0.0
87.0759516954422
119.83622097969055
290.98049652



Elapsed time to compute best fit: 24.337 seconds
Cross-validation score: 0.6599639689298884
Test score: 0.598669623059867
Best Hyperparameters: {}
6381.648015618324
113571.87601828575
264729.3089145422
24416.161705613136
5177.935180783272
1123.4531531333923
335.5261970758438
20.975560188293457
290.88741636276245
69.55518996715546
3308.560069799423
32917.704854130745
3635.9868351221085
0.0
1915.3373510837555
0.0
343.6963143348694
531.6126720905304
10147.5436835289
513.8313772678375
426.12581157684326
19.367289781570435
167.04508018493652
233.81643891334534
69.63508081436157
67.76855993270874
247.20143508911133
197.27238988876343
70.41267895698547
1861.3771716356277
1108.1221442222595
173.63493263721466
146.52730882167816
53.6506906747818
287.3218803405762
0.0
0.0
0.0
1038.550796866417
11.303919792175293
894.9264835119247
607.2782884836197
521.07737159729
310.7588814496994
79.99466896057129
118.60425114631653
393.97835421562195
0.0
0.0
0.0
394.53862833976746
2963.1929775476456
400.208705



Elapsed time to compute best fit: 23.768 seconds
Cross-validation score: 0.6623819992415688
Test score: 0.6703296703296703
Best Hyperparameters: {}
3217.714143395424
131779.98716890812
267887.2763963938
18972.11992394924
9424.136283040047
1697.6475459337234
59.64547038078308
130.20889854431152
689.5072243213654
118.91205084323883
8560.074505209923
1393.4448529481888
1892.6226719617844
0.0
235.37874722480774
0.0
1591.4555716514587
191.6854166984558
11229.530471205711
1259.5866346359253
223.1868315935135
104.09325909614563
350.9601125717163
70.75204968452454
74.55347967147827
42.07959032058716
41.64726996421814
241.4071146249771
103.88354015350342
848.2953256368637
1281.78910446167
174.2887372970581
96.48356878757477
991.6853406429291
139.3658903837204
0.0
0.0
0.0
3332.6785202026367
260.1736704111099
434.7997679710388
784.2101421356201
339.43469071388245
260.4536654949188
199.5909776687622
296.58548641204834
578.7672621011734
0.0
0.0
0.0
161.31176924705505
695.9782695770264
261.604846239



Elapsed time to compute best fit: 24.157 seconds
Cross-validation score: 0.6624179120490962
Test score: 0.6441048034934497
Best Hyperparameters: {}
3167.930928826332
131633.5854665041
276667.30515253544
22361.87057864666
3896.7404539585114
560.967600941658
412.74568343162537
286.9568474292755
79.64693129062653
40.55989992618561
11869.487348675728
152.54904890060425
1228.497966170311
0.0
1442.2901495695114
0.0
165.06804239749908
187.09761214256287
9286.19714307785
1903.325304031372
184.42796874046326
5.712179780006409
681.4547618627548
328.0616192817688
25.403419494628906
57.63928031921387
65.55396044254303
554.321858048439
122.09595930576324
236.1993681192398
1083.1603244543076
83.38783001899719
53.94227862358093
1133.1925189495087
124.43215870857239
0.0
0.0
0.0
845.4211111068726
139.5755307674408
1651.764521241188
897.7352967262268
530.5562019348145
135.3512009382248
14.697319984436035
51.66062927246094
549.4449608325958
0.0
0.0
0.0
686.3585919141769
3474.665672659874
192.087309479713



Elapsed time to compute best fit: 25.307 seconds
Cross-validation score: 0.6297109546669422
Test score: 0.710955710955711
Best Hyperparameters: {}
4688.069385051727
115266.85304963589
257595.3510556221
16819.720991134644
5791.049685120583
13.489189982414246
175.3474702835083
423.11367750167847
138.12881779670715
211.84488034248352
6103.26142513752
27643.032890319824
1375.7876279354095
0.0
763.6518833637238
0.0
1516.287870168686
10289.588239192963
13018.701496839523
168.27736961841583
204.98660898208618
0.0
123.4200006723404
288.6912291049957
150.679292678833
0.0
114.56683778762817
235.1621698141098
50.608728885650635
408.25092017650604
1975.239292383194
271.8572006225586
102.36880803108215
1042.4565992355347
269.12817084789276
0.0
0.0
0.0
1178.668336391449
34.79391002655029
1837.190219283104
1437.3593627214432
1190.4978778362274
545.442668557167
123.41614007949829
370.8107821941376
936.8187322616577
0.0
0.0
0.0
40.405569672584534
1751.2728826999664
64.7829601764679
320.39299631118774
5



Elapsed time to compute best fit: 24.130 seconds
Cross-validation score: 0.6642447966030028
Test score: 0.6206896551724138
Best Hyperparameters: {}
3560.1506626605988
160191.8848158121
237877.85093200207
15384.243628501892
14239.611744761467
154.42617011070251
345.88988649845123
36.76194977760315
468.82546532154083
78.96499907970428
7212.9110362529755
4828.9653170108795
4916.5326628685
0.0
935.7326285839081
0.0
819.0764789581299
1004.1072534322739
12668.560245037079
85.18551075458527
213.76879036426544
29.71880078315735
121.46578025817871
186.27393174171448
384.4575958251953
45.01885986328125
626.3446360826492
139.40624713897705
490.7590947151184
198.00430190563202
1189.662073135376
27.572880268096924
84.86103081703186
1998.7081904411316
260.3115425109863
0.0
0.0
0.0
3646.388638138771
286.7013989686966
1331.175814986229
170.21317923069
1292.6701519489288
125.94164073467255
125.12139940261841
440.99724531173706
953.1394091844559
0.0
0.0
0.0
108.29865205287933
207.06669557094574
86.61642



Elapsed time to compute best fit: 24.239 seconds
Cross-validation score: 0.6631140232063163
Test score: 0.6165919282511211
Best Hyperparameters: {}
2012.512024641037
135941.46870720387
259005.8689248562
20451.338597416878
7935.460236430168
1177.3592953681946
109.82459926605225
179.95831942558289
548.2625218629837
41.08988952636719
9135.135594964027
453.71285569667816
412.0116629600525
0.0
927.3828980922699
0.0
776.6440501213074
119.0048201084137
8107.231496214867
4603.300274372101
190.32425546646118
11.724300384521484
337.44623124599457
170.65693604946136
49.800201416015625
35.99522948265076
139.99797689914703
553.8857228755951
679.7781455516815
430.6633150577545
748.6531789302826
100.01690018177032
382.4942514896393
2247.7294669151306
320.6950001716614
0.0
0.0
0.0
2568.222025871277
224.45473909378052
793.519552230835
1400.8775817155838
802.6753253936768
179.76869344711304
52.80850028991699
638.556563615799
803.3370796442032
0.0
0.0
0.0
339.0275105237961
1046.8840165138245
103.76616907



Elapsed time to compute best fit: 24.168 seconds
Cross-validation score: 0.6430789951947098
Test score: 0.7066381156316915
Best Hyperparameters: {}
4063.1039078235626
154028.16605091095
235681.40906858444
17326.858667969704
14144.424018979073
72.9584412574768
78.64305973052979
81.7079690694809
104.48010957241058
37.42158913612366
5950.508221268654
5529.932804346085
10236.353704452515
0.0
990.7521512508392
0.0
1892.7101955413818
345.45212829113007
6150.217699050903
401.34319400787354
734.2001416683197
51.08722949028015
420.43461894989014
130.73761081695557
666.0258220434189
393.94337940216064
622.8072319030762
337.6912591457367
192.05724096298218
724.914504647255
1921.6730774641037
364.8689032793045
552.2055702209473
616.1119830608368
52.19420027732849
0.0
0.0
0.0
2266.104527235031
130.30941677093506
976.0656156539917
745.3916265964508
624.1046085357666
585.1472980976105
44.37334179878235
1078.0834031105042
660.9894421100616
0.0
0.0
0.0
241.53992986679077
2533.543602347374
84.1903293132



Elapsed time to compute best fit: 24.728 seconds
Cross-validation score: 0.6727209713357204
Test score: 0.7317073170731708
Best Hyperparameters: {}
7510.813710331917
141400.11345922947
237914.55100107193
16181.920970916748
13615.956318616867
1338.1860646009445
69.82121992111206
105.51436305046082
926.5127701759338
29.264339923858643
3825.797256231308
14706.497543811798
1240.0417939424515
0.0
319.92206478118896
0.0
1620.8119858503342
689.9739060401917
15249.40955710411
284.5135608911514
245.20226001739502
33.80213022232056
136.7675085067749
229.9059695005417
176.21524262428284
172.729159116745
353.5142033100128
215.60588836669922
129.8515911102295
1103.4070143699646
858.180905342102
93.56321823596954
240.0597701072693
1058.701688528061
114.98631024360657
0.0
0.0
0.0
2273.1975831985474
391.98634481430054
1376.8467185497284
896.581549167633
1328.309278011322
949.1887774467468
188.52766847610474
587.0405035018921
643.6482973098755
0.0
0.0
0.0
29.594260811805725
684.9876180887222
238.661548



Elapsed time to compute best fit: 25.463 seconds
Cross-validation score: 0.661534104378134
Test score: 0.6772009029345373
Best Hyperparameters: {}
3987.648140668869
120084.3153706789
250733.71435523033
22615.134759902954
8841.97560763359
317.5972958803177
223.78729140758514
265.33182179927826
431.33694887161255
45.72425997257233
3908.025162220001
24160.724893450737
2270.580335021019
0.0
1300.1067433357239
0.0
1996.4054052829742
296.30073952674866
15019.686136722565
641.2034912109375
447.34255850315094
27.755699515342712
890.2360699176788
417.3561294078827
71.72014045715332
72.47376024723053
505.8851275444031
357.624089717865
69.10946035385132
442.7656874656677
803.6847891807556
583.3296303749084
406.79996168613434
1141.381194114685
51.37691009044647
0.0
0.0
0.0
901.6021410226822
375.62306666374207
1647.5916349887848
275.15572917461395
784.2638322114944
403.96686232089996
131.65930128097534
359.7368264198303
466.0140620470047
0.0
0.0
0.0
388.8613168001175
433.6684800386429
249.470923185



Elapsed time to compute best fit: 24.240 seconds
Cross-validation score: 0.6451085786885212
Test score: 0.6621004566210045
Best Hyperparameters: {}
5358.361260771751
140775.7478723526
246329.33164286613
14108.844082593918
13346.584279179573
1011.1921602487564
71.09429025650024
140.47922146320343
290.18168091773987
25.56846022605896
5282.141214728355
14374.52966928482
5560.697996020317
0.0
652.7652189731598
0.0
1584.0387312173843
862.868940114975
10318.108355164528
2413.2647716999054
157.04443335533142
36.369959592819214
541.5767103433609
319.48031628131866
196.87389755249023
8.682570219039917
175.4728103876114
540.3915240764618
391.94984769821167
609.3386254310608
834.1995046138763
257.6520837545395
140.5442202091217
3174.6028941869736
161.43561935424805
0.0
0.0
0.0
1383.0579993724823
101.0883811712265
1289.280622124672
560.6182779073715
968.9638293981552
501.8948372602463
77.35969913005829
292.54773712158203
543.0926362276077
0.0
0.0
0.0
465.2826374769211
1002.1094895601273
181.660141