In [None]:
#Import Python Libraries
import pandas as pd
import pickle
import time
import numpy as np

#Import Self-written Functions
import os
import sys
src_dir = os.path.join(os.getcwd(), '..', 'src')
sys.path.append(src_dir)

from d00_utils.calculateTimeDifference import calculateTimeDifference #Function to calc time difference
from d01_data.loadCommits import loadCommits #Function to load SVN data
from d02_intermediate.cleanCommitData import cleanCommitData #Function to clean commit data
from d02_intermediate.cleanJiraData import cleanJiraData #Function to clean JIRA data

from d03_processing.createFittedTF_IDF import createFittedTF_IDF #Function to see if a trace is valid
from d03_processing.createCorpusFromDocumentList import createCorpusFromDocumentList #Function to create a corpus
from d03_processing.checkValidityTrace import checkValidityTrace #Function to see if a trace is valid
from d03_processing.calculateTimeDif import calculateTimeDif #Calculate the time difference between 2 dates in seconds
from d03_processing.checkFullnameEqualsEmail import checkFullnameEqualsEmail #Check if fullName is equal to the email
from d03_processing.calculateCosineSimilarity import calculateCosineSimilarity #Calculate the cos similarity
from d03_processing.calculateDocumentStatistics import *

from d03_processing.calculateQueryQuality import *
from d03_processing.normalize_data import *

from d04_model_evaluation.model_evaluation import *

#Display full value of a column
pd.set_option('display.max_colwidth', None)

#Display all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

print("done")

# 1. Load Raw Data

In [None]:
#Import raw JIRA data as a pandas dataframe
#jira_df_raw = pd.read_excel('../data/01_raw/JIRA_MXShop.xlsx')
jira_df_raw = pd.read_csv('../data/01_raw/JIRA_DataProcessing.csv')

#Import raw svn data as a pandas dataframe
svn_df_raw = loadCommits('../data/01_raw/data-processing-svn-dump.txt')

# 2. Clean Raw Data
## 2.1 Clean Raw Data - SVN Data
Clean the raw data of the SVN files

In [None]:
#Start timer
startTime = time.time() 

svn_df_clean = cleanCommitData(svn_df_raw)

#Create a temp XLSX file for all intermediate datasets
svn_df_clean.to_excel(excel_writer = "../data/02_intermediate/svn_df_clean.xlsx", index = False)

#Create a pickle file for all intermediate datasets
svn_df_clean.to_pickle(path= "../data/02_intermediate/svn_df_clean.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished cleaning after " + timeDifference)

## 2.2 Clean Raw Data - JIRA Data
Clean the raw data of the SVN files

In [None]:
#Rename key to Issue key
jira_df_raw = jira_df_raw.rename({'Key': 'Issue key'}, axis=1)

#Clean Data sets
jira_df_clean = cleanJiraData(dataFrame = jira_df_raw, cleanComments = False, commentAmount = 39)

#Create a temp XLSX file for all intermediate datasets
jira_df_clean.to_excel(excel_writer = "../data/02_intermediate/jira_df_clean.xlsx", index = False)

#Create a pickle file for all intermediate datasets
jira_df_clean.to_pickle(path= "../data/02_intermediate/jira_df_clean.pkl")

## 2.4 Clean Raw Data - Create Corpora
Create the corpora for JIRA Issues

In [None]:
#Create JIRA corpus for dataProcessing dataset
jira_corpus_summary  = createCorpusFromDocumentList(jira_df_clean.Summary)
jira_corpus_description = createCorpusFromDocumentList(jira_df_clean.Description)

#Merge all JIRA Corpora into 1 corpus
jira_corpus_all = [i+" "+j for i,j in zip(jira_corpus_summary,
                                          jira_corpus_description)]

#Save intermediate pickles
with open('../data/02_intermediate/jira_corpus_summary.pkl', 'wb') as f:
    pickle.dump(jira_corpus_summary, f)

with open('../data/02_intermediate/jira_corpus_description.pkl', 'wb') as f:
    pickle.dump(jira_corpus_description, f)

with open('../data/02_intermediate/jira_corpus_all.pkl', 'wb') as f:
    pickle.dump(jira_corpus_all, f)

Create the corpora for SVN

In [None]:
#Create corpus for log messages
svn_corpus_log = createCorpusFromDocumentList(svn_df_clean.Logs)

#Create corpus for unit names
svn_corpus_unitname = createCorpusFromDocumentList(svn_df_clean.Unit_names)

#Create corpus for entire commit (log message + model)
svn_corpus_all = createCorpusFromDocumentList(svn_df_clean.Logs + svn_df_clean.Unit_names)

#Save intermediate pickles
with open('../data/02_intermediate/svn_corpus_log.pkl', 'wb') as f:
    pickle.dump(svn_corpus_log, f)

with open('../data/02_intermediate/svn_corpus_unitname.pkl', 'wb') as f:
    pickle.dump(svn_corpus_unitname, f)

with open('../data/02_intermediate/svn_corpus_all.pkl', 'wb') as f:
    pickle.dump(svn_corpus_all, f)

# 3. Preprocess Data

In [None]:
#Run this code block when you've restarted the kernel, and want to use previously gained results.
#Load JIRA Corpora
jira_corpus_summary = pd.read_pickle("../data/02_intermediate/jira_corpus_summary.pkl")
jira_corpus_description = pd.read_pickle("../data/02_intermediate/jira_corpus_description.pkl")
jira_corpus_all = pd.read_pickle("../data/02_intermediate/jira_corpus_all.pkl")

#Load SVN corora
svn_corpus_log = pd.read_pickle("../data/02_intermediate/svn_corpus_log.pkl")
svn_corpus_unitname = pd.read_pickle("../data/02_intermediate/svn_corpus_unitname.pkl")
svn_corpus_all = pd.read_pickle("../data/02_intermediate/svn_corpus_all.pkl")

#Load clean datasets
jira_df_clean = pd.read_pickle("../data/02_intermediate/jira_df_clean.pkl")
svn_df_clean = pd.read_pickle("../data/02_intermediate/svn_df_clean.pkl")

## 3.0 Preprocess Data - Create cartesian product JIRA x Commits

In [None]:
#Create cartesian products JIRA x Commits
cartesian_df = jira_df_clean.merge(svn_df_clean, how='cross')

#Drop all rows which do not meet the rules of causality
cartesian_df = cartesian_df.drop(cartesian_df[cartesian_df.Jira_created_date > cartesian_df.Commit_date].index)

#Create a pickle file for all intermediate datasets
cartesian_df.to_pickle(path= "../data/03_processed/cartesian_df.pkl")

#Create a temp XLSX file for all intermediate datasets
cartesian_df.to_excel(excel_writer = "../data/02_intermediate/cartesian_df.xlsx", index = False)

In [None]:
# Run line below to get cartesian df
cartesian_df = pd.read_pickle(r'../data/03_processed/cartesian_df.pkl')

cartesian_df.head()

## 3.1 Preprocess Data - Create Labels

In [None]:
#Create new dataFrames for the time features
labels_df = pd.DataFrame() 

#Create a column, which indicates which traces are valid.
labels_df["is_valid"] = cartesian_df.apply(lambda x: checkValidityTrace(x.Issue_key_jira, x.Issue_key_commit), axis=1)
print("Finished creating labels for dataProcessing")

#Save intermediate results
labels_df.to_pickle(path= "../data/03_processed/labels_df.pkl")

In [None]:
labels_df[labels_df["is_valid"] == True]

## 3.2 Preprocess Data - Create Process-Related Features

In [None]:
#Create new dataFrames for the time features
features_process_related = pd.DataFrame() 

#Calculate the time features for data Processing Dataset
features_process_related['f1_assignee_is_commiter'] = cartesian_df.apply(lambda x: checkFullnameEqualsEmail(x.Assignee, x.Email), axis=1)
features_process_related['f2_timedif_issuecreation_and_commitcreation'] = cartesian_df.apply(lambda x: calculateTimeDif(x.Jira_created_date, x.Commit_date), axis=1)
features_process_related['f3_timedif_issueupdated_and_commitcreation'] = cartesian_df.apply(lambda x: calculateTimeDif(x.Jira_updated_date, x.Commit_date), axis=1)
features_process_related['f4_timedif_issueresolved_and_commitcreation'] = cartesian_df.apply(lambda x: calculateTimeDif(x.Jira_resolved_date, x.Commit_date), axis=1)
print("Finished data Processing")

#Create a pickle file for all intermediate datasets
features_process_related.to_pickle(path= "../data/03_processed/features_process_related.pkl")

## 3.3 Preprocess Data - Create Document Statistics Features

In [None]:
#Start timer
startTime = time.time() 

#Create new dataFrames for document statistics features
features_document_statistics = pd.DataFrame() 

features_document_statistics["f5_total_terms_jira"] = cartesian_df.apply(lambda x: calculateTotalWordCount(x.Jira_natural_text), 
                                                            axis=1)

#Calculate total terms JIRA for each trace
features_document_statistics["f6_total_terms_svn"] = cartesian_df.apply(lambda x: calculateTotalWordCount(x.Commit_natural_text), 
                                                            axis=1)

features_document_statistics["f7_unique_terms_jira"] = cartesian_df.apply(lambda x: calculateUniqueWordCount(x.Jira_natural_text), 
                                                            axis=1)
#Calculate unique terms JIRA for each trace
features_document_statistics["f8_unique_terms_svn"] = cartesian_df.apply(lambda x: calculateUniqueWordCount(x.Commit_natural_text), 
                                                            axis=1)


features_document_statistics["f9_overlap_terms_compared_to_jira"] = cartesian_df.apply(lambda x: calculateOverlapBetweenDocuments(x.Jira_natural_text, x.Commit_natural_text, 'list1'),
                                                            axis=1)
features_document_statistics["f10_overlap_terms_to_svn"] = cartesian_df.apply(lambda x: calculateOverlapBetweenDocuments(x.Jira_natural_text, x.Commit_natural_text, 'list2'),
                                                            axis=1)
features_document_statistics["f11_overlap_terms_to_union"] = cartesian_df.apply(lambda x: calculateOverlapBetweenDocuments(x.Jira_natural_text, x.Commit_natural_text, 'union'),
                                                            axis=1)


#Save results in pickle
features_document_statistics.to_pickle(path= "../data/03_processed/features_document_statistics.pkl")

#Create a temp XLSX file for all intermediate datasets
features_document_statistics.to_excel(excel_writer = "../data/03_processed/features_document_statistics.xlsx", index = False)

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating document statistics in " + timeDifference)

## 3.3 Preprocess Data - Create Information Retrieval Features
### 3.3.1 Create tfidf for the corpus

In [None]:
#Create new dataFrame
features_information_retrieval = pd.DataFrame() 

In [None]:
#Instantiate the count vectorizer and tfidf for the corpus
from sklearn.feature_extraction.text import CountVectorizer 

#instantiate CountVectorizer() for SVN
svn_all_countvectorizer = CountVectorizer()
svn_all_tfidf = createFittedTF_IDF(svn_all_countvectorizer, svn_corpus_all)

svn_log_countvectorizer = CountVectorizer()
svn_log_tfidf = createFittedTF_IDF(svn_log_countvectorizer, svn_corpus_log)

svn_unitname_countvectorizer = CountVectorizer()
svn_unitname_tfidf = createFittedTF_IDF(svn_unitname_countvectorizer, svn_corpus_unitname)

#instantiate CountVectorizer() for JIRA - unigram
jira_all_countvectorizer = CountVectorizer()
jira_all_tfidf = createFittedTF_IDF(jira_all_countvectorizer, jira_corpus_all)

jira_summary_countvectorizer = CountVectorizer()
jira_summary_tfidf = createFittedTF_IDF(jira_summary_countvectorizer, jira_corpus_summary)

jira_description_countvectorizer = CountVectorizer()
jira_description_tfidf = createFittedTF_IDF(jira_description_countvectorizer, jira_corpus_description)

#### IR Features - Log Message and Summary

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f12_ir_log_and_summary_log_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Summary, x.Logs, 
                                                                                                                                 svn_log_countvectorizer, 
                                                                                                                                 svn_log_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= "../data/03_processed/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f13_ir_log_and_summary_summary_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Summary, x.Logs, 
                                                                                                                                    jira_summary_countvectorizer, 
                                                                                                                                    jira_summary_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= "../data/03_processed/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

#### IR Features - Log Message and Description

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f14_ir_log_and_description_log_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Description, x.Unit_names, 
                                                                                                                        svn_log_countvectorizer, 
                                                                                                                        svn_log_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= "../data/03_processed/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f15_ir_log_and_description_description_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Description, x.Logs, 
                                                                                                                                jira_description_countvectorizer, 
                                                                                                                                jira_description_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= "../data/03_processed/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

#### IR Features - Log Message and JIRA All-Natural Text

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f16_ir_log_and_jira_all_log_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Logs, 
                                                                                                                              svn_log_countvectorizer, 
                                                                                                                              svn_log_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= "../data/03_processed/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f17_ir_log_and_jira_all_jira_all_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Logs, 
                                                                                                                              jira_all_countvectorizer, 
                                                                                                                              jira_all_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= "../data/03_processed/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

#### IR Features - Unit Names and Summary

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f18_ir_unitname_and_summary_unitname_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Summary, x.Unit_names, 
                                                                                                                                       svn_unitname_countvectorizer, 
                                                                                                                                       svn_unitname_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= "../data/03_processed/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f19_ir_unitname_and_summary_summary_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Summary, x.Unit_names, 
                                                                                                                                     jira_summary_countvectorizer, 
                                                                                                                                     jira_summary_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= "../data/03_processed/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

#### IR Features - Unit Names and Description

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f20_ir_unitname_and_description_unitname_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Description, x.Unit_names, 
                                                                                                                                        svn_unitname_countvectorizer, 
                                                                                                                                        svn_unitname_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= "../data/03_processed/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f21_ir_unitname_and_description_description_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Description, x.Unit_names, 
                                                                                                                                          jira_description_countvectorizer, 
                                                                                                                                          jira_description_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= "../data/03_processed/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

#### IR Features - Unit Names and JIRA All-Natural Text

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f22_ir_unitname_and_jira_all_unitname_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Unit_names, 
                                                                                                                       svn_unitname_countvectorizer, 
                                                                                                                       svn_unitname_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= "../data/03_processed/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f23_ir_unitname_and_jira_all_jira_all_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Unit_names, 
                                                                                                                                   jira_all_countvectorizer, 
                                                                                                                                   jira_all_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= "../data/03_processed/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

#### IR Features - Revision All-Natural Text and Summary

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f24_ir_svn_all_and_summary_svn_all_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Commit_natural_text, x.Summary, 
                                                                                                                             svn_all_countvectorizer, 
                                                                                                                             svn_all_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= "../data/03_processed/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f25_ir_svn_all_and_summary_summary_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Commit_natural_text, x.Summary, 
                                                                                                                            jira_summary_countvectorizer, 
                                                                                                                            jira_summary_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= "../data/03_processed/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

#### IR Features - Revision All-Natural Text and Description

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f26_ir_svn_all_and_description_svn_all_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Commit_natural_text, x.Description, 
                                                                                                                            svn_all_countvectorizer, 
                                                                                                                            svn_all_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= "../data/03_processed/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f27_ir_svn_all_and_description_description_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Commit_natural_text, x.Description, 
                                                                                                                                    jira_description_countvectorizer, 
                                                                                                                                    jira_description_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= "../data/03_processed/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

#### IR Features - Revision All-Natural Text and JIRA All-Natural Text

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f28_ir_svn_all_and_jira_all_svn_all_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Commit_natural_text, 
                                                                                                                     svn_all_countvectorizer, 
                                                                                                                     svn_all_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= "../data/03_processed/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f29_ir_svn_all_and_jira_all_jira_all_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Commit_natural_text, 
                                                                                                                      jira_all_countvectorizer, 
                                                                                                                      jira_all_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= "../data/03_processed/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

## 3.7 Query Quality

In [None]:
#Determine document counts
jira_documentcount = len(jira_df_clean.index)
svn_documentcount = len(svn_df_clean.index)

#### IDF Scores (SVN as Query)

In [None]:
#Start timer
startTime = time.time() 

#Create new dataFrame
processedData_SVN_dataProcessingFeaturesIDF = pd.DataFrame()

features_qq_specificity = pd.DataFrame()

#Calculate temporary IDF stats for each svn
features_qq_specificity["idf_svn_all_as_query"] = cartesian_df.apply(lambda x: calcIDFList(x.Commit_natural_text, 
                                                                                           svn_all_countvectorizer,
                                                                                           svn_all_tfidf),axis=1)

features_qq_specificity["f30_avgidf_svn_all_as_query"] = features_qq_specificity.apply(lambda x: calcAvgIDF(x.idf_svn_all_as_query), axis=1)
features_qq_specificity["f31_maxidf_svn_all_as_query"] = features_qq_specificity.apply(lambda x: calcMaxIDF(x.idf_svn_all_as_query), axis=1)
features_qq_specificity["f32_devidf_svn_all_as_query"] = features_qq_specificity.apply(lambda x: calcDevIDF(x.idf_svn_all_as_query), axis=1)

#Remove IDF stats
features_qq_specificity.drop('idf_svn_all_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### IDF Scores (SVNLogs as Query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["idf_log_as_query"] = cartesian_df.apply(lambda x: calcIDFList(x.Logs, 
                                                                                       svn_log_countvectorizer, 
                                                                                       svn_log_tfidf),axis=1)

features_qq_specificity["f33_avgidf_log_as_query"] = features_qq_specificity.apply(lambda x: calcAvgIDF(x.idf_log_as_query), axis=1)
features_qq_specificity["f34_maxidf_log_as_query"] = features_qq_specificity.apply(lambda x: calcMaxIDF(x.idf_log_as_query), axis=1)
features_qq_specificity["f35_devidf_log_as_query"] = features_qq_specificity.apply(lambda x: calcDevIDF(x.idf_log_as_query), axis=1)

#Remove IDF stats
features_qq_specificity.drop('idf_log_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### IDF Scores (SVNUnitNames as Query

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["idf_unitname_as_query"] = cartesian_df.apply(lambda x: calcIDFList(x.Unit_names, 
                                                                                              svn_unitname_countvectorizer, 
                                                                                              svn_unitname_tfidf),axis=1)

features_qq_specificity["f36_avgidf_unitname_as_query"] = features_qq_specificity.apply(lambda x: calcAvgIDF(x.idf_unitname_as_query), axis=1)
features_qq_specificity["f37_maxidf_unitname_as_query"] = features_qq_specificity.apply(lambda x: calcMaxIDF(x.idf_unitname_as_query), axis=1)
features_qq_specificity["f38_devidf_unitname_as_query"] = features_qq_specificity.apply(lambda x: calcDevIDF(x.idf_unitname_as_query), axis=1)

#Remove IDF stats
features_qq_specificity.drop('idf_unitname_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

##### IDF Scores (JIRA as Query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["idf_jira_all_as_query"] = cartesian_df.apply(lambda x: calcIDFList(x.Jira_natural_text, 
                                                                                            jira_all_countvectorizer,
                                                                                            jira_all_tfidf),axis=1)

features_qq_specificity["f39_avgidf_jira_all_as_query"] = features_qq_specificity.apply(lambda x: calcAvgIDF(x.idf_jira_all_as_query), axis=1)
features_qq_specificity["f40_maxidf_jira_all_as_query"] = features_qq_specificity.apply(lambda x: calcMaxIDF(x.idf_jira_all_as_query), axis=1)
features_qq_specificity["f41_devidf_jira_all_as_query"] = features_qq_specificity.apply(lambda x: calcDevIDF(x.idf_jira_all_as_query), axis=1)

#Remove IDF stats
features_qq_specificity.drop('idf_jira_all_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

##### IDF Scores (JIRA Summaries as Query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["idf_jira_summary_as_query"] = cartesian_df.apply(lambda x: calcIDFList(x.Summary, 
                                                                                                jira_summary_countvectorizer,
                                                                                                jira_summary_tfidf),axis=1)

features_qq_specificity["f42_avgidf_jira_summary_as_query"] = features_qq_specificity.apply(lambda x: calcAvgIDF(x.idf_jira_summary_as_query), axis=1)
features_qq_specificity["f43_maxidf_jira_summary_as_query"] = features_qq_specificity.apply(lambda x: calcMaxIDF(x.idf_jira_summary_as_query), axis=1)
features_qq_specificity["f44_devidf_jira_summary_as_query"] = features_qq_specificity.apply(lambda x: calcDevIDF(x.idf_jira_summary_as_query), axis=1)

#Remove IDF stats
features_qq_specificity.drop('idf_jira_summary_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

##### IDF Scores (JIRA Descriptions as Query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["idf_jira_description_as_query"] = cartesian_df.apply(lambda x: calcIDFList(x.Description, 
                                                                                                    jira_description_countvectorizer,
                                                                                                    jira_description_tfidf),axis=1)

features_qq_specificity["f45_avgidf_jira_description_as_query"] = features_qq_specificity.apply(lambda x: calcAvgIDF(x.idf_jira_description_as_query), axis=1)
features_qq_specificity["f46_maxidf_jira_description_as_query"] = features_qq_specificity.apply(lambda x: calcMaxIDF(x.idf_jira_description_as_query), axis=1)
features_qq_specificity["f47_devidf_jira_description_as_query"] = features_qq_specificity.apply(lambda x: calcDevIDF(x.idf_jira_description_as_query), axis=1)

#Remove IDF stats
features_qq_specificity.drop('idf_jira_description_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### ICTF Scores (SVN as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["ictf_svn_all_as_query"] = cartesian_df.apply(lambda x: calcICTFList(x.Commit_natural_text,
                                                                                            svn_all_countvectorizer,
                                                                                            svn_documentcount),axis=1)

features_qq_specificity["f48_avgictf_svn_all_as_query"] = features_qq_specificity.apply(lambda x: calcAvgICTF(x.ictf_svn_all_as_query, svn_documentcount), axis=1)
features_qq_specificity["f49_maxictf_svn_all_as_query"] = features_qq_specificity.apply(lambda x: calcMaxICTF(x.ictf_svn_all_as_query), axis=1)
features_qq_specificity["f50_devictf_svn_all_as_query"] = features_qq_specificity.apply(lambda x: calcDevICTF(x.ictf_svn_all_as_query), axis=1)

#Remove ICTF stats
features_qq_specificity.drop('ictf_svn_all_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### ICTF Scores (SVNLogs as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["ictf_svn_log_as_query"] = cartesian_df.apply(lambda x: calcICTFList(x.Logs, 
                                                                                             svn_log_countvectorizer, 
                                                                                             svn_documentcount),axis=1)
##
features_qq_specificity["f51_avgictf_svn_log_as_query"] = features_qq_specificity.apply(lambda x: calcAvgICTF(x.ictf_svn_log_as_query, svn_documentcount), axis=1)
features_qq_specificity["f52_maxictf_svn_log_as_query"] = features_qq_specificity.apply(lambda x: calcMaxICTF(x.ictf_svn_log_as_query), axis=1)
features_qq_specificity["f53_devictf_svn_log_as_query"] = features_qq_specificity.apply(lambda x: calcDevICTF(x.ictf_svn_log_as_query), axis=1)

#Remove ICTF stats
features_qq_specificity.drop('ictf_svn_log_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### ICTF Scores (SVNUnitNames as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["ictf_svn_unitname_as_query"] = cartesian_df.apply(lambda x: calcICTFList(x.Unit_names, 
                                                                                                  svn_unitname_countvectorizer, 
                                                                                                  svn_documentcount),axis=1)
##
features_qq_specificity["f54_avgictf_svn_unitname_as_query"] = features_qq_specificity.apply(lambda x: calcAvgICTF(x.ictf_svn_unitname_as_query, svn_documentcount), axis=1)
features_qq_specificity["f55_maxictf_svn_unitname_as_query"] = features_qq_specificity.apply(lambda x: calcMaxICTF(x.ictf_svn_unitname_as_query), axis=1)
features_qq_specificity["f56_devictf_svn_unitname_as_query"] = features_qq_specificity.apply(lambda x: calcDevICTF(x.ictf_svn_unitname_as_query), axis=1)

#Remove ICTF stats
features_qq_specificity.drop('ictf_svn_unitname_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### ICTF Scores (JIRA as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["ictf_jira_all_as_query"] = cartesian_df.apply(lambda x: calcICTFList(x.Jira_natural_text, 
                                                                                              jira_all_countvectorizer, 
                                                                                              jira_documentcount),axis=1)
##
features_qq_specificity["f57_avgictf_jira_all_as_query"] = features_qq_specificity.apply(lambda x: calcAvgICTF(x.ictf_jira_all_as_query, jira_documentcount), axis=1)
features_qq_specificity["f58_maxictf_jira_all_as_query"] = features_qq_specificity.apply(lambda x: calcMaxICTF(x.ictf_jira_all_as_query), axis=1)
features_qq_specificity["f59_devictf_jira_all_as_query"] = features_qq_specificity.apply(lambda x: calcDevICTF(x.ictf_jira_all_as_query), axis=1)

#Remove ICTF stats
features_qq_specificity.drop('ictf_jira_all_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")



endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### ICTF Scores (JIRA Summaries as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["ictf_jira_summary_as_query"] = cartesian_df.apply(lambda x: calcICTFList(x.Summary,
                                                                                                 jira_summary_countvectorizer, 
                                                                                                 jira_documentcount),axis=1)
##
features_qq_specificity["f60_avgictf_jira_summary_as_query"] = features_qq_specificity.apply(lambda x: calcAvgICTF(x.ictf_jira_summary_as_query, jira_documentcount), axis=1)
features_qq_specificity["f61_maxictf_jira_summary_as_query"] = features_qq_specificity.apply(lambda x: calcMaxICTF(x.ictf_jira_summary_as_query), axis=1)
features_qq_specificity["f62_devictf_jira_summary_as_query"] = features_qq_specificity.apply(lambda x: calcDevICTF(x.ictf_jira_summary_as_query), axis=1)

#Remove ICTF stats
features_qq_specificity.drop('ictf_jira_summary_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")



endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### ICTF Scores (JIRA Descriptions as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["ictf_jira_description_as_query"] = cartesian_df.apply(lambda x: calcICTFList(x.Description,
                                                                                                     jira_description_countvectorizer,
                                                                                                     jira_documentcount),axis=1)
##
features_qq_specificity["f63_avgictf_jira_description_as_query"] = features_qq_specificity.apply(lambda x: calcAvgICTF(x.ictf_jira_description_as_query, jira_documentcount), axis=1)
features_qq_specificity["f64_maxictf_jira_description_as_query"] = features_qq_specificity.apply(lambda x: calcMaxICTF(x.ictf_jira_description_as_query), axis=1)
features_qq_specificity["f65_devictf_jira_description_as_query"] = features_qq_specificity.apply(lambda x: calcDevICTF(x.ictf_jira_description_as_query), axis=1)

#Remove ICTF stats
features_qq_specificity.drop('ictf_jira_description_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### Entropy (SVN as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["entropy_svn_all_as_query"] = cartesian_df.apply(lambda x: calcEntropyList(x.Commit_natural_text,
                                                                                                   svn_all_countvectorizer,
                                                                                                   svn_documentcount,
                                                                                                   svn_df_clean.Commit_natural_text),axis=1)

features_qq_specificity["f66_avgentropy_svn_all_as_query"] = features_qq_specificity.apply(lambda x: calcAvgEntropy(x.entropy_svn_all_as_query), axis=1)
features_qq_specificity["f67_medentropy_svn_all_as_query"] = features_qq_specificity.apply(lambda x: calcMedEntropy(x.entropy_svn_all_as_query), axis=1)
features_qq_specificity["f68_maxentropy_svn_all_as_query"] = features_qq_specificity.apply(lambda x: calcMaxEntropy(x.entropy_svn_all_as_query), axis=1)
features_qq_specificity["f69_deventropy_svn_all_as_query"] = features_qq_specificity.apply(lambda x: calcDevEntropy(x.entropy_svn_all_as_query), axis=1)

#Remove Entropy stats
features_qq_specificity.drop('entropy_svn_all_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### Entropy (SVNLogs as query)

In [None]:
#Start timer
startTime = time.time() 

print("Time to stop")

#Calculate IDF stats for each svn
features_qq_specificity["entropy_svn_log_as_query"] = cartesian_df.apply(lambda x: calcEntropyList(x.Logs, 
                                                                                                 svn_log_countvectorizer, 
                                                                                                 svn_documentcount,
                                                                                                 svn_df_clean.Logs),axis=1)
##
features_qq_specificity["f70_avgentropy_svn_log_as_query"] = features_qq_specificity.apply(lambda x: calcAvgEntropy(x.entropy_svn_log_as_query), axis=1)
features_qq_specificity["f71_medentropy_svn_log_as_query"] = features_qq_specificity.apply(lambda x: calcMedEntropy(x.entropy_svn_log_as_query), axis=1)
features_qq_specificity["f72_maxentropy_svn_log_as_query"] = features_qq_specificity.apply(lambda x: calcMaxEntropy(x.entropy_svn_log_as_query), axis=1)
features_qq_specificity["f73_deventropy_svn_log_as_query"] = features_qq_specificity.apply(lambda x: calcDevEntropy(x.entropy_svn_log_as_query), axis=1)

#Remove Entropy stats
features_qq_specificity.drop('entropy_svn_log_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### Entropy (SVNUnitNames as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["entropy_svn_unitname_as_query"] = cartesian_df.apply(lambda x: calcEntropyList(x.Unit_names, 
                                                                                                      svn_unitname_countvectorizer, 
                                                                                                      svn_documentcount,
                                                                                                      svn_df_clean.Unit_names),axis=1)
##
features_qq_specificity["f74_avgentropy_svn_unitname_as_query"] = features_qq_specificity.apply(lambda x: calcAvgEntropy(x.entropy_svn_unitname_as_query), axis=1)
features_qq_specificity["f75_medentropy_svn_unitname_as_query"] = features_qq_specificity.apply(lambda x: calcMedEntropy(x.entropy_svn_unitname_as_query), axis=1)
features_qq_specificity["f76_maxentropy_svn_unitname_as_query"] = features_qq_specificity.apply(lambda x: calcMaxEntropy(x.entropy_svn_unitname_as_query), axis=1)
features_qq_specificity["f77_deventropy_svn_unitname_as_query"] = features_qq_specificity.apply(lambda x: calcDevEntropy(x.entropy_svn_unitname_as_query), axis=1)

#Remove Entropy stats
features_qq_specificity.drop('entropy_svn_unitname_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### Entropy (JIRA as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["entropy_jira_all_as_query"] = cartesian_df.apply(lambda x: calcEntropyList(x.Jira_natural_text, 
                                                                                                    jira_all_countvectorizer,
                                                                                                    jira_documentcount,
                                                                                                    jira_df_clean.Jira_natural_text),axis=1)
##
features_qq_specificity["f78_avgentropy_jira_all_as_query"] = features_qq_specificity.apply(lambda x: calcAvgEntropy(x.entropy_jira_all_as_query), axis=1)
features_qq_specificity["f79_medentropy_jira_all_as_query"] = features_qq_specificity.apply(lambda x: calcMedEntropy(x.entropy_jira_all_as_query), axis=1)
features_qq_specificity["f80_maxentropy_jira_all_as_query"] = features_qq_specificity.apply(lambda x: calcMaxEntropy(x.entropy_jira_all_as_query), axis=1)
features_qq_specificity["f81_deventropy_jira_all_as_query"] = features_qq_specificity.apply(lambda x: calcDevEntropy(x.entropy_jira_all_as_query), axis=1)

#Remove Entropy stats
features_qq_specificity.drop('entropy_jira_all_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### Entropy (JIRA Summaries as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["entropy_jira_summary_as_query"] = cartesian_df.apply(lambda x: calcEntropyList(x.Summary, 
                                                                                                        jira_summary_countvectorizer,
                                                                                                        jira_documentcount,
                                                                                                        jira_df_clean.Summary),axis=1)
##
features_qq_specificity["f82_avgentropy_jira_summary_as_query"] = features_qq_specificity.apply(lambda x: calcAvgEntropy(x.entropy_jira_summary_as_query), axis=1)
features_qq_specificity["f83_medentropy_jira_summary_as_query"] = features_qq_specificity.apply(lambda x: calcMedEntropy(x.entropy_jira_summary_as_query), axis=1)
features_qq_specificity["f84_maxentropy_jira_summary_as_query"] = features_qq_specificity.apply(lambda x: calcMaxEntropy(x.entropy_jira_summary_as_query), axis=1)
features_qq_specificity["f85_deventropy_jira_summary_as_query"] = features_qq_specificity.apply(lambda x: calcDevEntropy(x.entropy_jira_summary_as_query), axis=1)

#Remove Entropy stats
features_qq_specificity.drop('entropy_jira_summary_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### Entropy (JIRA Descriptions as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["entropy_jira_description_as_query"] = cartesian_df.apply(lambda x: calcEntropyList(x.Description,
                                                                                                            jira_description_countvectorizer,
                                                                                                            jira_documentcount,
                                                                                                            jira_df_clean.Description),axis=1)
##
features_qq_specificity["f86_avgentropy_jira_description_as_query"] = features_qq_specificity.apply(lambda x: calcAvgEntropy(x.entropy_jira_description_as_query), axis=1)
features_qq_specificity["f87_medentropy_jira_description_as_query"] = features_qq_specificity.apply(lambda x: calcMedEntropy(x.entropy_jira_description_as_query), axis=1)
features_qq_specificity["f88_maxentropy_jira_description_as_query"] = features_qq_specificity.apply(lambda x: calcMaxEntropy(x.entropy_jira_description_as_query), axis=1)
features_qq_specificity["f89_deventropy_jira_description_as_query"] = features_qq_specificity.apply(lambda x: calcDevEntropy(x.entropy_jira_description_as_query), axis=1)

#Remove Entropy stats
features_qq_specificity.drop('entropy_jira_description_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

##### Query Scope (SVN as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["f90_queryscope_svn_all_as_query"] = cartesian_df.apply(lambda x: calcQueryScope(x.Commit_natural_text,
                                                                                                         svn_df_clean.Commit_natural_text),axis=1)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



##### Query Scope (SVNLogs as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["f91_queryscope_svn_log_as_query"] = cartesian_df.apply(lambda x: calcQueryScope(x.Logs,
                                                                                                         svn_df_clean.Logs),axis=1)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



##### Query Scope (SVNUnitNames as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["f92_queryscope_svn_unitname_as_query"] = cartesian_df.apply(lambda x: calcQueryScope(x.Unit_names, 
                                                                                                              svn_df_clean.Unit_names),axis=1)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)


##### Query Scope (JIRA as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["f93_queryscope_jira_all_as_query"] = cartesian_df.apply(lambda x: calcQueryScope(x.Jira_natural_text,
                                                                                                          jira_df_clean.Jira_natural_text),axis=1)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



##### Query Scope (JIRA Summaries as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["f94_queryscope_jira_summary_as_query"] = cartesian_df.apply(lambda x: calcQueryScope(x.Summary, 
                                                                                                              jira_df_clean.Summary),axis=1)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



##### Query Scope (JIRA Descriptions as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["f95_queryscope_jira_description_as_query"] = cartesian_df.apply(lambda x: calcQueryScope(x.Description,
                                                                                                                  jira_df_clean.Description),axis=1)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")


endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



#### Kullback-Leiber divergence (SVN as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["f96_scs_svn_all_as_query"] = cartesian_df.apply(lambda x: calcSCS(x.Commit_natural_text,
                                                                                           svn_all_countvectorizer,
                                                                                           svn_documentcount),axis=1)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)



#### Kullback-Leiber divergence (SVNLogs as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["f97_scs_svn_log_as_query"] = cartesian_df.apply(lambda x: calcSCS(x.Logs,
                                                                                           svn_log_countvectorizer,
                                                                                           svn_documentcount),axis=1)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### Kullback-Leiber divergence (SVNUnitNames as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["f98_scs_svn_unitname_as_query"] = cartesian_df.apply(lambda x: calcSCS(x.Unit_names,
                                                                                                svn_unitname_countvectorizer,
                                                                                                svn_documentcount),axis=1)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### Kullback-Leiber divergence (JIRA as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["f99_scs_jira_all_as_query"] = cartesian_df.apply(lambda x: calcSCS(x.Jira_natural_text,
                                                                                            jira_all_countvectorizer,
                                                                                            jira_documentcount),axis=1)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### Kullback-Leiber divergence (JIRA Summaries as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["f100_scs_jira_summary_as_query"] = cartesian_df.apply(lambda x: calcSCS(x.Summary, 
                                                                                                jira_summary_countvectorizer,
                                                                                                jira_documentcount),axis=1)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

##### Kullback-Leiber divergence (JIRA Description as query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_specificity["f101_scs_jira_description_as_query"] = cartesian_df.apply(lambda x: calcSCS(x.Description, 
                                                                                                   jira_description_countvectorizer,
                                                                                                   jira_documentcount),axis=1)

#Save results in pickle
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

### Query Quality Similarity

#### SCQ (SVN as Query)

In [None]:
#Start timer
startTime = time.time() 

#Create new dataFrame
features_qq_similarity = pd.DataFrame()

#Calculate SCQ stats for each svn
features_qq_similarity["scq_svn_all_as_query"] = cartesian_df.apply(lambda x: calcSCQList(x.Commit_natural_text, 
                                                                                          svn_df_clean.Commit_natural_text,
                                                                                          svn_all_countvectorizer,
                                                                                          svn_all_tfidf,
                                                                                          svn_documentcount),axis=1)

features_qq_similarity["f102_SvnAsQuery_avgSCQ"] = features_qq_similarity.apply(lambda x: calcAvgSCQ(x.scq_svn_all_as_query, svn_documentcount), axis=1)
features_qq_similarity["f103_SvnAsQuery_maxSCQ"] = features_qq_similarity.apply(lambda x: calcMaxSCQ(x.scq_svn_all_as_query), axis=1)
features_qq_similarity["f104_SvnAsQuery_sumSCQ"] = features_qq_similarity.apply(lambda x: calcSumSCQ(x.scq_svn_all_as_query), axis=1)

#Remove SCQ stats
features_qq_similarity.drop('scq_svn_all_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_similarity.to_pickle(path= "../data/03_processed/features_qq_similarity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### SCQ (SVNLogs as Query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_similarity["scq_svn_log_as_query"] = cartesian_df.apply(lambda x: calcSCQList(x.Logs, 
                                                                                          svn_df_clean.Logs,
                                                                                          svn_log_countvectorizer,
                                                                                          svn_log_tfidf,
                                                                                          svn_documentcount),axis=1)

features_qq_similarity["f105_avgscq_svn_log_as_query"] = features_qq_similarity.apply(lambda x: calcAvgSCQ(x.scq_svn_log_as_query, svn_documentcount), axis=1)
features_qq_similarity["f106_maxscq_svn_log_as_query"] = features_qq_similarity.apply(lambda x: calcMaxSCQ(x.scq_svn_log_as_query), axis=1)
features_qq_similarity["f107_sumscq_svn_log_as_query"] = features_qq_similarity.apply(lambda x: calcSumSCQ(x.scq_svn_log_as_query), axis=1)

#Remove SCQ stats
features_qq_similarity.drop('scq_svn_log_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_similarity.to_pickle(path= "../data/03_processed/features_qq_similarity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### SCQ (SVNUnitNames as Query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_similarity["scq_svn_unitname_as_query"] = cartesian_df.apply(lambda x: calcSCQList(x.Unit_names,
                                                                                               svn_df_clean.Unit_names,
                                                                                               svn_unitname_countvectorizer,
                                                                                               svn_unitname_tfidf,
                                                                                               svn_documentcount),axis=1)

features_qq_similarity["f108_avgscq_svn_unitname_as_query"] = features_qq_similarity.apply(lambda x: calcAvgSCQ(x.scq_svn_unitname_as_query, svn_documentcount), axis=1)
features_qq_similarity["f109_maxscq_svn_unitname_as_query"] = features_qq_similarity.apply(lambda x: calcMaxSCQ(x.scq_svn_unitname_as_query), axis=1)
features_qq_similarity["f110_sumscq_svn_unitname_as_query"] = features_qq_similarity.apply(lambda x: calcSumSCQ(x.scq_svn_unitname_as_query), axis=1)

#Remove SCQ stats
features_qq_similarity.drop('scq_svn_unitname_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_similarity.to_pickle(path= "../data/03_processed/features_qq_similarity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### SCQ (JIRA as Query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_similarity["scq_jira_all_as_query"] = cartesian_df.apply(lambda x: calcSCQList(x.Jira_natural_text,
                                                                                           jira_df_clean.Jira_natural_text,
                                                                                           jira_all_countvectorizer,
                                                                                           jira_all_tfidf,
                                                                                           jira_documentcount),axis=1)

features_qq_similarity["f111_avgscq_jira_all_as_query"] = features_qq_similarity.apply(lambda x: calcAvgSCQ(x.scq_jira_all_as_query, jira_documentcount), axis=1)
features_qq_similarity["f112_maxscq_jira_all_as_query"] = features_qq_similarity.apply(lambda x: calcMaxSCQ(x.scq_jira_all_as_query), axis=1)
features_qq_similarity["f113_sumscq_jira_all_as_query"] = features_qq_similarity.apply(lambda x: calcSumSCQ(x.scq_jira_all_as_query), axis=1)

#Remove SCQ stats
features_qq_similarity.drop('scq_jira_all_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_similarity.to_pickle(path= "../data/03_processed/features_qq_similarity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### SCQ (JIRA Summaries as Query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_similarity["scq_jira_summary_as_query"] = cartesian_df.apply(lambda x: calcSCQList(x.Summary, 
                                                                                               jira_df_clean.Summary,
                                                                                               jira_summary_countvectorizer,
                                                                                               jira_summary_tfidf,
                                                                                               jira_documentcount),axis=1)

features_qq_similarity["f114_avgscq_jira_summary_as_query"] = features_qq_similarity.apply(lambda x: calcAvgSCQ(x.scq_jira_summary_as_query, jira_documentcount), axis=1)
features_qq_similarity["f115_maxscq_jira_summary_as_query"] = features_qq_similarity.apply(lambda x: calcMaxSCQ(x.scq_jira_summary_as_query), axis=1)
features_qq_similarity["f116_sumscq_jira_summary_as_query"] = features_qq_similarity.apply(lambda x: calcSumSCQ(x.scq_jira_summary_as_query), axis=1)

#Remove SCQ stats
features_qq_similarity.drop('scq_jira_summary_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_similarity.to_pickle(path= "../data/03_processed/features_qq_similarity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### SCQ (JIRA Descriptions as Query)

In [None]:
#Start timer
startTime = time.time() 

#Calculate IDF stats for each svn
features_qq_similarity["scq_jira_description_as_query"] = cartesian_df.apply(lambda x: calcSCQList(x.Description, 
                                                                                                   jira_df_clean.Description,
                                                                                                   jira_description_countvectorizer,
                                                                                                   jira_description_tfidf,
                                                                                                   jira_documentcount),axis=1)

features_qq_similarity["f117_avgscq_jira_description_as_query"] = features_qq_similarity.apply(lambda x: calcAvgSCQ(x.scq_jira_description_as_query, jira_documentcount), axis=1)
features_qq_similarity["f118_maxscq_jira_description_as_query"] = features_qq_similarity.apply(lambda x: calcMaxSCQ(x.scq_jira_description_as_query), axis=1)
features_qq_similarity["f119_sumscq_jira_description_as_query"] = features_qq_similarity.apply(lambda x: calcSumSCQ(x.scq_jira_description_as_query), axis=1)

#Remove SCQ stats
features_qq_similarity.drop('scq_jira_description_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_similarity.to_pickle(path= "../data/03_processed/features_qq_similarity.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

### Query Quality - Term Relatedness

#### PMI (SVN as query)

In [None]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(svn_all_countvectorizer)
termFrequencies = findTermFrequencies(svn_all_countvectorizer, svn_df_clean.Commit_natural_text)
termPairFrequencies = findTermPairFrequencies(termPairs, svn_df_clean.Commit_natural_text)

#Create new dataFrame
features_qq_termrelatedness = pd.DataFrame()

#Calculate IDF stats for each svn
features_qq_termrelatedness["pmi_svn_all_as_query"] = cartesian_df.apply(lambda x: calcPMIList(x.Commit_natural_text,
                                                                                               termFrequencies, 
                                                                                               termPairFrequencies, 
                                                                                               svn_df_clean.Commit_natural_text),axis=1)

features_qq_termrelatedness["f120_avgpmi_svn_all_as_query"] = features_qq_termrelatedness.apply(lambda x: calcAvgPMI(x.pmi_svn_all_as_query), axis=1)
features_qq_termrelatedness["f121_maxpmi_svn_all_as_query"] = features_qq_termrelatedness.apply(lambda x: calcMaxPMI(x.pmi_svn_all_as_query), axis=1)

features_qq_termrelatedness.drop('pmi_svn_all_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_termrelatedness.to_pickle(path= "../data/03_processed/features_qq_termrelatedness.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### PMI (SVNLogs as query)

In [None]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(svn_log_countvectorizer)
termFrequencies = findTermFrequencies(svn_log_countvectorizer, svn_df_clean.Logs)
termPairFrequencies = findTermPairFrequencies(termPairs, svn_df_clean.Logs)

#Calculate IDF stats for each svn
features_qq_termrelatedness["pmi_svn_log_as_query"] = cartesian_df.apply(lambda x: calcPMIList(x.Logs,
                                                                                               termFrequencies, 
                                                                                               termPairFrequencies, 
                                                                                               svn_df_clean.Logs),axis=1)

features_qq_termrelatedness["f122_avgpmi_svn_log_as_query"] = features_qq_termrelatedness.apply(lambda x: calcAvgPMI(x.pmi_svn_log_as_query), axis=1)
features_qq_termrelatedness["f123_maxpmi_svn_log_as_query"] = features_qq_termrelatedness.apply(lambda x: calcMaxPMI(x.pmi_svn_log_as_query), axis=1)



features_qq_termrelatedness.drop('pmi_svn_log_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_termrelatedness.to_pickle(path= "../data/03_processed/features_qq_termrelatedness.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### PMI (SVNUnitNames as query)

In [None]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(svn_unitname_countvectorizer)
termFrequencies = findTermFrequencies(svn_unitname_countvectorizer, svn_df_clean.Unit_names)
termPairFrequencies = findTermPairFrequencies(termPairs, svn_df_clean.Unit_names)

#Calculate IDF stats for each svn
features_qq_termrelatedness["pmi_svn_unitname_as_query"] = cartesian_df.apply(lambda x: calcPMIList(x.Unit_names,
                                                                                                    termFrequencies, 
                                                                                                    termPairFrequencies, 
                                                                                                    svn_df_clean.Unit_names),axis=1)

features_qq_termrelatedness["f124_avgpmi_svn_unitname_as_query"] = features_qq_termrelatedness.apply(lambda x: calcAvgPMI(x.pmi_svn_unitname_as_query), axis=1)
features_qq_termrelatedness["f125_maxpmi_svn_unitname_as_query"] = features_qq_termrelatedness.apply(lambda x: calcMaxPMI(x.pmi_svn_unitname_as_query), axis=1)



features_qq_termrelatedness.drop('pmi_svn_unitname_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_termrelatedness.to_pickle(path= "../data/03_processed/features_qq_termrelatedness.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### PMI (JIRA as query)

In [None]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(jira_all_countvectorizer)
termFrequencies = findTermFrequencies(jira_all_countvectorizer, jira_df_clean.Jira_natural_text)
termPairFrequencies = findTermPairFrequencies(termPairs, jira_df_clean.Jira_natural_text)

#Calculate IDF stats for each svn
features_qq_termrelatedness["pmi_jira_all_as_query"] = cartesian_df.apply(lambda x: calcPMIList(x.Jira_natural_text, 
                                                                                                termFrequencies, 
                                                                                                termPairFrequencies, 
                                                                                                jira_df_clean.Jira_natural_text),axis=1)

features_qq_termrelatedness["f126_avgpmi_jira_all_as_query"] = features_qq_termrelatedness.apply(lambda x: calcAvgPMI(x.pmi_jira_all_as_query), axis=1)
features_qq_termrelatedness["f127_maxpmi_jira_all_as_query"] = features_qq_termrelatedness.apply(lambda x: calcMaxPMI(x.pmi_jira_all_as_query), axis=1)



features_qq_termrelatedness.drop('pmi_jira_all_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_termrelatedness.to_pickle(path= "../data/03_processed/features_qq_termrelatedness.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### PMI (JIRA Summaries as query)

In [None]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(jira_summary_countvectorizer)
termFrequencies = findTermFrequencies(jira_summary_countvectorizer, jira_df_clean.Summary)
termPairFrequencies = findTermPairFrequencies(termPairs, jira_df_clean.Summary)

#Calculate IDF stats for each svn
features_qq_termrelatedness["pmi_jira_summary_as_query"] = cartesian_df.apply(lambda x: calcPMIList(x.Summary, 
                                                                                                   termFrequencies, 
                                                                                                   termPairFrequencies, 
                                                                                                   jira_df_clean.Summary),axis=1)

features_qq_termrelatedness["f128_avgpmi_jira_summary_as_query"] = features_qq_termrelatedness.apply(lambda x: calcAvgPMI(x.pmi_jira_summary_as_query), axis=1)
features_qq_termrelatedness["f129_maxpmi_jira_summary_as_query"] = features_qq_termrelatedness.apply(lambda x: calcMaxPMI(x.pmi_jira_summary_as_query), axis=1)



features_qq_termrelatedness.drop('pmi_jira_summary_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_termrelatedness.to_pickle(path= "../data/03_processed/features_qq_termrelatedness.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

#### PMI (JIRA Descriptions as query)

In [None]:
#Start timer
startTime = time.time() 

#Create pairs and find frequencies
termPairs = createTermPairs(jira_description_countvectorizer)
termFrequencies = findTermFrequencies(jira_description_countvectorizer, jira_df_clean.Description)
termPairFrequencies = findTermPairFrequencies(termPairs, jira_df_clean.Description)

#Calculate IDF stats for each svn
features_qq_termrelatedness["pmi_jira_description_as_query"] = cartesian_df.apply(lambda x: calcPMIList(x.Description, 
                                                                                                      termFrequencies, 
                                                                                                      termPairFrequencies, 
                                                                                                      jira_df_clean.Description),axis=1)

features_qq_termrelatedness["f130_avgpmi_jira_description_as_query"] = features_qq_termrelatedness.apply(lambda x: calcAvgPMI(x.pmi_jira_description_as_query), axis=1)
features_qq_termrelatedness["f131_maxpmi_jira_description_as_query"] = features_qq_termrelatedness.apply(lambda x: calcMaxPMI(x.pmi_jira_description_as_query), axis=1)



features_qq_termrelatedness.drop('pmi_jira_description_as_query', axis = 1, inplace=True)

#Save results in pickle
features_qq_termrelatedness.to_pickle(path= "../data/03_processed/features_qq_termrelatedness.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished creating query quality features in " + timeDifference)

## 3.8 Preprocess Data - Load and transform feature families needed for training
Load features and create a normalized set of them.

In [None]:
#Load Process-Related Features
features_process_related = pd.read_pickle(r'../data/03_processed/features_process_related.pkl')

#Load IR-Related Features
features_information_retrieval = pd.read_pickle(r'../data/03_processed/features_information_retrieval.pkl')

#Load Document Statistics Features
features_document_statistics = pd.read_pickle(r'../data/03_processed/features_document_statistics.pkl')

#Load Query Quality Features
features_qq_specificity = pd.read_pickle(r'../data/03_processed/features_qq_specificity.pkl')
features_qq_similarity = pd.read_pickle(r'../data/03_processed/features_qq_similarity.pkl')
features_qq_termrelatedness = pd.read_pickle(r'../data/03_processed/features_qq_termrelatedness.pkl')

In [None]:
#Normalize Process-Related Features
features_process_related_normalized = normalize_data(features_process_related)

##Normalize IR-Related Features
features_information_retrieval_normalized = normalize_data(features_information_retrieval)

#Normalize Document Statistics Features
features_document_statistics_normalized = normalize_data(features_document_statistics)

#Normalize Query Quality Features
features_qq_specificity_normalized = normalize_data(features_qq_specificity)
features_qq_similarity_normalized = normalize_data(features_qq_similarity)
features_qq_termrelatedness_normalized = normalize_data(features_qq_termrelatedness)

Put all features in a single data frame

In [None]:
#Create a single data frame for the non-normalized features
features_all_df = pd.concat([features_process_related,
                             features_document_statistics,
                             features_information_retrieval,
                             features_qq_specificity,
                             features_qq_similarity,
                             features_qq_termrelatedness], axis=1)

#Create a single data frame for the normalized features
features_all_normalized_df = pd.concat([features_process_related_normalized,
                                        features_document_statistics_normalized,
                                        features_information_retrieval_normalized,
                                        features_qq_specificity_normalized,
                                        features_qq_similarity_normalized,
                                        features_qq_termrelatedness_normalized], axis=1)

#Save into xlsx files
features_all_df.to_excel(excel_writer = "../results/1. Trace Link Feature Data/features_non-normalized.xlsx", index = False)
features_all_normalized_df.to_excel(excel_writer = "../results/1. Trace Link Feature Data/features_normalized.xlsx", index = False)