In [1]:
#Import Python Libraries
import pandas as pd
import pickle
import time
import numpy as np

#Import Self-written Functions
import os
import sys
src_dir = os.path.join(os.getcwd(), '..', 'src')
sys.path.append(src_dir)

from d00_utils.calculateTimeDifference import calculateTimeDifference #Function to calc time difference
from d01_data.loadCommits import loadCommits #Function to load SVN data
from d02_intermediate.cleanCommitData import cleanCommitData #Function to clean commit data
from d02_intermediate.cleanJiraData import cleanJiraData #Function to clean JIRA data

from d03_processing.createFittedTF_IDF import createFittedTF_IDF #Function to see if a trace is valid
from d03_processing.createCorpusFromDocumentList import createCorpusFromDocumentList #Function to create a corpus
from d03_processing.checkValidityTrace import checkValidityTrace #Function to see if a trace is valid
from d03_processing.calculateTimeDif import calculateTimeDif #Calculate the time difference between 2 dates in seconds
from d03_processing.checkFullnameEqualsEmail import checkFullnameEqualsEmail #Check if fullName is equal to the email
from d03_processing.calculateCosineSimilarity import calculateCosineSimilarity #Calculate the cos similarity
from d03_processing.calculateDocumentStatistics import *

from d03_processing.calculateQueryQuality import *
from d03_processing.normalize_data import *

from d04_model_evaluation.model_evaluation import *

#Display full value of a column
pd.set_option('display.max_colwidth', None)

#Display all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

print("done")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rande\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
  from pandas import MultiIndex, Int64Index


done


## 3.1 Load Cartesian and Labels

In [2]:
#Load Cartersian Product
cartesian_df = pd.read_pickle(r'../data/03_processed/cartesian_df.pkl')
labels_df = pd.read_pickle(r'../data/03_processed/labels_df.pkl')

# 3.2 Recalculate non-LC-specific features

In [3]:
#Load Document Statistics Features
features_document_statistics = pd.read_pickle(r'../data/03_processed/features_document_statistics.pkl')

In [4]:
cartesian_df.shape

(1149853, 28)

In [5]:
features_document_statistics.shape

(51415, 7)

In [6]:
features_document_statistics.head()

Unnamed: 0,f5_total_terms_jira,f6_total_terms_svn,f7_unique_terms_jira,f8_unique_terms_svn,f9_overlap_terms_compared_to_jira,f10_overlap_terms_to_svn,f11_overlap_terms_to_union
1578,62,,43,,,,
1579,62,,43,,,,
1580,62,,43,,,,
1581,62,,43,,,,
1582,62,,43,,,,


In [7]:
#Calculate total terms JIRA for each trace
features_document_statistics["f6_total_terms_svn"] = cartesian_df.apply(lambda x: calculateTotalWordCount(x.Commit_natural_text), 
                                                            axis=1)

#Calculate unique terms JIRA for each trace
features_document_statistics["f8_unique_terms_svn"] = cartesian_df.apply(lambda x: calculateUniqueWordCount(x.Commit_natural_text), 
                                                            axis=1)


features_document_statistics["f9_overlap_terms_compared_to_jira"] = cartesian_df.apply(lambda x: calculateOverlapBetweenDocuments(x.Jira_natural_text, x.Commit_natural_text, 'list1'),
                                                            axis=1)
features_document_statistics["f10_overlap_terms_to_svn"] = cartesian_df.apply(lambda x: calculateOverlapBetweenDocuments(x.Jira_natural_text, x.Commit_natural_text, 'list2'),
                                                            axis=1)
features_document_statistics["f11_overlap_terms_to_union"] = cartesian_df.apply(lambda x: calculateOverlapBetweenDocuments(x.Jira_natural_text, x.Commit_natural_text, 'union'),
                                                            axis=1)

In [8]:
features_document_statistics.head()

Unnamed: 0,f5_total_terms_jira,f6_total_terms_svn,f7_unique_terms_jira,f8_unique_terms_svn,f9_overlap_terms_compared_to_jira,f10_overlap_terms_to_svn,f11_overlap_terms_to_union
1578,62,,43,,,,
1579,62,,43,,,,
1580,62,,43,,,,
1581,62,,43,,,,
1582,62,,43,,,,


In [9]:
#Save results in pickle
features_document_statistics.to_pickle(path= "../data/03_processed/features_document_statistics.pkl")

## 3.3 Remove LC-specific features

In [10]:
#Load Information Retrieval Features
features_information_retrieval = pd.read_pickle(r'../data/03_processed/features_information_retrieval.pkl')

#Load Query Quality Features
features_qq_specificity = pd.read_pickle(r'../data/03_processed/features_qq_specificity.pkl')
features_qq_similarity = pd.read_pickle(r'../data/03_processed/features_qq_similarity.pkl')
features_qq_termrelatedness = pd.read_pickle(r'../data/03_processed/features_qq_termrelatedness.pkl')

In [11]:
# All features which need to be removed from the final feature set
features_to_be_removed_list = ["f18_ir_unitname_and_summary_unitname_as_query",
                               "f19_ir_unitname_and_summary_summary_as_query",
                               "f20_ir_unitname_and_description_unitname_as_query",
                               "f21_ir_unitname_and_description_description_as_query",
                               "f22_ir_unitname_and_jira_all_unitname_as_query",
                               "f23_ir_unitname_and_jira_all_jira_all_as_query",
                               "f24_ir_svn_all_and_summary_svn_all_as_query",
                               "f25_ir_svn_all_and_summary_summary_as_query",
                               "f26_ir_svn_all_and_description_svn_all_as_query",
                               "f27_ir_svn_all_and_description_description_as_query",
                               "f28_ir_svn_all_and_jira_all_svn_all_as_query",
                               "f29_ir_svn_all_and_jira_all_jira_all_as_query",
                               "f30_avgidf_svn_all_as_query",
                               "f31_maxidf_svn_all_as_query",
                               "f32_devidf_svn_all_as_query",
                               "f36_avgidf_unitname_as_query",
                               "f37_maxidf_unitname_as_query",
                               "f38_devidf_unitname_as_query",
                               "f48_avgictf_svn_all_as_query",
                               "f49_maxictf_svn_all_as_query",
                               "f50_devictf_svn_all_as_query",
                               "f54_avgictf_svn_unitname_as_query",
                               "f55_maxictf_svn_unitname_as_query",
                               "f56_devictf_svn_unitname_as_query",
                               "f66_avgentropy_svn_all_as_query",
                               "f67_medentropy_svn_all_as_query",
                               "f68_maxentropy_svn_all_as_query",    
                               "f69_deventropy_svn_all_as_query",
                               "f74_avgentropy_svn_unitname_as_query",    
                               "f75_medentropy_svn_unitname_as_query",
                               "f76_maxentropy_svn_unitname_as_query",    
                               "f77_deventropy_svn_unitname_as_query",
                               "f90_queryscope_svn_all_as_query",    
                               "f92_queryscope_svn_unitname_as_query",
                               "f96_scs_svn_all_as_query",    
                               "f98_scs_svn_unitname_as_query",
                               "f102_SvnAsQuery_avgSCQ", 
                               "f103_SvnAsQuery_maxSCQ",
                               "f104_SvnAsQuery_sumSCQ",
                               "f108_avgscq_svn_unitname_as_query",
                               "f109_maxscq_svn_unitname_as_query",
                               "f110_sumscq_svn_unitname_as_query",
                               "f120_avgpmi_svn_all_as_query",
                               "f121_maxpmi_svn_all_as_query",
                               "f124_avgpmi_svn_unitname_as_query",
                               "f125_maxpmi_svn_unitname_as_query"                            
                              ]

In [12]:
#Remove LCD-specific features
features_information_retrieval = features_information_retrieval[features_information_retrieval.columns.difference(features_to_be_removed_list)]
features_qq_specificity = features_qq_specificity[features_qq_specificity.columns.difference(features_to_be_removed_list)]
features_qq_similarity = features_qq_similarity[features_qq_similarity.columns.difference(features_to_be_removed_list)]
features_qq_termrelatedness = features_qq_termrelatedness[features_qq_termrelatedness.columns.difference(features_to_be_removed_list)]

In [13]:
features_qq_termrelatedness.shape

(51415, 8)

In [14]:
#Save results in pickle
features_information_retrieval.to_pickle(path= "../data/03_processed/features_information_retrieval.pkl")
features_qq_specificity.to_pickle(path= "../data/03_processed/features_qq_specificity.pkl")
features_qq_similarity.to_pickle(path= "../data/03_processed/features_qq_similarity.pkl")
features_qq_termrelatedness.to_pickle(path= "../data/03_processed/features_qq_termrelatedness.pkl")

## 3.8 Preprocess Data - Load and transform feature families needed for training
Load features and create a normalized set of them.

In [15]:
#Load Process-Related Features
features_process_related = pd.read_pickle(r'../data/03_processed/features_process_related.pkl')

#Load IR-Related Features
features_information_retrieval = pd.read_pickle(r'../data/03_processed/features_information_retrieval.pkl')

#Load Document Statistics Features
features_document_statistics = pd.read_pickle(r'../data/03_processed/features_document_statistics.pkl')

#Load Query Quality Features
features_qq_specificity = pd.read_pickle(r'../data/03_processed/features_qq_specificity.pkl')
features_qq_similarity = pd.read_pickle(r'../data/03_processed/features_qq_similarity.pkl')
features_qq_termrelatedness = pd.read_pickle(r'../data/03_processed/features_qq_termrelatedness.pkl')

In [16]:
#Normalize Process-Related Features
features_process_related_normalized = normalize_data(features_process_related)

#Normalize IR-Related Features
features_information_retrieval_normalized = normalize_data(features_information_retrieval)

#Normalize Document Statistics Features
features_document_statistics_normalized = normalize_data(features_document_statistics)

#Normalize Query Quality Features
features_qq_specificity_normalized = normalize_data(features_qq_specificity)
features_qq_similarity_normalized = normalize_data(features_qq_similarity)
features_qq_termrelatedness_normalized = normalize_data(features_qq_termrelatedness)

Put all features in a single data frame

In [17]:
#Create a single data frame for the non-normalized features
features_all_df = pd.concat([features_process_related,
                             features_document_statistics,
                             features_information_retrieval,
                             features_qq_specificity,
                             features_qq_similarity,
                             features_qq_termrelatedness], axis=1)

#Create a single data frame for the normalized features
features_all_normalized_df = pd.concat([features_process_related_normalized,
                                        features_document_statistics_normalized,
                                        features_information_retrieval_normalized,
                                        features_qq_specificity_normalized,
                                        features_qq_similarity_normalized,
                                        features_qq_termrelatedness_normalized], axis=1)

#Save into xlsx files
features_all_df.to_excel(excel_writer = "../results/1. Trace Link Feature Data/features_non-normalized.xlsx", index = False)
features_all_normalized_df.to_excel(excel_writer = "../results/1. Trace Link Feature Data/features_normalized.xlsx", index = False)

In [18]:
features_all_df.shape

(51415, 85)

Perform additional preprocessing

In [19]:
#Set the NaN to 0
features_all_df = features_all_df.fillna(0)
features_all_normalized_df = features_all_normalized_df.fillna(0)

#Saving feature names for later use
feature_name_df = list(features_all_df.columns)

#Transform pandas data frame into numpy arrays
features_all_array = np.array(features_all_df)
features_all_normalized_array = np.array(features_all_normalized_df)

#Load labels
labels_df = pd.read_pickle(r'../data/03_processed/labels_df.pkl')
labels_array = np.array(labels_df["is_valid"])

# 4.1 Evaluation - Non-normalized
## Random Forests

In [20]:
#Import Python Libraries
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb

#Method to show the different model evaluation metrics
def showModelPerformance(trainedModel, testFeatures, testLabels):
    # Use the fitted model to predict the labels of the test set
    predictionLabels = trainedModel.predict(testFeatures)
    
    #Calculate the different metrics for the test vs predicted labels
    accuracyValue = accuracy_score(testLabels.astype(bool), predictionLabels)
    precisionValue = precision_score(testLabels.astype(bool), predictionLabels, average='binary')
    f1Value = f1_score(testLabels.astype(bool), predictionLabels)
    f2Value = fbeta_score(testLabels.astype(bool), predictionLabels, beta=2.0)
    f05Value = fbeta_score(testLabels.astype(bool), predictionLabels, beta=0.5)
    recallValue = recall_score(testLabels.astype(bool), predictionLabels)
    averagePrecisionValue = average_precision_score(testLabels.astype(bool), predictionLabels)
    
    #Create a dataframe to output all evaluation metrics in
    performanceData = {'Accuracy':  [accuracyValue],
                       'Precision': [precisionValue],
                       'Recall': [recallValue],
                       'F1': [f1Value],
                       'F2': [f2Value],
                       'F0.5': [f05Value],
                       'Average Precision': [averagePrecisionValue]
                      }
    performanceDf = pd.DataFrame(performanceData)
    return(performanceDf)

#Method to define the Pipeline steps based on the given rebalancing strategy and classification algorithm
def define_steps(rebalancing_strategy, classification_algorithm):
    steps = None
    if(rebalancing_strategy == 'none'):
        if(classification_algorithm == 'random_forests'):
            steps = [['classifier', RandomForestClassifier(n_jobs=-1)]]
        elif (classification_algorithm == 'xg_boost'):
            steps = [['classifier', xgb.XGBClassifier(n_jobs=-1)]]
            return(steps)
        elif(classification_algorithm == 'light_gbm'):
            steps = [['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]]
    elif(rebalancing_strategy == 'over'):
        if(classification_algorithm == 'random_forests'):
            steps = [['smote', SMOTE()],
                    ['classifier', RandomForestClassifier(n_jobs=-1)]]
        elif (classification_algorithm == 'xg_boost'):
            steps = [['smote', SMOTE()],
                    ['classifier', xgb.XGBClassifier(n_jobs=-1)]]
        elif(classification_algorithm == 'light_gbm'):
            steps = [['smote', SMOTE()],
                    ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]]
    elif(rebalancing_strategy == 'under'):
        if(classification_algorithm == 'random_forests'):
            steps = [['under', RandomUnderSampler()],
                    ['classifier', RandomForestClassifier(n_jobs=-1)]]
        elif (classification_algorithm == 'xg_boost'):
            steps = [['under', RandomUnderSampler()],
                    ['classifier', xgb.XGBClassifier(n_jobs=-1)]]
        elif(classification_algorithm == 'light_gbm'):
            steps = [['under', RandomUnderSampler()],
                    ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]]
    elif(rebalancing_strategy == '5050'):
        if(classification_algorithm == 'random_forests'):
            steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                    ['under', RandomUnderSampler()],
                    ['classifier', RandomForestClassifier(n_jobs=-1)]]
        elif (classification_algorithm == 'xg_boost'):
            steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                    ['under', RandomUnderSampler()],
                    ['classifier', xgb.XGBClassifier(n_jobs=-1)]]
        elif(classification_algorithm == 'light_gbm'):
            steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                    ['under', RandomUnderSampler()],
                    ['classifier', lgb.LGBMClassifier(n_jobs=-1, importance_type='gain')]]
    return steps

#Method to generate the f1, f2, f0.5, accuracy, precision, recall, and average precision
def generate_evaluation_metrics(rebalancing_strategy, classification_algorithm, data, labels, is_normalized, n_runs, feature_names):
    #Create a dataframe to append to the results of each individual run
    evaluation_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })
    
    #Create a np array to put the importances per feature in
    importance_array = np.empty(shape=(n_runs, 85))
    
    #Perform the described pipeline steps to produce the results for the defined number of runs
    for i in range(n_runs):
        X_train, X_test, y_train, y_test = train_test_split(data,
                                                        labels,
                                                        test_size=0.2,
                                                        stratify=labels)
        
        #Set the pipeline steps according to the defined rebalancing strategy and classification algorithm
        steps = define_steps(rebalancing_strategy, classification_algorithm)
        
        #Create the pipeline
        model_pipeline = Pipeline(steps=steps)
        
        space_empty = dict()    
        
        stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)           
    
        #Create a model
        model = RandomizedSearchCV(estimator = model_pipeline, 
                                param_distributions = space_empty, 
                                n_iter=1, 
                                n_jobs=-1, 
                                cv = stratified_kfold)
        
        #Fit the model on the training data
        fitted_model = model.fit(X_train, y_train)
        
        #Evaluate the fitted model
        fitted_model_evaluation_df = showModelPerformance(trainedModel = fitted_model, 
                         testFeatures = X_test, 
                         testLabels = y_test)     
        
        #Add the evaluation of the current run to the results of the previous runs
        evaluation_df = pd.concat([evaluation_df,
                                   fitted_model_evaluation_df])
        
        #Find the feature importances of the fitted model
        if(classification_algorithm == "light_gbm"):
            current_importances = fitted_model.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
        else:
            current_importances = fitted_model.best_estimator_._final_estimator.feature_importances_
        #Add the feature importances of the current fitted model to the results of the previous runs

        importance_array[i] = current_importances  
    
    if is_normalized == True:
        dir_string = "3. Normalised Results"
    else:
        dir_string = "2. Non-Normalised Results"
    
    #Set the index as the run number
    evaluation_df = evaluation_df.reset_index(drop = True)
    evaluation_df.index += 1 
    evaluation_df.index.name = "run"
    
    #Output the evaluation data to a csv file
    evaluation_df.to_csv("../results/" + dir_string + "/" + classification_algorithm + "/" + rebalancing_strategy + "_results.csv")
    
    #Transform the importance array to a data frame
    importance_df = pd.DataFrame(data=importance_array, 
                                 columns= feature_names, 
                                 index=list(range(1, n_runs +1)))
    
    #Set the index as the run number
    importance_df.index.name = "run"
    
    #Output the importance data to a csv file
    importance_df.to_csv("../results/4. Feature Importance Results/" + classification_algorithm + "/" + rebalancing_strategy + "_results.csv")

## XG Boost

In [21]:
generate_evaluation_metrics(rebalancing_strategy = 'none', 
                            classification_algorithm = 'xg_boost', 
                            data = features_all_array, 
                            labels = labels_array, 
                            feature_names = feature_name_df,
                            is_normalized = False,
                            n_runs = 25)





































































































## LightGBM

In [22]:
generate_evaluation_metrics(rebalancing_strategy = '5050', 
                            classification_algorithm = 'light_gbm', 
                            data = features_all_array, 
                            labels = labels_array, 
                            feature_names = feature_name_df,
                            is_normalized = False,
                            n_runs = 25)