# Introduction
Data preparation of the TechDebt dataset. Concretely, from the following tables:
- GIT_COMMITS
- GIT_COMMITS_CHANGES
- JIRA_ISSUES
- SONAR_ANALYSIS
- SONAR_ISSUES
- SONAR_MEASURES

## Library Packages

In [1]:
# Import libraries and packages
# Miscellaneous libraries
import numpy as np
import pandas as pd
import os
from datetime import datetime
import collections


from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
os.getcwd()
os.chdir( '../src/features')
from tracking import track
from preparation_data import delete_na, analyse_categorical_variables, one_hot_encoding, message_length

## Data Preparation

#### Define the path of the files

In [2]:
track("Defining path of the data files")
# Define the path of the data files
path1 = '../../data/raw/'
path2 = "../data/raw/"
path_git_commits = path1 + 'GIT_COMMITS.csv'
path_git_commits_changes = path2 + 'GIT_COMMITS_CHANGES.csv'
path_jira_issues = path1 + 'JIRA_ISSUES.csv'
path_sonar_analysis = path1 + 'SONAR_ANALYSIS.csv'
path_sonar_issues = path1 + 'SONAR_ISSUES.csv'
path_sonar_measures = path1 + 'SONAR_MEASURES.csv'

# Ensure the input file exist
assert os.path.isfile(path_git_commits), f'{path_git_commits} not found. Is it a file?'
assert os.path.isfile("../"+path_git_commits_changes), f'{path_git_commits_changes} not found. Is it a file?'
assert os.path.isfile(path_jira_issues), f'{path_jira_issues} not found. Is it a file?'
assert os.path.isfile(path_sonar_analysis), f'{path_sonar_analysis} not found. Is it a file?'
assert os.path.isfile(path_sonar_issues), f'{path_sonar_issues} not found. Is it a file?'
assert os.path.isfile(path_sonar_measures), f'{path_sonar_measures} not found. Is it a file?'
track("Finishing defining path of the data files")

#### Read the files

In [3]:
track("Reading files")
# Read the files
git_commits_changes = spark.read.csv(path_git_commits_changes,header=True).toPandas()
git_commits = pd.read_csv(path_git_commits)
jira_issues = pd.read_csv(path_jira_issues)
sonar_analysis = pd.read_csv(path_sonar_analysis)
sonar_issues = pd.read_csv(path_sonar_issues)
sonar_measures = pd.read_csv(path_sonar_measures)
track("Finishing reading files")

#### Define selected variables
In the following section we are only selecting the useful variables for the project. The election process has been studied previusly, in the Data Understanding step.

In [4]:
# Define variables of interest for each dataframe
git_commits_changes_names = ['COMMIT_HASH','DATE','LINES_ADDED','LINES_REMOVED']
git_commits_names = ['PROJECT_ID','COMMIT_HASH','AUTHOR','AUTHOR_DATE','AUTHOR_TIMEZONE','COMMIT_MESSAGE']
jira_issues_names = ['HASH']
sonar_analysis_names = ['PROJECT_ID','ANALYSIS_KEY','REVISION']
sonar_issues_names = ['CREATION_ANALYSIS_KEY','SEVERITY','STATUS','EFFORT','MESSAGE','START_LINE','END_LINE','CLOSE_ANALYSIS_KEY']
sonar_measures_names = ['analysis_key','complexity' ,'cognitive_complexity', 'coverage', 'duplicated_blocks', 'duplicated_files', 
                        'duplicated_lines_density', 'violations','blocker_violations','critical_violations','major_violations','minor_violations','info_violations','false_positive_issues','open_issues','reopened_issues','confirmed_issues', 'sqale_debt_ratio','code_smells','bugs','reliability_rating','vulnerabilities','security_rating','files', 'comment_lines_density']

In [5]:
# Select variables of interest
git_commits_changes = git_commits_changes[git_commits_changes_names]
git_commits = git_commits[git_commits_names]
jira_issues = jira_issues[jira_issues_names]
sonar_analysis = sonar_analysis[sonar_analysis_names]
sonar_issues = sonar_issues[sonar_issues_names]
sonar_measures = sonar_measures[sonar_measures_names]
track("Finishing selecting variables of interest for each dataframe")

In [6]:
track("Starting defining numercial types")
# Select columns of interest
dtypes = ['uint8','int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'object']
track("Finishing defining numercial types")

### NA values
Deleting all NA values from the tables by using the global function implemented above delete_na().

In [7]:
track("Starting analysing NA values from all tables")
tables = [sonar_measures, sonar_issues, sonar_analysis, jira_issues,git_commits, git_commits_changes]
[sonar_measures, sonar_issues, sonar_analysis, jira_issues,git_commits, git_commits_changes] = delete_na(tables, dtypes)

Moreover, in the GIT_COMMITS table, we also find rows that contain the value "No Author" in the AUTHOR column.
As we cannot know if all those commits come from an unique unidentified person or from multiple ones, we decided to eliminate such rows, as seen in the Data Quality task, they represent a minor percentage of the table length.

In [8]:
# Delete rows that contain missing authors and reseting the DF index.",
git_commits = git_commits.drop(git_commits[git_commits.AUTHOR == "No Author"].index)
git_commits.reset_index(drop= True)
track("Finishing analysing NA values from all tables")

### Categorical Values
The next step is to analyse the categorical variables and encoding them.
For the SONAR_MEASURES, JIRA_ISSUES, SONAR_ANALYSIS table (add more if necessary) there are not categorical varibles.

In [9]:
track("Starting analysing categorical variables")
table_names = ["SONAR_ISSUES"]
variable_names = ["SEVERITY", "STATUS"]
dataframes = [sonar_issues]
analyse_categorical_variables(table_names, variable_names, dataframes)

+--------------+---------------+------------------+-----------------------------------------------+
|  Table name  | Variable name | Number of levels |                     Types                     |
+--------------+---------------+------------------+-----------------------------------------------+
| SONAR_ISSUES |    SEVERITY   |        5         | ['INFO' 'MINOR' 'MAJOR' 'CRITICAL' 'BLOCKER'] |
| SONAR_ISSUES |     STATUS    |        1         |                   ['CLOSED']                  |
+--------------+---------------+------------------+-----------------------------------------------+


As can be seen in the chunk above, the SEVERITY and STATUS variables have 5 and 1 levels respectively. In our case, we have performed the One-hot encoding for the SEVERITY variable. For the STATUS variable, efore deleting all NA, there was the OPENED level. However, all rows with an OPENED status contained NA, which means that for this variable we only have the CLOSED level. 
When joining the tables, we will calulate the mean of each types each author has.

In [10]:
sonar_issues = one_hot_encoding(sonar_issues, "SEVERITY")

In [11]:
sonar_issues = one_hot_encoding(sonar_issues, "STATUS")
track("Finishing analysing categorical variables from all tables")

### MESSAGE and COMMIT_MESSAGE variables
In the following section, we will encode the MESSAGE and COMMIT_MESSAGE variables for the SONAR_ISSUES table and GIT_COMMITS table respectively. For those variables, we will calulate the length of the message for each issue/commit, and reassigning the column with that new value instead of the text from the original message.

In [12]:
track("Starting codifying MESSAGE and COMMIT_MESSAGE variables using message_length() function")
message_length(sonar_issues,"MESSAGE")
message_length(git_commits,"COMMIT_MESSAGE")
track("Finishing codifying MESSAGE and COMMIT_MESSAGE variables using message_length() function")

### ISSUE_CODE_LENGTH variable

In the following cells we will proceed to computate the length mean per issue with the START_LINE and END_LINE variables.

In [13]:
track("Starting creating ISSUE_CODE_LENGTH variable for SONAR_ISSUES table")
issue_length = []
for index, row in sonar_issues.iterrows():
    diff = row['END_LINE'] - row['START_LINE']
    issue_length.append(diff)

In [14]:
sonar_issues = sonar_issues.drop('START_LINE', axis=1)
sonar_issues = sonar_issues.drop('END_LINE', axis=1)
sonar_issues['ISSUE_CODE_LENGTH'] = issue_length
sonar_issues.head()
track("Starting creating ISSUE_CODE_LENGTH variable for SONAR_ISSUES table")

## Joins

In [17]:
track("Starting joining SONAR tables")
# Joining SONAR_ANALYSIS with SONAR_ISSUES
sonar_complete_1 = pd.merge(sonar_issues, sonar_analysis, left_on='CREATION_ANALYSIS_KEY', right_on='ANALYSIS_KEY', how='inner')
sonar_complete_2 = pd.merge(sonar_issues, sonar_analysis, left_on='CLOSE_ANALYSIS_KEY', right_on='ANALYSIS_KEY', how='inner')
sonar_complete_1 = sonar_complete_1.drop('CREATION_ANALYSIS_KEY', axis=1)
sonar_complete_1 = sonar_complete_1.drop('CLOSE_ANALYSIS_KEY', axis=1)
sonar_complete_2 = sonar_complete_2.drop('CREATION_ANALYSIS_KEY', axis=1)
sonar_complete_2 = sonar_complete_2.drop('CLOSE_ANALYSIS_KEY', axis=1)

sonar_complete = pd.concat([sonar_complete_1, sonar_complete_2])

In [18]:
# Deleting duplicated rows
sonar_complete = sonar_complete.drop_duplicates()
sonar_complete = sonar_complete.reset_index(drop = True)

In [20]:
# Joining SONAR_ANALYSIS with SONAR_MEASURES
sonar_complete = pd.merge(sonar_complete, sonar_measures, left_on='ANALYSIS_KEY', right_on='analysis_key', how='inner')
sonar_complete = sonar_complete.drop('ANALYSIS_KEY', axis=1)

In [21]:
# Deleting duplicated rows
sonar_complete = sonar_complete.drop_duplicates()
sonar_complete = sonar_complete.reset_index(drop = True)
track("Finishing joining SONAR tables")

In [23]:
track("Starting joining COMMIT tables")
git_complete =  pd.merge(git_commits, git_commits_changes, left_on='COMMIT_HASH', right_on='COMMIT_HASH', how='inner')

In [28]:
# In order to execute the groupby functions, first we need to associate each commit to
# an author in the sonar_complete table. To do so, a dictionary of commit- author will be created.

commit_author_dict = {}
for i in range(len(git_complete)):
    commit_author_dict[git_complete["COMMIT_HASH"][i]] = git_complete["AUTHOR"][i]

In [35]:
# Once is complete, we will substitute the REVISON variable of the sonar_complete table 
# for the respective author.

# First, we obtain the values.

authors_measures = []

for i in range(len(sonar_complete)):
    aux = sonar_complete["REVISION"][i]
    if commit_author_dict.get(aux): authors_measures.append(commit_author_dict[aux])
    else: authors_measures.append(np.nan)