# Introduction
Data preparation of the TechDebt dataset. Concretely, from the following tables:
- GIT_COMMITS
- GIT_COMMITS_CHANGES
- JIRA_ISSUES
- SONAR_ANALYSIS
- SONAR_ISSUES
- SONAR_MEASURES

## Library Packages

In [63]:
# Import libraries and packages
# Miscellaneous libraries
import numpy as np
import pandas as pd
import os
from datetime import datetime
import collections
from prettytable import PrettyTable
import dask.dataframe as dd

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

## Data Preparation

#### Define the path of the files

In [64]:
# Define the path of the data files
path = '../data/raw/'
path_git_commits = path + 'GIT_COMMITS.csv'
path_git_commits_changes = path + 'GIT_COMMITS_CHANGES.csv'
path_jira_issues = path + 'JIRA_ISSUES.csv'
path_sonar_analysis = path + 'SONAR_ANALYSIS.csv'
path_sonar_issues = path + 'SONAR_ISSUES.csv'
path_sonar_measures = path + 'SONAR_MEASURES.csv'

# Ensure the input file exist
assert os.path.isfile(path_git_commits), f'{path_git_commits} not found. Is it a file?'
assert os.path.isfile(path_git_commits_changes), f'{path_git_commits_changes} not found. Is it a file?'
assert os.path.isfile(path_jira_issues), f'{path_jira_issues} not found. Is it a file?'
assert os.path.isfile(path_sonar_analysis), f'{path_sonar_analysis} not found. Is it a file?'
assert os.path.isfile(path_sonar_issues), f'{path_sonar_issues} not found. Is it a file?'
assert os.path.isfile(path_sonar_measures), f'{path_sonar_measures} not found. Is it a file?'

#### Read the files

In [65]:
# Read the files
git_commits_changes = spark.read.csv(path_git_commits_changes,header=True).toPandas()
git_commits = pd.read_csv(path_git_commits)
jira_issues = pd.read_csv(path_jira_issues)
sonar_analysis = pd.read_csv(path_sonar_analysis)
sonar_issues = pd.read_csv(path_sonar_issues)
sonar_measures = pd.read_csv(path_sonar_measures)

#### Define selected variables
In the following section we are only selecting the useful variables for the project. The election process has been studied previusly, in the Data Understanding step.

In [66]:
# Define variables of interest for each dataframe
git_commits_changes_names = ['COMMIT_HASH','DATE','LINES_ADDED','LINES_REMOVED']
git_commits_names = ['PROJECT_ID','COMMIT_HASH','AUTHOR','AUTHOR_DATE','AUTHOR_TIMEZONE','COMMIT_MESSAGE']
jira_issues_names = ['HASH']
sonar_analysis_names = ['PROJECT_ID','ANALYSIS_KEY','REVISION']
sonar_issues_names = ['CREATION_ANALYSIS_KEY','SEVERITY','STATUS','EFFORT','MESSAGE','START_LINE','END_LINE','CLOSE_ANALYSIS_KEY']
sonar_measures_names = ['analysis_key','complexity' ,'cognitive_complexity', 'coverage', 'duplicated_blocks', 'duplicated_files', 
                        'duplicated_lines_density', 'violations','blocker_violations','critical_violations','major_violations','minor_violations','info_violations','false_positive_issues','open_issues','reopened_issues','confirmed_issues', 'sqale_debt_ratio','code_smells','bugs','reliability_rating','vulnerabilities','security_rating','files', 'comment_lines_density']

In [67]:
# Select variables of interest
git_commits_changes = git_commits_changes[git_commits_changes_names]
git_commits = git_commits[git_commits_names]
jira_issues = jira_issues[jira_issues_names]
sonar_analysis = sonar_analysis[sonar_analysis_names]
sonar_issues = sonar_issues[sonar_issues_names]
sonar_measures = sonar_measures[sonar_measures_names]

In [68]:
# Select columns of interest
dtypes = ['uint8','int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'object']

## Global functions

In [69]:
def delete_na(dataframes, dtypes):
    '''
    Objective:
        - Delete all NA's from the dataframe passed
    Input:
        - Dataframe : String of the tables and their selected columns
        - Numerical : Numerical types
        
    Output: 
        - Dataframe with the deleted values.
    '''
    for i in range(len(dataframes)):
        dataframe_numerical = dataframes[i].select_dtypes(include=dtypes)
        total_rows = dataframe_numerical.shape[0]
        # Delete rows that contain na's
        dataframe_numerical = dataframe_numerical.dropna()
        dataframes[i] = dataframe_numerical
    return dataframes

In [70]:
def analyse_categorical_variables(table_names, variable_names, dataframes):
    '''
    Objective:
        - Analyse the categorical variables to be encoded
    Input:
        - Table_names : String of the names of the tables
        - Variable_names : String of the categorical variables corresponding to the table
        - Dataframes : String of the tables and their selected columns
        
    Output: 
        - Table with the categorical variables levels
        ["Table name", "Variable name", "Number of levels", "Types"]
    '''
    table = PrettyTable()
    table.field_names = ["Table name", "Variable name", "Number of levels", "Types"]
    for i in range(len(table_names)):
        for j in range(len(variable_names)):
            table.add_row([table_names[i], variable_names[j], len(dataframes[i][variable_names[j]].unique()), dataframes[i][variable_names[j]].unique()])
    print(table)    

In [71]:
def one_hot_encoding(table, variable):
    '''
    Objective:
        - Encode the categorical variable passed from the table to one-hot encoding.
          If the categorical variable only has one level, the column is deleted.
    Input:
        - Table : String of the table name
        - Variable : String of the categorical variable corresponding to the table 
        
    Output:
        For categorical variables with more than one level:
            - Table with the categorical variable encoded to one-hot 
            ["Table name", "Variable name", "Number of levels", "Types"]
        For categorical variables with less than one level:
            -String indicating so.
    '''
    # The column contain more than one level.
    if len(table[variable].unique()) > 1:
        variable_dummies = pd.get_dummies(table[variable])
        table = table.drop(variable, axis=1)
        table = table.join(variable_dummies)
        return table
    # The column contain only one value and will be deleted.
    else:
        return table.drop(variable, axis=1)

In [72]:
def message_length(table,column):
    '''
    Objective:
        - Generate a column containgthe length of different messages and delete the column
          that contains the orignal text.
    Input:
        - table : Dataframe that wants to be used.
        - column : Name of the variable that contains the messages.
    Output:
        - None
    '''
    message_length = []
    for msg in table[column]:
        message_length.append(len(msg))
    # Reassign the MESSAGE variable to its length instead of the initial string \n",
    table[column] = message_length

### NA values
Deleting all NA values from the tables by using the global function implemented above delete_na().

In [73]:
tables = [sonar_measures, sonar_issues, sonar_analysis, jira_issues,git_commits, git_commits_changes]
[sonar_measures, sonar_issues, sonar_analysis, jira_issues,git_commits, git_commits_changes] = delete_na(tables, dtypes)

Moreover, in the GIT_COMMITS table, we also find rows that contain the value "No Author" in the AUTHOR column.
As we cannot know if all those commits come from an unique unidentified person or from multiple ones, we decided to eliminate such rows, as seen in the Data Quality task, they represent a minor percentage of the table length.

In [74]:
# Delete rows that contain missing authors and reseting the DF index.",
git_commits = git_commits.drop(git_commits[git_commits.AUTHOR == "No Author"].index)
git_commits.reset_index(drop= True)

Unnamed: 0,PROJECT_ID,COMMIT_HASH,AUTHOR,AUTHOR_DATE,AUTHOR_TIMEZONE,COMMIT_MESSAGE
0,org.apache:batik,b1ff4af6abfec32fc710d77795bb20a612a82126,James Duncan Davidson,2000-10-01T07:37:01Z,0,Initial revision\n\n\ngit-svn-id: https://svn....
1,org.apache:batik,c8d7a13470987f892f7466d55c10e3cee34de31d,James Duncan Davidson,2000-10-01T07:40:39Z,0,Update\nPR:\n\n\ngit-svn-id: https://svn.apach...
2,org.apache:batik,93a16402b48ae1cf70ea4bd030479170749bd10a,James Duncan Davidson,2000-10-01T08:15:04Z,0,Added question line (more a test of list/forwa...
3,org.apache:batik,fcaecb541edc03f36b6ec7a792e97dfeabf26117,Dean Jackson,2000-10-02T13:33:11Z,0,testing commit\n\n\ngit-svn-id: https://svn.ap...
4,org.apache:batik,2ecc354fa4f3209adad11560abad29ca3fb9b95d,Dean Jackson,2000-10-02T13:39:12Z,0,undoing the test commit\n\n\ngit-svn-id: https...
...,...,...,...,...,...,...
81015,org.apache:thrift,53d9c0c20bd5af65676928b9b7a73dcb2cad3d78,Mark Slee,2007-11-26 21:15:40+00:00,0,Merging EOFException changes from Ben Maurer ...
81016,org.apache:thrift,5ab570558f55d73472fbf6c0e66e6e165093c7d8,Mark Slee,2007-11-27 08:38:16+00:00,0,Fix writeContainerEnd call being inside loop i...
81017,org.apache:thrift,844ac12489600d7647f01ab4f9b99d9e1b81e69e,Mark Slee,2007-11-27 08:38:52+00:00,0,TJSONProtocol writing support in Java Summary...
81018,org.apache:thrift,256bdc444866b90bbdccfb5343e9c9ea8c22603c,Mark Slee,2007-11-27 08:42:19+00:00,0,IPv6 tweaks for Thrift Summary: Need to pass ...


### Categorical Values
The next step is to analyse the categorical variables and encoding them.
For the SONAR_MEASURES, JIRA_ISSUES, SONAR_ANALYSIS table (add more if necessary) there are not categorical varibles.

In [75]:
table_names = ["SONAR_ISSUES"]
variable_names = ["SEVERITY", "STATUS"]
dataframes = [sonar_issues]
analyse_categorical_variables(table_names, variable_names, dataframes)

+--------------+---------------+------------------+-----------------------------------------------+
|  Table name  | Variable name | Number of levels |                     Types                     |
+--------------+---------------+------------------+-----------------------------------------------+
| SONAR_ISSUES |    SEVERITY   |        5         | ['INFO' 'MINOR' 'MAJOR' 'CRITICAL' 'BLOCKER'] |
| SONAR_ISSUES |     STATUS    |        1         |                   ['CLOSED']                  |
+--------------+---------------+------------------+-----------------------------------------------+


As can be seen in the chunk above, the SEVERITY and STATUS variables have 5 and 1 levels respectively. In our case, we have performed the One-hot encoding for the SEVERITY variable. For the STATUS variable, efore deleting all NA, there was the OPENED level. However, all rows with an OPENED status contained NA, which means that for this variable we only have the CLOSED level. 
When joining the tables, we will calulate the mean of each types each author has.

In [76]:
sonar_issues = one_hot_encoding(sonar_issues, "SEVERITY")

In [77]:
sonar_issues = one_hot_encoding(sonar_issues, "STATUS")

### MESSAGE and COMMIT_MESSAGE variables
In the following section, we will encode the MESSAGE and COMMIT_MESSAGE variables for the SONAR_ISSUES table and GIT_COMMITS table respectively. For those variables, we will calulate the length of the message for each issue/commit, and reassigning the column with that new value instead of the text from the original message.

In [78]:
message_length(sonar_issues,"MESSAGE")
message_length(git_commits,"COMMIT_MESSAGE")

### ISSUE_CODE_LENGTH variable

In the following cells we will proceed to computate the length mean per issue with the START_LINE and END_LINE variables.

In [79]:
issue_length = []
for index, row in sonar_issues.iterrows():
    diff = row['END_LINE'] - row['START_LINE']
    issue_length.append(diff)

In [80]:
sonar_issues = sonar_issues.drop('START_LINE', axis=1)
sonar_issues = sonar_issues.drop('END_LINE', axis=1)
sonar_issues['ISSUE_CODE_LENGTH'] = issue_length
sonar_issues.head()

Unnamed: 0,CREATION_ANALYSIS_KEY,EFFORT,MESSAGE,CLOSE_ANALYSIS_KEY,BLOCKER,CRITICAL,INFO,MAJOR,MINOR,ISSUE_CODE_LENGTH
41,AWd5_psxC4KKKThc-qK6,10.0,53,AWeDrnWEC4KKKThcAtWf,0,0,1,0,0,0.0
198,AWd5_psxC4KKKThc-qK6,10.0,53,AWeMr1K3C4KKKThcB9hi,0,0,1,0,0,0.0
199,AWd5_psxC4KKKThc-qK6,10.0,53,AWeMr1K3C4KKKThcB9hi,0,0,1,0,0,0.0
200,AWd5_psxC4KKKThc-qK6,10.0,53,AWeMr1K3C4KKKThcB9hi,0,0,1,0,0,0.0
201,AWd5_psxC4KKKThc-qK6,10.0,53,AWeMr1K3C4KKKThcB9hi,0,0,1,0,0,0.0


## Joins

Resum:
- 66711 rows in sonar analysis
- 719186 rows un cop fent el join amb la variable creation_Analysis key
- 728409 rows un cop fent el join amb el CLOSE_ANALYSIS_KEY
- 1447595 rows dels dos joints 
- 1200310 unique rows en total dels dos joints
- 1197656 rows un cop fent el join del SONRA_MEASURES i SONAR_COMPLETE (sonar_analysis + sonar_issues)
- 1083655 unique rows en total dels dos joints

(ho he calulat abans amb datafrme.shape[0] però els chuncks els he eliminat per netejar el codi)

In [81]:
# Joining SONAR_ANALYSIS with SONAR_ISSUES
sonar_complete_1 = pd.merge(sonar_issues, sonar_analysis, left_on='CREATION_ANALYSIS_KEY', right_on='ANALYSIS_KEY', how='inner')
sonar_complete_2 = pd.merge(sonar_issues, sonar_analysis, left_on='CLOSE_ANALYSIS_KEY', right_on='ANALYSIS_KEY', how='inner')
sonar_complete_1 = sonar_complete_1.drop('CREATION_ANALYSIS_KEY', axis=1)
sonar_complete_1 = sonar_complete_1.drop('CLOSE_ANALYSIS_KEY', axis=1)
sonar_complete_2 = sonar_complete_2.drop('CREATION_ANALYSIS_KEY', axis=1)
sonar_complete_2 = sonar_complete_2.drop('CLOSE_ANALYSIS_KEY', axis=1)

sonar_complete = pd.concat([sonar_complete_1, sonar_complete_2])

In [82]:
# deleting duplicated rows
sonar_complete = sonar_complete[sonar_complete.duplicated()]

In [83]:
# Joining SONAR_ANALYSIS with SONAR_MEASURES
sonar_complete = pd.merge(sonar_complete, sonar_measures, left_on='ANALYSIS_KEY', right_on='analysis_key', how='inner')
sonar_complete = sonar_complete.drop('ANALYSIS_KEY', axis=1)

In [84]:
# deleting duplicated rows
sonar_complete = sonar_complete[sonar_complete.duplicated()]
sonar_complete.shape[0]

1083655

In [85]:
git_complete =  pd.merge(git_commits, git_commits_changes, left_on='COMMIT_HASH', right_on='COMMIT_HASH', how='inner')

In [86]:
dd_git_complete = dd.from_pandas(git_complete)
'''
final_table = pd.merge(git_complete, sonar_complete, left_on='COMMIT_HASH', right_on='REVISION', how='inner')
final_table.shape[0]
'''

ValueError: Exactly one of npartitions and chunksize must be specified.