In [None]:
# Create needed structure for predicting software refactorings and their types, based on td data

In [1]:
import pandas as pd
import ast
from collections import Counter

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
df = pd.read_excel('/content/gdrive/MyDrive/maintainability/all_technical_debt_issues.xlsx')

In [None]:
df = df[df['project']=='jEdit']

In [None]:
columns_to_drop = ['project', 'key', 'rule', 'resolution', 'status', 'message',  'creationDate', 'updateDate', 'closeDate', 'tags']
df = df.drop(columns=columns_to_drop)

In [None]:
print(df)

       severity                         component  debt           type
16952  CRITICAL  src/org/gjt/sp/jedit/Buffer.java   8.0     CODE_SMELL
16953     MAJOR  src/org/gjt/sp/jedit/Buffer.java   5.0     CODE_SMELL
16954     MAJOR  src/org/gjt/sp/jedit/Buffer.java   5.0     CODE_SMELL
16955     MAJOR  src/org/gjt/sp/jedit/Buffer.java  10.0     CODE_SMELL
16956     MINOR  src/org/gjt/sp/jedit/Buffer.java  15.0  VULNERABILITY
...         ...                               ...   ...            ...
42970     MINOR      src/org/gjt/sp/util/Log.java   2.0     CODE_SMELL
42971     MAJOR      src/org/gjt/sp/util/Log.java  10.0     CODE_SMELL
42972     MAJOR      src/org/gjt/sp/util/Log.java  10.0     CODE_SMELL
42973     MINOR      src/org/gjt/sp/util/Log.java   5.0     CODE_SMELL
42974  CRITICAL      src/org/gjt/sp/util/Log.java   5.0     CODE_SMELL

[26023 rows x 4 columns]


In [None]:
df_ref = pd.read_excel('/content/gdrive/MyDrive/maintainability/Refs/JEditRefactorings.xlsx')

In [None]:
def transform_string(original_string):
    parts = original_string.rsplit('.', 1)
    transformed_parts = [part.replace('.', '/') for part in parts[:-1]] + [parts[-1]]
    return '.'.join(transformed_parts)

In [None]:
df_ref['Class'] = df_ref['Class'].apply(transform_string)
df_ref['Class'] = 'src/' + df_ref['Class']

In [None]:
df_ref['Class'] = df_ref['Class'].replace('\.', '/', regex=True)

In [None]:
df_ref['Class'] = 'src/' + df_ref['Class']
df_ref['Class'] = df_ref['Class'] + '.java'

In [None]:
print(df_ref)

                  Refactoring Type  \
0                       Move Class   
1                 Rename Parameter   
2     Remove Thrown Exception Type   
3       Replace Loop With Pipeline   
4            Change Parameter Type   
...                            ...   
1416                Move Attribute   
1417       Extract And Move Method   
1418       Extract And Move Method   
1419       Extract And Move Method   
1420       Extract And Move Method   

                                Refactoring Description  \
0     org.gjt.sp.jedit.browser.VFSFileChooserDialog....   
1     clazz : String to className : String in method...   
2     IOException in method private processClass(out...   
3     for(int i=0; i < classes.length; i++) with inc...   
4     pkg : PackageDoc to pkg : PackageElement in me...   
...                                                 ...   
1416  public GZIP_MAGIC_2 : int from class org.gjt.s...   
1417  public setLabel(label String) : void extracted...   
1418  public

In [None]:
df_ref = df_ref.drop(columns=['Refactoring Description'])

In [None]:
grouped_df = df_ref.groupby('Class').agg(lambda x: list(x)).reset_index()

# Rename the columns
grouped_df.columns = ['Class', 'Types']
print(grouped_df)

                                                 Class  \
0                       src/doclet/GenerateTocXML.java   
1                  src/org/gjt/sp/jedit/ActionSet.java   
2                   src/org/gjt/sp/jedit/Autosave.java   
3            src/org/gjt/sp/jedit/BeanShellAction.java   
4    src/org/gjt/sp/jedit/BeanShellAction/CachedBsh...   
..                                                 ...   
392         src/org/jedit/io/Native2ASCIIEncoding.java   
393  src/org/jedit/io/Native2ASCIIEncoding/Native2A...   
394       src/org/jedit/migration/CheckFileStatus.java   
395  src/org/jedit/migration/OneTimeMigrationServic...   
396         src/org/jedit/options/OptionGroupPane.java   

                                                 Types  
0    [Rename Parameter, Remove Thrown Exception Typ...  
1    [Add Method Annotation, Add Parameter Annotati...  
2    [Replace Loop With Pipeline, Add Method Annota...  
3    [Add Attribute Modifier, Add Attribute Modifie...  
4                 

In [None]:
grouped_df = grouped_df.rename(columns={'Class': 'component'})
print(grouped_df)

                                             component  \
0                       src/doclet/GenerateTocXML.java   
1                  src/org/gjt/sp/jedit/ActionSet.java   
2                   src/org/gjt/sp/jedit/Autosave.java   
3            src/org/gjt/sp/jedit/BeanShellAction.java   
4    src/org/gjt/sp/jedit/BeanShellAction/CachedBsh...   
..                                                 ...   
392         src/org/jedit/io/Native2ASCIIEncoding.java   
393  src/org/jedit/io/Native2ASCIIEncoding/Native2A...   
394       src/org/jedit/migration/CheckFileStatus.java   
395  src/org/jedit/migration/OneTimeMigrationServic...   
396         src/org/jedit/options/OptionGroupPane.java   

                                                 Types  
0    [Rename Parameter, Remove Thrown Exception Typ...  
1    [Add Method Annotation, Add Parameter Annotati...  
2    [Replace Loop With Pipeline, Add Method Annota...  
3    [Add Attribute Modifier, Add Attribute Modifie...  
4                 

In [None]:
merged_df = pd.merge(df, grouped_df, on='component')

In [None]:
merged_df = pd.merge(df, grouped_df, on='component', how='left')

# Fill NaN values in 'Value2' column with empty lists
merged_df['Types'].fillna(value=pd.Series([[]] * len(merged_df)), inplace=True)

In [None]:
print(merged_df)

       severity                         component  debt           type  \
0      CRITICAL  src/org/gjt/sp/jedit/Buffer.java   8.0     CODE_SMELL   
1         MAJOR  src/org/gjt/sp/jedit/Buffer.java   5.0     CODE_SMELL   
2         MAJOR  src/org/gjt/sp/jedit/Buffer.java   5.0     CODE_SMELL   
3         MAJOR  src/org/gjt/sp/jedit/Buffer.java  10.0     CODE_SMELL   
4         MINOR  src/org/gjt/sp/jedit/Buffer.java  15.0  VULNERABILITY   
...         ...                               ...   ...            ...   
26018     MINOR      src/org/gjt/sp/util/Log.java   2.0     CODE_SMELL   
26019     MAJOR      src/org/gjt/sp/util/Log.java  10.0     CODE_SMELL   
26020     MAJOR      src/org/gjt/sp/util/Log.java  10.0     CODE_SMELL   
26021     MINOR      src/org/gjt/sp/util/Log.java   5.0     CODE_SMELL   
26022  CRITICAL      src/org/gjt/sp/util/Log.java   5.0     CODE_SMELL   

                                                   Types  
0      [Encapsulate Attribute, Encapsulate Attribute

In [None]:
excel_file_path = '/content/gdrive/MyDrive/maintainability/Refs/JEditRefactoringsAndTD.xlsx'
merged_df.to_excel(excel_file_path, index=False)

In [None]:
# Compress all issues to only one per component

In [None]:
df_refactorings = pd.read_excel('/content/gdrive/MyDrive/maintainability/Refs/TuxGuitarRefactoringsAndTD.xlsx')

In [12]:
def is_float(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

def adjust_debt(debt):
    if debt == None or debt == 'n/a':
        return 0
    return debt

In [None]:
def adjust_excel(data):
    severities = {}
    types = {}
    debts = {}
    ref_types = {}
    severity = {'MINOR': 2,  'MAJOR': 3,  'INFO': 1, 'CRITICAL': 4,  'BLOCKER': 5}

    if not is_float(data.loc[0, 'severity']):
        frequencyClass = {}
        type = {'BUG': 2, 'VULNERABILITY': 3, 'CODE_SMELL': 1}

        for i in range(len(data)):
            if data.loc[i, 'component'] in frequencyClass.keys():
                if adjust_debt(data.loc[i, 'debt']) > 0:
                    frequencyClass[data.loc[i, 'component']] += 1
                    debts[data.loc[i, 'component']] += data.loc[i, 'debt']
                    severities[data.loc[i, 'component']] += severity[data.loc[i, 'severity']]
                    types[data.loc[i, 'component']] += type[data.loc[i, 'type']]
                    if data.loc[i, 'Types'] != [] and data.loc[i, 'Types'] != "[]":
                      ref_types[data.loc[i, 'component']] += ast.literal_eval(data.loc[i, 'Types'])
            else:
                if adjust_debt(data.loc[i, 'debt']) > 0:
                    frequencyClass[data.loc[i, 'component']] = 1
                    debts[data.loc[i, 'component']] = data.loc[i, 'debt']
                    severities[data.loc[i, 'component']] = severity[data.loc[i, 'severity']]
                    types[data.loc[i, 'component']] = type[data.loc[i, 'type']]
                    ref_types[data.loc[i, 'component']] = ast.literal_eval(data.loc[i, 'Types'])

        for key in frequencyClass:
            severities[key] /= frequencyClass[key]
            debts[key] /= frequencyClass[key]
            types[key] /= frequencyClass[key]

        components = frequencyClass.keys()
        newData = {"component": components, "severity": severities.values(), "debt": debts.values(), "type": types.values(), "RTypes": ref_types.values(),
                "FrequencyClass": frequencyClass.values()}
        df = pd.DataFrame(newData)
        return df
    else:
        newData = {"component": data['component'], "severity": data['severity'], "debt": data['debt'],
                   "type": data['type'], "RTypes": data['Types'],
                   "FrequencyClass": data['FrequencyClass']}
        df = pd.period_range.DataFrame(newData)
        return df

In [None]:
df_refactorings = adjust_excel(df_refactorings)

In [None]:
print(df_refactorings)

                                              component  severity       debt  \
0      src/org/herac/tuxguitar/app/TGMainSingleton.java  2.709677  10.645161   
1     src/org/herac/tuxguitar/app/action/impl/settin...  2.000000   5.000000   
2     src/org/herac/tuxguitar/app/system/config/TGCo...  3.750000   8.500000   
3     src/org/herac/tuxguitar/app/system/icons/TGCol...  2.250000   5.500000   
4     src/org/herac/tuxguitar/app/system/variables/T...  2.000000   1.000000   
...                                                 ...       ...        ...   
1293  src/org/herac/tuxguitar/play/models/midiplayer...  3.000000   5.000000   
1294  src/org/herac/tuxguitar/song/models/Instrument...  3.500000  25.000000   
1295   src/org/herac/tuxguitar/song/models/Silence.java  3.500000  25.000000   
1296  src/org/herac/tuxguitar/song/models/TimeSignat...  3.500000  25.000000   
1297   src/org/herac/tuxguitar/song/models/Tupleto.java  3.500000  25.000000   

          type                         

In [None]:
excel_file_path = '/content/gdrive/MyDrive/maintainability/Refs/TuxGuitarRefactoringsAndTDSingleInst.xlsx'
df_refactorings.to_excel(excel_file_path, index=False)

In [None]:
# Create new data. Data is not only one per component, but each component has as many instances as refactorings performed upon it

In [None]:
excel_file_path = '/content/gdrive/MyDrive/maintainability/Refs/TuxGuitarRefactoringsAndTD.xlsx'
df_refs_all = pd.read_excel(excel_file_path)

In [None]:
def prepare_for_explosion(data):
  for index, row in data.iterrows():
    if row['Types'][-1] != "]":
      last_comma_index = row['Types'].rfind(',')
      new_string = row['Types'][:last_comma_index]
      new_string += "]"
      df_refs_all.at[index, 'Types'] = new_string
      #if row['RTypes'][-1] != '\'':
      #  if row['RTypes'][-1] == ',':
      #    row['RTypes'] = row['RTypes'][:-1]
      #  row['RTypes'] += "'"
    df_refs_all.at[index, 'Types'] = ast.literal_eval(df_refs_all.at[index, 'Types'] )
  return data

In [None]:
df_refs_all = prepare_for_explosion(df_refs_all)

In [None]:
print(type(df_refs_all.loc[0, 'Types']))

<class 'list'>


In [None]:
df2 = df_refs_all.explode('Types').reset_index(drop=True)
print(df2)

      severity                                          component  debt  \
0        MAJOR   src/org/herac/tuxguitar/app/TGMainSingleton.java  20.0   
1        MAJOR   src/org/herac/tuxguitar/app/TGMainSingleton.java  20.0   
2        MAJOR   src/org/herac/tuxguitar/app/TGMainSingleton.java  20.0   
3        MAJOR   src/org/herac/tuxguitar/app/TGMainSingleton.java  20.0   
4        MAJOR   src/org/herac/tuxguitar/app/TGMainSingleton.java  20.0   
...        ...                                                ...   ...   
12762    MINOR  src/org/herac/tuxguitar/song/util/UndoableUtil...   5.0   
12763    MAJOR  src/org/herac/tuxguitar/song/util/UndoableUtil...  15.0   
12764    MAJOR  src/org/herac/tuxguitar/song/util/UndoableUtil...  15.0   
12765    MINOR  src/org/herac/tuxguitar/song/util/UndoableUtil...   5.0   
12766    MINOR  src/org/herac/tuxguitar/song/util/UndoableUtil...   5.0   

             type                          Types  
0      CODE_SMELL                  Rename Method

In [None]:
df2['Types'] = df2['Types'].fillna("Empty_list")

In [None]:
print(df2)

      severity                                          component  debt  \
0        MAJOR   src/org/herac/tuxguitar/app/TGMainSingleton.java  20.0   
1        MAJOR   src/org/herac/tuxguitar/app/TGMainSingleton.java  20.0   
2        MAJOR   src/org/herac/tuxguitar/app/TGMainSingleton.java  20.0   
3        MAJOR   src/org/herac/tuxguitar/app/TGMainSingleton.java  20.0   
4        MAJOR   src/org/herac/tuxguitar/app/TGMainSingleton.java  20.0   
...        ...                                                ...   ...   
12762    MINOR  src/org/herac/tuxguitar/song/util/UndoableUtil...   5.0   
12763    MAJOR  src/org/herac/tuxguitar/song/util/UndoableUtil...  15.0   
12764    MAJOR  src/org/herac/tuxguitar/song/util/UndoableUtil...  15.0   
12765    MINOR  src/org/herac/tuxguitar/song/util/UndoableUtil...   5.0   
12766    MINOR  src/org/herac/tuxguitar/song/util/UndoableUtil...   5.0   

             type                          Types  
0      CODE_SMELL                  Rename Method

In [None]:
excel_file_path = '/content/gdrive/MyDrive/maintainability/Refs/TuxGuitarRefactoringsAndTDRefAll.xlsx'
df2.to_excel(excel_file_path, index=False)

In [None]:
# Create dataset with single inst with the most predominant refactoring type or the least common one

In [16]:
excel_file_path = '/content/gdrive/MyDrive/maintainability/Refs/TuxGuitarRefactoringsAndTD.xlsx'
df = pd.read_excel(excel_file_path)

In [17]:
ref_assigned = {}

In [18]:
from os import minor
def find_equal_index(array):
  count  = 0
  for i in range(1, len(array)):
    if array[i - 1] == array[i]:
      count += 1
    else:
      break
  return count

def find_refactoring_general(ref_array):
  ref_type = None
  element_counts = Counter(ref_array)
  count = find_equal_index(list(element_counts.values()))
  most_common_elements = element_counts.most_common(count)

  if count == 0:
    return 'Empty_list'
  else:
    min_app = 100000
    index = 0
    for i in range(0, count):
      if most_common_elements[i][0] not in ref_assigned.keys():
        ref_assigned[most_common_elements[i][0]] = 1
        ref_type = most_common_elements[i][0]
        break

      if most_common_elements[i][0] in ref_assigned.keys():
        if min_app > ref_assigned[most_common_elements[i][0]]:
          min_app = ref_assigned[most_common_elements[i][0]]
          index = i

    if ref_type == None:
      ref_type = most_common_elements[index][0]

  return ref_type

In [19]:
def find_refactoring(ref_array):
  ref_type = None
  element_counts = Counter(ref_array)
  most_common_elements = element_counts.most_common(2)
  if len(most_common_elements) == 0:
    return 'Empty_list'

  if len(most_common_elements) == 2 and most_common_elements[0][1] == most_common_elements[1][1]:
    if most_common_elements[0] not in ref_assigned.keys():
      ref_assigned[most_common_elements[0]] = 1
      ref_type = most_common_elements[0]
    else:
      if len(most_common_elements) == 2:
        if most_common_elements[0] in ref_assigned.keys():
          if most_common_elements[1] not in ref_assigned.keys():
            ref_assigned[most_common_elements[1]] = 1
            ref_type = most_common_elements[1]
          else:
            if most_common_elements[0][1] >= most_common_elements[1][1]:
              ref_assigned[most_common_elements[1]] += 1
              ref_type = most_common_elements[1]
            else:
              ref_assigned[most_common_elements[0]] += 1
              ref_type = most_common_elements[0]
        else:
          ref_assigned[most_common_elements[0]] = 1
          ref_type = most_common_elements[0]
      else:
        if most_common_elements[0] not in ref_assigned.keys():
          ref_assigned[most_common_elements[0]] = 1
          ref_type = most_common_elements[0]
        else:
          ref_assigned[most_common_elements[0]] += 1
          ref_type = most_common_elements[0]
  else:
    ref_type = most_common_elements[0]
    if most_common_elements[0] not in ref_assigned.keys():
        ref_type = most_common_elements[0]
    else:
        ref_assigned[most_common_elements[0]] += 1

  return ref_type[0]

In [20]:
for index, row in df.iterrows():
  if df.at[index, 'Types'] != "[]":
    ref = find_refactoring_general(ast.literal_eval(df.at[index, 'Types']))
    df.at[index, 'Types'] = ref

In [21]:
excel_file_path = '/content/gdrive/MyDrive/maintainability/Refs/TuxGuitarRefactoringsAndTDOneRef.xlsx'
df.to_excel(excel_file_path, index=False)

In [None]:
# Apply the reduction method

In [22]:
excel_file_path = '/content/gdrive/MyDrive/maintainability/Refs/TuxGuitarRefactoringsAndTDOneRef.xlsx'
df = pd.read_excel(excel_file_path)

In [23]:
def adjust_excel_with_avg_mean(data):
    severities = {}
    types = {}
    debts = {}
    severity = {'MINOR': 2,  'MAJOR': 3,  'INFO': 1, 'CRITICAL': 4,  'BLOCKER': 5}
    refs = {}

    if not is_float(data.loc[0, 'severity']):
        frequencyClass = {}
        type = {'BUG': 2, 'VULNERABILITY': 3, 'CODE_SMELL': 1}

        for i in range(len(data)):
            if data.loc[i, 'component'] in frequencyClass.keys():
                if adjust_debt(data.loc[i, 'debt']) > 0:
                    frequencyClass[data.loc[i, 'component']] += 1
                    debts[data.loc[i, 'component']] += data.loc[i, 'debt']
                    severities[data.loc[i, 'component']] += severity[data.loc[i, 'severity']]
                    types[data.loc[i, 'component']] += type[data.loc[i, 'type']]
                    refs[data.loc[i, 'component']] = data.loc[i, 'Types']
            else:
                if adjust_debt(data.loc[i, 'debt']) > 0:
                    frequencyClass[data.loc[i, 'component']] = 1
                    debts[data.loc[i, 'component']] = data.loc[i, 'debt']
                    severities[data.loc[i, 'component']] = severity[data.loc[i, 'severity']]
                    types[data.loc[i, 'component']] = type[data.loc[i, 'type']]
                    refs[data.loc[i, 'component']] = data.loc[i, 'Types']

        for key in frequencyClass:
            severities[key] /= frequencyClass[key]
            debts[key] /= frequencyClass[key]
            types[key] /= frequencyClass[key]

        components = frequencyClass.keys()
        newData = {"component": components, "severity": severities.values(), "debt": debts.values(), "type": types.values(), "RTypes": refs.values()}
        df = pd.DataFrame(newData)
        return df
    else:
        newData = {"component": data['component'], "severity": data['severity'], "debt": data['debt'],
                   "type": data['type'], "RTypes": data['Types']}
        df = pd.period_range.DataFrame(newData)
        return df

In [24]:
df = adjust_excel_with_avg_mean(df)

In [25]:
df['RTypes'] = df['RTypes'].replace('[]', 'Empty_list')

In [26]:
excel_file_path = '/content/gdrive/MyDrive/maintainability/Refs/TuxGuitarRefactoringsAndTDOneRefOneComp.xlsx'
df.to_excel(excel_file_path, index=False)