##  This notebook aggregates all code elements and edit_actions by bug_id and file, producing association rules through pattern mining.
## This notebook outputs 2 csv files, 1 containing the association rules for bugs labelled as 'Quantum' and the other one containing the association rules for bugs labelled as 'Classical'

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import fpmax
from mlxtend.frequent_patterns import fpgrowth
import datetime
import warnings
warnings.filterwarnings("ignore")

### Data cleaning and merging

In [None]:
fixElements = pd.read_csv(r'..\data\generated\fixed-code-data.csv',header=None, sep=',')

In [None]:
BuggyElements = pd.read_csv(r'..\..\buggy-code\data\generated\buggy-code-data.csv',header=None, sep=',')

In [None]:
fixElements.head()

In [None]:
fixElements.columns = ["project_full_name", "fix_commit_hash", "buggy_commit_hash", "bug_id", "bug_type", "file_path","line_number","component","edit_action"]
fixElements.drop(index=fixElements.index[0], axis=0, inplace=True)
fixElements = fixElements.drop(columns=['project_full_name', 'fix_commit_hash','buggy_commit_hash','line_number'])


In [None]:
fixElements.head()

In [None]:
fixElements = fixElements.sort_values(by=['bug_id', 'file_path'])
fixElements = fixElements.reset_index(drop=True)

In [None]:
fixElements.shape

In [None]:
BuggyElements.head()

In [None]:
BuggyElements.columns = ["project_full_name", "fix_commit_hash", "buggy_commit_hash", "bug_id", "bug_type", "file_path","line_number","component","edit_action"]
BuggyElements.drop(index=fixElements.index[0], axis=0, inplace=True)
BuggyElements = BuggyElements.drop(columns=['project_full_name', 'fix_commit_hash','buggy_commit_hash','line_number'])

In [None]:
BuggyElements.head()

In [None]:
BuggyElements = BuggyElements.sort_values(by=['bug_id', 'file_path'])
BuggyElements = BuggyElements.reset_index(drop=True)

In [None]:
BuggyElements.shape

In [None]:
Elements = BuggyElements.append(fixElements)

In [None]:
Elements.head()

In [None]:
Elements.shape

### Replacing M and U edit actions by E edit action and merging component with edit action to ease mining algorithm

In [None]:
Elements["edit_action"].replace({"M": "E", "U": "E"}, inplace=True)
Elements["component-edit_action"] = Elements['component'].astype(str) +"-"+ Elements["edit_action"]
Elements = Elements.drop(columns=['component', 'edit_action'])

In [None]:
Elements.head()

### Splitting dataframe into bug_type == Quantum and bug_type == Classical

In [None]:
quantum_elements = Elements[Elements['bug_type'] == 'Quantum']
quantum_elements = quantum_elements.reset_index(drop=True)

In [None]:
classical_elements = Elements[Elements['bug_type'] == 'Classical'] 
classical_elements = classical_elements.reset_index(drop=True)

### Creating list of transactions to feed to mining algorithm

In [None]:
quantum_elements['id'] = quantum_elements.groupby(['bug_id','file_path']).ngroup()

In [None]:
classical_elements['id'] = classical_elements.groupby(['bug_id','file_path']).ngroup()

In [None]:
quantum_elements.head()

In [None]:
seen = set()
quantum_transactions = []
i = 0

while i < len(quantum_elements['id']):
    if quantum_elements['id'][i] not in seen:
        seen.add(quantum_elements['id'][i])
        quantum_transactions.append([quantum_elements['component-edit_action'][i]])
    else:
        quantum_transactions[-1].append(quantum_elements['component-edit_action'][i])
    i += 1

In [None]:
seen = set()
classical_transactions = []
i = 0

while i < len(classical_elements['id']):
    if classical_elements['id'][i] not in seen:
        seen.add(classical_elements['id'][i])
        classical_transactions.append([classical_elements['component-edit_action'][i]])
    else:
        classical_transactions[-1].append(classical_elements['component-edit_action'][i])
    i += 1

In [None]:
#Compute binary database for the quantum transactions
tr_enc = TransactionEncoder()
trans_array = tr_enc.fit(quantum_transactions).transform(quantum_transactions)
binary_database_quantum = pd.DataFrame(trans_array, columns=tr_enc.columns_)
binary_database_quantum

In [None]:
#Compute binary database for the classical transactions
tr_enc = TransactionEncoder()
trans_array = tr_enc.fit(classical_transactions).transform(classical_transactions)
binary_database_classical = pd.DataFrame(trans_array, columns=tr_enc.columns_)
binary_database_classical

### Metrics for association rules and/or frequent itemsets

#### Support:
Probability of item A occurring, i.e, how many transactions contain item A - P(A)
#### Confidence:
Likeliness of occurrence of a consequent B given the antecedents A - p(A and B) / p(A)
#### Lift:
Confidence/Support

Lift is the strongest metric to assess associations betwen consequent and antecedent: Lift > 1

### Create list of frequent itemsets for quantum and classical binary databases

In [None]:
frequent_itemsets_quantum = fpgrowth(binary_database_quantum, min_support=0.2, use_colnames=True)
frequent_itemsets_quantum.sort_values('2')

In [None]:
frequent_itemsets_classical= fpgrowth(binary_database_classical, min_support=0.2, use_colnames=True)
frequent_itemsets_classical

### Computing Association rules using lift metric

In [None]:
quantum_association_rules = association_rules(frequent_itemsets_quantum, metric="lift", min_threshold=1.2)
quantum_association_rules.sort_values(by=['lift'], ascending=True)

In [None]:
classical_association_rules = association_rules(frequent_itemsets_classical, metric="lift", min_threshold=1.2)
classical_association_rules.sort_values(by=['lift'], ascending=True)

## Sending dataframes to csv files

In [None]:
classical_association_rules.sort_values(by=['lift'], ascending=False).to_csv('classical-patterns.csv', index=False)
quantum_association_rules.sort_values(by=['lift'], ascending=False).to_csv('quantum-patterns.csv', index=False) 