In [1]:
import pandas as pd
import os
import xml.etree.ElementTree as ET

In [2]:
def get_xml_data(reviews, idx, data):
    review = ET.SubElement(reviews, 'review')
    review.set('id', str(idx))
    
    app_ver = ET.SubElement(review, 'app_version')
    app_ver.text = '0.0'
    
    user = ET.SubElement(review, 'user')
    user.text = 'NA'
    
    date = ET.SubElement(review, 'date')
    date.text = '1970-01-01'
    
    review_title = ET.SubElement(review, 'review_title')
    review_title.text = ''
    
    review_text = ET.SubElement(review, 'review_text')
    review_text.text = data["text"]

In [3]:

# create the file structure
reviews = ET.Element('reviews')

# Get the central repo that holds all the raw feedback data
raw_data_dir = "./data/raw"

# Get the central repo that will hold all the XML
xml_data_dir = "./data/xml"

os.mkdir(xml_data_dir)

# Read the files one by one, exporting them to xml
for dataset in os.listdir(raw_data_dir):
    df = pd.read_csv(os.path.join(raw_data_dir, dataset), index_col = 0)
    df.labels = df.labels.str.strip("']").str.strip("['").str.split(",")
    
    for idx, data in df.to_dict("index").items():
        get_xml_data(reviews, idx, data)
        
    review_xml_str = ET.tostring(reviews)
    
    with open(os.path.join(xml_data_dir, f"{dataset[:-4]}.xml"), "w") as xml_file:
        xml_file.write(review_xml_str.decode("utf-8"))

# Make a script to run this in SURF

In [6]:
cmd_str = lambda x: f"""java -classpath "C:\\Users\\pdev438\\shared\\panichella-SURF-tool-adcc79b\\SURF-Tool\\SURF-Tool\\lib\\*;C:\\Users\\pdev438\\shared\\panichella-SURF-tool-adcc79b\\SURF-Tool\\SURF-Tool\\SURF.jar" org.surf.Main C:\\Users\\pdev438\\projects\\unsupervised-classification-benchmark\\data\\xml\\{x}.xml {x}.xml \n"""

In [7]:
with open("run_surf.bat", "w") as f:
    cmd_str_text = ""
    for dataset_name in os.listdir(raw_data_dir):
        cmd_str_text += cmd_str(dataset_name[:-4])
    f.write(cmd_str_text)

# Run the SURF model over this xml file

In [65]:
xml_dir = "C:\\Users\\pdev438\\shared\\panichella-SURF-tool-adcc79b\\SURF-Tool\\SURF-Tool\\demo-data-set\\outputFile2.xml"

with open(xml_dir, "r") as f:
    root = ET.fromstring(f.read())

In [67]:
root.find("reviews_summary")

In [73]:
labelled_sentences = []

[labelled_sentences.extend(x.find("sentences").findall("sentence")) for x in root.findall("topic")]

[<Element 'sentence' at 0x0000026D9A9DB638>,
 <Element 'sentence' at 0x0000026D9A9DB778>,
 <Element 'sentence' at 0x0000026D9A9DB908>,
 <Element 'sentence' at 0x0000026D9A9DBA48>,
 <Element 'sentence' at 0x0000026D9A9DBB88>,
 <Element 'sentence' at 0x0000026D9A9DBCC8>,
 <Element 'sentence' at 0x0000026D9A9DBE08>,
 <Element 'sentence' at 0x0000026D9A9DBF48>,
 <Element 'sentence' at 0x0000026D9A9DD0E8>,
 <Element 'sentence' at 0x0000026D9A9DD228>,
 <Element 'sentence' at 0x0000026D9A9DD368>,
 <Element 'sentence' at 0x0000026D9A9DD4A8>,
 <Element 'sentence' at 0x0000026D9A9DD5E8>,
 <Element 'sentence' at 0x0000026D9A9DD728>,
 <Element 'sentence' at 0x0000026D9A9DD868>,
 <Element 'sentence' at 0x0000026D9A9DD9A8>,
 <Element 'sentence' at 0x0000026D9A9DDB88>,
 <Element 'sentence' at 0x0000026D9A9DDCC8>,
 <Element 'sentence' at 0x0000026D9A9DDEA8>,
 <Element 'sentence' at 0x0000026D9A9DE048>,
 <Element 'sentence' at 0x0000026D9A9DE188>,
 <Element 'sentence' at 0x0000026D9A9DE2C8>,
 <Element 

In [77]:
feedback_labels = {}

def extract_labels_for_sentences(sentence_tag):
    label = sentence_tag.find("sentence_type").text
    origin_id = sentence_tag.find("from_review").text
    
    if origin_id in feedback_labels.keys():
        feedback_labels[origin_id].append(label)
    else:
        feedback_labels[origin_id] = [label]

In [78]:
[extract_labels_for_sentences(x) for x in labelled_sentences]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [85]:
feedback_labels = {k: list(set(v)) for k,v in feedback_labels.items()}

In [98]:
df["surf_labels"] = None

In [99]:
for index, labels in feedback_labels.items():
    df.loc[int(index), "surf_labels"] = labels

In [100]:
df["surf_labels"] = df["surf_labels"].apply(lambda x: ["OTHER"] if x is None else x)

In [112]:
df.surf_labels.apply(lambda x: x[0]).unique()

array(['INFO', 'BUG', 'REQUEST', 'OTHER', 'QUESTION'], dtype=object)

In [122]:
df.labels.apply(lambda x: x[0]).unique()

array(['oth', 'bug', 'fea'], dtype=object)

In [141]:
true_bugs = df.labels.apply(lambda x: ["bug" in x, "fea" in x, "oth" in x])

In [143]:
pred_bugs = df.surf_labels.apply(lambda x: ["BUG" in x, "REQUEST" in x, "OTHER" in x or "INFO" in x or "QUESTION" in x])

In [139]:
from sklearn.metrics import roc_auc_score

In [153]:
roc_auc_score(np.asarray(true_bugs.values.reshape(-1).tolist()), np.asarray(pred_bugs.values.reshape(-1).tolist()))

0.5756305249538035

In [147]:
import numpy as np



array([[False, False,  True],
       [ True, False,  True],
       [False,  True, False],
       ...,
       [False, False,  True],
       [False, False,  True],
       [False, False,  True]])