In [1]:
import pandas as pd
import os
import xml.etree.ElementTree as ET

In [2]:
def get_xml_data(reviews, idx, data):
    review = ET.SubElement(reviews, 'review')
    review.set('id', str(idx))
    
    app_ver = ET.SubElement(review, 'app_version')
    app_ver.text = '0.0'
    
    user = ET.SubElement(review, 'user')
    user.text = 'NA'
    
    date = ET.SubElement(review, 'date')
    date.text = '1970-01-01'
    
    review_title = ET.SubElement(review, 'review_title')
    review_title.text = ''
    
    review_text = ET.SubElement(review, 'review_text')
    review_text.text = data["text"]

In [3]:
# Get the central repo that holds all the raw feedback data
raw_data_dir = "./data/raw"

# Get the central repo that will hold all the XML
xml_data_dir = "./data/xml"

os.makedirs(xml_data_dir, exist_ok=True)

# Read the files one by one, exporting them to xml
for dataset in os.listdir(raw_data_dir):
    df = pd.read_csv(os.path.join(raw_data_dir, dataset), index_col = 0)
    df.labels = df.labels.str.strip("']").str.strip("['").str.split(",")
    
    # create the file structure
    reviews = ET.Element('reviews')
    
    for idx, data in df.to_dict("index").items():
        get_xml_data(reviews, idx, data)
        
    review_xml_str = ET.tostring(reviews)
    
    dataset_name = dataset[:-4]
        
    dataset_name = dataset_name.replace(" ", "_").replace("#", "_")

    with open(os.path.join(xml_data_dir, f"{dataset_name}.xml"), "w") as xml_file:
        xml_file.write(review_xml_str.decode("utf-8"))

# Make a script to run this in SURF

In [4]:
cmd_str = lambda x: f"""java -classpath "C:\\Users\\pdev438\\shared\\panichella-SURF-tool-adcc79b\\SURF-Tool\\SURF-Tool\\lib\\*;C:\\Users\\pdev438\\shared\\panichella-SURF-tool-adcc79b\\SURF-Tool\\SURF-Tool\\SURF.jar" org.surf.Main C:\\Users\\pdev438\\projects\\unsupervised-classification-benchmark\\data\\xml\\{x}.xml {x}.xml \n"""

In [5]:
with open("run_surf.bat", "w") as f:
    cmd_str_text = ""
    for dataset_name in os.listdir(raw_data_dir):
        
        dataset_name = dataset_name[:-4].replace(" ", "_")
        
        cmd_str_text += cmd_str(dataset_name)
    f.write(cmd_str_text)

# Run the SURF model over this xml file

In [26]:
# #pd.read_csv(os.path.join("./data/raw", "williams_2017_@snapchat.csv"), index_col = 0).labels.str.split(",").apply(lambda x: x[0]).unique()

# df = pd.read_csv(os.path.join("./data/raw", "tizard_2019_features.csv"), index_col = 0)
# df.labels = df.labels.str.strip("]").str.strip("[").str.split(",")
# df.labels = df.labels.apply(lambda labels: [label.strip("\"").strip("'") for label in labels])
# print(df.shape)
# df.labels.apply(lambda x: x[0]).unique()

In [27]:
label_maps = {
    "chen_2014": [(["informative"], ["BUG", 'REQUEST', 'INFO', 'QUESTION'])],
    "ciurumelea_2017": [(["OTHER"], ["OTHER"])],
    "di_sorbo_2016": [(["[INFO]"], ["INFO"]), (["[BUG]"], ["BUG"]), (["[REQUEST]"], ["REQUEST"]), (["[QUESTION]"], ["QUESTION"])],
    "guzman_2015": [(["Bug report"], ["BUG"]), (["User request"], ["REQUEST"])],
    "maalej_2016": [(["Bug"], ["BUG"]), (["Feature"], ["REQUEST"])],
    "scalabrino_2017": [(["BUG"], ["BUG"]), (["FEATURE"], ["REQUEST"])],
    "tizard_2019": [(["apparent bug"], ["BUG"]), (["feature request"], ["REQUEST"]), (["question on application", "help seeking", "requesting more information", "question on background"], ["QUESTION"]), (["application guidance", "user setup", "praise for application", "dispraise for application", "application usage", "attempted solution", "acknowledgement of problem resolution"], ["INFO"])],
    "williams_2017": [(["bug"], ["BUG"]), (["fea"], ["REQUEST"]), (["oth"], ["OTHER"])],
}

In [53]:
from sklearn.metrics import roc_auc_score
import numpy as np

results = {}
XML_dir = "C:\\Users\\pdev438\\shared\\panichella-SURF-tool-adcc79b\\SURF-Tool\\SURF-Tool\\demo-data-set\\"

for file in os.listdir(XML_dir):
    
    if file[-4:] != ".xml":
        continue
    print(file)

    xml_file_dir = os.path.join(XML_dir, file)

    with open(xml_file_dir, "r", encoding="utf-8") as f:
        xml_str_data = f.read()
        root = ET.fromstring(xml_str_data)

    labelled_sentences = []

    [labelled_sentences.extend(x.find("sentences").findall("sentence")) for x in root.findall("topic")]

    feedback_labels = {}

    def extract_labels_for_sentences(sentence_tag):
        label = sentence_tag.find("sentence_type").text
        origin_id = sentence_tag.find("from_review").text

        if origin_id in feedback_labels.keys():
            feedback_labels[origin_id].append(label)
        else:
            feedback_labels[origin_id] = [label]

    [extract_labels_for_sentences(x) for x in labelled_sentences]

    feedback_labels = {k: list(set(v)) for k,v in feedback_labels.items()}

    # Get the central repo that holds all the raw feedback data
    raw_data_dir = "./data/raw"
    
    csv_file_name = file[:-4] + ".csv"

    df = pd.read_csv(os.path.join(raw_data_dir, csv_file_name), index_col = 0)
    df.labels = df.labels.str.strip("]").str.strip("[").str.split(",")
    df.labels = df.labels.apply(lambda labels: [label.strip("\"").strip("'") for label in labels])
    df.labels = df.labels.apply(lambda labels: [label.replace("'", "").replace("\"", "").strip() for label in labels])
    
    dataset_name = [dataset for dataset in label_maps.keys() if dataset in file][0]
    label_map = label_maps[dataset_name]
    
    true_label_set = [labels[0] for labels in label_map]
    prediction_label_set = [labels[1] for labels in label_map]
    
    df["surf_labels"] = None
    
    for index, labels in feedback_labels.items():
        df.loc[int(index), "surf_labels"] = labels
        
    df["surf_labels"] = df["surf_labels"].apply(lambda x: ["OTHER"] if x is None else x)
    
    true_labels = df.labels.apply(lambda x: [any([true_label in x for true_label in true_label_list]) for true_label_list in true_label_set])
    pred_labels = df.surf_labels.apply(lambda x: [any([pred_label in x for pred_label in prediction_label_list]) for prediction_label_list in prediction_label_set])
    
    true_lab_arr = np.asarray(true_labels.values.reshape(-1).tolist())
    pred_lab_arr = np.asarray(pred_labels.values.reshape(-1).tolist())
    
    # Only find ROC AUC value of labels that are contained in true labels
    class_has_true_values = true_lab_arr.any(axis=0)
    
    true_lab_arr = true_lab_arr[:, class_has_true_values]
    pred_lab_arr = pred_lab_arr[:, class_has_true_values]
    
    roc_auc_score_val = roc_auc_score(true_lab_arr, pred_lab_arr)
    
    results[file[:-4]] = roc_auc_score_val

chen_2014_facebook.xml
chen_2014_swiftkey.xml
chen_2014_tapfish.xml
chen_2014_templerun2.xml
ciurumelea_2017_2048.xml
ciurumelea_2017_Calculator.xml
ciurumelea_2017_CatLog.xml
ciurumelea_2017_Wally.xml
ciurumelea_2017_Xabber.xml
di_sorbo_2016_blinq_summary.xml
di_sorbo_2016_cstp_summary.xml
di_sorbo_2016_doodlePairs_summary.xml
di_sorbo_2016_karaokeFree_summary.xml
di_sorbo_2016_lifelog_summary.xml
di_sorbo_2016_minesweeperReloaded_summary.xml
di_sorbo_2016_movieCreator_summary.xml
di_sorbo_2016_picturexAndroid_summary.xml
di_sorbo_2016_picturexWindowsPhone_summary.xml
di_sorbo_2016_sheepOblock_summary.xml
di_sorbo_2016_sketch_summary.xml
di_sorbo_2016_stoneFlood_summary.xml
di_sorbo_2016_trackID_summary.xml
di_sorbo_2016_video_summary.xml
di_sorbo_2016_weightTrack_summary.xml
di_sorbo_2016_wifiFileTransfer_summary.xml
guzman_2015_Evernote.xml
guzman_2015_Picsart.xml
guzman_2015_Pininterest.xml
guzman_2015_Tripadvisor.xml
guzman_2015_Whatsapp.xml
maalej_2016_310947683.xml
maalej_2016_4

In [54]:
results

{'chen_2014_facebook': 0.5694599231371626,
 'chen_2014_swiftkey': 0.5865985756093035,
 'chen_2014_tapfish': 0.5872852355468396,
 'chen_2014_templerun2': 0.5922563541777374,
 'ciurumelea_2017_2048': 0.5625,
 'ciurumelea_2017_Calculator': 0.6017374517374516,
 'ciurumelea_2017_CatLog': 0.6254901960784314,
 'ciurumelea_2017_Wally': 0.6071428571428571,
 'ciurumelea_2017_Xabber': 0.5,
 'di_sorbo_2016_blinq_summary': 0.8130252485609628,
 'di_sorbo_2016_cstp_summary': 0.8571428571428572,
 'di_sorbo_2016_doodlePairs_summary': 0.9285714285714286,
 'di_sorbo_2016_karaokeFree_summary': 0.9131944444444444,
 'di_sorbo_2016_lifelog_summary': 0.8535632183908046,
 'di_sorbo_2016_minesweeperReloaded_summary': 0.8870214752567693,
 'di_sorbo_2016_movieCreator_summary': 0.8902207288835196,
 'di_sorbo_2016_picturexAndroid_summary': 0.9285714285714286,
 'di_sorbo_2016_picturexWindowsPhone_summary': 0.875,
 'di_sorbo_2016_sheepOblock_summary': 0.921875,
 'di_sorbo_2016_sketch_summary': 0.8791666666666667,
 'd

In [141]:
csv_file_name

'chen_2014_swiftkey.csv'

In [34]:
df["surf_labels"] = None

In [35]:
for index, labels in feedback_labels.items():
    df.loc[int(index), "surf_labels"] = labels

In [36]:
df["surf_labels"] = df["surf_labels"].apply(lambda x: ["OTHER"] if x is None else x)

In [37]:
df.surf_labels.apply(lambda x: x[0]).unique()

array(['OTHER', 'BUG', 'REQUEST', 'INFO', 'QUESTION'], dtype=object)

In [38]:
df.labels.apply(lambda x: x[0]).unique()

array(['non-informative', 'informative'], dtype=object)

In [59]:
true_bugs = df.labels.apply(lambda x: ["informative" in x])

In [60]:
pred_bugs = df.surf_labels.apply(lambda x: ["OTHER" not in x])

In [61]:
from sklearn.metrics import roc_auc_score

In [62]:
import numpy as np

In [63]:
roc_auc_score(np.asarray(true_bugs.values.reshape(-1).tolist()), np.asarray(pred_bugs.values.reshape(-1).tolist()))

0.5694599231371626