In [9]:
import pandas as pd
import os
import xml.etree.ElementTree as ET

In [10]:
def get_xml_data(reviews, idx, data):
    review = ET.SubElement(reviews, 'review')
    review.set('id', str(idx))
    
    app_ver = ET.SubElement(review, 'app_version')
    app_ver.text = '0.0'
    
    user = ET.SubElement(review, 'user')
    user.text = 'NA'
    
    date = ET.SubElement(review, 'date')
    date.text = '1970-01-01'
    
    review_title = ET.SubElement(review, 'review_title')
    review_title.text = ''
    
    review_text = ET.SubElement(review, 'review_text')
    review_text.text = data["text"]

In [3]:
# Get the central repo that holds all the raw feedback data
raw_data_dir = "./data/raw"

# Get the central repo that will hold all the XML
xml_data_dir = "./data/xml"

os.makedirs(xml_data_dir, exist_ok=True)

# Read the files one by one, exporting them to xml
for dataset in os.listdir(raw_data_dir):
    df = pd.read_csv(os.path.join(raw_data_dir, dataset), index_col = 0)
    df.labels = df.labels.str.strip("']").str.strip("['").str.split(",")
    
    # create the file structure
    reviews = ET.Element('reviews')
    
    for idx, data in df.to_dict("index").items():
        get_xml_data(reviews, idx, data)
        
    review_xml_str = ET.tostring(reviews)
    
    dataset_name = dataset[:-4]
        
    dataset_name = dataset_name.replace(" ", "_").replace("#", "_")

    with open(os.path.join(xml_data_dir, f"{dataset_name}.xml"), "w") as xml_file:
        xml_file.write(review_xml_str.decode("utf-8"))

# Make a script to run this in SURF

Donload SURF from https://zenodo.org/record/165128

In [15]:
surf_unzip_dir = "D:\\peter_devine_projects\\SURF-tool-SURF-v1.0\\panichella-SURF-tool-adcc79b"
xml_absolute_data_dir = "D:\\peter_devine_projects\\unsupervised-classification-benchmark\\data\\xml"

cmd_str = lambda x: f"""java -classpath "{surf_unzip_dir}\\SURF-Tool\\SURF-Tool\\lib\\*;{surf_unzip_dir}\\SURF-Tool\\SURF-Tool\\SURF.jar" org.surf.Main {xml_absolute_data_dir}\\{x}.xml {x}.xml \n"""

In [14]:
with open("run_surf.bat", "w") as f:
    cmd_str_text = ""
    for dataset_name in os.listdir(raw_data_dir):
        
        dataset_name = dataset_name[:-4].replace(" ", "_").replace("#", "_")
        
        cmd_str_text += cmd_str(dataset_name)
    f.write(cmd_str_text)

Run the run_surf.bat file that is now in your cwd

# Run the SURF model over this xml file

In [26]:
# #pd.read_csv(os.path.join("./data/raw", "williams_2017_@snapchat.csv"), index_col = 0).labels.str.split(",").apply(lambda x: x[0]).unique()

# df = pd.read_csv(os.path.join("./data/raw", "tizard_2019_features.csv"), index_col = 0)
# df.labels = df.labels.str.strip("]").str.strip("[").str.split(",")
# df.labels = df.labels.apply(lambda labels: [label.strip("\"").strip("'") for label in labels])
# print(df.shape)
# df.labels.apply(lambda x: x[0]).unique()

In [1]:
label_maps = {
    "chen_2014": [(["informative"], ["BUG", 'REQUEST', 'INFO', 'QUESTION'])],
    "ciurumelea_2017": [(["OTHER"], ["OTHER"])],
    "di_sorbo_2016": [(["[INFO]"], ["INFO"]), (["[BUG]"], ["BUG"]), (["[REQUEST]"], ["REQUEST"]), (["[QUESTION]"], ["QUESTION"])],
    "guzman_2015": [(["Bug report"], ["BUG"]), (["User request"], ["REQUEST"])],
    "maalej_2016": [(["Bug"], ["BUG"]), (["Feature"], ["REQUEST"])],
    "scalabrino_2017": [(["BUG"], ["BUG"]), (["FEATURE"], ["REQUEST"])],
    "tizard_2019": [(["apparent bug"], ["BUG"]), (["feature request"], ["REQUEST"]), (["question on application", "help seeking", "requesting more information", "question on background"], ["QUESTION"]), (["application guidance", "user setup", "praise for application", "dispraise for application", "application usage", "attempted solution", "acknowledgement of problem resolution"], ["INFO"])],
    "williams_2017": [(["bug"], ["BUG"]), (["fea"], ["REQUEST"]), (["oth"], ["OTHER"])],
}

In [5]:
raw_data_dir = "./data/raw"

sanitized_dataset_names = [x.replace(" ", "_").replace("#", "_") for x in os.listdir(raw_data_dir)]

sanitized_name_dict = {sanitized: original for original, sanitized in zip(os.listdir(raw_data_dir), sanitized_dataset_names)}

In [8]:
from sklearn.metrics import roc_auc_score
import numpy as np

results = {}
XML_dir = "D:\\peter_devine_projects\\unsupervised-classification-benchmark\\data\\ar_doc_output"
comparison_dir = "D:\\peter_devine_projects\\unsupervised-classification-benchmark\\data\\supervised_cls_preds"

for file in os.listdir(XML_dir):
    
    if file[-4:] != ".xml":
        continue
    print(file)

    xml_file_dir = os.path.join(XML_dir, file)

    with open(xml_file_dir, "r", encoding="utf-8") as f:
        xml_str_data = f.read()
        root = ET.fromstring(xml_str_data)

    labelled_sentences = []

    [labelled_sentences.extend(x.find("sentences").findall("sentence")) for x in root.findall("topic")]

    feedback_labels = {}

    def extract_labels_for_sentences(sentence_tag):
        label = sentence_tag.find("sentence_type").text
        origin_id = sentence_tag.find("from_review").text

        if origin_id in feedback_labels.keys():
            feedback_labels[origin_id].append(label)
        else:
            feedback_labels[origin_id] = [label]

    [extract_labels_for_sentences(x) for x in labelled_sentences]

    feedback_labels = {k: list(set(v)) for k,v in feedback_labels.items()}

    # Get the central repo that holds all the raw feedback data
    raw_data_dir = "./data/raw"
    
    sanitized_csv_file_name = file[:-4] + ".csv"
    real_csv_name = sanitized_name_dict[sanitized_csv_file_name]

    df = pd.read_csv(os.path.join(raw_data_dir, real_csv_name), index_col = 0)
    df.labels = df.labels.str.strip("]").str.strip("[").str.split(",")
    df.labels = df.labels.apply(lambda labels: [label.strip("\"").strip("'") for label in labels])
    df.labels = df.labels.apply(lambda labels: [label.replace("'", "").replace("\"", "").strip() for label in labels])
    print(df.labels.apply(lambda x: x[0]).unique())
    
    dataset_name = [dataset for dataset in label_maps.keys() if dataset in file][0]
    label_map = label_maps[dataset_name]
    
    true_label_set = [labels[0] for labels in label_map]
    prediction_label_set = [labels[1] for labels in label_map]
    
    df["surf_labels"] = None
    
    for index, labels in feedback_labels.items():
        df.loc[int(index), "surf_labels"] = labels
        
    df["surf_labels"] = df["surf_labels"].apply(lambda x: ["OTHER"] if x is None else x)
    
    df.to_csv(os.path.join(comparison_dir, file[:-4]+".csv"))
    
    true_labels = df.labels.apply(lambda x: [any([true_label in x for true_label in true_label_list]) for true_label_list in true_label_set])
    pred_labels = df.surf_labels.apply(lambda x: [any([pred_label in x for pred_label in prediction_label_list]) for prediction_label_list in prediction_label_set])
    
    true_lab_arr = np.asarray(true_labels.values.reshape(-1).tolist())
    pred_lab_arr = np.asarray(pred_labels.values.reshape(-1).tolist())
    
    # Only find ROC AUC value of labels that are contained in true labels
    class_has_true_values = true_lab_arr.any(axis=0)
    
    if class_has_true_values.sum() < 1:
        continue
    
    true_lab_arr = true_lab_arr[:, class_has_true_values]
    pred_lab_arr = pred_lab_arr[:, class_has_true_values]
    
    roc_auc_score_val = roc_auc_score(true_lab_arr, pred_lab_arr)
    
    results[file[:-4]] = roc_auc_score_val

chen_2014_facebook.xml


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa8 in position 231476: invalid start byte

In [22]:
results

{'chen_2014_facebook': 0.5694599231371626,
 'chen_2014_swiftkey': 0.5865985756093035,
 'chen_2014_tapfish': 0.5872852355468396,
 'chen_2014_templerun2': 0.5922563541777374,
 'ciurumelea_2017_2048': 0.5625,
 'ciurumelea_2017_Abstract_Art': 0.6605894105894106,
 'ciurumelea_2017_AcDisplay': 0.48886174890083045,
 'ciurumelea_2017_Adblock_Plus': 0.596153846153846,
 'ciurumelea_2017_Amaze_File_Manager': 0.48897058823529416,
 'ciurumelea_2017_Autostarts': 0.4858585858585859,
 'ciurumelea_2017_A_Comic_Viewer': 0.4950809592129535,
 'ciurumelea_2017_BatteryBot_Battery_Indicator': 0.6522988505747127,
 'ciurumelea_2017_Calculator': 0.6017374517374516,
 'ciurumelea_2017_CatLog': 0.6254901960784314,
 'ciurumelea_2017_Duck_Duck_GO': 0.652139037433155,
 'ciurumelea_2017_Financius_-_Expense_Manager': 0.4375,
 'ciurumelea_2017_Muzei_Live_Wallpaper': 0.65,
 'ciurumelea_2017_Turbo_Editor_(_Text_Editor_)': 0.16666666666666669,
 'ciurumelea_2017_Tweet_Lanes': 0.375,
 'ciurumelea_2017_Wally': 0.6071428571428

In [23]:
results_cos = pd.read_csv("./results/roc_auc_cosine.csv", index_col=0)

In [24]:
dataset_names = [x[:-3] for x in os.listdir("./data/downloaders") if x[-3:] == ".py"]

In [26]:
def get_per_dataset_results(results_df):    
    return [results_df[[col for col in results_df.columns if dataset_name in col]].T.mean() for dataset_name in dataset_names]

In [36]:
bert_mean = pd.DataFrame(get_per_dataset_results(results_cos), index=dataset_names)["bert_large_nli_mean_tokens.csv"]

In [43]:
ar_doc = pd.DataFrame(get_per_dataset_results(pd.DataFrame({"AR-Doc": results}).T), index=dataset_names)["AR-Doc"]

In [45]:
pd.DataFrame({"s-bert": bert_mean.values, "AR-doc": ar_doc.values}, index=dataset_names)

Unnamed: 0,s-bert,AR-doc
chen_2014,0.669321,0.5839
ciurumelea_2017,0.681986,0.532117
di_sorbo_2016,0.66121,0.878831
guzman_2015,0.758384,0.675908
maalej_2016,0.78587,0.523947
scalabrino_2017,0.66658,0.616997
tizard_2019,0.649695,0.576109
williams_2017,0.587251,0.596712


In [41]:
bert_mean.values

array([0.66932053, 0.68198576, 0.66121034, 0.75838379, 0.7858703 ,
       0.66657966, 0.64969485, 0.58725089])

In [42]:
ar_doc.values.resa

array([[0.58390002],
       [0.53211707],
       [0.87883127],
       [0.67590774],
       [0.52394739],
       [0.61699735],
       [0.57610921],
       [0.59671184]])

In [48]:
di_sorbos = [x for x in os.listdir("./data/raw") if "di_sorbo" in x]

In [50]:
sizes = [pd.read_csv(os.path.join(raw_data_dir, di), index_col = 0).shape for di in di_sorbos]

In [52]:
sum([x[0] for x in sizes])

1370