In [9]:
import pandas as pd
import os
import xml.etree.ElementTree as ET

In [10]:
def get_xml_data(reviews, idx, data):
    review = ET.SubElement(reviews, 'review')
    review.set('id', str(idx))
    
    app_ver = ET.SubElement(review, 'app_version')
    app_ver.text = '0.0'
    
    user = ET.SubElement(review, 'user')
    user.text = 'NA'
    
    date = ET.SubElement(review, 'date')
    date.text = '1970-01-01'
    
    review_title = ET.SubElement(review, 'review_title')
    review_title.text = ''
    
    review_text = ET.SubElement(review, 'review_text')
    review_text.text = data["text"]

In [3]:
# Get the central repo that holds all the raw feedback data
raw_data_dir = "./data/raw"

# Get the central repo that will hold all the XML
xml_data_dir = "./data/xml"

os.makedirs(xml_data_dir, exist_ok=True)

# Read the files one by one, exporting them to xml
for dataset in os.listdir(raw_data_dir):
    df = pd.read_csv(os.path.join(raw_data_dir, dataset), index_col = 0)
    df.labels = df.labels.str.strip("']").str.strip("['").str.split(",")
    
    # create the file structure
    reviews = ET.Element('reviews')
    
    for idx, data in df.to_dict("index").items():
        get_xml_data(reviews, idx, data)
        
    review_xml_str = ET.tostring(reviews)
    
    dataset_name = dataset[:-4]
        
    dataset_name = dataset_name.replace(" ", "_").replace("#", "_")

    with open(os.path.join(xml_data_dir, f"{dataset_name}.xml"), "w") as xml_file:
        xml_file.write(review_xml_str.decode("utf-8"))

# Make a script to run this in SURF

Donload SURF from https://zenodo.org/record/165128

In [15]:
surf_unzip_dir = "D:\\peter_devine_projects\\SURF-tool-SURF-v1.0\\panichella-SURF-tool-adcc79b"
xml_absolute_data_dir = "D:\\peter_devine_projects\\unsupervised-classification-benchmark\\data\\xml"

cmd_str = lambda x: f"""java -classpath "{surf_unzip_dir}\\SURF-Tool\\SURF-Tool\\lib\\*;{surf_unzip_dir}\\SURF-Tool\\SURF-Tool\\SURF.jar" org.surf.Main {xml_absolute_data_dir}\\{x}.xml {x}.xml \n"""

In [14]:
with open("run_surf.bat", "w") as f:
    cmd_str_text = ""
    for dataset_name in os.listdir(raw_data_dir):
        
        dataset_name = dataset_name[:-4].replace(" ", "_").replace("#", "_")
        
        cmd_str_text += cmd_str(dataset_name)
    f.write(cmd_str_text)

Run the run_surf.bat file that is now in your cwd

# Run the SURF model over this xml file

In [26]:
# #pd.read_csv(os.path.join("./data/raw", "williams_2017_@snapchat.csv"), index_col = 0).labels.str.split(",").apply(lambda x: x[0]).unique()

# df = pd.read_csv(os.path.join("./data/raw", "tizard_2019_features.csv"), index_col = 0)
# df.labels = df.labels.str.strip("]").str.strip("[").str.split(",")
# df.labels = df.labels.apply(lambda labels: [label.strip("\"").strip("'") for label in labels])
# print(df.shape)
# df.labels.apply(lambda x: x[0]).unique()

In [5]:
raw_data_dir = "./data/raw"

sanitized_dataset_names = [x.replace(" ", "_").replace("#", "_") for x in os.listdir(raw_data_dir)]

sanitized_name_dict = {sanitized: original for original, sanitized in zip(os.listdir(raw_data_dir), sanitized_dataset_names)}

In [39]:
from sklearn.metrics import roc_auc_score
import numpy as np

results = {}
XML_dir = "D:\\peter_devine_projects\\unsupervised-classification-benchmark\\data\\ar_doc_output"
comparison_dir = "D:\\peter_devine_projects\\unsupervised-classification-benchmark\\data\\supervised_cls_preds\\ar_doc"

for file in os.listdir(XML_dir):
    
    if file[-4:] != ".xml":
        continue
    print(file)

    xml_file_dir = os.path.join(XML_dir, file)

    with open(xml_file_dir, "r", encoding="utf-8") as f:
        xml_str_data = f.read()
        root = ET.fromstring(xml_str_data)

    labelled_sentences = []

    [labelled_sentences.extend(x.find("sentences").findall("sentence")) for x in root.findall("topic")]

    feedback_labels = {}

    def extract_labels_for_sentences(sentence_tag):
        label = sentence_tag.find("sentence_type").text
        origin_id = sentence_tag.find("from_review").text

        if origin_id in feedback_labels.keys():
            feedback_labels[origin_id].append(label)
        else:
            feedback_labels[origin_id] = [label]

    [extract_labels_for_sentences(x) for x in labelled_sentences]

    feedback_labels = {int(k): list(set(v)) for k,v in feedback_labels.items()}

    # Get the central repo that holds all the raw feedback data
    raw_data_dir = "./data/raw"
    
    sanitized_csv_file_name = file[:-4] + ".csv"
    real_csv_name = sanitized_name_dict[sanitized_csv_file_name]

    df = pd.read_csv(os.path.join(raw_data_dir, real_csv_name), index_col = 0)
    df.labels = df.labels.str.strip("]").str.strip("[").str.split(",")
    df.labels = df.labels.apply(lambda labels: [label.strip("\"").strip("'") for label in labels])
    df.labels = df.labels.apply(lambda labels: [label.replace("'", "").replace("\"", "").strip() for label in labels])
    
    surf_df = pd.DataFrame(feedback_labels.items()).set_index(0)
    surf_df["pred_labels"] = surf_df.iloc[:,0]
    surf_df = surf_df.drop(1, axis=1)
    df = df.join(surf_df, how="outer")
    
    df["pred_labels"] = df["pred_labels"].apply(lambda x: ["OTHER"] if type(x) != list else x)
    
    df.to_csv(os.path.join(comparison_dir, file[:-4]+".csv"))

chen_2014_facebook.xml
['non-informative' 'informative']
chen_2014_swiftkey.xml
['informative' 'non-informative']
chen_2014_tapfish.xml
['non-informative' 'informative']
chen_2014_templerun2.xml
['non-informative' 'informative']
ciurumelea_2017_2048.xml
['PRICING' 'USAGE' 'OTHER' 'COMPATIBILITY' 'RESSOURCES']
ciurumelea_2017_Abstract_Art.xml
['OTHER' 'USAGE' 'PRICING' 'COMPATIBILITY' 'RESSOURCES' 'PROTECTION']
ciurumelea_2017_AcDisplay.xml
['OTHER' 'PROTECTION' 'COMPATIBILITY' 'USAGE' 'RESSOURCES' 'PRICING']
ciurumelea_2017_Adblock_Plus.xml
['OTHER' 'PROTECTION' 'COMPATIBILITY' 'RESSOURCES' 'USAGE' 'PRICING']
ciurumelea_2017_Amaze_File_Manager.xml
['OTHER' 'RESSOURCES' 'USAGE' 'PRICING' 'COMPATIBILITY']
ciurumelea_2017_Autostarts.xml
['OTHER' 'PROTECTION' 'COMPATIBILITY' 'RESSOURCES' 'USAGE' 'PRICING']
ciurumelea_2017_A_Comic_Viewer.xml
['OTHER' 'RESSOURCES' 'PRICING' 'USAGE' 'COMPATIBILITY']
ciurumelea_2017_BatteryBot_Battery_Indicator.xml
['RESSOURCES' 'PROTECTION' 'PRICING' 'USAGE' 

['fea' 'oth' 'bug']
williams_2017_@windows.xml
['oth' 'bug' 'fea']


In [38]:
results

{'chen_2014_facebook': 0.5694599231371626,
 'chen_2014_swiftkey': 0.5413841343920979,
 'chen_2014_tapfish': 0.5195734498428121,
 'chen_2014_templerun2': 0.5243267558360264,
 'ciurumelea_2017_2048': 0.5625,
 'ciurumelea_2017_Abstract_Art': 0.6605894105894106,
 'ciurumelea_2017_AcDisplay': 0.48886174890083045,
 'ciurumelea_2017_Adblock_Plus': 0.596153846153846,
 'ciurumelea_2017_Amaze_File_Manager': 0.48897058823529416,
 'ciurumelea_2017_Autostarts': 0.4858585858585859,
 'ciurumelea_2017_A_Comic_Viewer': 0.4950809592129535,
 'ciurumelea_2017_BatteryBot_Battery_Indicator': 0.6522988505747127,
 'ciurumelea_2017_Calculator': 0.6017374517374516,
 'ciurumelea_2017_CatLog': 0.6254901960784314,
 'ciurumelea_2017_Duck_Duck_GO': 0.652139037433155,
 'ciurumelea_2017_Financius_-_Expense_Manager': 0.4375,
 'ciurumelea_2017_Muzei_Live_Wallpaper': 0.65,
 'ciurumelea_2017_Turbo_Editor_(_Text_Editor_)': 0.16666666666666669,
 'ciurumelea_2017_Tweet_Lanes': 0.375,
 'ciurumelea_2017_Wally': 0.6071428571428