### In this module, we transform the label texts generated using GPT-3 Turbo model  into list of candidate labels 

The text generated from the GPT model contains following format:

```
{label} : {description}
```

To select the labels that best can be used as rate feature for the apps, we use a simple approach in which we simply count the occurrence of the candidate labels. 

We use the top five most frequent labels as the final rate features.

In [3]:
GPT_LABEL_DIR = "../data/gpt labels/davinci/"
domains = ["ride", "health", "investing"]

# 6: two to three words labels
# 7: adjective labels
# 9 : NFR labels
# indices = {6: "two to three words", 7: "adjective", 9: "NFRs "} # chatgpt model
indices = {0: "two to three words", 1: "adjective", 2: "NFRs "}

In [4]:
# 1. get names of all csv files
import os, pandas as pd

domains_labels_dfs = {}

for domain in domains:
    dfs = []
    print("----domain: ", domain, "-----")
    for i in indices.keys():
        print("fetching labels i: ", indices[i])
        label_file = GPT_LABEL_DIR + domain + "_"+str(i)+"_gpt_labels.csv"
        df = pd.read_csv(label_file)
        dfs.append(df)
    domains_labels_dfs[domain] = dfs

----domain:  ride -----
fetching labels i:  two to three words
fetching labels i:  adjective
fetching labels i:  NFRs 
----domain:  health -----
fetching labels i:  two to three words
fetching labels i:  adjective
fetching labels i:  NFRs 
----domain:  investing -----
fetching labels i:  two to three words
fetching labels i:  adjective
fetching labels i:  NFRs 


In [5]:
domains_labels_dfs["ride"][0].head()

Unnamed: 0,params,input,prompt,generator choices,labels
0,{'prefix': '\nGenerate minimum of five labels ...,i am afairlynew lyftriderand iheardheard that ...,\nGenerate minimum of five labels (two to thre...,"[<OpenAIObject at 0x7fc669beeef0> JSON: {\n ""...",['Patient Drivers: Drivers wait patiently and ...
1,{'prefix': '\nGenerate minimum of five labels ...,usually cheaper than uber not cheaper than a c...,\nGenerate minimum of five labels (two to thre...,"[<OpenAIObject at 0x7fc669c14f40> JSON: {\n ""...","['Cheaper than Uber: Lower cost than Uber.', '..."
2,{'prefix': '\nGenerate minimum of five labels ...,well it's been a roller coaster riding with ly...,\nGenerate minimum of five labels (two to thre...,"[<OpenAIObject at 0x7fc669c14900> JSON: {\n ""...",['Unsafe Driver: Unsafe driving and reckless b...
3,{'prefix': '\nGenerate minimum of five labels ...,also will it would be nice if there was an opt...,\nGenerate minimum of five labels (two to thre...,"[<OpenAIObject at 0x7fc669c14c70> JSON: {\n ""...",['Time-Flexibility: Allow changing time of rid...
4,{'prefix': '\nGenerate minimum of five labels ...,i hate via they take way to long they take you...,\nGenerate minimum of five labels (two to thre...,"[<OpenAIObject at 0x7fc669c14ef0> JSON: {\n ""...",['Inefficient_Service: Poor customer service a...


In [14]:
from collections import Counter
import ast, re
import json
from functools import reduce

def rm_punc(string):
    return re.sub(r'[^\w\s]', "", string)

def str_arr(label_str):
    return label_str.split(",")

def format_labels(labels):
    _labels = []
    for label in labels:
        formatted = label
        if ":" in label:
            formatted = label.split(":")[0]
        elif "-" in label:
            formatted = label.split("-")[0]
        formatted = re.sub(r'[1-9.]', "", formatted) #^a-zA-Z
        formatted = re.sub("and", "", formatted).strip()
        if "," in formatted:
            _labels.extend([rm_punc(item).strip().lower() for item in formatted.split(",")])
        else:
            _labels.append(rm_punc(formatted).strip().lower())
    return list(filter(lambda x: len(x) > 0, _labels))


def format_desc(labels, top_labels):
    label_desc = {}
    for label in labels:
        format_label_res = format_labels([label])
        if format_label_res:
            formatted_label = format_label_res[0]
            desc = label
            if ":" in desc:
                desc = desc.split(":")[1]
            elif "-" in desc:
                desc = desc.split("-")[1]
            label_desc[formatted_label] = desc
    return label_desc

def load_label_counter(df, max_num=10, print_top = True):
    df["label_arr"] = df["labels"].apply(lambda x: ast.literal_eval(x))
    df["label_arr_formatted"] = df["label_arr"].apply(lambda x: format_labels(x))
    all_labels = list(reduce(lambda x,y: x+y, df["label_arr_formatted"].tolist()))
    counter = Counter(all_labels)
    if print_top:
        print("\ntop common labels: ", counter.most_common(max_num))
    return df, counter

def load_label_desc(df, output_file):
    df, counter = load_label_counter(df, 10, False)
    top_labels = [item[0] for item in counter.most_common(10)]
    df["label_desc_formatted"] = df["labels"].apply(lambda x: format_desc(ast.literal_eval(x), top_labels))
    print("\nsaving desc to file: ", output_file, "------\n")
    df.to_csv(output_file, header=True, index=False)
    
    all_labels_dict_arr = df["label_desc_formatted"].tolist()
    print("\ndf desc formatted[0]: ", all_labels_dict_arr[0])
    top_label_merged_descs = {}
    for label_desc_dict in all_labels_dict_arr:
        for label, desc in label_desc_dict.items():
            if label in top_labels:
                if label in list(top_label_merged_descs.keys()):
                    top_label_merged_descs[label].append(desc)
                else:
                    top_label_merged_descs[label] = [desc]
    sorted_top_label_merged_descs = {label: top_label_merged_descs[label] for label in top_labels if label in top_label_merged_descs}
    print("\n\nshowing desc for top labels: \n", sorted_top_label_merged_descs)

    top_dict_desc_file = output_file.split(".")[0] + "_top_only.csv"
#     top_desc_df = pd.DataFrame.from_dict(sorted_top_label_merged_descs, orient='index', columns=['Description'])
#     print("\nsaving top descriptions merged to file: ", top_dict_desc_file)
#     top_desc_df.to_csv(top_dict_desc_file, index=False, header=True)
    return df, sorted_top_label_merged_descs

In [11]:
# TWO TO THREE WORDS LABELS
    
for domain in domains:
    dfs = []
    i = 0
    print("\n\n-------------- DOMAIN: ", domain, "------------------------")
#     print("label type: ", indices[i+6]) # chatgpt
    print("label type: ", indices[i])

    df = domains_labels_dfs[domain][i]
    load_label_counter(df)
    print("\n----loading descriptions for top labels: ---\n")
    desc_file = GPT_LABEL_DIR + "desc/" + domain + "_" + str(i) + "_gpt_label_desc.csv"
    load_label_desc(df, desc_file)



-------------- DOMAIN:  ride ------------------------
label type:  two to three words

top common labels:  [('price gouging', 3), ('unreliable', 2), ('patient drivers', 1), ('good service', 1), ('flexible tips', 1), ('switch drivers', 1), ('cancel rides', 1), ('cheaper than uber', 1), ('reliable', 1), ('wait  save', 1)]

----loading descriptions for top labels: ---


saving desc to file:  ../data/gpt labels/davinci/desc/ride_0_gpt_label_desc.csv ------


df desc formatted[0]:  {'patient drivers': ' Drivers wait patiently and do not charge extra for late pickups.', 'good service': ' Clean cars and excellent service.', 'flexible tips': ' Drivers are flexible with tips.', 'switch drivers': ' Drivers can switch in the middle of a trip.', 'cancel rides': ' Rides can be cancelled without notice.'}


showing desc for top labels: 
 {'price gouging': [' Overcharging excessively.', ' Charging more than advertised.', ' Prices increase during high demand.'], 'unreliable': [' Unreliable service w

In [12]:
# ADJECTIVE LABELS

for domain in domains:
    dfs = []
    i = 1
    print("\n\n----domain: ", domain, "-----")
    print("label type: ", indices[i]) # +6 for chatgpt labels
    df = domains_labels_dfs[domain][i]
    load_label_counter(df)
    print("\n----loading descriptions for top labels: ---\n")
    desc_file = GPT_LABEL_DIR + "desc/" + domain + "_" + str(i) + "_gpt_label_desc.csv"
    load_label_desc(df, desc_file)



----domain:  ride -----
label type:  adjective

top common labels:  [('unreliable', 4), ('unsafe', 2), ('unresponsive', 2), ('unhelpful', 2), ('expensive', 2), ('reliable', 1), ('affordable', 1), ('flexible', 1), ('unpredictable', 1), ('inconsistent', 1)]

----loading descriptions for top labels: ---


saving desc to file:  ../data/gpt labels/davinci/desc/ride_1_gpt_label_desc.csv ------




showing desc for top labels: 


----domain:  health -----
label type:  adjective

top common labels:  [('misleading', 3), ('unhelpful', 2), ('compassionate', 2), ('motivating', 2), ('expensive', 2), ('unfair', 2), ('unprofessional', 1), ('discriminatory', 1), ('inconvenient', 1), ('informative', 1)]

----loading descriptions for top labels: ---


saving desc to file:  ../data/gpt labels/davinci/desc/health_1_gpt_label_desc.csv ------


df desc formatted[0]:  {'unhelpful': " concise description of customer service's lack of assistance", 'unprofessional': " concise description of counselor's inappr

In [15]:
# NFR LABELS

for domain in domains:
    dfs = []
    i = 2
    print("\n\n----domain: ", domain, "-----")
    print("label type: ", indices[i]) # 9 chatgpt
    df = domains_labels_dfs[domain][i]
    load_label_counter(df)
    print("\n----loading descriptions for top labels: ---\n")
    desc_file = GPT_LABEL_DIR + "desc/" + domain + "_" + str(i) + "_gpt_label_desc.csv"
    load_label_desc(df, desc_file)



----domain:  ride -----
label type:  NFRs 

top common labels:  [('reliability', 4), ('unreliable service', 3), ('security', 2), ('inadequate support', 2), ('customer service', 2), ('timeliness', 1), ('communication', 1), ('driver switching', 1), ('cancellations', 1), ('price gouging', 1)]

----loading descriptions for top labels: ---


saving desc to file:  ../data/gpt labels/davinci/desc/ride_2_gpt_label_desc.csv ------


df desc formatted[0]:  {'reliability': ' Drivers are patient and reliable, but app can be unreliable.', 'timeliness': ' Drivers are usually on time, but app can be late.', 'communication': ' App has issues with communication and messaging.', 'driver switching': ' App can switch drivers while waiting for pickup.', 'cancellations': ' App can cancel rides while waiting for pickup.'}


showing desc for top labels: 
 {'reliability': [' Drivers are patient and reliable, but app can be unreliable.', ' Usually arrives within 5-10 minutes, never more than 15.', ' Ensure co