### In this module, we transform the label texts generated using GPT-3 Turbo model  into list of candidate labels 

The text generated from the GPT model contains following format:

```
{label} : {description}
```

To select the labels that best can be used as rate feature for the apps, we use a simple approach in which we simply count the occurrence of the candidate labels. 

We use the top five most frequent labels as the final rate features.

In [18]:
GPT_LABEL_DIR = "../data/gpt labels/"
domains = ["ride", "health", "investing"]

# 6: two to three words labels
# 7: adjective labels
# 9 : NFR labels
indices = {6: "two to three words", 7: "adjective", 9: "NFRs "}

In [24]:
# 1. get names of all csv files
import os, pandas as pd

domains_labels_dfs = {}

for domain in domains:
    dfs = []
    print("----domain: ", domain, "-----")
    for i in indices.keys():
        print("fetching labels i: ", indices[i])
        label_file = GPT_LABEL_DIR + domain + "_"+str(i)+"_gpt_labels.csv"
        df = pd.read_csv(label_file)
        dfs.append(df)
    domains_labels_dfs[domain] = dfs

----domain:  ride -----
fetching labels i:  two to three words
fetching labels i:  adjective
fetching labels i:  NFRs 
----domain:  health -----
fetching labels i:  two to three words
fetching labels i:  adjective
fetching labels i:  NFRs 
----domain:  investing -----
fetching labels i:  two to three words
fetching labels i:  adjective
fetching labels i:  NFRs 


In [25]:
domains_labels_dfs["ride"][0].head()

Unnamed: 0,params,input,prompt,generator choices,labels
0,{'prefix': '\nGenerate minimum of five labels ...,i am afairlynew lyftriderand iheardheard that ...,\nGenerate minimum of five labels (two to thre...,"[<OpenAIObject at 0x7f0336fe1860> JSON: {\n ""...",['1. Patient Drivers: Lyft drivers are excelle...
1,{'prefix': '\nGenerate minimum of five labels ...,usually cheaper than uber not cheaper than a c...,\nGenerate minimum of five labels (two to thre...,"[<OpenAIObject at 0x7f0336ff1d60> JSON: {\n ""...",['1. Cheaper than Uber: Lyft is usually cheape...
2,{'prefix': '\nGenerate minimum of five labels ...,well it's been a roller coaster riding with ly...,\nGenerate minimum of five labels (two to thre...,"[<OpenAIObject at 0x7f0336fe1b30> JSON: {\n ""...",['1. Unsafe Driver: Reckless driving and dange...
3,{'prefix': '\nGenerate minimum of five labels ...,also will it would be nice if there was an opt...,\nGenerate minimum of five labels (two to thre...,"[<OpenAIObject at 0x7f0336ff1f90> JSON: {\n ""...",['1. Customizable Rides: Option to change time...
4,{'prefix': '\nGenerate minimum of five labels ...,i hate via they take way to long they take you...,\nGenerate minimum of five labels (two to thre...,"[<OpenAIObject at 0x7f0336fd0590> JSON: {\n ""...",['1. Poor Customer Service: Long wait times an...


In [98]:
from collections import Counter
import ast, re
import json
from functools import reduce

def rm_punc(string):
    return re.sub(r'[^\w\s]', "", string)

def str_arr(label_str):
    return label_str.split(",")

def format_labels(labels):
    _labels = []
    for label in labels:
        formatted = label
        if ":" in label:
            formatted = label.split(":")[0]
        elif "-" in label:
            formatted = label.split("-")[0]
        formatted = re.sub(r'[1-9.]', "", formatted) #^a-zA-Z
        formatted = re.sub("and", "", formatted).strip()
        if "," in formatted:
            _labels.extend([rm_punc(item).strip().lower() for item in formatted.split(",")])
        else:
            _labels.append(rm_punc(formatted).strip().lower())
    return list(filter(lambda x: len(x) > 0, _labels))


def format_desc(labels, top_labels):
    label_desc = {}
    for label in labels:
        formatted_label = format_labels([label])[0]
        desc = label
        if ":" in desc:
            desc = desc.split(":")[1]
        elif "-" in desc:
            desc = desc.split("-")[1]
        label_desc[formatted_label] = desc
    return label_desc

def load_label_counter(df, max_num=10, print_top = True):
    df["label_arr"] = df["labels"].apply(lambda x: ast.literal_eval(x))
    df["label_arr_formatted"] = df["label_arr"].apply(lambda x: format_labels(x))
    all_labels = list(reduce(lambda x,y: x+y, df["label_arr_formatted"].tolist()))
    counter = Counter(all_labels)
    if print_top:
        print("\ntop common labels: ", counter.most_common(max_num))
    return df, counter

def load_label_desc(df, output_file):
    df, counter = load_label_counter(df, 10, False)
    top_labels = [item[0] for item in counter.most_common(10)]
    df["label_desc_formatted"] = df["labels"].apply(lambda x: format_desc(ast.literal_eval(x), top_labels))
    print("\nsaving desc to file: ", output_file, "------\n")
    df.to_csv(output_file, header=True, index=False)
    
    all_labels_dict_arr = df["label_desc_formatted"].tolist()
    print("\ndf desc formatted[0]: ", all_labels_dict_arr[0])
    top_label_merged_descs = {}
    for label_desc_dict in all_labels_dict_arr:
        for label, desc in label_desc_dict.items():
            if label in top_labels:
                if label in list(top_label_merged_descs.keys()):
                    top_label_merged_descs[label].append(desc)
                else:
                    top_label_merged_descs[label] = [desc]
    sorted_top_label_merged_descs = {label: top_label_merged_descs[label] for label in top_labels if label in top_label_merged_descs}
    print("\n\nshowing desc for top labels: \n", sorted_top_label_merged_descs)

    top_dict_desc_file = output_file.split(".")[0] + "_top_only.csv"
#     top_desc_df = pd.DataFrame.from_dict(sorted_top_label_merged_descs, orient='index', columns=['Description'])
#     print("\nsaving top descriptions merged to file: ", top_dict_desc_file)
#     top_desc_df.to_csv(top_dict_desc_file, index=False, header=True)
    return df, sorted_top_label_merged_descs

In [99]:
# TWO TO THREE WORDS LABELS
    
for domain in domains:
    dfs = []
    i = 0
    print("\n\n-------------- DOMAIN: ", domain, "------------------------")
    print("label type: ", indices[i+6])
    df = domains_labels_dfs[domain][i]
    load_label_counter(df)
    print("\n----loading descriptions for top labels: ---\n")
    desc_file = GPT_LABEL_DIR + "desc/" + domain + "_" + str(i+6) + "_gpt_label_desc.csv"
    load_label_desc(df, desc_file)



-------------- DOMAIN:  ride ------------------------
label type:  two to three words

top common labels:  [('poor customer service', 3), ('safety concerns', 2), ('patient drivers', 1), ('best prices', 1), ('unreliable morning rides', 1), ('clean cars', 1), ('communication issues', 1), ('cheaper than uber', 1), ('more reliable than cabs', 1), ('inconsistent pricing', 1)]

----loading descriptions for top labels: ---


saving desc to file:  ../data/gpt labels/desc/ride_6_gpt_label_desc.csv ------


df desc formatted[0]:  {'patient drivers': ' Lyft drivers are excellent and very patient, even when the rider is running late.', 'best prices': ' Lyft still has the best prices per ride compared to competitors.', 'unreliable morning rides': ' Lyft can be unreliable for morning rides due to driver switching and cancellations.', 'clean cars': ' Lyft drivers have clean cars and provide great service.', 'communication issues': ' Lyft needs to improve their app for better communication and messa

In [100]:
# ADJECTIVE LABELS

for domain in domains:
    dfs = []
    i = 1
    print("\n\n----domain: ", domain, "-----")
    print("label type: ", indices[i+6])
    df = domains_labels_dfs[domain][i]
    load_label_counter(df)
    print("\n----loading descriptions for top labels: ---\n")
    desc_file = GPT_LABEL_DIR + "desc/" + domain + "_" + str(i+6) + "_gpt_label_desc.csv"
    load_label_desc(df, desc_file)



----domain:  ride -----
label type:  adjective

top common labels:  [('unreliable', 7), ('inconsistent', 3), ('expensive', 3), ('convenient', 2), ('frustrating', 2), ('patient', 1), ('affordable', 1), ('helpful', 1), ('pricey', 1), ('overcharging', 1)]

----loading descriptions for top labels: ---


saving desc to file:  ../data/gpt labels/desc/ride_7_gpt_label_desc.csv ------


df desc formatted[0]:  {'patient': ' Drivers are patient and understanding of delays.', 'inconsistent': ' App can switch drivers mid-trip and cause confusion.', 'affordable': ' Prices are competitive and reasonable.', 'helpful': ' Drivers are helpful and accommodating.', 'unreliable': ' App can fail to notify of driver cancellations, causing issues for riders.'}


showing desc for top labels: 
 {'unreliable': [' App can fail to notify of driver cancellations, causing issues for riders.', ' Inconsistent and unpredictable service quality', ' App did not work as expected and caused inconvenience.', " App's servi

In [101]:
# NFR LABELS

for domain in domains:
    dfs = []
    i = 2
    print("\n\n----domain: ", domain, "-----")
    print("label type: ", indices[9])
    df = domains_labels_dfs[domain][i]
    load_label_counter(df)
    print("\n----loading descriptions for top labels: ---\n")
    desc_file = GPT_LABEL_DIR + "desc/" + domain + "_" + str(9) + "_gpt_label_desc.csv"
    load_label_desc(df, desc_file)



----domain:  ride -----
label type:  NFRs 

top common labels:  [('reliability', 8), ('security', 5), ('customer support', 5), ('usability', 2), ('accessibility', 2), ('payment', 2), ('user experience', 2), ('transparency', 2), ('customer service', 2), ('performance', 1)]

----loading descriptions for top labels: ---


saving desc to file:  ../data/gpt labels/desc/ride_9_gpt_label_desc.csv ------


df desc formatted[0]:  {'reliability': ' The app should ensure that assigned drivers show up on time and do not cancel rides unexpectedly.', 'usability': ' The app should have clear and easy-to-use communication features for riders to contact drivers and vice versa.', 'accessibility': ' The app should be accessible to riders with disabilities, and drivers should be patient and accommodating towards them.', 'performance': ' The app should have fast response times and minimize wait times for riders.', 'security': ' The app should ensure the safety of riders by verifying driver identities and