### Load sample review dataset and extract top five rate features we manually identified in each domain

In [1]:
%run lib.ipynb import *

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sshre35\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sshre35\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\sshre35\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\sshre35\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sshre35\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sshre35\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is

In [20]:
domains_reviews = {}
domains = ["ride", "health", "investing"]
SAMPLE_DATA_DIR = "../data/sample reviews/"

genres = ["Positive genre", "Negative genre"]
unique_genres = {"ride": [], "health": [], "investing": []} 
domains_categories = {"ride": {}, "health": {}, "investing": {}}
domains_categories_top_percent = {}

In [21]:
# import sampled reviews

def load_reviews():
    for domain in domains:
        fname = SAMPLE_DATA_DIR + domain+"_reviews.csv"
        reviews_df = pd.read_csv(fname, encoding='iso-8859-1', usecols=["Domain", "Name", "Title", "Review", "Positive genre", "Negative genre"])
        domains_reviews[domain] = reviews_df
    
load_reviews()

In [22]:
for domain in domains:
    print(domain, len(domains_reviews[domain]))

ride 500
health 467
investing 466


In [23]:
domains_reviews["ride"].head()

Unnamed: 0,Domain,Name,Title,Review,Positive genre,Negative genre
0,RideHailing,via-low-cost-ride-sharing,Great music!,great driver ! great choice of music !,,
1,RideHailing,rapido-indias-bike-taxi,Insensitive,some drivers are good but some are insensitive...,,ride experience
2,RideHailing,ola-cabs,Harassers,they keep calling you to make the payments for...,,payment reliability
3,RideHailing,lyft,Slower service,"lately takes forever to get a ride if at all, ...",,"app reliability, affordable, ride experience"
4,RideHailing,bolt-fast-affordable-rides,Deductions,i was debited unaware and yet to be reimbursed...,,payment reliability


In [24]:
domains_reviews["investing"].head()

Unnamed: 0,Domain,Name,Title,Review,Positive genre,Negative genre
0,Investing,gemini-buy-bitcoin-crypto,Customer service,where is the support people? i've been trying ...,,customer service
1,Investing,marketsim,App's Currently Broken,"currently, the stock ticker search function is...",,usability
2,Investing,robinhood-investing-for-all,Some Nice Features But Traditional Brokerage C...,the good:\n1. recurring investments in stocks ...,usability,reliability
3,Investing,robinhood-investing-for-all,Great app but not sure about the Company (BAD ...,first things first bad customer support! \n\ng...,usability,"customer service, security, reliability"
4,Investing,stash-invest-build-wealth,Hate this app so much,"not a reliable app, wat too much info that the...",,reliability


In [25]:
domains_reviews["health"].head()

Unnamed: 0,Domain,Name,Title,Review,Positive genre,Negative genre
0,Mental Health,aura-meditation-sleep,Great app,really enjoy all of the features within this app!,,
1,Mental Health,betterhelp-online-counseling,Off to a good start,i just had my first live session and the app m...,"professional counseling, reliable",
2,Mental Health,betterhelp-online-counseling,Helpful and Affordable,i don't know what else to say. it has been hel...,"professional counseling, usable, accessible",
3,Mental Health,ibreathe-relax-and-breathe,Recommend,it's easy to use and practical,usable,
4,Mental Health,woebot-your-self-care-expert,Grateful for this Woebot,i was curious about cbt and i found this app o...,effective,


In [26]:
# store unique feature identified in each domain
# IMPORTANT: we later renamed some features into different names
# (without affecting the underlying semantic meaning) 
# for example
# effective -> helpfulness
# affordable -> affordability
# app reliability -> reliability

def generate_unique_genres():
    for domain in domains:
        for genre in genres:
            all_genres = domains_reviews[domain][genre].dropna().unique()
            for col_item in all_genres:
                items = col_item.split(",")
                for item in items:
                    item = re.sub(r'[^a-zA-Z ]', '', item) # remove punc except for space
                    stripped_item = item.strip()
                    if stripped_item not in unique_genres[domain] and len(stripped_item) > 0:
                        unique_genres[domain].append(stripped_item)
generate_unique_genres()

In [27]:
for domain in domains:
    categories = unique_genres[domain]
    print("\ndomain: ", domain, "\n", len(categories), categories)


domain:  ride 
 13 ['usable', 'ride experience', 'app reliability', 'affordable', 'customer service', 'service availability', 'income', 'personal safety', 'payment reliability', 'app availability', 'fraud', 'data privacy', 'security']

domain:  health 
 11 ['professional counseling', 'reliable', 'usable', 'accessible', 'effective', 'professional therapy', 'affordable', 'privacy', 'inclusivity', 'customer service', 'fraud']

domain:  investing 
 14 ['usability', 'reliability', 'customer service', 'affordable', 'profitability', 'security', 'customizability', 'performance', 'fradulent activity', 'compatibility', 'data safety', 'service availability', 'accessibility', 'environmental safety']


In [28]:
# extract top five categories in each domain, compute their percentage 

def extract_top_categories(save_to_file = True):
    for domain in domains:
        categories = unique_genres[domain]
        reviews_df = domains_reviews[domain]
        for category in categories:
            category = re.sub('[^a-zA-Z] ', "", category)
            output = output_dir + domain + "/"+ domain + "_" + category + ".csv"
            reviews_df_filtered = reviews_df[(~reviews_df["Positive genre"].isnull() & reviews_df["Positive genre"].str.contains(category)) | (~reviews_df["Negative genre"].isnull() & reviews_df["Negative genre"].str.contains(category))]
            if len(reviews_df_filtered)>0:
                domains_categories[domain][category] = len(reviews_df_filtered)
                if save_to_file:
                    reviews_df_filtered.to_csv(output, header=True, index=False)
        sorted_categories = dict(sorted(domains_categories[domain].items(), key=lambda item: item[1], reverse=True))
        domains_categories[domain] = sorted_categories
        top_n = 5
        domain_size = len(domains_reviews[domain])
        while top_n > 0:
            categories = list(domains_categories[domain].keys())
            data = []
            for category in categories:
                category_size = domains_categories[domain][category]
                data.append((category, category_size, 100*float(category_size / domain_size)))
            domains_categories_top_percent[domain] = pd.DataFrame(data, columns=["category", "num reviews", "percent"])
            top_n -= 1
            
extract_top_categories(False)