## Load reviews

In [None]:
%run lib.ipynb import *

In [35]:
domains_apps = {
    "ridehailing": ["uber", "lyft"],
    "dating": ["tinder", "bumble"],
    "investing": ["robinhood", "acorn"],
    "mentalhealth": ["calm", "headspace"]
}

In [36]:
domains_apps_df = {}
domains_apps_df_stats = []

for domain, apps in domains_apps.items():
    for app in apps:
        input_file = f"./data/reviews/raw/{domain}/{app}.csv"
        df = pd.read_csv(input_file)
        domains_apps_df[f"{domain}_{app}"] = df
        domains_apps_df_stats.append((domain, app, len(df)))

In [37]:
pd.DataFrame(domains_apps_df_stats, columns=["domain", "app", "count"])

Unnamed: 0,domain,app,count
0,ridehailing,uber,5018
1,ridehailing,lyft,8662
2,dating,tinder,5004
3,dating,bumble,3245
4,investing,robinhood,5009
5,investing,acorn,3843
6,mentalhealth,calm,5004
7,mentalhealth,headspace,4940


## Apply preprocessing at review/sentence level

In [48]:
lng = "en"
lng_threshold = 0.85

def check_lng_label(row, lng=["en"]):
    items = row["english score"]
    row["isEnglish"] = False

    try:
        if isinstance(items, str):
            items = items.split(",")
            lng_check = items[0] in lng
            score_check = float(items[1]) >= lng_threshold
            row["isEnglish"] = lng_check and score_check
    except Exception as e:
        print("error lng" , row["app"], row["uuid"],  row["title"], row["review"] , e)
    
    return row

def check_empty(arr):
    return [item for item in arr if len(item) > 0]

In [54]:
def preprocess(row, col="review", show_logs = False):
    text = row[col]
    preprend = "" if col == "review" else f"{col}_" # distinguish review cleaning and sentences cleaning
    row[preprend + "cleaned"] = ""
    row[preprend + "tokenized"] = ""
    row[preprend + "lemma"] = ""
    row[preprend + "cleaned length"] = 0
    row[preprend + "tokenized length"] = 0
    row[preprend + "lemma length"] = 0
    row[preprend +  "english score"] = False

    row_uuid = preprend +  "uuid"
    
    if row_uuid not in row and col != "review":
        row[row_uuid] = str(uuid.uuid4())
        
    if pd.isnull(text):
        return row  
    
    detected = check_lang(text, 'en', True, 0.85) # text, language, should check threshold, threshold
    lng = detected["language"]
    score = detected["score"]

    if show_logs:
        print(lng, score, text)
    row[preprend + "english score"] = ",".join([lng, str(score)])
    row[preprend + "isEnglish"] = lng == "en" and score >= lng_threshold
    
    cleaned_text = clean_text(text.lower())
    tokens = nltk_tokenize(cleaned_text)
    tokens = remove_punctuation(tokens)
    tokens = remove_non_alphabetic(tokens)
    row[preprend + "cleaned"] = ",".join(tokens)
    row[preprend + "cleaned length"] = len(tokens)

    
    tokens = check_empty(nltk_remove_stopwords(tokens, custom_stop_words))
    row[preprend + "tokenized"] = ",".join(tokens)
    
    lemma = check_empty(nltk_lemmatize_post_tag_rev_words(tokens).split(","))

    if show_logs:
        print("cleaned: ", tokens, "\nlemma", lemma)
        
    row[preprend + "lemma"] = ",".join(lemma)
    
    row[preprend + "tokenized length"] = len(tokens)
    row[preprend + "lemma length"] = len(lemma)
    return row

In [85]:
def break_into_sentences(df, review_index):
    records = df.to_records(index=False)
    app_sents = []
    cols = df.columns
    
    for row in records:
        row_items = []
        row_items.extend(row)
        
        row_review = row[review_index]

        if not pd.isnull(row_review):
            print(row_review)
            row_rev_sents = nltk.sent_tokenize(row_review)
            sents_items = []
            
            # loop through each sentence
            for sent in row_rev_sents:
                sent_items = []
                sent_items.extend(row_items) # add all columns from previous reviews df
                sent_items.append(sent) # append new col (sent)

                # all cols from previous reviews df plus 'sent'
                sents_items.append(sent_items)

                # app sent. level info to app df
                app_sents.append(sent_items)
                
    new_cols = []
    new_cols.extend(list(cols))
    new_cols.append("sent")
    sents_df = pd.DataFrame(app_sents, columns=new_cols)
    return sents_df

In [None]:
# apply review level preprocessing

for key, df in domains_apps_df.items():
    df = df.apply(lambda row: preprocess(row), axis=1)
    domains_apps_df[key] = df
    # df.to_csv(f"./data/{key}.csv", index=False, header=True)

## Apply relevant filters 

- **recency**: reviews posted after 2020
- **language**: reviews must be in English language
- **review length**: reviews' lemmatized length must be > 10 words

### 1. Filter reviews posted before 2020 from analysis

In [None]:
for key, df in domains_apps_df.items():
    df['date'] = pd.to_datetime(df['date'])    
    df = df[df['date'].dt.year > 2020]
    domains_apps_df[key] = df
    print(key, len(df))

### 2. Filter reviews with language label other than English 
### 3. Filter reviews with length < 10 words from analysis

In [None]:
for key, df in domains_apps_df.items():
    df = df[df['isEnglish'] == True]
    df = df[df['lemma length'] >= 10]
    domains_apps_df[key] = df
    print(key, len(df))

## Sample size for summary generation

In [None]:
# domain, appname, total reviews, 1, 2, 3, 4, 5
apps_count_stats = []

for key, df in domains_apps_df.items():
    domain_app = key.split("_")
    app_rate_lens = [domain_app[0], domain_app[1], len(df)] 
    
    for rate in range(1, 6):
        rate_df = df[df["rating"] == rate]
        app_rate_lens.append(len(rate_df))
        
    apps_count_stats.append(app_rate_lens)

apps_count_stats_df = pd.DataFrame(apps_count_stats, columns=["domain", "app", "total",  "#1", "#2", "#3", "#4", "#5"])
apps_count_stats_df

In [None]:
total_samples = 353

'''
sample size: 355
Margin of error: 5.24%
'''

def sample_size_rating(row):
    total = row["total"]
    samples = 0
    for i in range(1, 6):
        rating_count = row["#"+str(i)]
        prop_count = int(total_samples * float(rating_count/total))
        rate_size = prop_count
        row["#"+str(i)+"_samples"] = rate_size
        samples += rate_size
    row["#samples"] = samples
    return row

count_stats = apps_count_stats_df.apply(lambda row:sample_size_rating(row), axis=1)
count_stats

## Compute TF.IDF for estimate relevant score for the reviews

In [89]:
# matrix (corpus_size, num_features) : term frequency of unique words in each document
# df also known as document frequency (num_features) : counts number of doc that contains the given word

import numpy as np

class TfidfTransformer:
    def __init__(self, num_unique_words, corpus_size):
        self.num_unique_words = num_unique_words
        self.corpus_size = corpus_size # size of corpus

    def transform(self, matrix, df):
        tf = matrix / self.num_unique_words # normalize term frequency by the number of unique words in the corpus
        idf = np.log(self.corpus_size / df)
        tf_idf = tf * idf
        return tf_idf
        
def compute_tfidf(uuids_sents, num_reviews = None):
    spaced_sents_only = [item[1] for item in uuids_sents]
    spaced_sents_uuids_only = [item[0] for item in uuids_sents]
    num_reviews = num_reviews if num_reviews is not None else len(spaced_sents_only)
    
    vectorizer = TfidfVectorizer(norm=None) # # do not apply L2 normalization to TF-IDF scores
    tfidf_matrix = vectorizer.fit_transform(spaced_sents_only)
    
    combined_words = " ".join(spaced_sents_only)

    if len(combined_words) > 0:
        corpus_size = len(combined_words.split(" ")) # number of words
        num_unique_words = tfidf_matrix.getnnz(axis=1).sum()
        hybrid_transformer = TfidfTransformer(num_unique_words, corpus_size)
        doc_freq = np.array((tfidf_matrix != 0).sum(axis=0)).flatten()
        sentence_scores = hybrid_transformer.transform(tfidf_matrix, doc_freq)
        sentence_scores = np.array(sentence_scores)
        return sentence_scores

In [90]:
# compute avg score of review from its sentences

def assign_zero_score(row):
    score = row["sent_hybrid tfidf score"]
    row["sent_hybrid tfidf score"] = score if not pd.isnull(score) else 0
    return row

def avg_sents_scores(review_id, _df):
    df = _df[_df["uuid"] == review_id]
    avg_sents_score = np.mean(df["sent_hybrid tfidf score"])
    return avg_sents_score

In [91]:
# store reviews split into sentences as a separate dataframe
# apply preprocessing and compute TF.IDF score for each sentence
# use aggregate of sentence level scores to compute average review TF.IDF score

domains_apps_sents_df = {}

In [None]:
for key, reviews_df in domains_apps_df.items():
    # break reviews into sentences
    df = break_into_sentences(reviews_df, 1)
    # apply preprocessing at sentence level
    df = df.apply(lambda row: preprocess(row, "sent"), axis=1)
    domains_apps_sents_df[key] = df

In [None]:
tfidf_sents_scores = {}

for key, reviews_df in domains_apps_sents_df.items():
    # break reviews into sentences
    df = break_into_sentences(reviews_df, 1)
    # apply preprocessing at sentence level
    df = df.apply(lambda row: preprocess(row, "sent"), axis=1)
    
    uuids = df["sent_uuid"].tolist()
    lemma_reviews_sents = df["sent_lemma"].tolist()
    spaced_non_empty_sents = []
    for sent_id, sent in zip(uuids, lemma_reviews_sents):
        if isinstance(sent, str) and len(sent) > 0:
            sent_spaced = re.sub("," , " ", sent)
            spaced_non_empty_sents.append((sent_id, sent_spaced))
    
    if len(lemma_reviews_sents) != len(spaced_non_empty_sents):
        print("sentences filtered umatched: ", len(lemma_reviews_sents), len(spaced_non_empty_sents))
        
    sentence_scores = compute_tfidf(spaced_non_empty_sents)
    for id, item in enumerate(spaced_non_empty_sents):
        sent_score = sentence_scores[id]
        sent_id = item[0]
        sent = item[1]
        
        df.loc[df["sent_uuid"] == sent_id, 'sent_hybrid tfidf score'] = sent_score
        df.loc[df["sent_uuid"] == sent_id, 'sent_hybrid tfidf score sentence'] = sent
    tfidf_sents_scores[key] = {"scores": sentence_scores, "input": spaced_non_empty_sents, "df": df}

In [None]:
apps_ratings_scores = {}

for app, items in tfidf_sents_scores.items():
    df = items["df"]
    df = df.apply(lambda row: assign_zero_score(row), axis=1)

    review_uuids = list(set(df["uuid"].tolist()))
    reviews_scores = {}

    for rev_id in review_uuids:
        avg_score = avg_sents_scores(rev_id, df)
        df.loc[df["uuid"] == rev_id, 'sent_avg_hybrid tfidf score'] = avg_score
    
    for rate in range(1, 6):
        rate_df = df[df["rating"] == rate]
        rate_df = rate_df.sort_values("sent_avg_hybrid tfidf score", ascending=False)
            
        if key not in apps_ratings_scores:
            apps_ratings_scores[key] = {}
        apps_ratings_scores[key][rate] = rate_df
        print(key, rate, len(rate_df.groupby("uuid")))