## Imports

In [None]:
%run lib.ipynb import *

In [2]:
import numpy as np, pandas as pd, json
import nltk
import pandas as pd
import matplotlib.pyplot as plt, re

## Simple text preprocessing

In [3]:
def check_empty(arr):
    return [item for item in arr if len(item) > 0]
    
def preprocess(text, apps, should_ignore=False, should_stopwords_rm = False, should_lemma = False):
    cleaned_text = re.sub(r"'(\w)", "", text)
    # remove noise terms like html, nan, emojis
    cleaned_text = clean_text(cleaned_text.lower()) 
    cleaned_text = re.sub("-", " ", cleaned_text)
    tokens = nltk_tokenize(cleaned_text)
    tokens = remove_punctuation(tokens)
    tokens = remove_non_alphabetic(tokens)
    tokens = [item for item in tokens if len(item.strip()) > 0]
    return tokens

## Entity Extraction from labelled summaries

In [4]:
def extract_annotated_labels(row, col="Denser_Summary", pos_label="pos", neg_label="neg", neu_label="neutral"):
    label_json = json.loads(row["label"])
    summary = row[col]
    pos_ents = []
    neg_ents = []
    neu_ents = []
    app = row["apps"]
    for item in label_json:
        ent_text = item["text"]
        ent_ann_labels = item["labels"]
        if pos_label in ent_ann_labels: # positive
            pos_ents.append(ent_text)
        elif neg_label in ent_ann_labels: # negative
            neg_ents.append(ent_text)
        elif neu_label in ent_ann_labels:
            neu_ents.append(ent_text)

    tokens = preprocess(summary, [])
    print(f"\n\n--------------{app}:--------------")
    print("positive ents: ", pos_ents)
    print("negative ents: ", neg_ents)
    print("neutral ents: ", neu_ents)
    
    total_tokens = len(tokens)
    row["#raw_tokens"] = len(nltk.word_tokenize(summary))
    row["#tokens"] = total_tokens
    row["pos_ents"] = ";".join(pos_ents)
    row["#pos_ents"] = len(pos_ents)
    row["neg_ents"] = ";".join(neg_ents)
    row["#neg_ents"] = len(neg_ents)
    row["neu_ents"] = ";".join(neu_ents)
    row["#neu_ents"] = len(neu_ents)
    total_ents = len(pos_ents) + len(neg_ents) + len(neu_ents)
    row["%pos_ents"] = float(len(pos_ents) / total_ents)
    row["%neg_ents"] = float(len(neg_ents) / total_ents)
    row["%neu_ents"] = float(len(neu_ents) / total_ents)
    row["%pos_neg_ents"] = float(len(pos_ents) / len(neg_ents))
    row["entity density"] = float(total_ents / total_tokens)
    row["#entities"] = total_ents
    return row

## Compute entity quantity & entity density in CoD summaries

In [None]:
annotated_dir = "../data/summaries"
annotated_filenames = ["all_apps_codr.csv", 
                    "all_apps_cod.csv"
            ]

cols = ["apps", "Iteration turn", "Denser_Summary", "Missing_Entities", 
        "pos_ents", "neg_ents", "neu_ents", "#pos_ents", "#neg_ents", "#neu_ents",
       "#entities", "entity density", "#raw_tokens", "#tokens"]

cod_df_reviews = pd.read_csv(f"{annotated_dir}/{annotated_filenames[0]}") 
cod_df_reviews.rename(columns={"app": "apps"}, inplace=True)
cod_df_reviews = cod_df_reviews.apply(lambda row: extract_annotated_labels(row), axis=1)
cod_df_reviews = cod_df_reviews[cols]
print("CoD_R #rows: ", len(cod_df_reviews))

cod_df = pd.read_csv(f"{annotated_dir}/{annotated_filenames[1]}") 
cod_df.rename(columns={"app": "apps"}, inplace=True)
cod_df = cod_df.apply(lambda row: extract_annotated_labels(row, "Denser_Summary", "positive", "negative"), axis=1)
cod_df = cod_df[cols]
print("CoD #rows: ", len(cod_df))

In [44]:
# cod_df_reviews.to_csv(f"./data/summaries/{annotated_filenames[0]}", index=False, header=True)
# cod_df.to_csv(f"./data/summaries/{annotated_filenames[1]}", index=False, header=True)

In [36]:
cod_df.head(3)

Unnamed: 0,apps,Iteration turn,Denser_Summary,Missing_Entities,pos_ents,neg_ents,neu_ents,#pos_ents,#neg_ents,#neu_ents,#entities,entity density,#raw_tokens,#tokens
0,bumble,1,The app in question seems to have a variety of...,safety concerns;photo verification issues;expe...,beneficial for creating meaningful connections,"photo verification issues,;cost associated wit...",,1,3,0,4,0.034188,132,117
1,bumble,2,Users are encountering safety concerns and exp...,limited swipes;fake profiles;gender options,success in connecting with others,safety concerns;expensive subscriptions;limite...,,1,6,0,7,0.06422,124,109
2,bumble,3,The app's user base reports persistent safety ...,unresponsive matches;unclear guidelines;non-bi...,,safety concerns;expensive subscription model;f...,,0,9,0,9,0.092784,111,97


In [37]:
cod_df_reviews.head(3)

Unnamed: 0,apps,Iteration turn,Denser_Summary,Missing_Entities,pos_ents,neg_ents,neu_ents,#pos_ents,#neg_ents,#neu_ents,#entities,entity density,#raw_tokens,#tokens
0,bumble,1,"This application, known as Bumble, has been th...",expensive subscription;photo moderation;fake p...,,cost associated with the subscription model;mo...,,0,3,0,3,0.029703,114,101
1,bumble,2,"Bumble's users report mixed experiences, highl...",gender options;limited swipes;customer service,,costly subscription;photo moderation;limited n...,,0,5,0,5,0.060976,94,82
2,bumble,3,Bumble's user experience is marred by an expen...,match algorithm;notification spam,,subscription fee;photo moderation;restrictive ...,,0,7,0,7,0.0875,91,80


In [38]:
view_cols = ["apps", "entity density", "#entities",  "#raw_tokens", "#tokens"]
cod_df[cod_df["apps"] == "bumble"][view_cols].head()

Unnamed: 0,apps,entity density,#entities,#raw_tokens,#tokens
0,bumble,0.034188,4,132,117
1,bumble,0.06422,7,124,109
2,bumble,0.092784,9,111,97
3,bumble,0.11,11,112,100
4,bumble,0.122642,13,119,106


In [39]:
cod_df_reviews[cod_df_reviews["apps"] == "bumble"][view_cols].head()

Unnamed: 0,apps,entity density,#entities,#raw_tokens,#tokens
0,bumble,0.029703,3,114,101
1,bumble,0.060976,5,94,82
2,bumble,0.0875,7,91,80
3,bumble,0.104651,9,98,86
4,bumble,0.153846,12,91,78


In [40]:
apps = ["uber", "lyft", "tinder", "bumble", "robinhood", "acorn", "calm", "headspace"]

def format_ents_view(apps_cod_df, apps_codr_df, itr_col = "Iteration turn", ents_cols = "#entities"):
    apps_ents_stats = []

    for app in apps:
        app_ents_stats = [app]
        cod_df = apps_cod_df[apps_cod_df["apps"] == app]
        codr_df = apps_codr_df[apps_codr_df["apps"] == app]
        for itr in range(1, 6):
            df = cod_df[cod_df[itr_col] == itr]
            app_ents_stats.append(df[ents_cols].tolist()[0])
        for itr in range(1, 6):
            df = codr_df[codr_df[itr_col] == itr]
            app_ents_stats.append(df[ents_cols].tolist()[0])
        apps_ents_stats.append(app_ents_stats)
    
    
    cols = ["app"]
    cols.extend(["cod_"+str(i) for i in range(1, 6)])
    cols.extend(["cod_r_"+str(i) for i in range(1, 6)])
    apps_ents_stats_df = pd.DataFrame(apps_ents_stats, columns=cols)
    return apps_ents_stats_df

In [41]:
format_ents_view(cod_df, cod_df_reviews)

Unnamed: 0,app,cod_1,cod_2,cod_3,cod_4,cod_5,cod_r_1,cod_r_2,cod_r_3,cod_r_4,cod_r_5
0,uber,3,6,9,13,15,4,9,11,12,15
1,lyft,5,7,12,12,14,5,10,10,11,11
2,tinder,5,7,9,6,8,5,9,12,13,13
3,bumble,4,7,9,11,13,3,5,7,9,12
4,robinhood,3,4,5,4,2,6,8,10,12,14
5,acorn,5,6,5,8,6,9,9,10,11,11
6,calm,4,6,6,7,8,5,9,10,10,8
7,headspace,3,3,5,6,7,6,7,9,10,10


In [42]:
format_ents_view(cod_df, cod_df_reviews, itr_col = "Iteration turn", ents_cols = "entity density")

Unnamed: 0,app,cod_1,cod_2,cod_3,cod_4,cod_5,cod_r_1,cod_r_2,cod_r_3,cod_r_4,cod_r_5
0,uber,0.034091,0.077922,0.111111,0.164557,0.182927,0.037383,0.102273,0.146667,0.164384,0.192308
1,lyft,0.037037,0.063636,0.111111,0.108108,0.118644,0.04902,0.111111,0.123457,0.139241,0.1375
2,tinder,0.04902,0.093333,0.121622,0.082192,0.111111,0.05618,0.101124,0.131868,0.141304,0.156627
3,bumble,0.034188,0.06422,0.092784,0.11,0.122642,0.029703,0.060976,0.0875,0.104651,0.153846
4,robinhood,0.031579,0.051282,0.066667,0.067797,0.037736,0.058824,0.090909,0.111111,0.141176,0.155556
5,acorn,0.052632,0.083333,0.068493,0.123077,0.089552,0.09375,0.105882,0.12987,0.150685,0.148649
6,calm,0.040816,0.064516,0.068182,0.076923,0.082474,0.05,0.097826,0.119048,0.126582,0.093023
7,headspace,0.029412,0.033333,0.058824,0.071429,0.08642,0.058252,0.076087,0.105882,0.116279,0.113636


In [43]:
# compute mean for each cols:

for col in ["#entities", "entity density"]:
    print("....", col, ".....")
    df = format_ents_view(cod_df, cod_df_reviews, itr_col = "Iteration turn", ents_cols = col)
    cols = [item for item in df.columns if item != "app"]
    for c in cols:
        print(c, np.mean(df[c]))

.... #entities .....
cod_1 4.0
cod_2 5.75
cod_3 7.5
cod_4 8.375
cod_5 9.125
cod_r_1 5.375
cod_r_2 8.25
cod_r_3 9.875
cod_r_4 11.0
cod_r_5 11.75
.... entity density .....
cod_1 0.038596775713925204
cod_2 0.06644710066987368
cod_3 0.087349064242958
cod_4 0.10051025406917566
cod_5 0.1039381981703867
cod_r_1 0.054138935948423755
cod_r_2 0.09327344137176027
cod_r_3 0.11942535020353648
cod_r_4 0.13553779111665915
cod_r_5 0.14389302197905798


## Compute entity quantity & entity density of baseline summaries

In [None]:
annotated_dir = "../data/summaries"

baselines = {}
types = ["vanilla", "tfidf"]

for type in types:
    df = pd.read_csv(f"{annotated_dir}/all_apps_{type}_old.csv")
    df = df.apply(lambda row: extract_annotated_labels(row, "summary", "positive", "negative"), axis=1)
    baselines[type] = df
    print("mean: ", np.mean(df["#entities"]), np.mean(df["entity density"]))

In [30]:
df = baselines["vanilla"]
print("mean: ", np.mean(df["#entities"]), np.mean(df["entity density"]))
df[view_cols]

mean:  9.5 0.08583842254645581


Unnamed: 0,apps,entity density,summary,pos_ents,neg_ents,neu_ents
0,Uber,0.084746,Users report mixed experiences with Uber's app...,convenience of the service;payment options;saf...,app's glitches;customer service;safety concern...,
1,Lyft,0.081967,The Lyft app receives mixed reviews. Some user...,positive interactions with drivers;reliability...,dynamic pricing;app's navigation;customer serv...,
2,Tinder,0.084906,"Many users express frustration with Tinder, ci...",success in meeting people through Tinder,banned without explanation;fake profiles;bots;...,
3,Bumble,0.117117,The Bumble app receives mixed reviews. Some us...,"unique approach, where women initiate conversa...",24-hour message limit;limited swipes;fake prof...,
4,Robinhood,0.077586,Users have mixed feelings about Robinhood. Som...,user-friendly platform;educational;instant dep...,customer service;technical issues during high-...,
5,Acorn,0.09375,Users appreciate Acorn's app for its easy savi...,automatic round-ups;recurring investments,customer service;account verification;withdraw...,
6,Calm,0.068966,"The Calm app, designed to aid with anxiety and...","helpful for relaxation, meditation, and sleep;...",subscription cost;previously free features now...,call for more diversity in authors for sleep s...
7,Headspace,0.07767,Headspace app receives mixed reviews. Many use...,benefits for meditation and sleep;sleepcasts;v...,requiring payment for most features;substantia...,


In [31]:
df = baselines["tfidf"]
print("mean: ", np.mean(df["#entities"]), np.mean(df["entity density"]))
df[view_cols]

mean:  6.25 0.051172816529636164


Unnamed: 0,apps,entity density,summary,pos_ents,neg_ents,neu_ents
0,Uber,0.045455,Uber drivers are getting lazy to accepts rides...,,accepts rides;driver was fantastic very person...,
1,Lyft,0.035088,"20 minute waits for a 5 minute ride, then driv...",,drivers cancel 10 minutes into waiting;prices ...,
2,Tinder,0.071429,"They made it so when you get banned, you can’t...",,banned;same 12 matches for over a month;countl...,
3,Bumble,0.080645,"You don’t get to see all your likes, you only ...",,"see all your likes,;more results;more swipes;t...",
4,Robinhood,0.034483,"They made it easy to trade stocks, buy stocks....",easy to trade stocks;make money on crypto;layo...,lost money and,
5,Acorn,0.05,"Now, they are taking money out of my account e...",beginner investors and long term investments,money out of my account;verify with the email ...,
6,Calm,0.04918,I also love the sleep meditations and stories ...,sleep meditations and stories;dailies;soundscape,unwanted stories;daily check-in font;playlists,
7,Headspace,0.043103,I understand they need to make money off the a...,,guided meditation;scripts for the meditations;...,30 min - 45 min meditations;practice


In [32]:
view_cols = ["apps", "entity density", "summary", "pos_ents", "neg_ents", "neu_ents", "#entities", "entity density", "#raw_tokens", "#tokens"]

baselines["tfidf"][view_cols].to_csv(f"./data/summaries/all_apps_tfidf.csv", index=False, header=True)
baselines["vanilla"][view_cols].to_csv(f"./data/summaries/all_apps_vanilla.csv", index=False, header=True)

## Run statistical analysis on entity count and density

- use multiple paired t-test to determine the significance of difference in statistic of two groups.
- use one way ANOVA to compare the average statistics of more than two groups.

### Normality test

- Null hypothesis H0: the distribution is normal.
- Alternate hypothesis H1: the distribution is not normal.
- Significance level = 0.01 and 0.05 which suggest confidence of 99% and 95% repectively.
- If the observed p-value from the Shapiro-Wilk test is above the significance level, we fail to reject the null hypothesis. In other words, we found no sufficient evidence to claim that the distribution of the values is not normal.
  

In [17]:
from scipy.stats import shapiro

def shapiro_test(data):
    stat, p_value = shapiro(data)
    if p_value > 0.05:
        print("..***failed to reject H0***..")
    return stat, p_value

In [18]:
for _column in ["#entities", "entity density"]:
    print("\n\n....col" , _column, ".....")
    vanilla = baselines["vanilla"][_column]
    print("vanilla: ", shapiro_test(vanilla))

    for itr in range(1, 6):
        print("\niteration: ", itr)
       
        codr = cod_df_reviews[cod_df_reviews["Iteration turn"] == itr][_column]
        cod = cod_df[cod_df["Iteration turn"] == itr][_column]

        print("cod: ", shapiro_test(cod))
        print("codr: ", shapiro_test(codr))



....col #entities .....
vanilla:  (0.8162244558334351, 0.04258103668689728)

iteration:  1
cod:  (0.8015025854110718, 0.029704909771680832)
..***failed to reject H0***..
codr:  (0.9004141092300415, 0.29144734144210815)

iteration:  2
cod:  (0.8104381561279297, 0.036982372403144836)
..***failed to reject H0***..
codr:  (0.8395451307296753, 0.07451588660478592)

iteration:  3
..***failed to reject H0***..
cod:  (0.8454927206039429, 0.0857318639755249)
..***failed to reject H0***..
codr:  (0.8995161652565002, 0.2860732674598694)

iteration:  4
..***failed to reject H0***..
cod:  (0.9270010590553284, 0.4891917109489441)
..***failed to reject H0***..
codr:  (0.9650728702545166, 0.8567925095558167)

iteration:  5
..***failed to reject H0***..
cod:  (0.930121660232544, 0.5171833038330078)
..***failed to reject H0***..
codr:  (0.9542165398597717, 0.75356125831604)


....col entity density .....
..***failed to reject H0***..
vanilla:  (0.8612016439437866, 0.12342841923236847)

iteration:  1
.

### Paired t-test
---

- The sampled source reviews used for summarization is the same for all methods: CoDr, CoD, and Vanilla.
- In a paired t-test, we compute difference between two set of observed data, and assess whether the average difference is significant or not.
- First, we perform Shapiro-Wilk's test on the set of difference values.
    - Null hypothesis H0: the distribution is normal.
    - Alternate hypothesis H1: the distribution is not normal.
    - Significance level = 0.01 and 0.05 which suggest confidence of 99% and 95% repectively.
    - If the observed p-value from the Shapiro-Wilk test is above the significance level, we fail to reject the null hypothesis. In other words, we found no sufficient evidence to claim that the distribution of the difference values is not normal.
- Use paired t-test on the set of difference values.
    - Null hypothesis H0: the change reflected by the average difference is significant.
    - Alternate hypothesis H1: the change reflected by the average difference is not significant.
    - Significance level = 0.01 and 0.05 which suggest confidence of 99% and 95% repectively.
    - If the observed p-value from the Shapiro-Wilk test is below the significance level, we reject the null hypothesis. In other words, the observed change in values betwene the two groups is significant.

In [18]:
from scipy.stats import f_oneway, ttest_rel

def paired_ttest(col1, col2):
    statistic, p_value = ttest_rel(col1, col2)
    if p_value < 0.05:
        print("***significant***")
    return statistic, p_value

In [21]:
# let's run paired t-test for cod_5, codr_5 and vanilla. 
# Since, this is a paired test (only two groups) we will use combinations of pairs. 

for _column in ["#entities", "entity density"]:
    print("\n\n....col" , _column, ".....")
    vanilla = baselines["vanilla"][_column]
    tfidf = baselines["tfidf"][_column]
    
    for itr in range(1, 6):
        print("\niteration: ", itr)
        
        codr = cod_df_reviews[cod_df_reviews["Iteration turn"] == itr][_column]
        print("vanilla vs codr: ", paired_ttest(vanilla, codr))
        
        cod = cod_df[cod_df["Iteration turn"] == itr][_column]
        print("vanilla vs cod", paired_ttest(vanilla, cod))

        print("tfidf vs codr", paired_ttest(tfidf, codr))
        print("tfidf vs cod", paired_ttest(tfidf, cod))

        for itr_2 in range(1, 6):
            cod_2 = cod_df[cod_df["Iteration turn"] == itr_2][_column]
            print(f"cod[{str(itr_2)}] vs codr[{str(itr)}]: ", paired_ttest(codr, cod_2))



....col #entities .....

iteration:  1
***significant***
vanilla vs codr:  (8.602889557460568, 5.7153950148406615e-05)
***significant***
vanilla vs cod (11.0, 1.1376151194592588e-05)
tfidf vs codr (1.5071573172061195, 0.1754994493585011)
***significant***
tfidf vs cod (2.5528888301902897, 0.03794328796316367)
cod[1] vs codr[1]:  (2.2, 0.06373101530263678)
cod[2] vs codr[1]:  (-0.39735970711951313, 0.7029400234562209)
cod[3] vs codr[1]:  (-1.5240136747505373, 0.1713281412298464)
cod[4] vs codr[1]:  (-1.9540168418367887, 0.09162379533890302)
cod[5] vs codr[1]:  (-1.8364615450436153, 0.10890866499940696)

iteration:  2
vanilla vs codr:  (1.4883513944689681, 0.18026237269037185)
***significant***
vanilla vs cod (5.16671222870379, 0.0012997914254367544)
tfidf vs codr (-2.0, 0.08561932856297597)
tfidf vs cod (0.4605661864718383, 0.6590761036459647)
***significant***
cod[1] vs codr[2]:  (8.078246376295356, 8.56523155312282e-05)
***significant***
cod[2] vs codr[2]:  (3.668996928526714, 0.007