In [2]:
import pandas as pd
from os.path import join
import numpy as np

from scipy.stats import spearmanr

# parallelisation functionality
from multiprocess import Pool
import psutil
from tqdm import tqdm

# US Tweets

In [2]:
dst = "../../data/twitter/bootstrapping_US"

In [31]:
src = "../../data/twitter"
fname = "US_politician_tweets_2010-11-06_to_2022-03-16.csv.gzip"
tweets = pd.read_csv(join(src, fname), compression="gzip", parse_dates=["created_at"])

In [32]:
len(tweets)

2588559

In [33]:
tweets["retweeted"].value_counts()

False    2035667
True      552892
Name: retweeted, dtype: int64

In [34]:
tweets["quoted"].value_counts()

False    2321822
True      266737
Name: quoted, dtype: int64

In [35]:
tweets["reply"].value_counts()

False    2340048
True      248511
Name: reply, dtype: int64

In [36]:
len(tweets[(tweets["quoted"] == False) & (tweets["reply"] == False) & (tweets["retweeted"] == False)])

1523050

In [37]:
# drop retweets
tweets = tweets[tweets["retweeted"] == False]

In [38]:
len(tweets)

2035667

In [39]:
# drop tweets without honesty component (distill RoBERTa filtering)
tweets = tweets.dropna(subset=["belief", "truth"])

In [40]:
len(tweets)

1824800

In [46]:
2035667 - 1824800

210867

In [42]:
# set tweet creation date as index for easier sampling and aggregation
tweets = tweets.set_index("created_at")

In [44]:
tweets = tweets[tweets.index.year > 2010]

In [45]:
len(tweets)

1824022

In [47]:
1824800 - 1824022

778

In [48]:
tweets["belief"].sum()

131626.0

In [49]:
tweets["truth"].sum()

273192.0

In [50]:
len(tweets[(tweets["belief"] == 1) & (tweets["truth"] == 1)])

19653

## Honesty components

In [55]:
def run_bootstrap_belief(i):
    tweet_sample = tweets.sample(frac=1, replace=True)
    belief = tweet_sample[["belief", "party"]]\
        .groupby(by=[tweet_sample.index.year, tweet_sample.index.month, "party"])\
        .agg(["sum", "count"])

    belief.index.set_names(["year", "month", "party"], inplace=True)
    belief = belief.reset_index()
    belief.columns = ["year", "month", "party", "belief_sum", "belief_count"]
    belief["belief_share"] = belief["belief_sum"] / belief["belief_count"]
    belief["run"] = i
    return belief

In [12]:
fname = "belief"
belief_bootstrap = pd.DataFrame()
pool = Pool(10)
N_bootstrap = 1000

for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_belief, 
    iterable=range(N_bootstrap)), 
    total=N_bootstrap):
        belief_bootstrap = pd.concat([belief_bootstrap, tmp])
belief_bootstrap = belief_bootstrap.reset_index(drop=True)
belief_bootstrap.to_csv(join(dst, fname + ".csv"), index=False)
pool.close()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [13:00<00:00,  1.28it/s]


In [13]:
def run_bootstrap_truth(i):
    tweet_sample = tweets.sample(frac=1, replace=True)
    truth = tweet_sample[["truth", "party"]]\
        .groupby(by=[tweet_sample.index.year, tweet_sample.index.month, "party"])\
        .agg(["sum", "count"])

    truth.index.set_names(["year", "month", "party"], inplace=True)
    truth = truth.reset_index()
    truth.columns = ["year", "month", "party", "truth_sum", "truth_count"]
    truth["truth_share"] = truth["truth_sum"] / truth["truth_count"]
    truth["run"] = i
    return truth

In [14]:
fname = "truth"
truth_bootstrap = pd.DataFrame()
pool = Pool(10)
N_bootstrap = 1000

for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_truth, 
    iterable=range(N_bootstrap)), 
    total=N_bootstrap):
        truth_bootstrap = pd.concat([truth_bootstrap, tmp])
truth_bootstrap = truth_bootstrap.reset_index(drop=True)
truth_bootstrap.to_csv(join(dst, fname + ".csv"), index=False)
pool.close()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [06:34<00:00,  2.53it/s]


In [56]:
del tweets

# US domains

In [67]:
len(tweets_with_urls)

1339442

In [57]:
dst = "../../data/twitter/bootstrapping_US"

In [58]:
src = "../../data/twitter"
fname = "US_URLs_2010-11-06_to_2022-03-16.csv.gzip"
urls = pd.read_csv(join(src, fname), compression="gzip", parse_dates=["created_at"])

In [59]:
# drop retweets and entries withour URLs
urls = urls[urls["retweeted"] == False]
urls = urls[urls["has_url"] == True]

In [66]:
len(urls.dropna(subset=["belief", "truth"]))

1437973

In [28]:
urls["has_NG_score"] = False
urls.loc[urls["Score"].dropna().index, "has_NG_score"] = True
urls["has_independent_score"] = False
urls.loc[urls["unreliable"].dropna().index, "has_independent_score"] = True

In [29]:
urls = urls.set_index("created_at")

## NewsGuard coverage timelines

In [30]:
# remove all entries with urls that point to large social media (twitter, 
# facebook, youtube, instagram), search (google, yahoo) or e-commerce (amazon) 
# sites
excluded_domains = ["twitter.com", "youtube.com", "facebook.com",
            "instagram.com", "cards.twitter.com", "google.com", "yahoo.com"]
urls_clean = urls[~urls["domain"].isin(excluded_domains)]

In [18]:
def run_bootstrap_NG_coverage(i):
    url_sample = urls_clean.sample(frac=1, replace=True)
    coverage = url_sample[["has_NG_score", "party"]]\
        .groupby(by=[url_sample.index.year, url_sample.index.month, "party"])\
        .agg(["sum", "count"])

    coverage.index.set_names(["year", "month", "party"], inplace=True)
    coverage = coverage.reset_index()
    coverage.columns = ["year", "month", "party", "has_NG_score_sum", "has_NG_score_count"]
    coverage["NG_coverage"] = coverage["has_NG_score_sum"] / coverage["has_NG_score_count"]
    coverage["run"] = i
    return coverage

In [19]:
fname = "NG_coverage"
pool = Pool(10)
N_bootstrap = 1000
NG_coverage_bootstrap = pd.DataFrame()
for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_NG_coverage, 
    iterable = range(N_bootstrap)), 
    total = N_bootstrap):
        NG_coverage_bootstrap = pd.concat([NG_coverage_bootstrap, tmp])
NG_coverage_bootstrap = NG_coverage_bootstrap.reset_index(drop=True)
NG_coverage_bootstrap.to_csv(join(dst, fname + ".csv"), index=False)
pool.close()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [02:43<00:00,  6.12it/s]


## Independent list coverage

In [31]:
def run_bootstrap_independent_coverage(i):
    url_sample = urls_clean.sample(frac=1, replace=True)
    coverage = url_sample[["has_independent_score", "party"]]\
        .groupby(by=[url_sample.index.year, url_sample.index.month, "party"])\
        .agg(["sum", "count"])

    coverage.index.set_names(["year", "month", "party"], inplace=True)
    coverage = coverage.reset_index()
    coverage.columns = ["year", "month", "party", "has_independent_score_sum", "has_independent_score_count"]
    coverage["independent_coverage"] = coverage["has_independent_score_sum"] / coverage["has_independent_score_count"]
    coverage["run"] = i
    return coverage

In [32]:
fname = "independent_coverage"
N_bootstrap = 1000
pool = Pool(10)
independent_coverage_bootstrap = pd.DataFrame()
for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_independent_coverage, 
    iterable = range(N_bootstrap)), 
    total = N_bootstrap):
        independent_coverage_bootstrap = pd.concat([independent_coverage_bootstrap, tmp])
independent_coverage_bootstrap = independent_coverage_bootstrap.reset_index(drop=True)
independent_coverage_bootstrap.to_csv(join(dst, fname + ".csv"), index=False)
pool.close()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [02:41<00:00,  6.19it/s]


In [22]:
del urls
del urls_clean

# US Users

In [42]:
src = "../../data/twitter"
fname = "US_politician_twitter_account_stats_2010-11-06_to_2022-03-16.csv"
users = pd.read_csv(join(src, fname))

## Politifact, NG score & unreliable correlations

In [45]:
def run_bootstrap_reliability_score_correlations(i):
    user_sample = users.sample(frac=1, replace=True, random_state=i)
    pf_bootstrap = pd.DataFrame({
        "corr_NGScore_pf":[user_sample[["NG_score_mean", "pf_score"]]\
                           .corr().loc["pf_score"][0]],
        "corr_ind_pf":[np.abs(user_sample[["unreliable_share", "pf_score"]]\
                           .corr().loc["pf_score"][0])],
        "corr_NGScore_ind":[np.abs(user_sample[["NG_score_mean", "unreliable_share"]]\
                           .corr().loc["unreliable_share"][0])],
        "corr_NGShare_pf":[np.abs(user_sample[["fishy_share_60", "pf_score"]]\
                           .corr().loc["pf_score"][0])],
        "corr_NGShare_ind":[np.abs(user_sample[["fishy_share_60", "unreliable_share"]]\
                           .corr().loc["unreliable_share"][0])],
        "corr_NGScore_NGShare":[np.abs(user_sample[["fishy_share_60", "NG_score_mean"]]\
                           .corr().loc["NG_score_mean"][0])],
        "corr_NGScore_accuracy":[np.abs(user_sample[["fishy_share_60", "accuracy_mean"]]\
                           .corr().loc["accuracy_mean"][0])],
        "corr_NGScore_transparency":[np.abs(user_sample[["fishy_share_60", "transparency_mean"]]\
                           .corr().loc["transparency_mean"][0])],
        "run":[i]
    })
    return pf_bootstrap

In [46]:
fname = "user_reliability_score_correlations"
N_bootstrap = 10000
pool = Pool(10)
pf_bootstrap = pd.DataFrame()
for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_reliability_score_correlations, 
    iterable = range(N_bootstrap)), 
    total = N_bootstrap):
        pf_bootstrap = pd.concat([pf_bootstrap, tmp])
pf_bootstrap = pf_bootstrap.reset_index(drop=True)
pf_bootstrap.to_csv(join(dst, fname + ".csv"), index=False)
pool.close()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:33<00:00, 298.61it/s]


In [58]:
del users

# LIWC scores

In [4]:
src = "../../data/twitter"

In [5]:
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_clean_mask_LIWC.csv.gzip"
df = pd.read_csv(
    join(src, fname), 
    compression="gzip",
    dtype={"id":str},
)
df["id"] = df["id"].str.replace('"', '')

In [6]:
fname = "US_politician_tweets_2010-11-06_to_2022-03-16.csv.gzip"
df2 = pd.read_csv(
    join(src, fname), 
    compression="gzip",
    parse_dates=["created_at"],
    dtype={"id":str, "author_id":str},
    usecols=["id", "author_id", "party", "created_at",
             "belief", "truth", "neutral",
             "retweeted", "quoted", "reply"]
) 

In [7]:
df = pd.merge(df, df2, how="left", left_on="id", right_on="id")
del df2

In [8]:
for col in ["belief", "truth", "neutral"]:
    df[col] = df[col].astype(int)

In [9]:
df["honesty_component"] = np.nan
df.loc[df[df["belief"] == 1].index, "honesty_component"] = "belief"
df.loc[df[df["truth"] == 1].index, "honesty_component"] = "truth"
df.loc[df[df["neutral"] == 1].index, "honesty_component"] = "neutral"

In [10]:
cols = ["honesty_component", "Analytic", "Authentic", "emo_pos", "emo_neg", "moral"]
df[cols]\
    .groupby(["honesty_component"])\
    .agg(["mean", "std", "count"])

Unnamed: 0_level_0,Analytic,Analytic,Analytic,Authentic,Authentic,Authentic,emo_pos,emo_pos,emo_pos,emo_neg,emo_neg,emo_neg,moral,moral,moral
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count,mean,std,count,mean,std,count
honesty_component,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
belief,63.521585,30.822591,111435,46.158038,33.928406,111435,0.86417,2.056284,111435,0.433783,1.392329,111435,0.683627,1.769463,111435
neutral,76.897466,26.131528,1423429,42.048568,33.451033,1423429,1.148689,2.541295,1423429,0.345161,1.285239,1423429,0.738635,1.93847,1423429
truth,74.144456,27.034059,271981,43.494753,32.820735,271981,0.707073,1.807511,271981,0.355489,1.224642,271981,0.620208,1.671442,271981


In [11]:
df = df.set_index("created_at")

In [12]:
def run_bootstrap_LIWC(i):
    cols = ["Analytic", "Authentic", "moral", "emo_pos", "emo_neg"]
    newcols = ["year", "month", "party", 
                        "analytic_sum", "analytic_count",
                        "authentic_sum", "authentic_count",
                        "moral_sum", "moral_count",
                        "emo_pos_sum", "emo_pos_count",
                        "emo_neg_sum", "emo_neg_count"]
    
    df_sample = df.sample(frac=1, replace=True, random_state=i)
    grouping = df_sample[cols + ["party"]]\
        .groupby(by=[df_sample.index.year, df_sample.index.month, "party"])\
        .agg(["sum", "count"])

    grouping.index.set_names(["year", "month", "party"], inplace=True)
    grouping = grouping.reset_index()
    grouping.columns = newcols
    for col in cols:
        col = col.lower()
        grouping[f"{col}_share"] = grouping[f"{col}_sum"] / \
            grouping[f"{col}_count"]
    grouping["run"] = i
    
    belief_subset = df_sample[df_sample["belief"] == 1]
    truth_subset = df_sample[df_sample["truth"] == 1]
    neutral_subset = df_sample[df_sample["neutral"] == 1]
    
    belief_grouping = belief_subset[cols + ["party"]]\
        .groupby(by=[belief_subset.index.year, belief_subset.index.month, "party"])\
        .agg(["sum", "count"])
    belief_grouping.index.set_names(["year", "month", "party"], inplace=True)
    belief_grouping = belief_grouping.reset_index()
    belief_grouping.columns = newcols
    for col in cols:
        col = col.lower()
        belief_grouping[f"{col}_share"] = belief_grouping[f"{col}_sum"] / \
            belief_grouping[f"{col}_count"]
    belief_grouping["run"] = i
    
    truth_grouping = truth_subset[cols + ["party"]]\
        .groupby(by=[truth_subset.index.year, truth_subset.index.month, "party"])\
        .agg(["sum", "count"])
    truth_grouping.index.set_names(["year", "month", "party"], inplace=True)
    truth_grouping = truth_grouping.reset_index()
    truth_grouping.columns = newcols
    for col in cols:
        col = col.lower()
        truth_grouping[f"{col}_share"] = truth_grouping[f"{col}_sum"] / \
            truth_grouping[f"{col}_count"]
    truth_grouping["run"] = i
    
    neutral_grouping = neutral_subset[cols + ["party"]]\
        .groupby(by=[neutral_subset.index.year, neutral_subset.index.month, "party"])\
        .agg(["sum", "count"])
    neutral_grouping.index.set_names(["year", "month", "party"], inplace=True)
    neutral_grouping = neutral_grouping.reset_index()
    neutral_grouping.columns = newcols
    for col in cols:
        col = col.lower()
        neutral_grouping[f"{col}_share"] = neutral_grouping[f"{col}_sum"] / \
            neutral_grouping[f"{col}_count"]
    neutral_grouping["run"] = i
    
    return grouping, belief_grouping, truth_grouping, neutral_grouping

In [13]:
dst = "../../data/twitter/bootstrapping_US"
LIWC_bootstrap = pd.DataFrame()
LIWC_belief_bootstrap = pd.DataFrame()
LIWC_truth_bootstrap = pd.DataFrame()
LIWC_neutral_bootstrap = pd.DataFrame()
pool = Pool(10)
N_bootstrap = 1000

for tmp1, tmp2, tmp3, tmp4 in tqdm(pool.imap_unordered(
    func=run_bootstrap_LIWC, 
    iterable=range(N_bootstrap)), 
    total=N_bootstrap):
        LIWC_bootstrap = pd.concat([LIWC_bootstrap, tmp1])
        LIWC_belief_bootstrap = pd.concat([LIWC_belief_bootstrap, tmp2])
        LIWC_truth_bootstrap = pd.concat([LIWC_truth_bootstrap, tmp3])
        LIWC_neutral_bootstrap = pd.concat([LIWC_neutral_bootstrap, tmp4])
        
LIWC_bootstrap = LIWC_bootstrap.reset_index(drop=True)
LIWC_belief_bootstrap = LIWC_belief_bootstrap.reset_index(drop=True)
LIWC_truth_bootstrap = LIWC_truth_bootstrap.reset_index(drop=True)
LIWC_neutral_bootstrap = LIWC_neutral_bootstrap.reset_index(drop=True)

fname = "LIWC"
LIWC_bootstrap.to_csv(join(dst, fname + ".csv"), index=False)
fname = "LIWC_belief"
LIWC_belief_bootstrap.to_csv(join(dst, fname + ".csv"), index=False)
fname = "LIWC_truth"
LIWC_truth_bootstrap.to_csv(join(dst, fname + ".csv"), index=False)
fname = "LIWC_neutral"
LIWC_neutral_bootstrap.to_csv(join(dst, fname + ".csv"), index=False)
pool.close()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:13<00:00,  1.36s/it]


# Code graveyard

## Honesty components & information trustworthiness

In [51]:
# drop tweets without URLs
tweets_with_urls = tweets[tweets["has_url"] == True]

In [None]:
# set tweet creation date as index for easier sampling and aggregation
tweets_with_urls = tweets_with_urls.set_index("created_at")

In [52]:
# create data subsets for parties
dem = tweets_with_urls[tweets_with_urls["party"] == "Democrat"]
rep = tweets_with_urls[tweets_with_urls["party"] == "Republican"]
datasets = {"all":tweets_with_urls, "democrat":dem, "republican":rep}

In [13]:
components = ["neutral", "belief", "truth"]
N_bootstrap = 1000
cols = ["fishy_60", "fishy_40", "fishy_20"] + [f"C_{i}" for i in range(0, 9)]

In [14]:
def run_bootstrap_honesty_misinfo(dataset):
    tmp = pd.DataFrame()
    tweet_sample = datasets[dataset].sample(len(datasets[dataset]), replace=True)
    row = {}
    for col in cols:
        agg = tweet_sample[[col] + components]\
                .groupby(col).agg(sum)
        try:
            res = agg.loc[1] / agg.loc[0]
        except KeyError:
            return tmp
        for comp in components:
            tmp = pd.concat([tmp, pd.DataFrame({
                "misinfo_label":[col],
                "honesty_component":[comp],
                "percentage":[res[comp]]
            })])
            if comp != "neutral":
                tmp = pd.concat([tmp, pd.DataFrame({
                        "misinfo_label":[col],
                        "honesty_component":[f"{comp}_diff"],
                        "percentage":[res[comp] - res["neutral"]]
                    })])
            
    return tmp

In [15]:
fname = "misinfo_label_and_honesty_component"
bootstrap_results_both = pd.DataFrame()
pool = Pool(10)

for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_honesty_misinfo, 
    iterable=["all"] * N_bootstrap), 
    total=N_bootstrap):
        bootstrap_results_both = pd.concat([bootstrap_results_both, tmp])
bootstrap_results_both = bootstrap_results_both.reset_index(drop=True)
bootstrap_results_both.to_csv(join(dst, fname + ".csv"), index=False)
pool.close()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [06:06<00:00,  2.73it/s]


In [16]:
fname = "misinfo_label_and_honesty_component_dem"
bootstrap_result_dem = pd.DataFrame()
pool = Pool(10)

for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_honesty_misinfo, 
    iterable=["democrat"] * N_bootstrap), 
    total=N_bootstrap):
        bootstrap_result_dem = pd.concat([bootstrap_result_dem, tmp])
bootstrap_result_dem = bootstrap_result_dem.reset_index(drop=True)
bootstrap_result_dem.to_csv(join(dst, fname + ".csv"), index=False)
pool.close()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:20<00:00,  4.98it/s]


In [17]:
fname = "misinfo_label_and_honesty_component_rep"
bootstrap_result_rep = pd.DataFrame()
pool = Pool(10)

for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_honesty_misinfo, 
    iterable=["republican"] * N_bootstrap), 
    total=N_bootstrap):
        bootstrap_result_rep = pd.concat([bootstrap_result_rep, tmp])
bootstrap_result_rep = bootstrap_result_rep.reset_index(drop=True)
bootstrap_result_rep.to_csv(join(dst, fname + ".csv"), index=False)
pool.close()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [02:19<00:00,  7.18it/s]


## Honesty share

In [36]:
def run_bootstrap_user_honesty_share(i):
    bootstrap_honesty = pd.DataFrame()
    user_sample = users.sample(frac=1, replace=True)
    
    for col in ["fishy_share_60", "fishy_share_40", "fishy_share_20",
                "C_0_share", "C_1_share", "C_2_share",
                "C_3_share", "C_4_share", "C_5_share", "C_6_share",
                "C_7_share", "C_8_share"]:
        
        corr_belief = user_sample[["belief_share", col]].corr()\
            .loc["belief_share", col]
        corr_truth = user_sample[["truth_share", col]].corr()\
            .loc["truth_share", col]
        corr_neutral = user_sample[["neutral_share", col]].corr()\
            .loc["neutral_share", col] 

        sparman_corr_belief, spearman_pval_belief = \
            spearmanr(user_sample["belief_share"], user_sample[col])
        sparman_corr_truth, spearman_pval_truth = \
            spearmanr(user_sample["truth_share"], user_sample[col])
        sparman_corr_neutral, spearman_pval_neutral = \
            spearmanr(user_sample["neutral_share"], user_sample[col])

        for corr, comp in zip([corr_belief, corr_truth, corr_neutral],
                              ["belief", "truth", "neutral"]):
            bootstrap_honesty = pd.concat([bootstrap_honesty, pd.DataFrame({
                "val":[corr],
                "val_type":["pearson_correlation"],
                "component":[comp],
                "misinfo_type":[col]
            })])

        for corr, comp in zip([sparman_corr_belief, sparman_corr_truth, sparman_corr_neutral],
                              ["belief", "truth", "neutral"]):
            bootstrap_honesty = pd.concat([bootstrap_honesty, pd.DataFrame({
                "val":[corr],
                "val_type":["spearman_correlation"],
                "component":[comp],
                "misinfo_type":[col]
            })])

        for pval, comp in zip([spearman_pval_belief, spearman_pval_truth, spearman_pval_neutral],
                              ["belief", "truth", "neutral"]):
            bootstrap_honesty = pd.concat([bootstrap_honesty, pd.DataFrame({
                "val":[pval],
                "val_type":["spearman_pval"],
                "component":[comp],
                "misinfo_type":[col]
            })])
    return bootstrap_honesty

In [37]:
fname = "user_honesty_share"
N_bootstrap = 10000
pool = Pool(10)
bootstrap_honesty = pd.DataFrame()
for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_user_honesty_share, 
    iterable = range(N_bootstrap)), 
    total = N_bootstrap):
        bootstrap_honesty = pd.concat([bootstrap_honesty, tmp])
bootstrap_honesty = bootstrap_honesty.reset_index(drop=True)
bootstrap_honesty.to_csv(join(dst, fname + ".csv"), index=False)
pool.close()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [03:47<00:00, 44.03it/s]


## Log honesty share

In [38]:
def run_bootstrap_user_honesty_share_log(i):
    bootstrap_honesty_log = pd.DataFrame()
    user_sample = users.sample(frac=1, replace=True).copy()
    user_sample["belief_share"] = np.log(user_sample["belief_share"] + 0.01)
    user_sample["truth_share"] = np.log(user_sample["truth_share"] + 0.01)
    
    for col in ["fishy_share_60", "fishy_share_40", "fishy_share_20",
                "C_0_share", "C_1_share", "C_2_share",
                "C_3_share", "C_4_share", "C_5_share", "C_6_share",
                "C_7_share", "C_8_share"]:
        corr_belief = user_sample[["belief_share", col]].corr()\
            .loc["belief_share", col]
        corr_truth = user_sample[["truth_share", col]].corr()\
            .loc["truth_share", col]
        corr_neutral = user_sample[["neutral_share", col]].corr()\
            .loc["neutral_share", col] 

        sparman_corr_belief, spearman_pval_belief = \
            spearmanr(user_sample["belief_share"], user_sample[col])
        sparman_corr_truth, spearman_pval_truth = \
            spearmanr(user_sample["truth_share"], user_sample[col])
        sparman_corr_neutral, spearman_pval_neutral = \
            spearmanr(user_sample["neutral_share"], user_sample[col])

        for corr, comp in zip([corr_belief, corr_truth, corr_neutral],
                              ["belief", "truth", "neutral"]):
            bootstrap_honesty_log = pd.concat([bootstrap_honesty_log, pd.DataFrame({
                "val":[corr],
                "val_type":["pearson_correlation"],
                "component":[comp],
                "misinfo_type":[col]
            })])

        for corr, comp in zip([sparman_corr_belief, sparman_corr_truth, sparman_corr_neutral],
                              ["belief", "truth", "neutral"]):
            bootstrap_honesty_log = pd.concat([bootstrap_honesty_log, pd.DataFrame({
                "val":[corr],
                "val_type":["spearman_correlation"],
                "component":[comp],
                "misinfo_type":[col]
            })])

        for pval, comp in zip([spearman_pval_belief, spearman_pval_truth, spearman_pval_neutral],
                              ["belief", "truth", "neutral"]):
            bootstrap_honesty_log = pd.concat([bootstrap_honesty_log, pd.DataFrame({
                "val":[pval],
                "val_type":["spearman_pval"],
                "component":[comp],
                "misinfo_type":[col]
            })])
        
    return bootstrap_honesty_log

In [None]:
fname = "user_honesty_share_log"
N_bootstrap = 10000
pool = Pool(10)
bootstrap_honesty_log = pd.DataFrame()
for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_user_honesty_share_log, 
    iterable = range(N_bootstrap)), 
    total = N_bootstrap):
        bootstrap_honesty_log = pd.concat([bootstrap_honesty_log, tmp])
bootstrap_honesty_log = bootstrap_honesty_log.reset_index(drop=True)
bootstrap_honesty_log.to_csv(join(dst, fname + ".csv"), index=False)
pool.close()