In [3]:
import pandas as pd
import numpy as np
from os.path import join

import sys
sys.path.append('../../../../utilities/twitter_functions')
import twitter_functions as tf

# Create a URL data frame

## Expand URL lists

In [20]:
# load the cleaned timeline-data
src = "../../data/twitter"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_clean.csv.gzip"
tweets = pd.read_csv(join(src, fname),
                 compression="gzip",
                 usecols=["id", "author_id", "created_at", "expanded_urls",
                          "retweeted", "quoted", "reply"])
tweets = tweets.drop_duplicates(subset="id")

  tweets = pd.read_csv(join(src, fname),


In [21]:
# parse the URL lists
tweets["expanded_urls"] = tweets["expanded_urls"].fillna("[]")
tweets["expanded_urls"] = tweets["expanded_urls"].apply(lambda x: eval(x))
tweets["has_url"] = tweets["expanded_urls"].apply(lambda x: len(x) > 0)

In [22]:
# expand the url lists such that tweets with a list of N urls are converted
# into N individual rows, one for each URL
# NOTE: this operation takes a substantial amount of time, which is why we
# save the outcome so we can skip this step if we want to re-do other parts of
# the data wrangling later
urls = tweets['expanded_urls']\
    .apply(pd.Series)\
    .reset_index()\
    .melt(id_vars='index')\
    .dropna()[['index', 'value']]\
    .set_index('index')

  urls = tweets['expanded_urls']\


In [26]:
# merge the expanded URL data frame with the tweet data frame
urls = pd.merge(
    urls,
    tweets[['id']],
    left_index=True,
    right_index=True).rename(columns={'value_x': 'expanded_urls'})

In [34]:
# some tweets contain the same URL twice. We drop these
urls = urls.drop_duplicates(subset=["id", "value"])

In [35]:
urls = pd.merge(tweets, urls, left_on="id", right_on="id", how="left")
del tweets

urls["N_urls"] = urls["expanded_urls"].apply(lambda x: len(x))
urls = urls.rename(columns={"value":"url"})

# save the outcome
urls.to_csv(join(src, "combined_US_politician_twitter_timelines_2010-11-06_to_2021-03-16_clean_urls.csv.xz"),
          compression="xz", index=False)

## Add tweet metrics

In [4]:
# load the data frame with the expanded URLs
src = "../../data/twitter"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2021-03-16_clean_urls.csv.xz"
urls = pd.read_csv(join(src, fname), compression="xz", parse_dates=["created_at"])

In [5]:
# load the public metrics information for the collected tweets
src = "../../data/twitter"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_clean.csv.gzip"
tweet_metrics = pd.read_csv(join(src, fname),
                 compression="gzip",
                 usecols=["id", "retweet_count",
                          "reply_count", "like_count", "quote_count"])
tweet_metrics = tweet_metrics.drop_duplicates(subset="id")
# merge the tweet metrics with the tweet data frame
urls = pd.merge(urls, tweet_metrics, how="left", left_on="id", right_on="id")
del tweet_metrics

In [6]:
# remove the additional quote characters from the tweet ID and author ID columns
# that prevent parsing of these fields as numbers
urls["id"] = urls["id"].apply(lambda x: x.replace('"', ''))
urls["author_id"] = urls["author_id"].apply(lambda x: x.replace('"', ''))

## Add unraveled URLs

Note: run the following to unravel a list of URLs:  
`python ../../../utilities/unravel_urls/unravel_urls.py url_list.csv.gzip -dst unraveled_urls3/ -v 1`

In [7]:
# load the list of originally shortened URLs with their expansions to their true
# destination
src = "../../data/twitter"
fname = "unraveled_urls.csv.xz"
unraveled_urls = pd.read_csv(join(src, fname), compression="xz")

In [8]:
# add URL information
urls = pd.merge(urls, unraveled_urls, left_on="url", right_on="url", how="left")

# add indicator of whether the URL was originally shortened
urls["shortened_url"] = False
urls.loc[urls["unraveled_url"].dropna().index, "shortened_url"] = True

# replace the shortened URL with the unraveled URL
urls.loc[urls["unraveled_url"].dropna().index, "url"] = \
    urls.loc[urls["unraveled_url"].dropna().index, "unraveled_url"]
urls = urls.drop(columns=["unraveled_url"])

In [9]:
# extract the domain from the URL
urls["domain"] = urls["url"].apply(tf.extract_domain)

found malformed URL https
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http


## Add NewsGuard nutrition scores

Newsguard rating cutoff: 60 (see [description](https://www.newsguardtech.com/ratings/rating-process-criteria/)).

In [49]:
# load the nutrition labels
src = "../../data/newsguard/newsguard_2022-03/03"
fname = "metadata-2022030100.csv"
NG_scores = pd.read_csv(join(src, fname))
# if more than one score exists for the same domain, keep the most recent one
NG_scores = NG_scores.sort_values(by=["Domain","Last Updated"], ascending=False)
NG_scores = NG_scores.drop_duplicates(subset=["Domain"])
NG_scores = NG_scores.rename(columns={"Domain":"domain"})

# threshold scores at various cutoffs to define untrustworthy domains
NG_scores["fishy_60"] = 0
NG_scores["fishy_40"] = 0
NG_scores["fishy_20"] = 0
NG_scores.loc[NG_scores[NG_scores["Score"] < 60].index, "fishy_60"] = 1
fishy_60_domains = set(NG_scores[NG_scores["fishy_60"] == 1]["domain"])
NG_scores.loc[NG_scores[NG_scores["Score"] < 40].index, "fishy_40"] = 1
fishy_40_domains = set(NG_scores[NG_scores["fishy_40"] == 1]["domain"])
NG_scores.loc[NG_scores[NG_scores["Score"] < 20].index, "fishy_20"] = 1
fishy_20_domains = set(NG_scores[NG_scores["fishy_20"] == 1]["domain"])

In [57]:
all_EN = len(NG_scores[(NG_scores["Language"] == "en")])
untrustworthy_EN = len(NG_scores[(NG_scores["Language"] == "en") & (NG_scores["Score"] < 60)])
print(f"Trustworthy EN: {100 - untrustworthy_EN/all_EN * 100:1.3f}")

all_DE = len(NG_scores[(NG_scores["Language"] == "de")])
untrustworthy_DE = len(NG_scores[(NG_scores["Language"] == "de") & (NG_scores["Score"] < 60)])
print(f"Trustworthy DE: {100 - untrustworthy_DE/all_DE * 100:1.3f}")

Trustworthy EN: 62.755
Trustworthy DE: 74.483


In [11]:
nutrition_cols = [
    "fishy_60", "fishy_40", "fishy_20", "domain",
    "Rating", "Score", "Country", "Language", 
    "Does not repeatedly publish false content",
    "Gathers and presents information responsibly",
    "Regularly corrects or clarifies errors",
    "Handles the difference between news and opinion responsibly",
    "Avoids deceptive headlines",
    "Website discloses ownership and financing",
    "Clearly labels advertising",
    "Reveals who's in charge, including any possible conflicts of interest",
    "The site provides names of content creators, along with either contact or biographical information"
]
nutrition_categories = {col:f"C_{i}" for i, col in enumerate(nutrition_cols[8:])}

In [12]:
# add the nutrition information to the tweet data table
urls = pd.merge(urls, NG_scores[nutrition_cols],
         left_on="domain", right_on="domain", how="left")
urls = urls.rename(columns=nutrition_categories)
del NG_scores

# transform the labels into binary values
for col in nutrition_categories.values():
    urls[col] = urls[col].replace({"Yes":0, "No":1})

In [13]:
# export the list of all URLs with a NewsGuard score for text straping
url_export = urls[["url", "Score"]].copy()
url_export = url_export.drop_duplicates()
url_export[["url", "Score"]]\
    .rename(columns={"Score":"score"})\
    .to_csv(join(src, "unraveled_url_list.csv.gzip"), 
            index=False, compression="gzip")
del url_export

## Add alternative trustworthiness labels

In [14]:
# load the list of independently compiled trustworthiness labels for 
# news sources
src = "../../data/twitter"
fname = "unstrustworthy_domain_list.csv"
alt_labels = pd.read_csv(join(src, fname))
alt_labels = alt_labels.rename(columns = {"type":"unreliable", "url":"Domain"})

# convert reliability labels to binary
alt_labels["unreliable"] = alt_labels["unreliable"]\
    .replace({"reliable":0, "unreliable":1})

# merge with the tweet data table
urls = pd.merge(urls, alt_labels[["accuracy", "transparency", 
        "unreliable", "Domain"]], how="left", left_on="domain",
         right_on="Domain")
del alt_labels

## Add truth seeking & belief speaking scores

In [15]:
! rsync -avze ssh jlasser@medea:/data/honesty/corpora/Twitter/combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_threshold_label.csv ../../data/twitter/combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_threshold_label.csv --progress

receiving incremental file list

sent 20 bytes  received 139 bytes  106.00 bytes/sec
total size is 666,404,969  speedup is 4,191,226.22


In [16]:
# load the word matching counts for belief-speaking and truth-seeking
src = "../../data/twitter"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_threshold_label.csv"
cols = ["id", "author_id", "belief_count", "truth_count", "created_at"]
honesty_tweets = pd.read_csv(join(src, fname), usecols=cols, parse_dates=["created_at"])
honesty_tweets["author_id"] = honesty_tweets["author_id"].apply(lambda x: x.replace('"', ''))
honesty_tweets["id"] = honesty_tweets["id"].apply(lambda x: x.replace('"', ''))

In [17]:
# belief-speaking and truth-seeking labels for each tweet are assigned based on
# the majority of words matching to one of the two components. If there is a 
# tie, the tweet is assigned to both components this results in
# 190650 unambiguous belief tweets
# 240302 unambiguous truth tweets
# 30613 ties with count > 0 including 607 ties with count > 1 and 13 ties with count > 2

honesty_tweets["belief"] = 0
honesty_tweets["truth"] = 0
honesty_tweets["neutral"] = 0
# unambigous majority votes
honesty_tweets.loc[honesty_tweets[honesty_tweets["belief_count"] > \
                    honesty_tweets["truth_count"]].index, "belief"] = 1
honesty_tweets.loc[honesty_tweets[honesty_tweets["truth_count"] > \
                    honesty_tweets["belief_count"]].index, "truth"] = 1

# ties
honesty_tweets.loc[honesty_tweets[(honesty_tweets["truth_count"] == \
                    honesty_tweets["belief_count"]) & \
                    (honesty_tweets["truth_count"] > 0)].index, "truth"] = 1
honesty_tweets.loc[honesty_tweets[(honesty_tweets["truth_count"] == \
                    honesty_tweets["belief_count"]) &\
                    (honesty_tweets["truth_count"] > 0)].index, "belief"] = 1

# neutral
honesty_tweets.loc[honesty_tweets[(honesty_tweets["truth_count"] == 0) & \
                    (honesty_tweets["belief_count"] == 0)].index, "neutral"] = 1

In [18]:
urls = pd.merge(honesty_tweets[["id", "belief", "truth", "neutral"]], 
         urls, how="right", left_on="id", right_on="id")
del honesty_tweets

## Add party affiliation

In [19]:
# author IDs have become converted to floats / integers and lost the last 4
# digits. We have a list of the correct IDs stored as strings and use it to
# match the corrupted IDs back to the correct ones based on the first 13 digits
# of the IDs

ids = np.loadtxt(join(src, "correct_author_ids.txt"))
ids = pd.DataFrame({"id":ids}).sort_values(by="id")
ids["id"] = ids["id"].astype(int).astype(str)
partial_ids = ids[ids["id"].apply(lambda x: len(x) >=14)].copy()
partial_ids["id_part"] = partial_ids["id"].apply(lambda x: x[0:13])
partial_ids = {row["id_part"]:row["id"] for i, row in partial_ids.iterrows()}

def match_id(old_id):
    if len(old_id) > 16:
        id_part = old_id[0:13]
        if id_part in partial_ids.keys():
            correct_id = partial_ids[id_part]
            return correct_id
    return old_id

In [20]:
# load party affiliation, strip " used to ensure author_ids are stored as
# strings and not numbers
party_affiliation = pd.read_csv(join(src, "party_affiliations_complete.csv"))
party_affiliation["author_id"] = party_affiliation["author_id"]\
    .apply(lambda x: x.replace('"', ''))

party_affiliation["author_id"] = party_affiliation["author_id"].apply(match_id)
urls["author_id"] = urls["author_id"].apply(match_id)

# merge fishy link information and information about party affiliation
urls = pd.merge(urls, party_affiliation, how="left", left_on="author_id",
    right_on="author_id")
del party_affiliation

# Create a tweet data frame

In [21]:
# the current "url" data frame contains one row per URL, i.e. the same
# tweet can be present more than once. To calculate the share of tweets with
# unreliable information, we first calculate the mean NewsGuard score (and 
# mean accuracy and transparency) per tweet by averaging over all scores 
# of URLs that are present in a given tweet and then assigning "fishy" and
# "unreliable" labels on the tweet level

# columns that are defined on the tweet level
tweet_cols = ["id", "belief", "truth", "neutral", "author_id",
              "created_at", "retweeted", "quoted", "reply", "has_url",
              "retweet_count", "reply_count", "like_count", "quote_count",
              "handle", "name", "party"]
tweets = urls[tweet_cols].drop_duplicates(subset=["id"]).copy()

## Calculate average NewsGuard score and misinfo components

In [22]:
NewsGuard_categories = ["C_0", "C_1", "C_2", "C_3", "C_4", "C_5", "C_6", "C_7", "C_8"]
average_scores = urls[["id", "Score", "transparency", "accuracy"] + NewsGuard_categories]\
    .groupby("id")\
    .agg("mean")

for cutoff in [20, 40, 60]:
    average_scores[f"fishy_{cutoff}"] = np.nan
    average_scores.loc[average_scores[\
                average_scores["Score"] < cutoff].index, f"fishy_{cutoff}"] = 1
    average_scores.loc[average_scores[\
                average_scores["Score"] >= cutoff].index, f"fishy_{cutoff}"] = 0

# round() rounds 0.5 down to 1. This behaviour is intended: if a Tweet
# contains two URLs, one which conforms to a category, and one which doesn't,
# we want to label the full tweet as not conforming to the category
def nan_round(entry):
    if entry != entry: return np.nan
    else: return round(entry)
for cat in NewsGuard_categories:
    average_scores[cat] = average_scores[cat].apply(nan_round)

## Calculate average accuracy & transparency score and unreliable domains

In [23]:
average_scores["unreliable"] = np.nan
# original definition: sources with transparency = 1 are unreliable
# since transparency can have non-integer values after averaging, we decide
# to label tweets with an average domain transparency value of links of
# <= 1.5 as "unreliable", since that means that the majority of domains 
# linked to in the tweet are unreliable. If one domain with transparency 1
# and one domain with transparency 2 are linked, the tweet is unreliable
average_scores.loc[average_scores[\
            average_scores["transparency"] <= 1.5].index, "unreliable"] = 1
average_scores.loc[average_scores[\
            average_scores["transparency"] > 1.5].index, "unreliable"] = 0
# original defintion: sources with accuracy = 1 or 2 are unreliable
# since accuracy can have non-integer values after averaging, we decide to
# label tweets with an average domain accuracy value of links of <= 2.5 as
# "unreliable", since that means that the majority of domains linked to in 
# the tweet are unreliable. If one domain with accuracy 2 and one domain 
# with accuracy 3 are linked, the tweet is unreliable.
average_scores.loc[average_scores[\
            average_scores["accuracy"] <= 2.5].index, "unreliable"] = 1
average_scores.loc[average_scores[\
            average_scores["accuracy"] > 2.5].index, "unreliable"] = 0

tweets = pd.merge(tweets, average_scores, how="left", left_on="id", right_on="id")
del average_scores

# Create a user data frame

In [24]:
users = tweets[["author_id", "handle", "name", "party", "id"]]\
    .groupby(["author_id", "handle", "name", "party"])\
    .agg("count")\
    .reset_index()\
    .rename(columns={"id":"N_tweets"})

## Add account stats

In [25]:
src = "../../data/twitter/US_politician_twitter_accounts/clean"
fname = "congress-member-unique-twitter-accounts_114-117.csv"
cols = ["followers_count", "following_count", "tweet_count", "created_at", 
        "id"]
account_stats = pd.read_csv(join(src, fname), parse_dates=["created_at"],
                            usecols=cols)
# if there is more than one entry for the same account, keep the most recent one
account_stats = account_stats\
    .sort_values("created_at", ascending=False)\
    .drop_duplicates(subset="id")\
    .rename(columns={"id":"author_id"})
account_stats["author_id"] = account_stats["author_id"].astype(str).apply(match_id)

users = pd.merge(users, account_stats, how="left", left_on="author_id", right_on="author_id")
del account_stats

## Add Congress information

In [26]:
src = "../../data/twitter/US_politician_twitter_accounts/clean"
fname = "congress-member-twitter-handles_114-117.csv"
congress_twitter_handles = pd.read_csv(join(src, fname))
congress_twitter_handles = congress_twitter_handles\
    .sort_values(by="congress", ascending=False)\
    .drop_duplicates(subset="handle")\
    .reset_index(drop=True)

users = pd.merge(users, congress_twitter_handles, how="left", left_on="handle", right_on="handle")
del congress_twitter_handles

## Add share of untrustworthy domains (NewsGuard)

In [27]:
cols = ["author_id", "fishy_60", "fishy_40", "fishy_20"]
fishy_user_count = tweets[tweets["retweeted"] == False][cols + \
    list(nutrition_categories.values())]\
    .groupby("author_id")\
    .agg(["sum", "count"])

fishy_user_count["fishy_share_60"] = fishy_user_count["fishy_60"]["sum"] / \
                            fishy_user_count["fishy_60"]["count"]
fishy_user_count["fishy_share_40"] = fishy_user_count["fishy_40"]["sum"] / \
                            fishy_user_count["fishy_40"]["count"]
fishy_user_count["fishy_share_20"] = fishy_user_count["fishy_20"]["sum"] / \
                            fishy_user_count["fishy_20"]["count"]

for col in nutrition_categories.values():
    fishy_user_count[f"{col}_share"] = fishy_user_count[col]["sum"] / \
                                       fishy_user_count[col]["count"]
    
# flatten the hierarchical indices
fishy_user_count = fishy_user_count.reset_index()
fishy_user_count.columns = ['_'.join(col).strip("_") \
                            for col in fishy_user_count.columns.values]

fishy_user_count.head(2)

Unnamed: 0,author_id,fishy_60_sum,fishy_60_count,fishy_40_sum,fishy_40_count,fishy_20_sum,fishy_20_count,C_0_sum,C_0_count,C_1_sum,...,fishy_share_20,C_0_share,C_1_share,C_2_share,C_3_share,C_4_share,C_5_share,C_6_share,C_7_share,C_8_share
0,1009269193,0.0,221,0.0,221,0.0,221,0.0,221,1.0,...,0.0,0.0,0.004525,0.208145,0.022727,0.0,0.330317,0.070588,0.081448,0.099548
1,1011053278304592000,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,,,,,,,,,,


In [28]:
cols = ["fishy_share_60", "fishy_share_40", "fishy_share_20", "C_0_share",
        "C_1_share", "C_2_share", "C_3_share", "C_4_share", "C_5_share",
        "C_6_share", "C_7_share", "C_8_share", "author_id"]
users = pd.merge(users, fishy_user_count[cols], how="left", left_on="author_id",
         right_on="author_id")

## Add average NewsGuard score

In [29]:
average_NG_scores = tweets[tweets["retweeted"] == False][["author_id", "Score"]]\
    .groupby("author_id")\
    .mean()\
    .reset_index()\
    .rename(columns={"Score":"NG_score_mean"})
users = pd.merge(users, average_NG_scores, how="left", left_on="author_id", right_on="author_id")

## Add average accuracy & transparency score

In [32]:
average_accuracy_transparency = tweets[tweets["retweeted"] == False][["author_id", "accuracy", "transparency"]]\
    .groupby("author_id")\
    .mean()\
    .reset_index()\
    .rename(columns={"accuracy":"accuracy_mean", "transparency":"transparency_mean"})
users = pd.merge(users, average_accuracy_transparency, how="left", left_on="author_id", right_on="author_id")

## Add share of unstrustworthy domains (independent list)

In [33]:
unreliable_user_count = tweets[tweets["retweeted"] == False][["author_id", "unreliable"]]\
    .groupby("author_id")\
    .agg(["sum", "count"])

unreliable_user_count["unreliable_share"] = unreliable_user_count["unreliable"]["sum"] / \
                            unreliable_user_count["unreliable"]["count"]
    
# flatten the hierarchical indices
unreliable_user_count = unreliable_user_count.reset_index()
unreliable_user_count.columns = ['_'.join(col).strip("_") \
                            for col in unreliable_user_count.columns.values]

users = pd.merge(users, unreliable_user_count[["author_id", "unreliable_share"]],
                 how="left", left_on="author_id", right_on="author_id")
del unreliable_user_count

## Add share of belief-speaking and truth-seeking

In [34]:
honesty_tweets = tweets[tweets["retweeted"] == False][["author_id", "belief", "truth", "created_at"]]\
    .dropna(subset=["belief", "truth"]).copy()

In [35]:
# all honesty component tweets
honesty_label_count = honesty_tweets[["author_id", "belief", "truth"]]\
    .groupby("author_id")\
    .agg(["sum", "count"])

for col in ["belief", "truth"]:
    honesty_label_count[f"{col}_share"] = honesty_label_count[col]["sum"] / \
    honesty_label_count[col]["count"]
    
honesty_label_count.columns = ['_'.join(col).strip("_") \
                            for col in honesty_label_count.columns.values]
honesty_label_count = honesty_label_count.reset_index()
#honesty_label_count["author_id"] = honesty_label_count["author_id"].apply(match_id)

In [36]:
honesty_tweets = honesty_tweets.set_index("created_at")

In [37]:
# only first 4 years
honesty_label_count_first = honesty_tweets[honesty_tweets.index.year <= 2013]\
    .groupby("author_id")\
    .agg(["sum", "count"])

for col in ["belief", "truth"]:
    honesty_label_count_first[f"{col}_share_2010_to_2013"] = honesty_label_count_first[col]["sum"] / \
    honesty_label_count_first[col]["count"]
    
honesty_label_count_first.columns = ['_'.join(col).strip("_") \
                            for col in honesty_label_count_first.columns.values]
honesty_label_count_first = honesty_label_count_first.reset_index()
honesty_label_count_first["author_id"] = honesty_label_count_first["author_id"].apply(match_id)
cols = ["belief_sum", "belief_count", "truth_sum", "truth_count"]
honesty_label_count_first = honesty_label_count_first\
    .rename(columns={col:col + "_2010_to_2013" for col in cols}) 

In [38]:
# only last 4 years
honesty_label_count_last = honesty_tweets[honesty_tweets.index.year >= 2019]\
    [["author_id", "belief", "truth"]]\
    .groupby("author_id")\
    .agg(["sum", "count"])

for col in ["belief", "truth"]:
    honesty_label_count_last[f"{col}_share_2019_to_2022"] = honesty_label_count_last[col]["sum"] / \
    honesty_label_count_last[col]["count"]
    
honesty_label_count_last.columns = ['_'.join(col).strip("_") \
                            for col in honesty_label_count_last.columns.values]
honesty_label_count_last = honesty_label_count_last.reset_index()
honesty_label_count_last["author_id"] = honesty_label_count_last["author_id"].apply(match_id)
cols = ["belief_sum", "belief_count", "truth_sum", "truth_count"]
honesty_label_count_last = honesty_label_count_last\
    .rename(columns={col:col + "_2019_to_2022" for col in cols}) 

In [39]:
users = users.merge(honesty_label_count[["author_id", "belief_share", 
                    "truth_share"]], how="left", left_on="author_id", 
                    right_on="author_id")
del honesty_label_count

users = users.merge(honesty_label_count_first[["author_id", "belief_share_2010_to_2013",
                        "truth_share_2010_to_2013"]], how="left", left_on="author_id", 
                         right_on="author_id")
del honesty_label_count_first

users = users.merge(honesty_label_count_last[["author_id","belief_share_2019_to_2022",
                        "truth_share_2019_to_2022"]], how="left", left_on="author_id", 
                         right_on="author_id")
del honesty_label_count_last

## Add share of neutral tweets

In [40]:
honesty_tweets = honesty_tweets.reset_index()
neutral_count = honesty_tweets[honesty_tweets[["belief", "truth"]]\
    .sum(axis=1) == 0][["author_id", "created_at"]]\
    .groupby("author_id")\
    .agg("count")\
    .reset_index()\
    .rename(columns={"created_at":"neutral_count"})
neutral_count["author_id"] = neutral_count["author_id"].apply(match_id)

users = pd.merge(users, neutral_count, how="left", left_on="author_id",
         right_on="author_id").dropna(subset=["neutral_count"])
users["neutral_share"] = users["neutral_count"] / users["N_tweets"]
users = users.drop(columns=["neutral_count"])
del honesty_tweets
del neutral_count

## Add ideology scores

In [41]:
src = "../../data/twitter"
fname = "govtrack-stats-{}-{}-ideology.csv"
ideology_scores = pd.DataFrame()
for year in range(2013, 2021):
    for chamber in ["house", "senate"]:
        tmp = pd.read_csv(join(src, "ideology_scores",
                               fname.format(year, chamber)))
        tmp["year"] = year
        tmp["name"] = tmp["name"].apply(lambda x: x.replace("b'", ""))
        tmp["name"] = tmp["name"].apply(lambda x: x.replace("'", "").lower())
        ideology_scores = pd.concat([ideology_scores, tmp])

In [42]:
# match politician Twitter account names to govtrack politician names

# a single politician can have at maximum 8 entries for 8 different years
# 2013 to 2020
counts = ideology_scores["name"].value_counts()
unique_names = list(counts[counts <= 8].index)

unique_scores = ideology_scores[ideology_scores["name"].isin(unique_names)]\
    .sort_values(by="year", ascending=False)\
    .drop_duplicates(subset=["name"])\
    .set_index("name")
unique_names = list(set(unique_scores.index))

def match_score(account_name):
    '''Matches govtrack politician names to Twitter account names.'''
    if account_name == account_name:
        account_name = set(account_name.lower().split(" "))
        for name in unique_names:
            # hard matching: if the govtrack name string is completely included
            # in the Twitter account name string, record a match
            if name in account_name:
                return unique_scores.loc[name]["id"]
    else:
        return np.nan
    
users["ideology_score_id"] = users["name"].apply(match_score)

In [43]:
# add hand-matched missing scores
src = "../../data/twitter"
fname = "missing_govtrack_ideology_scores.csv"
missing_scores = pd.read_csv(join(src, fname))
missing_scores = {row["handle"]:row["ideology_score_id"] \
                  for i, row in missing_scores.iterrows()}

# merge on the handle since this seems to be the most consistent index between
# the two datasets
users = users.set_index("handle")
for handle, score_id in missing_scores.items():
    users.loc[handle, "ideology_score_id"] = score_id
users = users.reset_index()

In [44]:
# for many accounts, there is more than one ideology score since they were 
# active over many years. We calculate the mean, std and count of the ideology
# score for each user and add this information to the user_df
ideology_scores_agg = ideology_scores[["id", "ideology"]]\
    .groupby("id")\
    .agg(["mean", "std", "count"])
ideology_scores_agg = ideology_scores_agg.reset_index()
ideology_scores_agg.columns = ['_'.join(col).strip("_") \
                            for col in ideology_scores_agg.columns.values]

In [45]:
users = users.merge(ideology_scores_agg, how="left", 
                      left_on="ideology_score_id", right_on="id")
del ideology_scores
del ideology_scores_agg

## Add Politifact scores

In [46]:
src = "../../data"
fname = "misinfo_score_politifact.csv"
pf_scores = pd.read_csv(join(src, fname), 
        usecols=["pf_score", "elite_account"])\
    .rename(columns={"elite_account":"handle"})

users = pd.merge(users, pf_scores, how="left", left_on="handle", right_on="handle")
del pf_scores

# Data exports

In [47]:
dst = "../../data/twitter"

In [184]:
urls.to_csv(join(dst, "US_URLs_2010-11-06_to_2022-03-16.csv.gzip"),
               index=False, compression="gzip")

In [48]:
users.to_csv(join(dst, "US_politician_twitter_account_stats_2010-11-06_to_2022-03-16.csv"),
               index=False)

In [186]:
tweets[tweets["fishy_60"] == 1].to_csv(join(dst, "US_tweets_with_dodgy_links.csv"),
                                            index=False)

In [187]:
tweets[tweets["has_url"] == True].to_csv(join(dst, "US_tweets_with_urls.csv.gzip"),
                                            index=False, compression="gzip")

In [188]:
tweets.to_csv(join(dst, "US_politician_tweets_2010-11-06_to_2022-03-16.csv.gzip"),
                        index=False, compression="gzip")