# Get ULRs tweeted by US politicians

In [1]:
import pandas as pd
import numpy as np
from os.path import join
from os import listdir

import sys
sys.path.append('../../../../utilities/twitter_functions')
from twitter_functions import extract_domain

## Extract shortened URLs

In [48]:
src = "../../data/twitter"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2021-12-14_clean.csv.gzip"

df = pd.read_csv(join(src, fname),
                 compression="gzip",
                 usecols=["id", "author_id", "created_at", "expanded_urls"])

In [49]:
df["expanded_urls"] = df["expanded_urls"].fillna("[]")
df["expanded_urls"] = df["expanded_urls"].apply(lambda x: eval(x))
df["has_url"] = df["expanded_urls"].apply(lambda x: len(x) > 0)
df = df[df["has_url"]].drop(columns=["has_url"])

In [50]:
URLs = []
for url_list in df["expanded_urls"]:
    URLs.extend(url_list)
URLs = pd.DataFrame({"url":list(set(URLs))})
URLs["domain"] = URLs["url"].apply(extract_domain)

In [51]:
# shorteners from this repo: https://github.com/boutetnico/url-shorteners
# NOTE: update list periodically
shortener_dst = "../../../../utilities/url_shorteners/url-shorteners"
url_shorteners = list(np.loadtxt(join(shortener_dst, "list.txt"), dtype=str))

# add URL shorteners based on manual inspections of all URLs that appeared >100
# times in the dataset
url_shorteners.extend([
    "fb.me", "buff.ly", "nyti.ms", "wapo.st", "youtu.be", "1.usa.gov", "fxn.ws",
    "on.fb.me", "politi.co", "trib.al", "washex.am", "hill.cm", "cnb.cx",
    "hubs.ly", "cs.pn","n.pr", "conta.cc", "mi.tt", "usat.ly", "abcn.ws",
    "reut.rs", "cbsn.ws", "huff.to", "instagr.am", "bloom.bg", "fw.to", 
    "ift.tt", "strib.mn", "lat.ms", "afs.mn", "dpo.st", "mailchi.mp",
    "dailysign.al", "tmblr.co", "rub.io", "yhoo.it", "omny.fm", "chrl.ie",
    "tulsi.to", "apne.ws", "hrc.io", "ed.gr", "ti.me", "herit.ag", "indy.st",
    "ofa.bo", "trib.in", "azc.cc", "bsun.md", "wjcf.co", "bityl.co", "go.shr.lc"
])

In [52]:
shortened_urls = URLs[URLs["domain"].isin(url_shorteners)]
unshortened_urls = URLs[~URLs["domain"].isin(url_shorteners)]
shortened_urls["url"].to_csv("url_list.csv.gzip", compression="gzip", index=False)

In [53]:
shortened_urls["url"][1204 * 100:].to_csv("url_list2.csv.gzip", compression="gzip", index=False)

## Load unraveled URLs

Unraveling is done with the `unrael_url` utility on the server.

In [63]:
! rsync -avze ssh jlasser@medea:/home/jlasser/Honesty-project/data/twitter/unraveled_urls/ ../../data/twitter/unraveled_urls/

receiving incremental file list

sent 20 bytes  received 31,364 bytes  20,922.67 bytes/sec
total size is 6,362,451  speedup is 202.73


In [None]:
! rsync -avze ssh jlasser@medea:/home/jlasser/Honesty-project/data/twitter/unraveled_urls2/ ../../data/twitter/unraveled_urls2/

In [85]:
src = '../../data/twitter/unraveled_urls'

files = listdir(src)
unraveled_urls = pd.DataFrame()
for f in files:
    tmp = pd.read_csv(join(src, f), compression="gzip")
    unraveled_urls = pd.concat([unraveled_urls, tmp])

In [86]:
# note: unravelling crashed once because of a malformatted string and had to
# be started again, to unravel the remaining urls in a separate folder
src = '../../data/twitter/unraveled_urls2'

files = listdir(src)
for f in files:
    tmp = pd.read_csv(join(src, f), compression="gzip")
    unraveled_urls = pd.concat([unraveled_urls, tmp])
unraveled_urls = unraveled_urls.reset_index(drop=True)

## Add hosts from timeouts

In [87]:
timeouts = len(unraveled_urls) - len(unraveled_urls["status_code"].dropna())
print("{} timeouts ({:1.2f}%)".format(\
        timeouts,
        (timeouts / len(unraveled_urls["status_code"].dropna()) * 100)))

15114 timeouts (10.26%)


In [88]:
def extract_host(unraveled_url):
    if unraveled_url == unraveled_url and unraveled_url.startswith("Cannot"):
        host = unraveled_url.split(" ")[4].split(":")[0]
        return host
    else:
        return unraveled_url

In [89]:
unraveled_urls["unraveled_url"] = unraveled_urls["unraveled_url"].apply(extract_host)

In [90]:
dst = '../../data/twitter'
unraveled_urls.to_csv(join(dst, "unraveled_urls.csv.xz"), index=False,
                      compression="xz")