In [7]:
import configparser
import csv
import gc
import json
import os.path
from datetime import datetime
from pathlib import Path
from urllib.parse import urlparse

import pandas as pd

import requests
import tweepy
from dateutil.parser import parse

try:  # for notebooks
    get_ipython
    from tqdm._tqdm_notebook import tqdm_notebook as tqdm
except:  # for commandline
    from tqdm import tqdm
tqdm.pandas()

Config = configparser.ConfigParser()
Config.read('../config.cnf')

consumer_key = Config.get('twitter_keys', 'consumer_key')
consumer_secret = Config.get('twitter_keys', 'consumer_secret')
access_token = Config.get('twitter_keys', 'access_token')
access_token_secret = Config.get('twitter_keys', 'access_token_secret')

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
# set up access to the Twitter API
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [3]:
data_dir = Path("../data/")
queries_dir = data_dir / "queries/"
output = data_dir / "refetched/"

In [4]:
with open(str(queries_dir / 'queryInfo.txt'), 'r') as q:
    q.readline()
    for f in q:
        query = f.split('\t')[0]
        relevant = query.split(' ')[0]
        if 'chicago' in relevant:
            break

row_counts = {}
with open(str(queries_dir / 'row_counts.txt')) as f:
    reader = csv.reader(f, delimiter='\t')
    for row in reader:
        row_counts[row[0]] = int(row[1])

In [54]:
df = pd.read_csv("../data/refetched/wired AND science.csv", dtype={'tweet_id':str,'retweet_id':str}, parse_dates=['posted_on'])
df.describe()

Unnamed: 0,tweet_id,posted_on,tweet,truncated,refetched,error,retweet_id,retweet_truncated
count,38282.0,38282,38282,38282,38282,303,15474.0,38282
unique,38281.0,35391,38282,2,2,4,3004.0,2
top,7.948450047872369e+17,2016-12-16 15:12:03,"{""retweet_count"": 0, ""in_reply_to_user_id_str""...",False,False,"[{'message': 'No status found with that ID.', ...",7.981638823847566e+17,False
freq,2.0,40,1,37384,37687,263,426.0,36927
first,,2016-11-01 00:00:19,,,,,,
last,,2017-01-18 20:03:10,,,,,,


In [55]:
rts = df[df.retweet_id.notnull()].retweet_id
unique_rts = set(rts)
print(100*len(set(df.tweet_id).intersection(unique_rts))/len(unique_rts))

91.61118508655126


In [103]:
def get_tweet_urls(t):
    ''' 
    Given a Tweet JSON, pull the URLs found inside it
    '''
    try: 
        return get_urls(t['entities']['urls'])
    except:
        return []
    
def get_retweet_urls(t):
    '''
    Given a Tweet JSON, pull the URLs of the Tweet this tweet retweeted
    '''
    try: 
        return get_urls(t['retweeted_status']['entities']['urls'])
    except:
        return []

def get_urls(urls):
    ''' 
    Generic function to extract the URLs from the urls sub-object
    '''
    try: 
        urls = [v for (k,v) in urls[0].items() if k in ('url', 'expanded_url')]
        return list(set(urls))
    except:
        return []
    
tweets['tweet_urls'] = tweets.tweet.map(get_tweet_urls)
tweets['retweet_urls'] = tweets.tweet.map(get_retweet_urls)
# del tweets['tweet']


In [94]:
def match_urls(urls, to_match):
    '''
    urls: all the URLs found in the tweet or retweet
    to_match: a substring to figure out what URLs are actually relevant
    '''
    return [url for url in urls if to_match in url]

# Note: passing in "globe" as a way of identifying globe and mail URLs
tweets['relevant_urls'] = tweets.apply(lambda row: match_urls(row['tweet_urls']+row['retweet_urls'], relevant), axis=1)

ValueError: could not broadcast input array from shape (2) into shape (4)

In [32]:
def clean_url(url):
    '''
    Strip out trailing slashes, URL query variables, anchors, etc. 
    '''
    try:
        up = urlparse(url)
        domain = '.'.join(up.netloc.split('.')[-2:]).strip()
        path = up.path.strip('/').strip()
        return '%s/%s' % (domain, path)
    except:
        raise

# clean all of the URLs
tweets['relevant_urls'] = tweets.relevant_urls.map(lambda urls: list(set([clean_url(url) for url in urls])))

# Pick the first
tweets['clean_url'] = tweets.relevant_urls.map(lambda x: x[0] if len(x) > 0 else '')

In [33]:
gc.collect()

7

In [34]:
no_relevant = tweets[tweets.relevant_urls.map(len) == 0]
shortened_urls = set(no_relevant.tweet_urls.sum() + no_relevant.retweet_urls.sum())
shortened_urls = [url for url in shortened_urls if 'twitter.com' not in url]

print("Found %s non-relevant URLs" % len(shortened_urls))

7319


In [35]:
if os.path.exists('shortened_urls.txt'):
    writemode = 'w'
else:
    writemode = 'a'

with open('shortened_urls.txt', writemode) as f:
    if writemode == 'w':
        f.write('short_url\n')
    for url in shortened_urls:
        f.write('%s\n' % url)

In [36]:
# Read in the resolved URLs and create a Map (Dict)
resolved_urls = pd.read_csv('resolved_urls.csv')
resolved_urls.dropna(subset=['url'], inplace=True)
resolved_urls = resolved_urls.set_index('short_url').to_dict()['url']

In [37]:
def expand_urls(urls):
    '''
    Look up the the resolved URL from the map
    '''
    global resolved_urls
    return [resolved_urls[url] for url in urls if url in resolved_urls]

tweets['expanded_tweet_urls'] = tweets.apply(lambda row: expand_urls(row['tweet_urls']) if len(row['relevant_urls']) == 0 else [], axis=1)
tweets['expanded_retweet_urls'] = tweets.apply(lambda row: expand_urls(row['retweet_urls']) if len(row['relevant_urls']) == 0 else [], axis=1) 

In [150]:
# Combine the expanded URLs from tweets and retweets
tweets['expanded_relevant_urls'] = tweets.apply(lambda row: match_urls(row['expanded_tweet_urls']+row['expanded_retweet_urls'], 'globe'), axis=1)

In [151]:
# Clean up the expanded URLs
tweets['expanded_relevant_urls'] = tweets.expanded_relevant_urls.map(lambda urls: list(set([clean_url(url) for url in urls])))

# Pick the first one of the URLs (if there is one)
tweets['expanded_clean_url'] = tweets.expanded_relevant_urls.map(lambda x: x[0] if len(x) > 0 else '')

In [152]:
tweets.clean_url.map(lambda x: len(x) > 0).sum()

21346

In [95]:
tweets['clean_url'] = tweets.apply(lambda row: row['clean_url'] if len(row['clean_url']) > 0 else row['expanded_clean_url'], axis=1)



In [38]:
# tweets = tweets[['tweet_id', 'clean_url']]
# gc.collect()

In [None]:
# def get_linked_tweet(urls):
#     twitter_urls = match_urls(urls, 'twitter.com')
#     try:
#         tweet_id = twitter_urls[0].split('/')[-1]
#         if tweet_id.isnumeric():
#             return int(tweet_id)
#     except:
#         pass
#     return 0
    
# tweets['link_to_tweet'] = tweets.apply(lambda row: get_linked_tweet(row['tweet_urls'] + row['retweet_urls']), axis=1)

# # sometimes the tweet link is to itself. Set to 0
# tweets['link_to_tweet'] = tweets.apply(lambda row: row['link_to_tweet'] if row['link_to_tweet'] != row['tweet_id'] else 0, axis=1)

In [None]:
# Try ths again after finding more relevant URLs. This is currently not yielding anything.

# df = tweets[tweets.clean_url.map(len) > 0][['tweet_id', 'clean_url']]
# df.columns = ['tweet_id', 'link_to_tweet_clean_url']
# df.sample(10)
# df2 = tweets.merge(df, left_on='link_to_tweet', right_on='tweet_id', how='left')

In [29]:
# mentions = pd.read_excel('theglobeandmail.xlsx')
mentions = pd.read_excel('News_mentions_2017.xlsx')
mentions = mentions[['Altmetric_ID', 'Url']]
mentions['clean_url'] = mentions.Url.map(clean_url)
mentions.sample(10)
df = tweets.merge(mentions, left_on='clean_url', right_on='clean_url', how='left')
len(df.Altmetric_ID.unique())

NameError: name 'clean_url' is not defined