In [6]:
import pandas as pd
from pathlib import Path
import csv
import operator
import json
from os import listdir
from pprint import pprint

In [7]:
queries = {
    'bostonglobe': ['science', 'science_extended'],
    'chicago': ['science'],
    'foxnews': ['science'],
    'theguardian': ['science'],
    'iflscience': [],
    'latimes': ['science'],
    'nytimes': ['science'],
    'sfchronicle': ['science'],
    'slate': ['bad_astronomy', 'climate_desk', 'future_tense', 'health_and_science'],
    'theglobeandmail': ['science'],
    'washingtonpost': ['animalia', 'energy-environment', 'speaking-of-science', 'to-your-health'],
    'wired': ['science']
}

news_short_altmetric = {
    'bostonglobe': ['The Boston Globe'],
    'chicago': ['Chicago Sun-Times'],
    'foxnews': ['FOX News'],
    'theguardian': ['The Guardian'],
    'iflscience': [],
    'latimes': ['LA Times'],
    'nytimes': ['New York Times'],
    'sfchronicle': ['San Francisco Chronicle'],
    'slate': ['Slate Magazine', 'Slate France'],
    'theglobeandmail': ['The Globe and Mail'],
    'washingtonpost': ['Washington Post'],
    'wired': ['Wired.it', 'Wired.com', 'Wired.co.uk']
}

news_altmetric_short = {}
for a, ss in news_short_altmetric.items():
    for s in ss:
        news_altmetric_short[s] = a

In [8]:
news_altmetric_short

{'Chicago Sun-Times': 'chicago',
 'FOX News': 'foxnews',
 'LA Times': 'latimes',
 'New York Times': 'nytimes',
 'San Francisco Chronicle': 'sfchronicle',
 'Slate France': 'slate',
 'Slate Magazine': 'slate',
 'The Boston Globe': 'bostonglobe',
 'The Globe and Mail': 'theglobeandmail',
 'The Guardian': 'theguardian',
 'Washington Post': 'washingtonpost',
 'Wired.co.uk': 'wired',
 'Wired.com': 'wired',
 'Wired.it': 'wired'}

In [10]:
queries = pd.read_csv("../../data/queries.csv", index_col="id")
queries.head()

Unnamed: 0_level_0,query,venue_name,venue_short,relevant_terms,found_tweets
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,bostonglobe AND science_extended,The Boston Globe,bostonglobe,science_extended,2287
2,bostonglobe AND science,The Boston Globe,bostonglobe,science,29963
3,chicago AND suntimes AND science,The Chicago Suntimes,chicago,science,2003
4,foxnews AND science,FOX News,foxnews,science,431310
5,guardian AND science,The Guardian,theguardian,science,130078


In [11]:
queries.groupby("venue_short")['relevant_terms'].apply(list)

venue_short
bostonglobe                              [science_extended, science]
chicago                                                    [science]
foxnews                                                    [science]
iflscience                                                     [nan]
latimes                                                    [science]
nytimes                                                    [science]
sfchronicle                                                [science]
slate              [bad_astronomy, climate_desk, future_tense, he...
theglobeandmail                                            [science]
theguardian                                                [science]
washingtonpost     [animalia, energy-environment, speaking-of-sci...
wired                                                      [science]
Name: relevant_terms, dtype: object

In [12]:
queries.groupby("venue_name")['venue_short'].unique()

venue_name
FOX News                        [foxnews]
IFLScience                   [iflscience]
SF Chronicle                [sfchronicle]
Slate                             [slate]
The Boston Globe            [bostonglobe]
The Chicago Suntimes            [chicago]
The Globe and Mail      [theglobeandmail]
The Guardian                [theguardian]
The LA Times                    [latimes]
The NY Times                    [nytimes]
The Washington Post      [washingtonpost]
Wired                             [wired]
Name: venue_short, dtype: object

In [33]:
data_dir = Path("../../data/")

all_tweets_dir = data_dir / "all_tweets"
selected_tweets_dir = data_dir / "refetched_tweets"
urls_dir = data_dir / "twitter_urls"
urls_cleaned_dir = data_dir / "twitter_urls_cleaned"

In [129]:
f = "chicago AND suntimes AND science.csv"
df = pd.read_csv(all_tweets_dir / f, index_col="tweet_id")
df2 = pd.read_csv(selected_tweets_dir / f, index_col="tweet_id")

In [109]:
cols = ['entities', 'quoted_status_id', 'retweeted_status_id']
for col in cols:
    df2[col] = None

In [125]:
df2.drop(["retweet_id", "retweet_truncated"], axis=1, inplace=True)
for ix, row in df2.iterrows():
    tweet = json.loads(row['tweet'])
    df2.loc[ix, 'entities'] = str(tweet['entities'])
    df2.loc[ix, 'truncated'] = str(tweet['truncated'])
    if 'quoted_status_id_str' in tweet:
        df2.loc[ix, 'quoted_status_id'] = tweet['quoted_status_id_str']
    if 'retweeted_status' in tweet:
        df2.loc[ix, 'retweeted_status_id'] = tweet['retweeted_status']['id_str']
df2.drop("tweet", axis=1)

Unnamed: 0_level_0,posted_on,truncated,refetched,error,entities,retweeted_status_id,quoted_status_id
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
794830803037519876,2016-11-05 09:16:45,False,False,,"{'user_mentions': [], 'urls': [{'expanded_url'...",,
794830803037519876,2016-11-05 09:16:45,False,False,,"{'user_mentions': [], 'urls': [{'expanded_url'...",,
794830803037519876,2016-11-05 09:16:45,False,False,,"{'user_mentions': [], 'urls': [{'expanded_url'...",,
794846955365105664,2016-11-05 10:20:56,False,False,,"{'user_mentions': [{'name': 'Pat Kiernan', 'in...",794830803037519876,
795006492189528064,2016-11-05 20:54:52,False,False,,"{'user_mentions': [{'name': 'Pat Kiernan', 'in...",794830803037519876,
797141624199315456,2016-11-11 18:19:08,False,False,,"{'user_mentions': [], 'urls': [{'expanded_url'...",,
797142522174717953,2016-11-11 18:22:42,False,True,,"{'user_mentions': [], 'media': [{'media_url_ht...",,
798616236930187264,2016-11-15 19:58:43,True,False,"[{'code': 144, 'message': 'No status found wit...","{'user_mentions': [{'name': 'Jordan Owen', 'in...",,
800326942402580480,2016-11-20 13:16:27,False,True,,{'user_mentions': [{'name': 'ACS PMSE Division...,,
800362813701111808,2016-11-20 15:38:59,False,False,,{'user_mentions': [{'name': 'ACS PMSE Division...,800326942402580480,
