In [1]:
from twarc import Twarc2, expansions
import datetime
import json
import pandas as pd
import time
from glob import glob

In [2]:
client = Twarc2(bearer_token="")

In [3]:
# A list for the 16 media that will be used in this study 
media = ["CNN", "nytimes", "FoxNews", "WSJ", "washingtonpost", "TIME", "ABC", "HuffPost", "NBCNews", "NewYorker", "NPR", "CBSNews", 
         "business", "USATODAY", "MSNBC", "Newsweek"]

In [4]:
# read the company dataset
corporation = pd.read_csv ("fortune500.csv")

In [5]:
corporation.head()

Unnamed: 0,company,location,industry,state,city,twitter,followers_m
0,Walmart,"Bentonville, AR",General Merchandisers,AR,Bentonville,Walmart,1.2
1,Exxon Mobil,"Irving, TX",Petroleum Refining,TX,Irving,exxonmobil,0.3
2,Chevron,"San Ramon, CA",Petroleum Refining,CA,San Ramon,Chevron,0.4
3,Apple,"Cupertino, CA","Computers, Office Equipment",CA,Cupertino,AppleSupport,1.4
4,General Motors,"Detroit, MI",Motor Vehicles and Parts,MI,Detroit,GM,0.8


### Retrieve media tweets

In [6]:
company = list(corporation["company"])

In [7]:
query = []

In [8]:
# create queries to get media tweets
for c in company:
    for m in media:
        q = f"{c} from:{m}"  # only retrieve tweets that include the company names from the media's Twitter account
        query.append(q)

In [9]:
len(query)

4720

In [10]:
# use chunks to handle large file downloading
chuncks = [query[x:x+10] for x in range(0, len(query), 10)]

In [11]:
len(chuncks)

472

In [12]:
# Specify the start time and end time for the time period you want Tweets from
start_time = datetime.datetime(2021, 1, 1, 0, 0, 0, 0, datetime.timezone.utc)
end_time = datetime.datetime(2022, 1, 1, 0, 0, 0, 0, datetime.timezone.utc)

In [None]:
# downloading
i= 0
for chunck in chuncks:
    i+=1 
    media_posts = pd.DataFrame()
    
    for q in chunck:
        search_results = client.search_all(query=q, start_time=start_time, end_time=end_time, max_results=100)
        for page in search_results:
            result = expansions.flatten(page)
            for tweet in result:
                media_posts = media_posts.append(tweet,ignore_index=True)

        media_posts.to_json(f'{i}_datafile.jsonl')
        time.sleep(2)


In [None]:
# join all media tweets together
files = glob('media/*.jsonl')
media_full = pd.DataFrame()
for filename in files:
    df = pd.read_json(filename)
    media_full = media_full.append(df, ignore_index=True)

In [None]:
len(media_full)

In [None]:
media_full.head()

In [None]:
media_full.tail()

In [None]:
media_full.to_json("media.jsonl")

### Retrieve corporate tweets

In [13]:
cor_twitter = list(corporation["twitter"])

In [14]:
len(cor_twitter)

295

In [None]:
# create queries based on corporate Twitter account, the start time and end time are the same as the media tweets.
for c in cor_twitter:
    corporate_posts = pd.DataFrame()
    search_results = client.search_all(query=f'from:{c} -is:reply', start_time=start_time, end_time=end_time, max_results=100)
    for page in search_results:
        result = expansions.flatten(page)
        for tweet in result:
            corporate_posts = corporate_posts.append(tweet,ignore_index=True)

    # save the data per corporation
    corporate_posts.to_json(f'{c}.jsonl')
    time.sleep(2)

In [None]:
# join all corporate tweets together
files = glob('corporation/*.jsonl')
cor_full = pd.DataFrame()
for filename in files:
    df = pd.read_json(filename)
    cor_full = cor_full.append(df, ignore_index=True)

In [None]:
len(cor_full)

In [None]:
cor_full.head()

In [None]:
cor_full.tail()

In [None]:
cor_full.to_json("corporation.jsonl")