# Barstool bloggers on Twitter

In [1]:
import pandas as pd
import glob
import json

In [2]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

#### Read blogger list into dataframe

In [3]:
df = pd.read_csv('data/processed/barstool_bloggers.csv')

In [4]:
twitter_handles = df.twitter.to_list()

In [5]:
twitter_handles[0:1]

['stoolpresidente']

---

#### Last update: June 27

In [6]:
for handle in twitter_handles:
    !twarc2 timeline --use-search --no-context-annotations --limit 10 {handle} data/raw/barstool_tweets/{handle}.jsonl

Set --limit of 10 reached:   0%| | Processed 4 days/13 years [00:01<33:46, 100 t
Set --limit of 10 reached:   0%| | Processed 2 days/12 years [00:02<1:05:15, 100
Set --limit of 10 reached:   0%| | Processed 3 days/12 years [00:01<40:46, 100 t
Set --limit of 10 reached:  16%|▏| Processed 1 year, 10 months/11 years [00:01<0
Set --limit of 10 reached:   0%| | Processed 6 days/11 years [00:02<22:09, 100 t
Set --limit of 10 reached:   0%| | Processed 6 days/11 years [00:01<20:43, 100 t
Set --limit of 10 reached:   1%| | Processed a month/11 years [00:02<03:24, 100 
Set --limit of 10 reached:   0%| | Processed 4 days/10 years [00:02<27:09, 100 t
Set --limit of 10 reached:   5%| | Processed 7 months/11 years [00:01<00:32, 100
Set --limit of 10 reached:   0%| | Processed 5 days/9 years [00:02<25:59, 100 tw
Set --limit of 10 reached:   0%| | Processed 6 days/8 years [00:02<19:37, 100 tw
Set --limit of 10 reached:   2%| | Processed a month/10 years [00:02<02:24, 100 
Set --limit of 10 reached:  

----

#### Get the path to each member's tweet file

In [7]:
file_path = "data/raw/barstool_tweets/"
json_files = glob.glob(file_path + "*.jsonl")

In [8]:
len(json_files)

105

In [9]:
# json_files

#### Read all the json files, loop and snag values that interest us 

In [10]:
%%time

jsons = []
data_list = []

for f in json_files:
    with open(f) as file:
        for line in file:
            jsons.append(json.loads(line))
        
for j in jsons:
    for d in j['data']:
        datadict = {
            "author_id":d['author_id'],
            "conversation_id": d['conversation_id'],
            "tweet_id": d['id'],
            "tweet_text": d['text'],
            "source": d['source'],
            "created_date_uct": d['created_at'],
            "retweets": pd.json_normalize(d['public_metrics'])['retweet_count'][0],
            "likes": pd.json_normalize(d['public_metrics'])['like_count'][0],
            "quotes": pd.json_normalize(d['public_metrics'])['quote_count'][0],
        }
        data_list.append(datadict)

CPU times: user 4.27 s, sys: 176 ms, total: 4.45 s
Wall time: 4.38 s


#### Convert the list of dicts to a pandas dataframe

In [11]:
src = pd.DataFrame(data_list)

#### How many Twitter users did we capture? 

In [12]:
len(src.author_id.unique())

105

#### Drop dupes because the requests might overlap

In [13]:
src = src.drop_duplicates(subset='tweet_id')

#### How many tweets? 

In [14]:
len(src)

10379

In [15]:
src.to_csv('data/processed/barstool_blogger_tweets_1000.csv', index=False)

In [16]:
src.head()

Unnamed: 0,author_id,conversation_id,tweet_id,tweet_text,source,created_date_uct,retweets,likes,quotes
0,34703080,1478611086442000384,1478683395035181057,@maxstarks78 @CFBHOF @Bengals @NDFootball Truly a legend RIP 🪦,Twitter for iPhone,2022-01-05T11:02:48.000Z,0,4,0
1,34703080,1471794263641964544,1471794263641964544,"RT @LRiddickESPN: If you know @DeionSanders, if you have been teammates with him, or competed against him, then you know what he is doing n…",Twitter for iPhone,2021-12-17T10:47:51.000Z,318,0,0
2,34703080,1456448587076886534,1456734885452910600,@JoiDukesTV He won’t…,Twitter for iPhone,2021-11-05T21:27:16.000Z,0,2,0
3,34703080,1438552332179066881,1438552332179066881,Only in Florida would a Chicken 🍗 🐓 be bold enough to prance around Popeye’s https://t.co/uFPJVIHKrb,Instagram,2021-09-16T17:16:17.000Z,0,2,0
4,34703080,1437072435174588417,1437177352732225539,@stoolpresidente @DeionSanders AND THE FALCONS…,Twitter for iPhone,2021-09-12T22:12:37.000Z,0,18,0
