## List of tickers and names

In [14]:
import pandas as pd



stock_tickers_and_names = pd.read_csv("../datasets/stock_tickers_and_names.csv")

In [27]:
stock_tickers_and_names.rename(columns={" name": "name"}, inplace=True)

In [28]:
df_sample = stock_tickers_and_names.sample(150)

## Scrape Tweets for Dataset
- [Documentation](https://developer.twitter.com/en/docs/twitter-api/metrics) for `public_metrics`
    - retweet_count: A count of how many times the Tweet has been Retweeted
    - reply_count: A count of how many times the Tweet has been replied to
    - like_count: A count of how many times the Tweet has been liked
    - quote_count: A count of how many times the Tweet has been Retweeted with a new comment (message). Please note: This does not include Retweets. To get the “Retweets and comments” total as displayed on the Twitter clients, simply add retweet_count and quote_count

In [29]:
df_sample.columns

Index(['ticker', 'name'], dtype='object')

In [47]:
df_sample.reset_index(inplace=True)

In [72]:
import requests
import datetime



twitter_search_endpoint = 'https://api.twitter.com/2/tweets/search/recent'
headers = {
    'Authorization' : 'Bearer <bearer_token>'
}
tweet_fields = 'text,author_id,created_at,public_metrics'


now = datetime.datetime.now()

# YYYY-MM-DDTHH:mm:ssZ
start = (now - datetime.timedelta(days=6)).strftime("%Y-%m-%dT%H:%M:%SZ")  # 6 days ago
end = (now - datetime.timedelta(days=3)).strftime("%Y-%m-%dT%H:%M:%SZ")    # 3 days ago



with open("../datasets/improved_scraped_tweets.csv", "a") as f:

    for i,r in df_sample.iterrows():
        parameters  = {
            'query' : r["name"],
            'start_time': start,
            'end_time': end,
            'tweet.fields' : tweet_fields,
        }
        res = requests.get(twitter_search_endpoint, headers=headers, params=parameters)
        if "data" in res.json():
            pd.json_normalize(res.json()["data"]).to_csv(f, header=False, index=False)
        print(f"Done {i}: {r['name']}")

Done 0: Nektar Therapeutics
Done 1: ON Semiconductor
Done 2: International Paper Company
Done 3: Halliburton Company
Done 4: Sempra Energy
Done 5: PayPal
Done 6: Air Products and Chemicals
Done 7: L Brands
Done 8: BorgWarner
Done 9: Wynn Resorts
Done 10: American Airlines
Done 11: Comcast
Done 12: Patterson
Done 13: Boston Scientific
Done 14: Viacom
Done 15: TD Ameritrade Holding
Done 16: Tiffany & Co.
Done 17: Constellation Brands
Done 18: Seagate
Done 19: Simon Property
Done 20: Red Hat
Done 21: ImmunoGen
Done 22: Vertex Pharmaceuticals
Done 23: Xcel Energy
Done 24: SL Green Realty
Done 25: J. C. Penney Company
Done 26: WestRock Company
Done 27: NetApp
Done 28: AGNC Investment
Done 29: Twenty-First Century Fox
Done 30: Cigna
Done 31: Johnson & Johnson
Done 32: Assurant
Done 33: Sirius XM
Done 34: Baidu
Done 35: BB&T
Done 36: Formula One
Done 37: Lumentum
Done 38: Align
Done 39: Petríë_leo Brasileiro S.A. - Petrobras
Done 40: Kansas City Southern
Done 41: C.H. Robinson Worldwide
Done 

In [73]:
df_scraped = pd.read_csv("../datasets/improved_scraped_tweets.csv",
            header=None,
            names=['author_id', 'created_at', 'text', 'id', 'public_metrics.retweet_count',
       'public_metrics.reply_count', 'public_metrics.like_count',
       'public_metrics.quote_count'])

In [79]:
df_scraped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1276 entries, 0 to 1275
Data columns (total 8 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   author_id                     1276 non-null   object
 1   created_at                    1276 non-null   object
 2   text                          1276 non-null   object
 3   id                            1276 non-null   object
 4   public_metrics.retweet_count  1267 non-null   object
 5   public_metrics.reply_count    1267 non-null   object
 6   public_metrics.like_count     1276 non-null   int64 
 7   public_metrics.quote_count    1276 non-null   int64 
dtypes: int64(2), object(6)
memory usage: 79.9+ KB


In [82]:
df_scraped['public_metrics.retweet_count'] = df_scraped['public_metrics.retweet_count'].fillna(0)