In [1]:
# VesterAI - Notebook 01b: Twitter and Reddit Scraping

"""
Objective:
Collect and store financial posts and comments related to specific stocks from:
1. Twitter (via X API or third-party dataset)
2. Reddit (via Pushshift API or PRAW)

Outcome:
- CSV files with text, author, date, source, and possibly metadata like upvotes or likes.
"""

'\nObjective:\nCollect and store financial posts and comments related to specific stocks from:\n1. Twitter (via X API or third-party dataset)\n2. Reddit (via Pushshift API or PRAW)\n\nOutcome:\n- CSV files with text, author, date, source, and possibly metadata like upvotes or likes.\n'

In [2]:
# Install required libraries (you may need to set up Twitter/Reddit credentials separately)
!pip install snscrape praw pandas --quiet
!pip install -U jupyterlab ipywidgets jupyterlab-widgets

import pandas as pd
import datetime
import os
import snscrape.modules.twitter as sntwitter
import praw

In [3]:
# Define output path
raw_data_path = "../data/raw"
os.makedirs(raw_data_path, exist_ok=True)
print(f"Data will be saved in: {raw_data_path}")

Data will be saved in: ../data/raw


In [9]:
# Install Tweepy if not already installed
!pip install tweepy --quiet

In [11]:
import tweepy
import pandas as pd
import datetime
import os

# Twitter API Bearer Token (get from https://developer.twitter.com/)
bearer_token = "AAAAAAAAAAAAAAAAAAAAAJC10AEAAAAAQ8%2FaBip%2BQI2qyhJ76P%2F5I71kvv4%3DoTmwRQrxnzqEQY4nRNsHA5w14cEo65moK9bajb14z9Uz1ljFFv"

# Authenticate
client = tweepy.Client(bearer_token=bearer_token, wait_on_rate_limit=True)

# Function to scrape recent tweets for a stock ticker
def scrape_twitter_v2(query="AAPL stock", max_results=100):
    tweets_data = []
    response = client.search_recent_tweets(
        query=query,
        tweet_fields=["created_at", "public_metrics", "lang"],
        max_results=min(max_results, 100)  # Twitter limits to 100 per request
    )

    for tweet in response.data:
        tweets_data.append({
            "date": tweet.created_at,
            "text": tweet.text,
            "retweets": tweet.public_metrics["retweet_count"],
            "likes": tweet.public_metrics["like_count"],
            "language": tweet.lang,
            "source": "Twitter"
        })

    return pd.DataFrame(tweets_data)

# Example: Scrape 100 tweets for AAPL
tweet_df = scrape_twitter_v2("AAPL stock", max_results=100)
tweet_file_path = os.path.join("../data/raw", "AAPL_twitter_api.csv")
tweet_df.to_csv(tweet_file_path, index=False)

print(f"Tweets saved to: {tweet_file_path}")
tweet_df.head()

Rate limit exceeded. Sleeping for 795 seconds.


Tweets saved to: ../data/raw/AAPL_twitter_api.csv


Unnamed: 0,date,text,retweets,likes,language,source
0,2025-03-25 22:54:18+00:00,Best stock traders group out there!\nupdates +...,0,0,en,Twitter
1,2025-03-25 22:53:02+00:00,"RT @finchat_io: ""Over the long term, it's hard...",34,0,en,Twitter
2,2025-03-25 22:51:18+00:00,Best stock trade Group out there! \nFree chatr...,0,0,en,Twitter
3,2025-03-25 22:50:18+00:00,Bast stock group ♥️♥️\nDiscord👌👌\n\nKFC- h...,0,0,en,Twitter
4,2025-03-25 22:47:40+00:00,Bast stock group ♥️♥️\nDiscord👌👌\n\n MAC- ht...,0,0,en,Twitter


In [5]:
# Reddit API credentials (replace with your own)
reddit = praw.Reddit(
    client_id="YYSSSxidNPugOoGJ3eAwwQ",
    client_secret="ony_INNIG9cAAkiwZ9Ry-MPzw1d0vQ",
    user_agent="vesterai_reddit_scraper"
)

# Function to scrape Reddit posts from r/stocks or r/investing
def scrape_reddit_posts(subreddit_name="stocks", query="AAPL", limit=50):
    subreddit = reddit.subreddit(subreddit_name)
    posts = []

    for submission in subreddit.search(query, limit=limit, sort="new"):
        posts.append({
            "date": datetime.datetime.fromtimestamp(submission.created_utc),
            "title": submission.title,
            "content": submission.selftext,
            "upvotes": submission.score,
            "comments": submission.num_comments,
            "source": f"Reddit ({subreddit_name})"
        })

    return pd.DataFrame(posts)

# Example: Scrape Reddit posts mentioning AAPL
reddit_df = scrape_reddit_posts("stocks", "AAPL", limit=50)
reddit_file_path = os.path.join(raw_data_path, "AAPL_reddit_posts.csv")
reddit_df.to_csv(reddit_file_path, index=False)

print(f"Reddit posts saved to: {reddit_file_path}")
reddit_df.head()

Reddit posts saved to: ../data/raw/AAPL_reddit_posts.csv


Unnamed: 0,date,title,content,upvotes,comments,source
0,2025-03-25 18:14:10,Stocks Close Higher for 3rd Straight Day; Tesl...,\nMajor indexes closed slightly higher Tuesday...,2,4,Reddit (stocks)
1,2025-03-21 05:30:31,r/Stocks Daily Discussion & Fundamentals Frida...,"This is the daily discussion, so anything stoc...",18,330,Reddit (stocks)
2,2025-03-20 20:54:15,Short sellers have made $15 billion betting ag...,Short sellers have been cleaning up to start 2...,555,32,Reddit (stocks)
3,2025-03-20 02:08:52,A Deeper Dive on Trump's Tariffs and Market Po...,Trump’s latest trade policies are hammering th...,0,10,Reddit (stocks)
4,2025-03-19 15:47:39,The Fate of the S&P's 10 Most Popular Stocks,Almost all of the 10 largest stocks in the Sta...,0,5,Reddit (stocks)


In [12]:
print("Social Media Data Collection Summary:")
print(f"Tweets: {len(tweet_df)} → saved to {tweet_file_path}")
print(f"Reddit posts: {len(reddit_df)} → saved to {reddit_file_path}")
print("\nNext: Sentiment analysis on these posts in Notebook 03.")

Social Media Data Collection Summary:
Tweets: 100 → saved to ../data/raw/AAPL_twitter_api.csv
Reddit posts: 50 → saved to ../data/raw/AAPL_reddit_posts.csv

Next: Sentiment analysis on these posts in Notebook 03.
