In [1]:
!pip install python-dotenv



In [2]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access the credentials
client_id = os.getenv("REDDIT_CLIENT_ID")
client_secret = os.getenv("REDDIT_CLIENT_SECRET")
user_agent = os.getenv("REDDIT_USER_AGENT")
username = os.getenv("REDDIT_USERNAME")
password = os.getenv("REDDIT_PASSWORD")

In [3]:
import praw
import requests
import pandas as pd
from datetime import datetime

# Reddit API credentials
    
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent,
    username=username,
    password=password
)

# Get historical posts from Pushshift
def get_pushshift_data(query, subreddits, after, before, size=150):
    url = "https://api.pushshift.io/reddit/search/submission/"
    posts = []

    for subreddit in subreddits:
        params = {
            "q": query,
            "subreddit": subreddit,
            "after": after,
            "before": before,
            "size": size,
            "sort": "desc"
        }
        res = requests.get(url, params=params)
        if res.status_code == 200:
            for post in res.json()["data"]:
                posts.append({
                    "title": post.get("title", ""),
                    "text": post.get("selftext", ""),
                    "subreddit": subreddit,
                    "date": datetime.fromtimestamp(post["created_utc"]),
                    "url": post.get("url", ""),
                    "score": post.get("score", 0)
                })
    return posts

# Get recent posts from Reddit (PRAW)
def get_recent_posts(query, subreddits, limit=100):
    posts = []
    for subreddit in subreddits:
        for post in reddit.subreddit(subreddit).search(query, limit=limit):
            posts.append({
                "title": post.title,
                "text": post.selftext,
                "subreddit": subreddit,
                "date": datetime.fromtimestamp(post.created_utc),
                "url": post.url,
                "score": post.score
            })
    return posts

# Search settings
query = "canadian housing OR house prices OR rent"
subs = ["CanadaHousing", "TorontoRealEstate", "Vancouver", "PersonalFinanceCanada"]
after = "2023-01-01"
before = "2024-12-31"

# Run and save
all_data = get_pushshift_data(query, subs, after, before)
all_data += get_recent_posts(query, subs)
pd.DataFrame(all_data).to_csv("canadian_housing_reddit.csv", index=False)

print(" Done. File saved: canadian_housing_reddit.csv")


✅ Done. File saved: canadian_housing_reddit.csv


In [5]:
all_data = pd.DataFrame(all_data)

all_data.head()

Unnamed: 0,title,text,subreddit,date,url,score
0,"“Generally speaking, Canada undertaxes housing...",[https://ricochet.media/en/3786/it-feels-hopel...,CanadaHousing,2021-09-22 19:44:12,https://www.reddit.com/r/canadahousing/comment...,794
1,Is there a Canadian city/town where you feel t...,"Everywhere I look, I don't feel comfortable bu...",CanadaHousing,2024-02-15 14:03:05,https://www.reddit.com/r/canadahousing/comment...,117
2,"This is the problem with Canadian Real-Estate,...","**Problem, in short:**\n\nThe Canadian real es...",CanadaHousing,2025-05-26 01:06:19,https://www.reddit.com/r/canadahousing/comment...,0
3,Canadian Real Estate will lead to a devastatin...,Anytime there’s a high frequency of fraud for ...,CanadaHousing,2022-04-21 20:29:33,https://www.reddit.com/r/canadahousing/comment...,298
4,Want to remove speculators from Canadian housi...,"There is a solution, apart from increasing sup...",CanadaHousing,2021-11-23 21:46:14,https://www.reddit.com/r/canadahousing/comment...,344


In [6]:
all_data.columns

Index(['title', 'text', 'subreddit', 'date', 'url', 'score'], dtype='object')

In [7]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   title      400 non-null    object        
 1   text       400 non-null    object        
 2   subreddit  400 non-null    object        
 3   date       400 non-null    datetime64[ns]
 4   url        400 non-null    object        
 5   score      400 non-null    int64         
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 18.9+ KB
