In [20]:
# Import required libraries for Reddit API data collection
import requests  # For making HTTP requests to Reddit API
import pandas as pd  # For data manipulation and analysis

In [21]:
# Reddit API credentials - these should be stored securely in production
CLIENT_ID = 'LDPs59MIc82xLXPftHQ3Sw'  # Reddit app client ID
SECRET_KEY = 'uZSSG2q3ueggUkUReIMzaKe8xnC6iw'  # Reddit app secret key

# Set up authentication for Reddit API
auth = requests.auth.HTTPBasicAuth(CLIENT_ID, SECRET_KEY)  # Basic auth with client credentials
headers = {'User-Agent': 'MyAPI/0.0.1'}  # Required user agent for Reddit API

# User credentials for password grant type
data = {
    'grant_type': 'password',  # Using password grant type for authentication
    'username': 'Few-Strength-2343',  # Reddit username
    'password': 'Linyihao.041228',  # Reddit password
}

In [22]:
def get_valid_token():
    """
    Obtain a valid access token from Reddit API using password grant type.
    
    Returns:
        str: Access token for Reddit API authentication
    """
    # Request access token from Reddit OAuth endpoint
    res = requests.post(
        'https://www.reddit.com/api/v1/access_token',
        auth=auth,  # Basic authentication with client credentials
        headers=headers,  # Required user agent header
        data=data  # Password grant credentials
    )
    
    # Check if request was successful
    if res.status_code == 200:
        return res.json()['access_token']  # Extract token from response
    else:
        # Raise exception with detailed error information
        raise Exception(f"Failed to get access token: {res.status_code} {res.text}")


def parse_reddit_data(res):
    """
    Parse Reddit API response and extract relevant post data into a pandas DataFrame.
    
    Args:
        res (requests.Response): Response object from Reddit API containing post data
        
    Returns:
        pd.DataFrame: DataFrame containing parsed Reddit post information with columns:
            - subreddit: Name of the subreddit
            - title: Post title
            - author: Username of post author
            - selftext: Post content/body text
            - url: Post URL
            - created_utc: Creation timestamp (UTC)
            - score: Net score (upvotes - downvotes)
            - upvote_ratio: Ratio of upvotes to total votes
            - upvotes: Number of upvotes
            - downvotes: Number of downvotes
            - num_comments: Number of comments on the post
    """
    data_list = []  # Initialize list to store post data
    
    # Iterate through each post in the API response
    for post in res.json()['data']['children']:
        # Extract relevant fields from each post
        data = {
            'subreddit': post['data']['subreddit'],  # Subreddit name
            'title': post['data']['title'],  # Post title
            'author': post['data']['author'],  # Author username
            'selftext': post['data']['selftext'],  # Post content
            'url': post['data']['url'],  # Post URL
            'created_utc': post['data']['created_utc'],  # Creation timestamp
            'score': post['data']['score'],  # Net score
            'upvote_ratio': post['data']['upvote_ratio'],  # Upvote ratio
            'upvotes': post['data']['ups'],  # Number of upvotes
            'downvotes': post['data']['downs'],  # Number of downvotes
            'num_comments': post['data']['num_comments'],  # Comment count
        }
        data_list.append(data)  # Add to list

    # Create DataFrame from list of dictionaries (efficient approach)
    df = pd.DataFrame(data_list).reset_index(drop=True)
    return df
    

def get_reddit_data(subreddit='uoft', limit=100):
    """
    Fetch Reddit posts from a specified subreddit using Reddit API.
    
    Args:
        subreddit (str, optional): Name of the subreddit to fetch posts from. 
                                 Defaults to 'uoft'.
        limit (int, optional): Maximum number of posts to fetch (1-100). 
                              Defaults to 100.
    
    Returns:
        pd.DataFrame: DataFrame containing Reddit post data with parsed information
    """
    # Get fresh access token for this session
    access_token = get_valid_token()
    headers['Authorization'] = f'bearer {access_token}'  # Add bearer token to headers
    
    # Make request to Reddit API for new posts from specified subreddit
    res = requests.get(
        f'https://oauth.reddit.com/r/{subreddit}/new',  # API endpoint for new posts
        headers=headers,  # Headers with authorization
        params={'limit': limit}  # Limit number of posts returned
    )
    
    # Check if request was successful
    if res.status_code == 200:
        return parse_reddit_data(res)  # Parse and return the data
    else:
        # Raise exception with detailed error information
        raise Exception(f"Failed to fetch data: {res.status_code} {res.text}")

In [None]:
# Collect Reddit data from the UofT subreddit
# Fetches the latest 100 posts and stores them in a DataFrame
df = get_reddit_data('uoft', 500)

# Display the first few rows to verify data collection
df.head()

Unnamed: 0,subreddit,title,author,selftext,url,created_utc,score,upvote_ratio,upvotes,downvotes,num_comments
0,UofT,I have received my OSSD yet UofT says I have n...,Educational-One-4576,I received an email yesterday telling me that ...,https://i.redd.it/g15ertuk0udf1.jpeg,1752931000.0,3,1.0,3,0,0
1,UofT,"Can I attend other lectures in MAT235, like it...",missmyballs,The title. I was not able to enroll into the o...,https://www.reddit.com/r/UofT/comments/1m3v2ji...,1752928000.0,2,1.0,2,0,0
2,UofT,Concerns about my chances in getting into UOFT...,PokemonPikachu01,I am student at a STEM school. My results last...,https://www.reddit.com/r/UofT/comments/1m3t9n7...,1752922000.0,0,0.5,0,0,6
3,UofT,health checkups for newly arriving internation...,Electronic-Lab-5693,did you have to go to a doctor to test for dis...,https://www.reddit.com/r/UofT/comments/1m3sw2s...,1752921000.0,2,1.0,2,0,2
4,UofT,"advice for MAT137Y1, boris khesin &amp; alejan...",Ok-Brilliant-8144,im waitlisted for mat137 (boris khesin) and i ...,https://www.reddit.com/r/UofT/comments/1m3rbb5...,1752914000.0,4,1.0,4,0,4


In [None]:
# Get fresh access token for this session
access_token = get_valid_token()
headers['Authorization'] = f'bearer {access_token}'  # Add bearer token to headers

# Make request to Reddit API for new posts from specified subreddit
res = requests.get(
    f'https://oauth.reddit.com/r/uoft/new',  # API endpoint for new posts
    headers=headers,  # Headers with authorization
    params={'limit': 100}  # Limit number of posts returned
)