In [1]:
# import used for handling tabular data (dataframes, CSV, etc.)
# import a library to make HTTP requests (like visiting a website)
# import a tool to parse and extract content from HTML (web pages)
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# store the links of URLs you wish to scrape
# in this example, we are using old Reddit as they are much easier to scrape than the modern site
links = ["https://old.reddit.com/r/canada/",
        "https://old.reddit.com/r/publichealth/",
        "https://old.reddit.com/r/depression/"]

In [4]:
# Import time to add delays between requests to respect server limits (slide 21)
import time

def scrape_to_dataframe(urls):
    # Initialize an empty list to store scraped data as dictionaries
    data = []
    
    # Set a user-agent header to mimic a browser and avoid bot detection (slide 23)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    # Iterate through each URL in the provided list
    for url in urls:
        # Send a GET request to the URL with the user-agent header
        response = requests.get(url, headers=headers)
        
        # Check if the request was successful (status code 200)
        if response.status_code != 200:
            # Print an error message if the request fails
            print(f"Failed to fetch {url}: Status code {response.status_code}")
            continue
        
        # Create a BeautifulSoup object to parse the HTML content (slide 23)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all post elements with class="thing" (slide 22)
        posts = soup.find_all('div', class_='thing')
        
        # Iterate over each post to extract title and upvotes
        for post in posts:
            # Extract title from <a class="title"> within <p class="title">
            title_elem = post.find('p', class_='title')
            title = title_elem.find('a', class_='title').text.strip() if title_elem and title_elem.find('a', class_='title') else 'No Title'
            
            # Extract upvotes from <div class="score"> (or "score unvoted")
            score_elem = post.find('div', class_='score')
            upvotes = score_elem.text.strip() if score_elem and score_elem.text.strip() != '•' else '0'
            
            # Append a dictionary with url, title, and upvotes to the data list
            data.append({
                'url': url,
                'title': title,
                'upvotes': upvotes
            })
        
        # Add a 2-second delay to avoid overwhelming the server (slide 21)
        time.sleep(2)
    
    # Return the list of dictionaries containing scraped data
    return data

In [7]:
# Call the scrape_to_dataframe function with the list of URLs
data = scrape_to_dataframe(links)

# Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(data)

# Print the DataFrame to verify the scraped data
print(df)

# Export the DataFrame to a CSV file in the data/ directory
df.to_csv("data/reddit_posts.csv", index=False)

# Print a confirmation message
print("Scraped data exported to reddit_posts.csv")

                                     url  \
0       https://old.reddit.com/r/canada/   
1       https://old.reddit.com/r/canada/   
2       https://old.reddit.com/r/canada/   
3       https://old.reddit.com/r/canada/   
4       https://old.reddit.com/r/canada/   
..                                   ...   
81  https://old.reddit.com/r/depression/   
82  https://old.reddit.com/r/depression/   
83  https://old.reddit.com/r/depression/   
84  https://old.reddit.com/r/depression/   
85  https://old.reddit.com/r/depression/   

                                                title upvotes  
0     Saturdays and Sundays are now Opinion-free days     431  
1   McIntosh sets world record in 400M freestyle a...    1190  
2                          Witness the Rise of Ultron       6  
3   Nearly 2 in 3 say Canada should not join Trump...    3169  
4   Mexico’s relationship with Canada at a 6-year ...     787  
..                                                ...     ...  
81        can’t sleep a