## Reddit Web Scraper

In [8]:
# imports
import os
import numpy as np
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent
import random
import time


In [9]:
# Function definitions for read/write html
def read_html(path): 
    with open(path, 'rb') as f: 
        return f.read()

def write_html(html, path):
    directory = os.path.dirname(path)
    if not os.path.exists(directory): 
        os.makedirs(directory)
    with open(path, 'wb') as f:
        f.write(html)

In [10]:
# Make website cache many posts by scrolling and save html content afterwards
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

ua = UserAgent()
user_agent = ua.random
chrome_options.add_argument(f"user-agent={user_agent}")

# Initialize WebDriver
service = Service()
driver = webdriver.Chrome(service=service, options=chrome_options)

url = 'https://www.reddit.com/r/AmItheAsshole/'
driver.get(url)

SCROLL_PAUSE_TIME = random.uniform(2, 5)
last_height = driver.execute_script("return document.body.scrollHeight")

num_scrolls = 0
while (num_scrolls < 5):
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    time.sleep(SCROLL_PAUSE_TIME + random.uniform(0.5, 1.5))

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height
    num_scrolls += 1
    
    # Random mouse movement to simulate human behavior
    action = ActionChains(driver)
    action.move_by_offset(random.randint(0, 100), random.randint(0, 100)).perform()
    
raw_html_path = 'data/aita-reddit-html.txt'
directory = os.path.dirname(raw_html_path)

if not os.path.exists(directory): 
    os.makedirs(directory)
    
with open(raw_html_path, 'w', encoding='utf-8') as file:
    file.write(driver.page_source)

driver.quit()

In [11]:
# Parse webpage HTML and get links to each article
with open(raw_html_path, 'r', encoding='utf-8') as file:
    raw_html = file.read()

soup = BeautifulSoup(raw_html, 'html.parser')
articles = soup.find_all('article', class_='w-full m-0')

print(f"Number of articles found: {len(articles)}")

links = []
for article in articles: 
    shreddit_post = article.find('shreddit-post')
    links.append(shreddit_post.get('content-href'))

with open('data/reddit-links.txt', 'w', encoding='utf-8') as f: 
    for link in links:
        f.write(link + '\n')


Number of articles found: 50


In [12]:
# Perform new fetch on each post link
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"}
for index, link in enumerate(links): 
     aita_post_req = requests.get(link, headers=headers)
     file_path = f'data/posts-html/aita-post{index}-html.txt'
     write_html(aita_post_req.content, file_path)

In [14]:
# Get text from HTML served from the post link
posts_html_path = 'data/posts-html'
posts_path = 'data/posts'

if not os.path.exists(posts_path): 
    os.makedirs(posts_path)

items = os.listdir(posts_html_path)
for index, item in enumerate(items): 
    aita_post_soup = BeautifulSoup(read_html(posts_html_path + '/' + item), 'html.parser')
    post_container = aita_post_soup.find('div', class_='text-neutral-content')
    div1 = post_container.find('div')
    div2 = div1.find('div')
    p_elements = div2.find_all('p')
    post_text = '\n\n'.join(p.get_text(strip=True) for p in p_elements) # Concatenate text
    file_path = os.path.join(posts_path, f'aita-post{index}.txt')
    with open(file_path, 'w', encoding='utf-8') as f: 
        f.write(post_text)


## Text To Speech