## Reddit Web Scraper

In [1]:
# imports
import os
import numpy as np
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent
import random
import time
import string


In [2]:
# Function definitions for read/write html
def read_html(path): 
    with open(path, 'rb') as f: 
        return f.read()

def write_html(html, path):
    directory = os.path.dirname(path)
    if not os.path.exists(directory): 
        os.makedirs(directory)
    with open(path, 'wb') as f:
        f.write(html)

In [3]:
# Make website cache many posts by scrolling and save html content afterwards
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

ua = UserAgent()
user_agent = ua.random
chrome_options.add_argument(f"user-agent={user_agent}")

# Initialize WebDriver
service = Service()
driver = webdriver.Chrome(service=service, options=chrome_options)

url = 'https://www.reddit.com/r/AmItheAsshole/'
driver.get(url)

SCROLL_PAUSE_TIME = random.uniform(2, 5)
last_height = driver.execute_script("return document.body.scrollHeight")

num_scrolls = 0
while (num_scrolls < 5):
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    time.sleep(SCROLL_PAUSE_TIME + random.uniform(0.5, 1.5))

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height
    num_scrolls += 1
    
    # Random mouse movement to simulate human behavior
    action = ActionChains(driver)
    action.move_by_offset(random.randint(0, 100), random.randint(0, 100)).perform()
    
raw_html_path = 'data/aita-reddit-html.txt'
directory = os.path.dirname(raw_html_path)

if not os.path.exists(directory): 
    os.makedirs(directory)
    
with open(raw_html_path, 'w', encoding='utf-8') as file:
    file.write(driver.page_source)

driver.quit()

In [4]:
# Parse webpage HTML and get links to each article
with open(raw_html_path, 'r', encoding='utf-8') as file:
    raw_html = file.read()

soup = BeautifulSoup(raw_html, 'html.parser')
articles = soup.find_all('article', class_='w-full m-0')

print(f"Number of articles found: {len(articles)}")

links = []
for article in articles: 
    shreddit_post = article.find('shreddit-post')
    links.append(shreddit_post.get('content-href'))

with open('data/reddit-links.txt', 'w', encoding='utf-8') as f: 
    for link in links:
        f.write(link + '\n')


Number of articles found: 50


In [5]:
# Perform new fetch on each post link
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"}
for index, link in enumerate(links): 
     aita_post_req = requests.get(link, headers=headers)
     file_path = f'data/posts-html/aita-post{index}-html.txt'
     write_html(aita_post_req.content, file_path)

In [6]:
# Get text from HTML served from the post link
posts_html_path = 'data/posts-html'
posts_path = 'data/posts'
allowed_chars = set(string.printable)

if not os.path.exists(posts_path): 
    os.makedirs(posts_path)

items = os.listdir(posts_html_path)
for index, item in enumerate(items): 
    aita_post_soup = BeautifulSoup(read_html(posts_html_path + '/' + item), 'html.parser')
    post_container = aita_post_soup.find('div', class_='text-neutral-content')
    div1 = post_container.find('div')
    div2 = div1.find('div')
    p_elements = div2.find_all('p')
    post_text = '\n\n'.join(p.get_text(strip=True) for p in p_elements) # Concatenate text
    file_path = os.path.join(posts_path, f'aita-post{index}.txt')
    with open(file_path, 'w', encoding='utf-8') as f: 
        post_text = ''.join(filter(lambda x: x in allowed_chars, post_text))
        f.write(post_text)


## Text To Speech

In [7]:
# imports
from gtts import gTTS

In [7]:
# function to create audio file from text
def textToSpeech(text, output_file): 
    tts = gTTS(text=text, lang='en', slow=False)
    tts.save(output_file)

In [10]:
# Read text from data/posts directory and save audio files in audio directory
audio_dir = 'output/audio'
if not os.path.exists(audio_dir): 
    os.makedirs(audio_dir)
    
for index, filename in enumerate(os.listdir(posts_path)): 
    path = os.path.join(posts_path, filename)
    with open(path, 'r') as file: 
        text = file.read()
        
    textToSpeech(text, os.path.join(audio_dir, f'post-audio{index}.mp3'))

gTTSError: 429 (Too Many Requests) from TTS API. Probable cause: Unknown

## Video Subtitle Generator

In [1]:
# imports
import whisper
import ffmpeg
import subprocess
import os
import torch

In [None]:
# Get aligned subtitles from audio file
model = whisper.load_model('base')
audio_file_path = 'output/audio/post-audio9.mp3'

result = model.transcribe(audio_file_path)
print(result["segments"])
for segment in result['segments']: 
    print(segment)
    
#options = whisper.DecodingOptions(fp16=False)
#decoded_result = whisper.decode(model, result['audio'], options)


In [3]:
audio_file_path = 'output/audio/post-audio9.mp3'
video_file_path = 'output/video/post-video9.mp4'

ffmpeg_command = [
    'ffmpeg',
    '-f', 'lavfi',
    '-i', 'color=size=1280x720:duration=30:rate=30:color=black',
    '-i', audio_file_path,
    '-c:v', 'libx264',
    '-tune', 'stillimage',
    '-c:a', 'aac',
    '-b:a', '192k',
    '-pix_fmt', 'yuv420p',
    '-shortest',
    video_file_path
]

subprocess.run(ffmpeg_command, check=True)

print("saved video")

In [3]:
video_file = 'output/video/post-video9.mp4'
audio_output = 'output/audio/extracted-audio.mp3'
subtitles_file = 'output/subtitles/subtitles.srt'  # Assume this file is generated
output_video_file = 'output/video/post-video9-subtitled.mp4'

# Ensure the output directories exist
os.makedirs(os.path.dirname(audio_output), exist_ok=True)
os.makedirs(os.path.dirname(subtitles_file), exist_ok=True)
os.makedirs(os.path.dirname(output_video_file), exist_ok=True)

# Extract audio from the video
extract_audio_command = [
    'ffmpeg',
    '-i', video_file,
    '-q:a', '0',
    '-map', 'a',
    audio_output
]

# Run the FFmpeg commands
try:
    # Extract audio
    subprocess.run(extract_audio_command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    print(f"Audio extracted to {audio_output}")

except subprocess.CalledProcessError as e:
    print(f"An error occurred: {e}")
    print(f"Command output: {e.output}")
    print(f"Stderr: {e.stderr}")

Audio extracted to output/audio/extracted-audio.mp3


In [5]:
audio_output = 'output/audio/extracted-audio.mp3'
subtitles_file = 'output/subtitles/subtitles.srt'


torch.cuda.empty_cache()
print(torch.cuda.memory_summary())

model = whisper.load_model('base')

# Transcribe the audio file
result = model.transcribe(word_timestamps=True, audio=audio_output)

# Function to convert seconds to SRT time format
def seconds_to_srt_time(seconds):
    millisec = int((seconds - int(seconds)) * 1000)
    time_str = f"{int(seconds // 3600):02}:{int((seconds % 3600) // 60):02}:{int(seconds % 60):02},{millisec:03}"
    return time_str

# Create the SRT content
srt_content = ""
counter = 1

for segment in result['segments']:
    for word in segment['words']: 
        start_time = seconds_to_srt_time(segment['start'])
        end_time = seconds_to_srt_time(segment['end'])
        text = word['word']
        srt_content += f"{counter}\n{start_time} --> {end_time}\n{text.strip()}\n\n"
        counter += 1

# Write the SRT content to a file
with open(subtitles_file, 'w', encoding='utf-8') as srt_file:
    srt_file.write(srt_content)

print(f"Subtitles have been saved as {subtitles_file}")

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 294981 KiB | 723231 KiB |  43224 MiB |  42935 MiB |
|       from large pool | 218732 KiB | 533001 KiB |  37547 MiB |  37334 MiB |
|       from small pool |  76249 KiB | 190230 KiB |   5676 MiB |   5601 MiB |
|---------------------------------------------------------------------------|
| Active memory         | 294981 KiB | 723231 KiB |  43224 MiB |  42935 MiB |
|       from large pool | 218732 KiB | 533001 KiB |  37547 MiB |  37334 MiB |
|       from small pool |  76249 KiB | 190230 KiB |   5676 MiB |   5601 MiB |
|---------------------------------------------------------------

In [2]:
video_file = 'output/video/post-video9.mp4'
output_video_file = 'output/video/post-video9-subtitled.mp4'
subtitles_file = 'output/subtitles/subtitles.srt'

# Add subtitles to the video
add_subtitles_command = [
    'ffmpeg',
    '-i', video_file,
    '-vf', f"subtitles={subtitles_file}",
    '-c:a', 'copy',
    output_video_file
]

# Run the FFmpeg command to add subtitles
try:
    add_subtitles_result = subprocess.run(add_subtitles_command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    print(f"Subtitled video created at {output_video_file}")
    print(add_subtitles_result.stdout)
    print(add_subtitles_result.stderr)
except subprocess.CalledProcessError as e:
    print(f"An error occurred while adding subtitles: {e}")
    print(f"Stderr: {e.stderr}")

Subtitled video created at output/video/post-video9-subtitled.mp4

ffmpeg version 7.0.1-essentials_build-www.gyan.dev Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 13.2.0 (Rev5, Built by MSYS2 project)
  configuration: --enable-gpl --enable-version3 --enable-static --disable-w32threads --disable-autodetect --enable-fontconfig --enable-iconv --enable-gnutls --enable-libxml2 --enable-gmp --enable-bzlib --enable-lzma --enable-zlib --enable-libsrt --enable-libssh --enable-libzmq --enable-avisynth --enable-sdl2 --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxvid --enable-libaom --enable-libopenjpeg --enable-libvpx --enable-mediafoundation --enable-libass --enable-libfreetype --enable-libfribidi --enable-libharfbuzz --enable-libvidstab --enable-libvmaf --enable-libzimg --enable-amf --enable-cuda-llvm --enable-cuvid --enable-dxva2 --enable-d3d11va --enable-d3d12va --enable-ffnvcodec --enable-libvpl --enable-nvdec --enable-nvenc --enable-vaapi --enable-libgme -