## import

In [62]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
from time import sleep
import json, os


## instantiate webdriver

In [63]:
## instantiate driver
## check the version of Google Chrome and download correct version of chromedriver
driver = webdriver.Chrome()

In [64]:
## get page of "social grep", which gived old posts of subreddit
## e.g. https://socialgrep.com/search?query=%2Fr%2FLanguageTechnology%2Cafter%3A2010-01-01&order_by=oldest
## original reddit url = 'https://www.reddit.com/r/xxxxxxxxx/'

subreddit = 'Windows' # choose by yourself
start_date = '2010-01-01' # choose by yourself

url = f'https://socialgrep.com/search?query=%2Fr%2F{subreddit}%2Cafter%3A{start_date}&order_by=oldest'

driver.get(url)
repeat_time, waiting_time = 4, 2

## scroll to the bottom of the page and wait
for i in range(repeat_time):
    driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight);")
    sleep(waiting_time)

## example of one post

In [65]:
## function to scrape
def get_content(post, subreddit):
    try:
        vote = int(post.select_one('span.text-info').text)
    except:
        vote = 0
    try:
        title = post.a.text
    except:
        return None
    try:
        text = post.select_one('div.post_content').get_text(separator='\n').strip()
        if text == '':
            text = None
    except:
        text = None
    date = post.select_one('h6.card-subtitle').text.split(',')[1].strip()

    if text == None and title == f"/r/{subreddit.lower()}":
        return None
    else:
        return {
            "vote" : vote,
            "title" : title,
            "text" : text,
            "date" : date
        }

In [66]:
soup = BeautifulSoup(driver.page_source)
posts = soup.select('div.card-body') # content is under here

get_content(posts[1], subreddit) # show one example

{'vote': 0,
 'title': "Does anyone know how to fix Vundo.JD? AVG 'fixes' it and it comes back alive again.",
 'text': None,
 'date': '2010-01-02'}

# for loop with datetime

In [67]:
if os.path.exists(f'{subreddit}.json'):
    ## resume scraping from the last date in the json file
    with open(f'{subreddit}.json', 'r', encoding='utf8') as f:
        scraped_data = json.load(f)
    new_date = scraped_data[-1]['date']
    url = f'https://socialgrep.com/search?query=%2Fr%2F{subreddit}%2Cafter%3A{new_date}&order_by=oldest'
else:
    ## if the file not exists, create a new list
    scraped_data = []

In [68]:
## scrape and append to `scraped_data`
## RUN THIS CELL AGAIN AND AGAIN until getting the latest post

for _ in tqdm(range(200)): # set repeat time 

    ## scroll to the bottom of the page and wait
    driver.get(url)
    for i in range(4):
        driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight);")
        sleep(2)

    ## get HTML
    soup = BeautifulSoup(driver.page_source)
    posts = soup.select('div.card-body')

    ## iterate each post
    for post in posts:
        one_post_dict = get_content(post, subreddit)
        if one_post_dict != None:
            scraped_data.append(one_post_dict)

    ## save to json
    with open(f'{subreddit}.json', 'w', encoding='utf8') as f:
        json.dump(scraped_data, f, indent=False, ensure_ascii=False)

    ## set new date
    new_date = scraped_data[-1]['date']
    url = f'https://socialgrep.com/search?query=%2Fr%2F{subreddit}%2Cafter%3A{new_date}&order_by=oldest'


100%|██████████| 200/200 [58:20<00:00, 17.50s/it]  


## to dataframe and drop duplicate

In [None]:
df = pd.read_json(f'{subreddit}.json').drop_duplicates()
df

In [None]:
## missing value in text
df.isna().sum()

In [None]:
## text includes [removed] [deleted]
df[df['text'].isin(['[removed]', '[deleted]'])]