Goal: To scrape as much of the old Overwatch forums as I can.

## Imports

In [None]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import numpy as np
%matplotlib inline
from selenium import webdriver
import requests
from time import sleep
from bs4 import BeautifulSoup
import json

from sklearn.model_selection import train_test_split

### Functions for use within the scrapers

In [None]:
# Simplifying the getting of the HTML that I'm looking for.
def get_html(url):
    return BeautifulSoup(requests.get(url).content, 'lxml')

In [None]:
# Function to specifically spit out a list of tuples
# tuple[0] is the url of the topic
# tuple[1] is the number of pages in the topic
def urls_with_numbers(forum_page_url):
#current_forum_page = 'https://us.battle.net/forums/en/overwatch/22813879/?page='+ str(forum_page)
    forum_soup = get_html(forum_page_url)                                       # Input URL get Soup
    topic_url_list = ['https://us.battle.net' + topic.attrs['href'] for topic in forum_soup.find_all(attrs={'class': "ForumTopic"})]
    topic_tuples = []                                                           # Above is list of urls of topics in forum page
    count = 0                                                                   # Instantiating empty list and starting count
    for forum_topic in forum_soup.find_all(attrs={'class': "ForumTopic"}):      # for i in all th3e forum topic infos
        posts_num = json.loads(forum_topic.attrs['data-forum-topic'])['lastPosition'] # Turning each info bit into a dict
        topic_pages = posts_num//20 + 1                     # Number of posts in Topic / by num allowed per page +1 for 1st
        if posts_num%20 != 0 and posts_num > 20:            # If there's a remainder page
            topic_pages += 1                                # Add remainder page
        topic_tuples.append((topic_url_list[count],topic_pages)) # Add the url to the page amount in tuples
        count += 1                                          # Keep track of the count
    return topic_tuples                                     # Returns list of tuples


In [None]:
def save_posts(list_of_dicts, saver, forum_page):
    return pd.DataFrame(list_of_dicts, columns = ['text',
                            'date',
                            'ids_dict',
                            'post_num',
                            'auth_posts',
                            'prof_link',
                            'title',
                            'forum_page',
                            'statuses',
                            'topic_url']).to_csv(saver + str(forum_page), index=False)

In [None]:
# This function was used to simplify the most recent version of the scraping code.
# It'll likely be replaced by the smaller function further down with hardcoded bits removed.
words = []
dates = []
TopicPosts = []
title = []
forum_page = []
prof_link = []
auth_posts = []
topic_keys  = []
def turn_to_dict(post, words = words, 
                 dates = dates, TopicPosts = TopicPosts, 
                 title = title, forum_page = forum_page, 
                 prof_link = prof_link, auth_posts = auth_posts, topic_keys = topic_keys):
    post_dict = {"If you're seeing this": "something went wrong."}
    try:
        post_dict = {                               # Creation & Statement of dicts
            'text'      : words[post].contents,                     # Text of the post
            'date'      : dates[post].attrs['data-tooltip-content'],# Date of the (unedited) post
            'ids_dict'  : TopicPosts[post].attrs['data-topic-post'],# Author info & votes of the post
            'post_num'  : TopicPosts[post].attrs['id'] ,            # Post number in the topic
            'title'     : title,                    # Title of Topic
            'forum_page': forum_page,               # Page in the forum
            'topic_url' : topic_keys[post][0]}
    except:
        pass
    
    try:
        post_dict['statuses']   = TopicPosts[post].attrs['data-topic']
    except:
        pass
    
    try:
        post_dict['prof_link']  = prof_link[post].attrs['href'][0]
    except:
        pass
    
    try:
        post_dict['auth_posts'] = auth_posts[post][0]  # Number of posts author has made
    except:
        pass
    
    return post_dict

In [None]:
# Function to replace above function in usability
def add_attempt(dictionary, title, addition):
    try:
        dictionary[str(title)] = addition
    except:
        pass
    return

In [None]:
# For code simplification later.
edt = ['\n\t\t\t\t\t\t\t\xa0(Edited)\n']

In [None]:
# Function for clicking next until the final page of the topic has been scraped.
def click_next():
    more_pages = True
    try:
        nexts = []
        for element in browser.find_elements_by_class_name('Button-content'):
            if element.text == 'NEXT':
                nexts.append(element)
        nexts[1].click()
        #not_last_page = True
    except:
        more_pages = False
    return more_pages

## Versions of scraping code

This was the original code that I used to scrape the forum.  I wanted to get a small and simple amount of data and as such thie only obtains the first page of every topic and will only grab topics in pages divisible by 5 or 10, depending on what I set it to previously.

In [None]:
# ORIGINAL SCRAPING SAMPLING CODE
#ops = webdriver.chrome.options.Options()
#ops.add_argument('--dns-prefetch-disable')
path = '../Garage/chromedriver'                                         # Path to Chromedriver
browser = webdriver.Chrome(executable_path = path)#, options=ops)         # Open browser
list_of_dicts = []
saver = './data/full_scrapes/Overwatch_Test_' 

for forum_page in range(1,1000):
    try:
        current_list = 'https://us.battle.net/forums/en/overwatch/22813879/?page='+ str(forum_page)
        browser.get(current_list)
        title_list = browser.find_elements_by_class_name("ForumTopic-title")
        if forum_page % 100 == 0:
            save_posts(list_of_dicts, saver, forum_page)

        for i in range(len(title_list)):
            try:
                title_list[i].click()                                   # Click on Title i
                more_pages = True                                       # True means there's more pages
                #print('start dict')
                while more_pages == True:                                     # Click on Title i
                    soup = BeautifulSoup(browser.page_source, 'lxml')         # Soup of all
                    words = soup.find_all('div',{"class" :'TopicPost-bodyContent'}) # Words
                    dates = soup.find_all('a',{"class" :'TopicPost-timestamp'})     # Dates
                    for post in range(len(words)):
                        post_dict = {                                         # Creation & Statement of dicts
                            'text' : words[post].text,
                            'date' : dates[post].text
                        }
                        list_of_dicts.append(post_dict)
         
                    more_pages = click_next()                           # Clicks "NEXT" otherwise return False
                    browser.get(current_list)                                            # Back
                    title_list = browser.find_elements_by_class_name("ForumTopic-title")
            except:
                print("Woops", forum_page, i)
                save_posts(list_of_dicts, saver, forum_page)
                sleep(10)
                browser.refresh()
                browser.get(current_list)
    except:
        print("Woops", forum_page)
        save_posts(list_of_dicts, saver, forum_page)
        sleep(30)
        browser.refresh()
        browser.get(current_list)
save_posts(list_of_dicts, saver, forum_page)
df = pd.DataFrame(list_of_dicts, columns = ['text','date'])        bb

This was an expanded version of the previous code, still with selenium.  I wanted to grab as much information from the forums as possible.
I also added the ops to the webdriver because a Stack Overflow page suggested it for the Timeout error I kept getting when running this code.  It did not help.

In [None]:
# OLD CODE WITH PROBABLY TOO MANY BROWSER CALLS
ops = webdriver.chrome.options.Options()
ops.add_argument('--dns-prefetch-disable')
path = '../Garage/chromedriver'                                         # Path to Chromedriver
browser = webdriver.Chrome(executable_path = path, options=ops)         # Open browser
list_of_dicts = []
saver = './data/full_scrapes/Overwatch_Pearl_'                          # Path to save

for forum_page in range(0,1001):     # BE CAREFUL WITH THIS             # For each page in chunk of posts
    try:                                                                # Failsafe within forum page
        current_list = 'https://us.battle.net/forums/en/overwatch/22813879/?page='+ str(forum_page)
        browser.get(current_list)                                       # Go to forum page
        title_list = browser.find_elements_by_class_name("ForumTopic-title")# Define list of topic pages to click
        if forum_page % 100 == 0 & forum_page != 0:                                       # Failsafe saver per 100 pages
            save_posts(list_of_dicts, saver, forum_page)

        for i in range(len(title_list)):                                # For every title in the list
            try:                                                        # Failsafe in topic page
                title_list[i].click()                                   # Click on Title i
                more_pages = True                                       # True means there's more pages
                #print('start dict')
                #sleep(2)
                while more_pages == True:                               # While true that there's more pages...
                    title      = browser.find_element_by_class_name('Topic-title').text
                    dates      = [e.get_attribute('data-tooltip-content') for e in browser.find_elements_by_class_name('TopicPost-timestamp') if e.text != ' (Edited)']
                    words      = [e.text for e in browser.find_elements_by_class_name('TopicPost-bodyContent')]
                    nums_dict  = [e.get_attribute('data-topic-post') for e in browser.find_elements_by_class_name('TopicPost')]
                    #sleep(2)
                    post_num   = [e.get_attribute('id') for e in browser.find_elements_by_class_name('TopicPost')]
                    auth_posts = [e.text for e in browser.find_elements_by_class_name('Author-posts') if e.get_attribute('data-toggle') == 'tooltip']
                    prof_link  = [e.get_attribute('href') for e in browser.find_elements_by_class_name('Author-avatar ')]
                    all_imgs   = [e.get_attribute('src') for e in browser.find_elements_by_css_selector('img')]
                    auth_img   = [e for e in all_imgs if 'blznav' not in e and len(e) != 0][1:-1]
                    #print('dict components defined')
                    for post in range(len(words)):                      # For each post in topic
                        try:
                            post_dict = {                               # Creation & Statement of dicts
                                'text'      : words[post],              # Text of the post
                                'date'      : dates[post],              # Date of the (unedited) post
                                'nums_dict' : nums_dict[post],          # Author info & votes of the post
                                'post_num'  : post_num[post],           # Post umber in the topic
                                'prof_link' : prof_link[post],          # Link to author profile
                                'auth_img'  : auth_img[post],           # Profile image of author
                                'title'     : title,                    # Title of Topic
                                'forum_page': forum_page}               # Page in the forum
                        except:
                            pass
                        try:
                            post_dict['auth_posts'] = auth_posts[post]  # Number of posts author has made
                        except:
                            pass
                        list_of_dicts.append(post_dict)
                    more_pages = click_next()                           # Clicks "NEXT" otherwise return False
                    #print('next clicked')
                browser.get(current_list)                               # Returns to current forum page
                #sleep(3)                                              # Tiny safety sleep
                #print('slept')
                title_list = browser.find_elements_by_class_name("ForumTopic-title") # Redefine list for freshness
                
            except:
                print("Woops", forum_page, i)                           # Informs of the problem
                save_posts(list_of_dicts, saver, forum_page)            # Save current posts
                sleep(10)                                               # Safety sleep
                browser.refresh()                                       # Safety Refresh
                browser.get(current_list)                               # Return to forum page
                title_list = browser.find_elements_by_class_name("ForumTopic-title") # Redefine for freshness
    except:
        print("Woops", forum_page)                                      # Informs that there's a problem
        save_posts(list_of_dicts, saver, forum_page)                    # Save current posts
        sleep(30)                                                       # Long Safety Sleep
        browser.refresh()                                               # Safety Refresh
        browser.get(current_list)                                       # Return to forum page
        
save_posts(list_of_dicts, saver, forum_page)                            # Finally save the posts
df = pd.DataFrame(list_of_dicts, columns = ['text','date'])             # Return the dataframe for looking! :D

Unfortunitly the above code resulted in a kind of timeout error so I decided to try a simpler approach.  This was when I decided to limit the number of requests that I made for the browser by instead using BeaufitulSoup once I got the code for the page.

In [None]:
#ops = webdriver.chrome.options.Options()
#ops.add_argument('--dns-prefetch-disable')
path = '../Garage/chromedriver'                                         # Path to Chromedriver
browser = webdriver.Chrome(executable_path = path)#, options=ops)         # Open browser
list_of_dicts = []
saver = './data/full_scrapes/Overwatch_Test_' 

for forum_page in range(1,1000):
    try:
        current_list = 'https://us.battle.net/forums/en/overwatch/22813879/?page='+ str(forum_page)
        browser.get(current_list)
        title_list = browser.find_elements_by_class_name("ForumTopic-title")
        if forum_page % 100 == 0:
            save_posts(list_of_dicts, saver, forum_page)

        for i in range(len(title_list)):                                # For every title in the list
            try:                                                        # Failsafe in topic page
                title_list[i].click()                                   # Click on Title i
                more_pages = True                                       # True means there's more pages
                next_page = 2
                current_topic = browser.current_url
                while more_pages == True:                               # While true that there's more pages...
                    soup       = BeautifulSoup(browser.page_source, 'lxml')         # Soup of words
                    title      = soup.find(attrs={'class':'Topic-title'}).contents
                    TopicPosts = soup.find_all(attrs = {"class" :'TopicPost'})
                    dates      = [e for e in soup.find_all('a',{"class" :'TopicPost-timestamp'}) if e.contents != ['\n\t\t\t\t\t\t\t\xa0(Edited)\n']]
                    words      = soup.find_all('div',{"class" :'TopicPost-bodyContent'})
                    auth_posts = [e.contents for e in soup.find_all('a',attrs = {'class' :'Author-posts'})]
                    prof_link  = soup.find_all(attrs = {"class" :'Author-avatar '})
                    all_imgs   = [e.get_attribute('src') for e in browser.find_elements_by_css_selector('img')]
                    auth_img   = [e for e in all_imgs if 'blznav' not in e and len(e) != 0][1:-1]
                    for post in range(len(words)):                      # For each post in topic
                        try:
                            post_dict = {                               # Creation & Statement of dicts
                                'text'      : words[post].contents,                     # Text of the post
                                'date'      : dates[post].attrs['data-tooltip-content'],# Date of the (unedited) post
                                'ids_dict'  : TopicPosts[post].attrs['data-topic-post'],# Author info & votes of the post
                                'post_num'  : TopicPosts[post].attrs['id'] ,            # Post number in the topic
                                'statuses'  : TopicPosts[post].attrs['data-topic'],
                                'prof_link' : prof_link[post].attrs['href'],            # Link to author profile
                                'auth_img'  : auth_img[post],           # Profile image of author
                                'title'     : title,                    # Title of Topic
                                'forum_page': forum_page}               # Page in the forum
                        except:
                            pass
                        try:
                            post_dict['auth_posts'] = auth_posts[post]  # Number of posts author has made
                        except:
                            pass
                        list_of_dicts.append(post_dict)
                    try:
                        browser.get(current_topic + '?page=' + str(next_page))
                        soup = BeautifulSoup(browser.page_source, 'lxml')
                        if soup.find(attrs = {'class':'error-type'}).contents[0] == '404':
                            more_pages = False
                        else:
                            next_page += 1
                    except:
                        more_pages = False
                browser.get(current_list)                               # Returns to current forum page
                #sleep(3)                                                # Tiny safety sleep
                title_list = browser.find_elements_by_class_name("ForumTopic-title") # Redefine list for freshness
                
            except:
                print("Woops", forum_page, i)                           # Informs of the problem
                save_posts(list_of_dicts, saver, forum_page)            # Save current posts
                sleep(10)                                               # Safety sleep
                browser.refresh()                                       # Safety Refresh
                browser.get(current_list)                               # Return to forum page
                title_list = browser.find_elements_by_class_name("ForumTopic-title") # Redefine for freshness
    except:
        print("Woops", forum_page)
        save_posts(list_of_dicts, saver, forum_page)
        sleep(30)
        browser.refresh()
        browser.get(current_list)
save_posts(list_of_dicts, saver, forum_page)
df = pd.DataFrame(list_of_dicts, columns = ['text','date'])        

In [None]:
# THIS is our new official one??
path = '../Garage/chromedriver'                                         # Path to Chromedriver
browser = webdriver.Chrome(executable_path = path)#, options=ops)         # Open browser
list_of_dicts = []
saver = './data/full_scrapes/Overwatch_Test_' 

for forum_page in range(72,1000):
    try:
        current_list = 'https://us.battle.net/forums/en/overwatch/22813879/?page='+ str(forum_page)
        browser.get(current_list)
        title_list = browser.find_elements_by_class_name("ForumTopic-title")
        if forum_page % 100 == 0:
            save_posts(list_of_dicts, saver, forum_page)

        for i in range(len(title_list)):                                # For every title in the list
            try:                                                        # Failsafe in topic page
                title_list[i].click()                                   # Click on Title i
                more_pages = True                                       # True means there's more pages
                next_page = 2
                current_topic = browser.current_url
                while more_pages == True:                               # While true that there's more pages...
                    soup       = BeautifulSoup(browser.page_source, 'lxml')         # Soup of words
                    title      = soup.find(attrs={'class':'Topic-title'}).contents
                    TopicPosts = soup.find_all(attrs = {"class" :'TopicPost'})
                    dates      = [e for e in soup.find_all('a',{"class" :'TopicPost-timestamp'}) if e.contents != ['\n\t\t\t\t\t\t\t\xa0(Edited)\n']]
                    words      = soup.find_all('div',{"class" :'TopicPost-bodyContent'})
                    auth_posts = [e.contents for e in soup.find_all('a',attrs = {'class' :'Author-posts'})]
                    prof_link  = soup.find_all(attrs = {"class" :'Author-avatar '})
                    all_imgs   = [e.get_attribute('src') for e in browser.find_elements_by_css_selector('img')]
                    auth_img   = [e for e in all_imgs if 'blznav' not in e and len(e) != 0][1:-1]
                    for post in range(len(words)):                      # For each post in topic
                        try:
                            post_dict = {                               # Creation & Statement of dicts
                            'text'      : words[post].contents,                     # Text of the post
                            'date'      : dates[post].attrs['data-tooltip-content'],# Date of the (unedited) post
                            'ids_dict'  : TopicPosts[post].attrs['data-topic-post'],# Author info & votes of the post
                            'post_num'  : TopicPosts[post].attrs['id'] ,            # Post number in the topic
                            'auth_img'  : auth_img[post],           # Profile image of author
                            'title'     : title,                    # Title of Topic
                            'forum_page': forum_page}               # Page in the forum
                        except:
                            pass
                        try:
                            post_dict['statuses']   = TopicPosts[post].attrs['data-topic']
                            post_dict['prof_link']  = prof_link[post].attrs['href']
                            post_dict['auth_posts'] = auth_posts[post]  # Number of posts author has made
                        except:
                            pass
                        list_of_dicts.append(post_dict)
                    try:
                        browser.get(current_topic + '?page=' + str(next_page))
                        soup = BeautifulSoup(browser.page_source, 'lxml')
                        #print('attempted new page')
                        if soup.find(attrs = {'class':'error-type'}).contents[0] == '404':
                            #print("got a 404")
                            more_pages = False
                        #else:
                        #    print("there is another page and we added 1")
                        #    next_page += 1
                    except:
                        next_page += 1
                browser.get(current_list)                               # Returns to current forum page
                #sleep(3)                                                # Tiny safety sleep
                title_list = browser.find_elements_by_class_name("ForumTopic-title") # Redefine list for freshness
            except KeyboardInterrupt:
                print("Canceled!")
            except:
                print("TimedOut on Forum Section:", forum_page,"Topic number:", i, "Topic Page:", next_page-1)                           # Informs of the problem
                save_posts(list_of_dicts, saver, forum_page)            # Save current posts
                sleep(10)                                               # Safety sleep
                browser.refresh()                                       # Safety Refresh
                browser.get(current_list)                               # Return to forum page
                title_list = browser.find_elements_by_class_name("ForumTopic-title") # Redefine for freshness
            #except:
            #    print('SOMETHING ELSE WENT WRONG')
            #    pass
    except KeyboardInterrupt:
        print("Canceled!")
    except:
        print("TimedOut on Forum Section:", forum_page)
        save_posts(list_of_dicts, saver, forum_page)
        sleep(30)
        browser.refresh()
        browser.get(current_list)
    #except:
    #    print("SOMETHING ELSE WENT WRONG")
    #    pass
save_posts(list_of_dicts, saver, forum_page)
df = pd.DataFrame(list_of_dicts, columns = ['text','date'])        

I had decided to attempt to find the bug by removing the try/excepts and no dice.

In [None]:
# THIS is just before deleting a bunch of junk
list_of_dicts = []
saver = './data/full_scrapes/Overwatch_Test_' 

for forum_page in range(0,100):
    #try:
    topic_keys = urls_with_numbers('https://us.battle.net/forums/en/overwatch/22813879/?page='+ str(forum_page))
    if forum_page % 100 == 0:
        save_posts(list_of_dicts, saver, forum_page) 
    for topic_stats in topic_keys:                                # For every title in the list
        for page in range(topic_stats[1]):
            page += 1               # TRY "for page +1 in range(topic_stats[1]):" later
            if page > 1:
                URL = topic_stats[0] + '?page=' + str(page)
            else:
                URL = topic_stats[0]
            page_soup  = get_html(URL)         # Soup of words
            title      = page_soup.find(attrs={'class':'Topic-title'}).contents
            dates      = [e for e in page_soup.find_all('a',{"class" :'TopicPost-timestamp'}) if e.contents != ['\n\t\t\t\t\t\t\t\xa0(Edited)\n']]
            words      = page_soup.find_all('div',{"class" :'TopicPost-bodyContent'})
            TopicPosts = page_soup.find_all(attrs = {"class" :'TopicPost'})
            auth_posts = [e.contents for e in page_soup.find_all('a',attrs = {'class' :'Author-posts'})]
            prof_link  = page_soup.find_all(attrs = {"class" :'Author-avatar '})
        for page in range(topic_info[1]):                      # For each post in topic

            turn_to_dict(post)
            list_of_dicts.append(post_dict)
        try:
            resoup = get_html(title_list[i] + '?page=' + str(next_page))
            if resoup.find(attrs = {'class':'error-type'}).contents[0] == '404':
                more_pages = False
        except:
            next_page += 1
        resoup = get_html(current_list)                               # Returns to current forum page
        #except KeyboardInterrupt:
        #    print("Canceled!")
        #except:
        #    print("TimedOut on Forum Section:", forum_page,"Topic number:", i, "Topic Page:", next_page-1)                           # Informs of the problem
        #    save_posts(list_of_dicts, saver, forum_page)            # Save current posts
        #    sleep(10)    
        print("Success!:", forum_page,"Topic number:", i, "Topic Page:", next_page-1)
    #except KeyboardInterrupt:
    #    print("Canceled!")
    #except:
    #    print("TimedOut on Forum Section:", forum_page)
    #    save_posts(list_of_dicts, saver, forum_page)
    #    sleep(30)
    #    resoup = get_html(current_list)
save_posts(list_of_dicts, saver, forum_page)
df = pd.DataFrame(list_of_dicts, columns = ['text','date'])        

In [None]:
path_to_chromedriver = '../Garage/chromedriver'                          # Path to Chromedriver
browser = webdriver.Chrome(executable_path = path_to_chromedriver)       # Open browser
list_of_dicts = []
saver = './data/full_scrapes/Overwatch_'                                 # Path to save

for forum_page in range(0,1001):                                         # For each page in chunk of posts
    current_list = 'https://us.battle.net/forums/en/overwatch/22813879/?page='+ str(forum_page)
    browser.get(current_list)                                        # Go to forum page
    title_list = browser.find_elements_by_class_name("ForumTopic-title")# Define list of topic pages to click
    if forum_page % 100 == 0:                                        # Failsafe saver per 100 pages
        save_posts(list_of_dicts, saver, forum_page)

    for i in range(len(title_list)):                                 # For every title in the list
        title_list[i].click()                                    # Click on Title i
        more_pages = True                                        # True means there's more pages
        print('start dict')
        while more_pages == True:                                # While true that there's more pages...
            title      = browser.find_element_by_class_name('Topic-title').text
            dates      = [e.get_attribute('data-tooltip-content') for e in browser.find_elements_by_class_name('TopicPost-timestamp') if e.text != ' (Edited)']
            words      = [e.text for e in browser.find_elements_by_class_name('TopicPost-bodyContent')]
            nums_dict  = [e.get_attribute('data-topic-post') for e in browser.find_elements_by_class_name('TopicPost')]
            post_num   = [e.get_attribute('id') for e in browser.find_elements_by_class_name('TopicPost')]
            auth_posts = [e.text for e in browser.find_elements_by_class_name('Author-posts') if e.get_attribute('data-toggle') == 'tooltip']
            prof_link  = [e.get_attribute('href') for e in browser.find_elements_by_class_name('Author-avatar ')]
            all_imgs   = [e.get_attribute('src') for e in browser.find_elements_by_css_selector('img')]
            auth_img   = [e for e in all_imgs if 'blznav' not in e and len(e) != 0][1:-1]
            print('dict components defined')
            for post in range(len(words)):                       # For each post in topic
                try:
                    post_dict = {                                    # Creation & Statement of dicts
                        'text'      : words[post],                   # Text of the post
                        'date'      : dates[post],                   # Date of the (unedited) post
                        'nums_dict' : nums_dict[post],               # Author info & votes of the post
                        'post_num'  : post_num[post],                # Post umber in the topic
                        'prof_link' : prof_link[post],               # Link to author profile
                        'auth_img'  : auth_img[post],                # Profile image of author
                        'title'     : title,                         # Title of Topic
                        'forum_page': forum_page                     # Page in the forum
                    }
                except:
                    pass
                try:
                    post_dict['auth_posts'] = auth_posts[post]   # Number of posts author has made
                except:
                    pass
                list_of_dicts.append(post_dict)
                print('dict created and appended')
            more_pages = click_next()                            # Clicks "NEXT" otherwise return False
            print('next clicked')
        browser.get(current_list)                                # Returns to current forum page
        sleep(0.5)                                               # Tiny safety sleep
        print('slept')
        title_list = browser.find_elements_by_class_name("ForumTopic-title") # Redefine list for freshness
save_posts(list_of_dicts, saver, forum_page)                             # Finally save the posts
df = pd.DataFrame(list_of_dicts, columns = ['text','date'])              # Return the dataframe for looking! :D

Below is the latest version of the scraping code.  Very buggy.  At this point I had to shelf the project to focus on other things, both because these edits didn't fix the timeout error and because the effort for my jupothixic

In [None]:
# THIS is for turning into the Requests library
list_of_dicts = []
saver = './data/full_scrapes/Overwatch_Test_'                   # The path & starting name for saving

for forum_page in range(101):                                   # Will go through all the forum pages specified
    topic_keys = urls_with_numbers('https://us.battle.net/forums/en/overwatch/22813879/?page='+ str(forum_page))
    if forum_page % 100 == 0:
        save_posts(list_of_dicts, saver, forum_page) 
    for topic_stats in topic_keys:                              # For every title in the list
        for page in range(topic_stats[1]):                      # for page # in range of # of pages in the topic
            #page += 1                                          # Add 1 to compensate for starting at 0
            if page > 1:                                        # Basically making sure it's not the first page
                URL = topic_stats[0] + '?page=' + str(page)     # This is basicslly for getting to all the pages
            else:
                URL = topic_stats[0]                            # This will be the first page
            sleep(5)
            
            # This chunk defines the lists of things that we want per row
            page_soup  = get_html(URL)                          # Soup of the page
            title      = page_soup.find(attrs={'class':'Topic-title'}).contents
            dates      = [e for e in page_soup.find_all('a',{"class" :'TopicPost-timestamp'}) if e.contents !=edt]
            words      = page_soup.find_all('div',{"class" :'TopicPost-bodyContent'})
            TopicPosts = page_soup.find_all(attrs = {"class" :'TopicPost'})
            auth_posts = [e.contents for e in page_soup.find_all('a',attrs = {'class' :'Author-posts'})]
            prof_link  = page_soup.find_all(attrs = {"class" :'Author-avatar '})
            print("F-Page:", forum_page, "Posts/Topic:", topic_stats[1],"Page:",page,"URL:",URL )
            if page == 1:
                print("Skipped duplicate page")
            else:
                for post in range(len(words)):                  # For each post that's present
                    list_of_dicts.append(turn_to_dict(post, words, 
                 dates, TopicPosts, title, forum_page, 
                 prof_link, auth_posts, topic_keys))            # Add the dict of post to list of posts
        print("END F-page:", forum_page,"Topic Page:", page, "Last URL:", URL) # Sanity check
df = save_posts(list_of_dicts, saver, forum_page)               # Should both save the data & create a df to see

In [None]:
df = save_posts(list_of_dicts, saver, forum_page)

In [None]:
pd.DataFrame(list_of_dicts, columns = ['text',
                            'date',
                            'ids_dict',
                            'post_num',
                            'auth_posts',
                            'prof_link',
                            'title',
                            'forum_page',
                            'statuses',
                            'topic_url'])

In [None]:
list_of_topic_links = ['https://us.battle.net' + topic.attrs['href'] for topic in resoup.find_all(attrs={'class': "ForumTopic"})]

In [None]:
test = resoup.find_all(attrs={'class': "ForumTopic"})
for i in range(len(test)):
    print('-------')
    print(i)
    posts = json.loads(test[i].attrs['data-forum-topic'])['lastPosition']
    print('posts:', posts)
    pages = posts//20 + 1
    print("pages:", pages)
    if posts%20 > 0 and posts > 21:
        pages += 1
        print("has remainder", posts%20, 'pages:', pages)
    

In [None]:
test = resoup.find_all(attrs={'class': "ForumTopic"})
for i in range(len(test)):
    posts = json.loads(test[i].attrs['data-forum-topic'])['lastPosition']
    pages = posts//20 + 1
    if posts%20 > 0 and posts > 21:
        pages += 1
for  in test

In [None]:
Sample Data:

In [None]:
#df_24to77     = pd.read_csv('./data/Overwatch_24to77_posts_77.csv')
#df_149        = pd.read_csv('./data/Overwatch_posts_149')          # An error skipped pages 24 to 77
df_s0to150    = pd.read_csv('./data/Overwatch_Sample_to1500_160')
df_s150to1000 = pd.read_csv('./data/Overwatch_SAMPLE150to1000_990')
df_s1500      = pd.read_csv('./data/Overwatch_SAMPLEposts_1500')
df_s1500to5k  = pd.read_csv('./data/Overwatch_Sample1500to5k_5000')
df_s5kto10k   = pd.read_csv('./data/Overwatch_Sample5kto10k_7250')

In [None]:
df_s0to150['date']      = df_s0to150['date'].map(lambda x: x.replace('\t','').replace('\n',''))
df_s150to1000['date']   = df_s150to1000['date'].map(lambda x: x.replace('\t','').replace('\n',''))
df_s1500['date']        = df_s1500['date'].map(lambda x: x.replace('\t','').replace('\n',''))
df_s1500to5k['date']    = df_s1500to5k['date'].map(lambda x: x.replace('\t','').replace('\n',''))
df_s5kto10k['date']     = df_s5kto10k['date'].map(lambda x: x.replace('\t','').replace('\n',''))

In [None]:
dfs_listed = [    
    df_s0to150,
    df_s150to1000,
    df_s1500,
    df_s1500to5k,
    df_s5kto10k]

In [None]:
for text in df_s150to1000['text']:
    try:
        if '<Removed by forum moderator for toxicity>' in text:
            print(text)
    except:
        print(text)
        #print(df_s0to150['date'])

In [None]:
for df in dfs_listed:
    for text in df['text']:
        try:
            if '<Removed' in text:
                df.head(1)
                print(text)
        except:
            pass

In [None]:
for df in dfs_listed:
    print(df['date'].value_counts())

Tested Via Pieces

In [None]:
dates      = [e.get_attribute('data-tooltip-content') for e in browser.find_elements_by_class_name('TopicPost-timestamp') if e.text != ' (Edited)']
words      = [e.text for e in browser.find_elements_by_class_name('TopicPost-bodyContent')]
nums_dict  = [e.get_attribute('data-topic-post') for e in browser.find_elements_by_class_name('TopicPost')]
post_num   = [e.get_attribute('id') for e in browser.find_elements_by_class_name('TopicPost')]
auth_posts = [e.text for e in browser.find_elements_by_class_name('Author-posts') if e.get_attribute('data-toggle') == 'tooltip']
prof_link  = [e.get_attribute('href') for e in browser.find_elements_by_class_name('Author-avatar ')]
all_imgs   = [e.get_attribute('src') for e in browser.find_elements_by_css_selector('img')]
auth_img   = [e for e in all_imgs if 'blznav' not in e and len(e) != 0][1:-1]

In [None]:
links = driver.find_elements_by_xpath("xpath")
links = [link.get_attribute('href') for link in driver.find_elements_by_xpath("xpath")]
for link in links:
    driver.get(link)
for x in range(len(links)):
    links[x].click()
    try:
        driver.implicitly_wait(3)
        DO something
        driver.back()
        print("Mission completed!!")
    except (ElementNotVisibleException, NoSuchElementException):
        driver.back()
        print("No action")

In [None]:
soup = BeautifulSoup(browser.page_source, 'lxml')         # Soup of words
words = soup.find_all('div',{"class" :'TopicPost-bodyContent'})
dates = soup.find_all('a',{"class" :'TopicPost-timestamp'})

print(len(words))
print(len(dates))

In [None]:
# Testing Code # 1
path_to_chromedriver = '../Garage/chromedriver'                          # Path to Chromedriver
browser = webdriver.Chrome(executable_path = path_to_chromedriver)       # Open browser
current_list = 'https://us.battle.net/forums/en/overwatch/22813879/?page=9'
browser.get(current_list)
title_list = browser.find_elements_by_class_name("ForumTopic-title")
title_list[4].click()
browser.find_element_by_class_name('Topic-title').text

In [None]:
# Testing Code # 2
path_to_chromedriver = '../Garage/chromedriver'                          # Path to Chromedriver
browser = webdriver.Chrome(executable_path = path_to_chromedriver)       # Open browser
browser.get(current_topic + '?page=' + str(next_page))
soup = BeautifulSoup(browser.page_source, 'lxml')
#soup.find_all('div',{"class" :'TopicPost-bodyContent'})

In [None]:
# Moves down the forum, clicking on each individual title link
title_list = browser.find_elements_by_class_name("ForumTopic-title")
for i in range(len(title_list)):
    print(title_list[i])
    title_list[i].click()
    sleep(1)
    browser.back()
    title_list = browser.find_elements_by_class_name("ForumTopic-title")
    print(title_list[i])

In [None]:
def unknown_funct(title_list, list_of_dicts, browser):
    for i in range(len(title_list)):
        try:
            title_list[i].click()                                           # Click on Title i
            soup = BeautifulSoup(browser.page_source, 'lxml')               # Soup of all
            words = soup.find_all('div',{"class" :'TopicPost-bodyContent'}) # Words
            dates = soup.find_all('a',{"class" :'TopicPost-timestamp'})     # Dates
            sleep(0.5)
            
            for post in range(len(words)):
                post_dict = {                                         # Creation & Statement of dicts
                    'text' : words[post].text,
                    'date' : dates[post].text
                }
                list_of_dicts.append(post_dict)

            browser.back()                                            # Back
            title_list = browser.find_elements_by_class_name("ForumTopic-title")
        except:
            print("Woops", forum_page, i)
            save_posts(list_of_dicts, saver, forum_page)

In [None]:
# Very alpha Psudocode
links = driver.find_elements_by_xpath("xpath")
links = [link.get_attribute('href') for link in driver.find_elements_by_xpath("xpath")]
for link in links:
    driver.get(link)
for x in range(len(links)):
    links[x].click()
    try:
        driver.implicitly_wait(3)
        DO something
        driver.back()
        print("Mission completed!!")
    except (ElementNotVisibleException, NoSuchElementException):
        driver.back()
        print("No action")