In [1]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from dateutil import parser
from elasticsearch import Elasticsearch 

In [2]:
# config
BASEURL = 'https://forum.vbulletin.com/forum/vbulletin-5-connect/vbulletin-5-suggestions'
MAXPAGE = 20
PAGINATION = False

# ElasticSearch
HOST = 'localhost'
PORT = 9200
INDEX = 'vbulletin'
DOCTYPE = 'post'

In [3]:
# get page URL from pagination
def getPageUrlFromPagination():
    soup = BeautifulSoup(requests.get(BASEURL).text, 'html.parser')
    arrUrl = {}
    for a in soup.find_all('a', href=True , attrs={'class':'js-pagenav-button b-button b-button--narrow js-shrink-event-child b-button--primary page js-pagenav-current-button js-pagenav-first-button'}):
        page = int(a.text)
        arrUrl[page] = (a['href'])
    for a in soup.find_all('a', href=True , attrs={'class':'js-pagenav-button b-button b-button--narrow js-shrink-event-child b-button--secondary'}):
        page = int(a.text)
        arrUrl[page] = (a['href'])
    return arrUrl

# get page URL from page pattern
def getPageUrlFromPattern(baseUrl, maxPage):
    arrUrl = {}
    arrUrl[1] = baseUrl
    for i in range(2,maxPage):
        arrUrl[i] = f'{baseUrl}/page{i}'
    return arrUrl

In [4]:
# utility library
def formatDate(strDate):
    dt = parser.parse(strDate)
    return dt.isoformat()

In [5]:
# parsing Library
def getSubject(tr):
    return tr.find('a', attrs={'class':'topic-title js-topic-title'}).text
def getAuthor(tr):
    return tr.find('div', attrs={'class':'topic-info h-clear h-hide-on-small h-hide-on-narrow-column'}).find('a').text
def getCreatedAt(tr):
    isoCreatedAt = None
    layer = tr.find('div', attrs={'class':'topic-info h-clear h-hide-on-small h-hide-on-narrow-column'})
    if layer:
        strCreatedAt = layer.find('span', attrs={'class':'date'}).text
        isoCreatedAt = formatDate(strCreatedAt)
    return isoCreatedAt
def getResponses(tr):
    intResponse = 0
    strResponses = tr.find('div', attrs={'class':'posts-count'})
    if strResponses:
        strResponses = strResponses.text
        arrResponses = strResponses.split(' ')
        intResponse = int(arrResponses[0].replace(",", ""))
    return intResponse
def getViews(tr):
    intViews = 0
    strViews = tr.find('div', attrs={'class':'views-count'})
    if strViews:
        strViews = strViews.text
        arrViews = strViews.split(' ')
        intViews = int(arrViews[0].replace(",", ""))
    return intViews 
def getLastPostBy(tr):
    strLastPostBy = ''
    cellLastpost = tr.find('td', attrs={'class':'cell-lastpost'})
    if cellLastpost:
        divLastpost=cellLastpost.find('div', attrs={'class':'lastpost-by'})
        if divLastpost:
            strLastPostBy = divLastpost.find('a')
            if strLastPostBy:
                strLastPostBy = strLastPostBy.text
    return strLastPostBy
def getLastPostTime(tr):
    isoLastPostTime = None
    cellLastpost = tr.find('td', attrs={'class':'cell-lastpost'})
    if cellLastpost:
        strLastPostTime = cellLastpost.find('span', attrs={'class':'post-date'})
        if strLastPostTime:
            strLastPostTime = strLastPostTime.text
            isoLastPostTime = formatDate(strLastPostTime)
    return isoLastPostTime

In [6]:
# iterate over page
if PAGINATION:
    arrPage = getPageUrlFromPagination()
else:
    arrPage = getPageUrlFromPattern(BASEURL, MAXPAGE)
# 
connES=Elasticsearch([{'host':HOST,'port':PORT}])
# connES.indices.delete(index=INDEX)
for i in range(1,MAXPAGE):
    link = arrPage[i]
    objSoupHtml = BeautifulSoup(requests.get(link).text, 'html.parser')
    for tr in objSoupHtml.find_all('tr', attrs={'class':'topic-item'}):
        objPost = {}
        objPost['Post_subject'] = getSubject(tr)
        objPost['Author'] = getAuthor(tr)
        objPost['created_at'] = getCreatedAt(tr)
        objPost['responses'] = getResponses(tr)
        objPost['views_count'] = getViews(tr)
        objPost['last_post_by'] = getLastPostBy(tr)
        objPost['last_post_time'] = getLastPostTime(tr)
        connES.index(index=INDEX, doc_type=DOCTYPE, body=objPost)
print('Completed')

1
{'Post_subject': 'Voting for your favorite suggestions...', 'Author': 'Wayne Luke', 'created_at': '2013-08-06T21:51:00', 'responses': 7, 'views_count': 910, 'last_post_by': 'In Omnibus', 'last_post_time': '2016-08-28T07:05:00'}
{'Post_subject': 'Before making a Suggestion...', 'Author': 'Wayne Luke', 'created_at': '2012-09-09T06:57:00', 'responses': 0, 'views_count': 946, 'last_post_by': 'Wayne Luke', 'last_post_time': '2012-09-09T06:57:00'}
{'Post_subject': 'vbulletin 5 marketplace', 'Author': 'ddkfoundations', 'created_at': '2019-06-20T10:20:00', 'responses': 2, 'views_count': 49, 'last_post_by': 'zweeper', 'last_post_time': '2019-06-20T22:23:00'}
{'Post_subject': 'Signing into the VBulletin forums with username OR email', 'Author': 'kbarg', 'created_at': '2019-05-21T15:54:00', 'responses': 3, 'views_count': 67, 'last_post_by': 'kbarg', 'last_post_time': '2019-05-27T19:33:00'}
{'Post_subject': 'VB 5 Shoutbox', 'Author': 'riser4what', 'created_at': '2019-05-19T10:06:00', 'responses'

{'Post_subject': 'URL Whitelist Permissions By Usergroup', 'Author': 'In Omnibus', 'created_at': '2017-11-17T04:48:00', 'responses': 1, 'views_count': 88, 'last_post_by': 'rhens', 'last_post_time': '2017-11-17T09:56:00'}
{'Post_subject': 'Topic redirects', 'Author': 'chriske', 'created_at': '2017-11-13T05:37:00', 'responses': 0, 'views_count': 60, 'last_post_by': 'chriske', 'last_post_time': '2017-11-13T05:37:00'}
{'Post_subject': 'Why are these features missing?', 'Author': 'ABDUR7MAAN', 'created_at': '2017-11-03T19:02:00', 'responses': 2, 'views_count': 193, 'last_post_by': 'delicjous', 'last_post_time': '2017-11-08T07:00:00'}
{'Post_subject': 'Sign up, login via Google+ and Twitter?', 'Author': 'botia', 'created_at': '2017-10-14T05:56:00', 'responses': 3, 'views_count': 150, 'last_post_by': 'Wayne Luke', 'last_post_time': '2017-10-17T11:35:00'}
{'Post_subject': 'An option to mention a usergroup with @', 'Author': 'hanm13', 'created_at': '2017-10-12T04:27:00', 'responses': 1, 'views_

{'Post_subject': 'Members who have visited today', 'Author': 'goxy63', 'created_at': '2016-10-28T17:15:00', 'responses': 3, 'views_count': 72, 'last_post_by': 'goxy63', 'last_post_time': '2016-10-29T06:42:00'}
{'Post_subject': 'vBulletin5 Project Tools', 'Author': 'In Omnibus', 'created_at': '2016-07-06T04:23:00', 'responses': 0, 'views_count': 120, 'last_post_by': 'In Omnibus', 'last_post_time': '2016-07-06T04:23:00'}
{'Post_subject': 'Display Announcements to specific usergroup', 'Author': 'Dovaleh', 'created_at': '2016-03-02T07:12:00', 'responses': 1, 'views_count': 185, 'last_post_by': 'HowToEvery', 'last_post_time': '2016-06-13T01:53:00'}
{'Post_subject': 'Request :Advanced Forum Statistics on vbulletin 5', 'Author': 'franzes80', 'created_at': '2016-05-14T03:22:00', 'responses': 1, 'views_count': 254, 'last_post_by': 'Trevor Hannant', 'last_post_time': '2016-05-15T23:28:00'}
{'Post_subject': "Email notifications don't pick up changes made to phrases in AdminCP", 'Author': 'Fhabio'

{'Post_subject': 'Ahhh, my eyes! Tracking lines or alternating colors would be nice!!!', 'Author': 'djcaseanova', 'created_at': '2015-03-10T14:20:00', 'responses': 4, 'views_count': 146, 'last_post_by': 'Wayne Luke', 'last_post_time': '2015-04-05T11:36:00'}
{'Post_subject': 'Embed mp3s into forum', 'Author': 'donnyaz', 'created_at': '2015-03-27T04:31:00', 'responses': 3, 'views_count': 105, 'last_post_by': 'Mark.B', 'last_post_time': '2015-04-02T03:20:00'}
{'Post_subject': 'A couple suggestions', 'Author': 'BugOutGirl', 'created_at': '2015-03-12T08:44:00', 'responses': 1, 'views_count': 75, 'last_post_by': 'Dominic', 'last_post_time': '2015-03-19T04:30:00'}
{'Post_subject': 'View latest posts', 'Author': 'online68', 'created_at': '2015-03-13T07:18:00', 'responses': 2, 'views_count': 227, 'last_post_by': 'Mark.B', 'last_post_time': '2015-03-14T12:08:00'}
{'Post_subject': 'Will we ever be able to delete our own replacement variables?', 'Author': 'BugOutGirl', 'created_at': '2015-03-11T13

{'Post_subject': 'Header redesign + Search bar improvement', 'Author': 'Miykichii', 'created_at': '2014-05-18T10:20:00', 'responses': 0, 'views_count': 198, 'last_post_by': 'Miykichii', 'last_post_time': '2014-05-18T10:20:00'}
{'Post_subject': 'Implementing proper live notification system', 'Author': 'Miykichii', 'created_at': '2014-05-18T10:18:00', 'responses': 0, 'views_count': 237, 'last_post_by': 'Miykichii', 'last_post_time': '2014-05-18T10:18:00'}
8
{'Post_subject': 'Voting for your favorite suggestions...', 'Author': 'Wayne Luke', 'created_at': '2013-08-06T21:51:00', 'responses': 7, 'views_count': 910, 'last_post_by': 'In Omnibus', 'last_post_time': '2016-08-28T07:05:00'}
{'Post_subject': 'Before making a Suggestion...', 'Author': 'Wayne Luke', 'created_at': '2012-09-09T06:57:00', 'responses': 0, 'views_count': 946, 'last_post_by': 'Wayne Luke', 'last_post_time': '2012-09-09T06:57:00'}
{'Post_subject': "Improving Member's list", 'Author': 'Miykichii', 'created_at': '2014-05-18T1

{'Post_subject': 'Not a poll', 'Author': 'DemOnstar', 'created_at': '2013-09-10T09:03:00', 'responses': 6, 'views_count': 83, 'last_post_by': 'DemOnstar', 'last_post_time': '2013-09-11T11:36:00'}
{'Post_subject': 'vBulletin 5 LDAP auth', 'Author': 'Halandar', 'created_at': '2013-09-10T06:59:00', 'responses': 0, 'views_count': 617, 'last_post_by': 'Halandar', 'last_post_time': '2013-09-10T06:59:00'}
{'Post_subject': 'Member list: Improvements on design/usability and features', 'Author': 'TLMD', 'created_at': '2013-08-23T03:27:00', 'responses': 11, 'views_count': 1207, 'last_post_by': 'CheeseMan316', 'last_post_time': '2013-09-03T10:53:00'}
{'Post_subject': 'Add an AdminCP option to allow the jQuery js file to be included in the <head>', 'Author': 'Glenn Vergara', 'created_at': '2013-08-29T16:30:00', 'responses': 0, 'views_count': 168, 'last_post_by': 'Glenn Vergara', 'last_post_time': '2013-08-29T16:30:00'}
{'Post_subject': 'Add hook before </body> tag fter all script includes', 'Author

{'Post_subject': 'vB5: Navbar: Textfield for jumping to pages should be auto-selected', 'Author': 'TLMD', 'created_at': '2013-06-10T22:23:00', 'responses': 11, 'views_count': 87, 'last_post_by': 'TLMD', 'last_post_time': '2013-06-20T01:10:00'}
{'Post_subject': 'vB4 CMS Searchable archive', 'Author': 'jdj', 'created_at': '2013-05-24T05:13:00', 'responses': 2, 'views_count': 47, 'last_post_by': 'jdj', 'last_post_time': '2013-06-07T01:42:00'}
{'Post_subject': 'To do list', 'Author': 'DemOnstar', 'created_at': '2013-05-25T07:11:00', 'responses': 6, 'views_count': 53, 'last_post_by': 'DemOnstar', 'last_post_time': '2013-06-03T22:15:00'}
{'Post_subject': 'Any way to add "This solved my issue" and "This was helpful" options to my forums', 'Author': 'Dieds', 'created_at': '2013-05-17T08:50:00', 'responses': 4, 'views_count': 64, 'last_post_by': 'IggyP', 'last_post_time': '2013-05-19T11:55:00'}
{'Post_subject': 'bring back table editing', 'Author': 'IggyP', 'created_at': '2013-05-19T02:42:00', 

{'Post_subject': 'New features..', 'Author': 'DemOnstar', 'created_at': '2013-02-08T03:50:00', 'responses': 2, 'views_count': 44, 'last_post_by': 'DemOnstar', 'last_post_time': '2013-02-11T02:39:00'}
{'Post_subject': 'improvement for the human verification option', 'Author': 'sukagwe', 'created_at': '2013-02-07T21:04:00', 'responses': 1, 'views_count': 66, 'last_post_by': 'Joe D.', 'last_post_time': '2013-02-09T03:51:00'}
{'Post_subject': 'motion captcha to prevent spam', 'Author': 'sukagwe', 'created_at': '2013-01-31T19:22:00', 'responses': 10, 'views_count': 198, 'last_post_by': 'DemOnstar', 'last_post_time': '2013-02-08T04:30:00'}
{'Post_subject': 'Guests Can Only See Thread Opener', 'Author': 'Mondy', 'created_at': '2013-01-30T18:29:00', 'responses': 1, 'views_count': 47, 'last_post_by': 'Wayne Luke', 'last_post_time': '2013-02-07T11:07:00'}
{'Post_subject': 'Instant Notification By Email for forums', 'Author': 'hamids47', 'created_at': '2013-01-23T23:25:00', 'responses': 1, 'views

{'Post_subject': 'Words and buttons touching the edges of tables', 'Author': 'SKSApps', 'created_at': '2012-10-05T12:30:00', 'responses': 1, 'views_count': 42, 'last_post_by': 'Wayne Luke', 'last_post_time': '2012-10-06T07:29:00'}
{'Post_subject': 'integration between vBulletin + facebook', 'Author': '3bir.com', 'created_at': '2012-09-28T10:22:00', 'responses': 1, 'views_count': 302, 'last_post_by': 'Wayne Luke', 'last_post_time': '2012-09-28T10:37:00'}
15
{'Post_subject': 'Voting for your favorite suggestions...', 'Author': 'Wayne Luke', 'created_at': '2013-08-06T21:51:00', 'responses': 7, 'views_count': 910, 'last_post_by': 'In Omnibus', 'last_post_time': '2016-08-28T07:05:00'}
{'Post_subject': 'Before making a Suggestion...', 'Author': 'Wayne Luke', 'created_at': '2012-09-09T06:57:00', 'responses': 0, 'views_count': 946, 'last_post_by': 'Wayne Luke', 'last_post_time': '2012-09-09T06:57:00'}
{'Post_subject': 'The clean way to upgrade', 'Author': 'Veniamin', 'created_at': '2012-09-28T