# QA Data Mining QA Serverless Forum

In [174]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os

# For Web scrawler
import time
from selenium import webdriver

%matplotlib inline

In [2]:
#Get seed url of category main page
seed_urls = ['https://forum.serverless.com/c/serverless-framework/5',
             'https://forum.serverless.com/c/serverless-architectures/6',
             'https://forum.serverless.com/c/event-gateway/8']

In [21]:
qa_url = ['https://forum.serverless.com/c/serverless-framework/5']

In [4]:
#Fetch QA category
category_list = []
for url in seed_urls:
    qa_category = url.split('/')[4] # Get category name
    # Replace '-' with space
    qa_category = ' '.join(qa_category.split('-'))
    category_list.append(qa_category)
    print(url)
print(category_list)

https://forum.serverless.com/c/serverless-framework/5
https://forum.serverless.com/c/serverless-architectures/6
https://forum.serverless.com/c/event-gateway/8
['serverless framework', 'serverless architectures', 'event gateway']


Each QA is marked by href and title. To get questions and answers. I have to iterate through all question link. Firstly. All headers are in `<table>` element with class `class="topic-list ember-view"`
Every information is in `<tbody>` with `class="topic-list-body"`. For each `<tr>` we will collect the follow data:
* `id="ember109"` This contains data id of each QA. It is useful to for cleaning dataset
* `<span> class="link-top-line" > <a> class="title raw-link raw-topic-link"`which contain the url to each QA. We need it to go to each questions for colleting the rest of content and reply
*  `href="/t/base-configuration-for-multiple-resources/20103"` this is the url in `<a>`
*  string title in `<a>class="title raw-link raw-topic-link" </a>`
*  `<a>class="discourse-tag simple"` to collect tag name `data-tag-name="variables"`

In [64]:
# Scroll function
# This function takes two arguments. The driver that is being used and a timeout.
# The driver is used to scroll and the timeout is used to wait for the page to load.

def scroll(driver, timeout):
    scroll_pause_time = timeout

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(scroll_pause_time)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            # If heights are the same it will exit the function
            break
        last_height = new_height

In [88]:
def extract_data(seed_urls):
    qa_data = []
    for url in seed_urls:
        # Create a new instance of the Firefox driver
        driver = webdriver.Firefox()
        # Move to that driver
        driver.get(url)
        # use "scroll" function to scroll the page every 4 seconds
        scroll(driver, 4)
        qa_category = url.split('/')[4] # Get category name
        # Replace '-' with space
        qa_category = ' '.join(qa_category.split('-'))
        # scan through each category url
        #data = requests.get(url)
        # Fetch the data using BeautifulSoup after all data is loaded
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        results = soup.find_all('td', class_="main-link")
        for r in results:
            title_element = r.find('a', attrs={"class": "title raw-link raw-topic-link"}).text
            link_url = r.find('a', attrs={"class": "title raw-link raw-topic-link"})['href']
            id = link_url.split('/')[-1]
            tags = [] # For forum tags
            
            # Get the discours tag line which only appear when there's a tag
            bottom_line = r.find('div', 'discourse-tags')
            if bottom_line is not None:
                # Add all tags in one array
                for element in bottom_line.select('a'):
                    tags.append(element.text)
            # Create new row
            qa_rows = [{'id': id, 'title': title_element, 'url': link_url, 'category': qa_category, 'tags': tags}]
            qa_data.extend(qa_rows)
            
        # Close this WebDriver session and move on to the next
        driver.quit()
            
    df =  pd.DataFrame(qa_data)
    df = df[['id', 'title', 'url', 'category', 'tags']]
    return df

In [111]:
%%time
results = extract_data(seed_urls)
results

CPU times: total: 8.66 s
Wall time: 12min 1s


Unnamed: 0,id,title,url,category,tags
0,18,About the Serverless Framework category,/t/about-the-serverless-framework-category/18,serverless framework,[]
1,20103,Base configuration for multiple resources,/t/base-configuration-for-multiple-resources/2...,serverless framework,[variables]
2,20102,Using existing buckets creates lambda function...,/t/using-existing-buckets-creates-lambda-funct...,serverless framework,[lambda]
3,20094,CredentialProviderError - migrating to AWS v3 sdk,/t/credentialprovidererror-migrating-to-aws-v3...,serverless framework,[]
4,20096,How do i set X-Content-Type-Options header in ...,/t/how-do-i-set-x-content-type-options-header-...,serverless framework,[aws]
...,...,...,...,...,...
4952,3573,"Req.body is null (using express, body-parser, ...",/t/req-body-is-null-using-express-body-parser-...,event gateway,"[aws, lambda]"
4953,3260,How to move context requestId and caller to th...,/t/how-to-move-context-requestid-and-caller-to...,event gateway,[aws]
4954,3208,Event gatway/ local development - Confused sli...,/t/event-gatway-local-development-confused-sli...,event gateway,[aws]
4955,2998,Is Event Gateway truly Serverless?,/t/is-event-gateway-truly-serverless/2998,event gateway,[]


In [None]:
data = pd.DataFrame(results)

In [113]:
# Remove first header data
# Filter rows based on the condition
noise_corpus = ["About the Serverless Framework category", "About the Serverless Architectures category", "About the Event Gateway category"]
for title in noise_corpus:
    noise_df = data['title'].str.contains(title)
    data = data[~noise_df]
# Reset index
data.reset_index(drop = True, inplace = True)

In [219]:
data = pd.read_csv('discourse_questions.csv')

In [220]:
data

Unnamed: 0,id,title,url,category,tags
0,20103,Base configuration for multiple resources,/t/base-configuration-for-multiple-resources/2...,serverless framework,['variables']
1,20102,Using existing buckets creates lambda function...,/t/using-existing-buckets-creates-lambda-funct...,serverless framework,['lambda']
2,20094,CredentialProviderError - migrating to AWS v3 sdk,/t/credentialprovidererror-migrating-to-aws-v3...,serverless framework,[]
3,20096,How do i set X-Content-Type-Options header in ...,/t/how-do-i-set-x-content-type-options-header-...,serverless framework,['aws']
4,11726,Is there a way to delete my account?,/t/is-there-a-way-to-delete-my-account/11726,serverless framework,[]
...,...,...,...,...,...
4949,3573,"Req.body is null (using express, body-parser, ...",/t/req-body-is-null-using-express-body-parser-...,event gateway,"['aws', 'lambda']"
4950,3260,How to move context requestId and caller to th...,/t/how-to-move-context-requestid-and-caller-to...,event gateway,['aws']
4951,3208,Event gatway/ local development - Confused sli...,/t/event-gatway-local-development-confused-sli...,event gateway,['aws']
4952,2998,Is Event Gateway truly Serverless?,/t/is-event-gateway-truly-serverless/2998,event gateway,[]


In [166]:
# Save the csv just in case
data.to_csv('discourse_questions.csv', index=False)

In [221]:
post_urls = data['url']
post_urls

0       /t/base-configuration-for-multiple-resources/2...
1       /t/using-existing-buckets-creates-lambda-funct...
2       /t/credentialprovidererror-migrating-to-aws-v3...
3       /t/how-do-i-set-x-content-type-options-header-...
4            /t/is-there-a-way-to-delete-my-account/11726
                              ...                        
4949    /t/req-body-is-null-using-express-body-parser-...
4950    /t/how-to-move-context-requestid-and-caller-to...
4951    /t/event-gatway-local-development-confused-sli...
4952            /t/is-event-gateway-truly-serverless/2998
4953               /t/increase-lambda-execution-time/2688
Name: url, Length: 4954, dtype: object

In [222]:
domain = "https://forum.serverless.com"
def extract_post(post_urls, domain):
    post_data = [] # Generate new post data
    # Create progress checker
    total_row = len(post_urls)
    processed_percent = 0
    percent_increment = 5 # Only report after every five row
    processed_rows = 0 
    import math
    from datetime import datetime
    # Iterate through each url in the url column
    for url_tail in post_urls:
        # Get full URL address of the post
        url = domain + url_tail
        data = requests.get(url)
        soup = BeautifulSoup(data.content, 'html.parser')
        results = soup.find_all('div', id='post_1')
        
        # Fetch each element in post div
        for r in results:
            # Get published date of the post
            time = r.find('time', attrs={'class':'post-time'})['datetime']
            post_content = r.find('div', attrs={'itemprop':'text'}).text.strip()
            # Get the current date and time in ISO 8601 format
            current_date = datetime.now().isoformat()

            # Save the row and add to the dataFrame
            post_row = [{'content': post_content,'post_time': time, 'scaped_time':current_date}]
            post_data.extend(post_row)

        # Confirm new done row
        processed_rows += 1
            
        # Calculate and print the progress at each percentage increment
        percent_done = math.floor((processed_rows / total_row) *100)
        if percent_done >= processed_percent + percent_increment:
            processed_percent = percent_done # Update process
            latest_row = post_data[-1]['post_time']
            print(f'Processed {processed_percent}% of the data')
            print(f'The latest data in post_time row: {latest_row}')
    df =  pd.DataFrame(post_data)
    df = df[['content', 'post_time', 'scaped_time']]
    return df
            

In [223]:
%%time
post_df = extract_post(post_urls, domain)
post_df

Processed 5% of the data
The latest data in post_time row: 2021-08-29T13:33:07Z
Processed 10% of the data
The latest data in post_time row: 2022-08-23T19:44:09Z
Processed 15% of the data
The latest data in post_time row: 2022-01-16T03:35:34Z
Processed 20% of the data
The latest data in post_time row: 2020-01-02T06:51:21Z
Processed 25% of the data
The latest data in post_time row: 2020-06-15T12:00:53Z
Processed 30% of the data
The latest data in post_time row: 2020-12-30T18:29:33Z
Processed 35% of the data
The latest data in post_time row: 2020-06-29T16:54:31Z
Processed 40% of the data
The latest data in post_time row: 2020-05-26T01:19:35Z
Processed 45% of the data
The latest data in post_time row: 2017-09-27T02:43:28Z
Processed 50% of the data
The latest data in post_time row: 2019-09-18T01:02:10Z
Processed 55% of the data
The latest data in post_time row: 2019-05-05T12:59:02Z
Processed 60% of the data
The latest data in post_time row: 2019-01-22T08:39:27Z
Processed 65% of the data
The

Unnamed: 0,content,post_time,scaped_time
0,"Hi,\nI am attempting to consolidate the boiler...",2024-04-18T10:17:01Z,2024-04-21T03:24:21.082346
1,I have an existing S3 bucket and a lambda func...,2024-04-18T08:10:55Z,2024-04-21T03:24:21.981367
2,"When I migrated my app from AWS v2 to v3 sdk, ...",2024-04-16T06:56:26Z,2024-04-21T03:24:22.855259
3,I’m trying t set the X-Content-Type-Options he...,2024-04-16T16:43:16Z,2024-04-21T03:24:23.734330
4,Is there a way to delete my account?,2020-06-09T11:21:51Z,2024-04-21T03:24:24.616235
...,...,...,...
4949,"Hello,\nI followed the steps listed in this tu...",2018-01-22T15:58:09Z,2024-04-21T04:37:16.309613
4950,We’re trying to encrypt our payload before sen...,2017-12-06T03:09:23Z,2024-04-21T04:37:17.203009
4951,"Hi,\nI just read the blog post on the event ga...",2017-11-28T16:34:04Z,2024-04-21T04:37:18.148581
4952,I’ve been through the examples and have to say...,2017-11-01T02:45:27Z,2024-04-21T04:37:18.974628


In [None]:
# Convert new data to DataFrame
new_df = pd.DataFrame(post_df)

# Concatenate the existing DataFrame with the new DataFrame along the columns axis
updated_df = pd.concat([data, new_df], axis=1)

In [228]:
# Display the updated DataFrame
updated_df.head(10)

Unnamed: 0,id,title,url,category,tags,content,post_time,scaped_time
0,20103,Base configuration for multiple resources,/t/base-configuration-for-multiple-resources/2...,serverless framework,['variables'],"Hi,\nI am attempting to consolidate the boiler...",2024-04-18T10:17:01Z,2024-04-21T03:24:21.082346
1,20102,Using existing buckets creates lambda function...,/t/using-existing-buckets-creates-lambda-funct...,serverless framework,['lambda'],I have an existing S3 bucket and a lambda func...,2024-04-18T08:10:55Z,2024-04-21T03:24:21.981367
2,20094,CredentialProviderError - migrating to AWS v3 sdk,/t/credentialprovidererror-migrating-to-aws-v3...,serverless framework,[],"When I migrated my app from AWS v2 to v3 sdk, ...",2024-04-16T06:56:26Z,2024-04-21T03:24:22.855259
3,20096,How do i set X-Content-Type-Options header in ...,/t/how-do-i-set-x-content-type-options-header-...,serverless framework,['aws'],I’m trying t set the X-Content-Type-Options he...,2024-04-16T16:43:16Z,2024-04-21T03:24:23.734330
4,11726,Is there a way to delete my account?,/t/is-there-a-way-to-delete-my-account/11726,serverless framework,[],Is there a way to delete my account?,2020-06-09T11:21:51Z,2024-04-21T03:24:24.616235
5,20090,Serverless › Authorization is currently down,/t/serverless-authorization-is-currently-down/...,serverless framework,[],"Hi,\nWhen I try to deploy my app I get the fol...",2024-04-15T23:17:51Z,2024-04-21T03:24:25.482221
6,20080,Cannot log in to app.serverless.com,/t/cannot-log-in-to-app-serverless-com/20080,serverless framework,[],I cannot log in to app.serverless.com. I know ...,2024-04-10T05:57:59Z,2024-04-21T03:24:26.352198
7,20084,Sls package on an M1 MacBook gives a docker ru...,/t/sls-package-on-an-m1-macbook-gives-a-docker...,serverless framework,[],I have installed the serverless framework on m...,2024-04-11T14:32:24Z,2024-04-21T03:24:27.269003
8,20077,Error on Deploy: Request Entity Too Large,/t/error-on-deploy-request-entity-too-large/20077,serverless framework,"['aws', 'lambda', 'api-gateway', 'cicd']","Hi, I am trying to deploy a Sls NodeJS applica...",2024-04-08T13:24:40Z,2024-04-21T03:24:28.120206
9,19616,No module named ‘pydantic_core._pydantic_core’,/t/no-module-named-pydantic-core-pydantic-core...,serverless framework,"['aws', 'lambda']",I am trying to deploy a python lambda with the...,2023-10-24T02:13:23Z,2024-04-21T03:24:29.017074


In [227]:
# Save the updated dataset
# Save the csv just in case
updated_df.to_csv('discourse_data.csv', index=False)