## Webscraping code
A webcrawler to extract all of the data from Beoworld

In [1]:
import requests
from bs4 import BeautifulSoup
import time
URL = "https://support.bang-olufsen.com/hc/en-us/articles/360043398612-How-do-I-add-my-Beoplay-E8-to-the-Bang-Olufsen-app"


In [131]:
def retrieve_all_posts(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")

    """ Retrieves all post information form a soup object """
    
    # Getting post header
    header = soup.select('h1[class="entry-title"]')[0].text

    # Finding post tags
    tags = soup.select('div[class="bbp-topic-tags"]')[0]
    tags_list = [a_tag.text for a_tag in tags.find_all('a')] if tags else []
    
    # Following body retrieves text from data. 
    #########################
    
    forum_body = soup.find('ul', attrs={'class': 'forums bbp-replies'})
    posts = soup.select('div[class*="loop-item"]')
    
    all_posts = []
    for i, post in enumerate(posts):
        author = post.select('span[class="bbp-author-name"]')[0].text
        role = post.select('div[class="bbp-author-role"]')[0].text
        
        # Remove all <blockquote> elements from the post, as there are instances where they quote previous comments
        for blockquote in post.find_all('blockquote'):
            blockquote.decompose()


        # Concatenate all texts from a post message
        text = " ".join(p.text for p in post.select('p'))


        full_post = {
            'number': i,
            'author': author,
            'role': role,
            'text': text
            }
        all_posts.append(full_post)


    forum_data = {
            'header_name': header,
            'url': url, 
            'tags:': tags_list,
            'posts': all_posts
        }
    
    return forum_data


def link_crawler(master_url, page_nr=1):
    """
        Gets all post_links for a given forum.
    """         
    
    link_list = []
    while True:
        url = f'{master_url}{page_nr}'
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        
        posts = soup.select('a[class="bbp-topic-permalink"]')
        
        # breaking loop when iterated over all site pages
        if not posts or len(posts) == 0:
            print("no posts on page")
            break 
        
        
        for post_nr in posts:
            link = post_nr.get('href')
            link_list.append(link)
        
        page_nr += 1 

        # simple monitoring
        if page_nr % 10 == 0:
            print(str(page_nr) + " scraped") 

    return link_list        

        

def write_links_to_file(link_list, file_name='links.txt'):
    """
    Writes each link from the link_list to a new line in a specified file.
    """
    with open(file_name, 'w') as file:
        for link in link_list:
            file.write(f"{link}\n")


def load_links_from_file(file_name):
    with open(file_name, 'r') as file:
        links = [line.strip() for line in file.readlines()]
    return links


def writer():
    """ 
        Writes all the records  
    """




In [132]:
# getting all links
forum_links = {"general_discussion":"https://forum.beoworld.org/forums/forum/general-forum/general-discussion-questions/page/", "workbench":"https://forum.beoworld.org/forums/forum/products-related-forums/the-workbench/page/"}
for forum_name, root_link in forum_links.items():
    full_link_list = link_crawler(root_link)
    write_links_to_file(full_link_list, f"{forum_name}.txt")

10 scraped
20 scraped
30 scraped
40 scraped
50 scraped
60 scraped
70 scraped
no posts on page
10 scraped
no posts on page


In [105]:
url = "https://forum.beoworld.org/forums/topic/beosound-core-connectivity-and-alternatives/" 
posts = retrieve_all_posts(url)



link_crawler()



#  BEOWORLD SUPPO

In [19]:
from selenium import webdriver
from selenium.webdriver.common.by import By

URL = "https://support.bang-olufsen.com/hc/en-us/articles/360043398612-How-do-I-add-my-Beoplay-E8-to-the-Bang-Olufsen-app"
red = "https://www.reddit.com/r/BangandOlufsen/comments/1b2s62m/beoplay_ex_is_nearly_perfect/"
driver = webdriver.Chrome()
driver.get(red)
# cookie_button = driver.find_element(By.ID, 'acceptButton')
# cookie_button.click()
# driver.get("https://support.bang-olufsen.com/hc/en-us/sections/360007312332-Beoplay-E8")
# driver.implicitly_wait(2)
# driver.get("https://support.bang-olufsen.com/hc/en-us/articles/360020150597-Can-I-use-my-Beosound-Explore-outdoors")
