# Web Mining & Coginitive Computing - MSc 2025 - FS2024
## Group Assignment 1

Authors: **Bachem**, Kilian;
**Mohr**, Otis

In [82]:
# Load all packages needed for the code below

import requests # Needed for sending GET requests to the NBB website
from bs4 import BeautifulSoup # Needed for finding and extracting elements from the request response
import pandas as pd # Needed for generating/analyzing DataFrames
from datetime import datetime # Needed for Timestamps

In [83]:
# Define dict variable 'links', which holds the NBB website URLs for each notebook brand
links = {
    "acer": "https://www.notebooksbilliger.de/notebooks/acer+notebooks/page/1?perPage=50&sort=popularity&order=desc&availability=alle",
    "apple": "https://www.notebooksbilliger.de/notebooks/apple+notebooks/page/1?perPage=50&sort=popularity&order=desc&availability=alle",
    "asus": "https://www.notebooksbilliger.de/notebooks/asus+notebooks/page/1?perPage=50&sort=popularity&order=desc&availability=alle",
    "dell": "https://www.notebooksbilliger.de/notebooks/dell+notebooks/page/1?perPage=50&sort=popularity&order=desc&availability=alle",
    "hp": "https://www.notebooksbilliger.de/notebooks/hp+notebooks/page/1?perPage=50&sort=popularity&order=desc&availability=alle",
    "huawei": "https://www.notebooksbilliger.de/notebooks/huawei+notebooks/page/1?perPage=50&sort=popularity&order=desc&availability=alle",
    "lenovo": "https://www.notebooksbilliger.de/notebooks/lenovo+notebooks/page/1?perPage=50&sort=popularity&order=desc&availability=alle",
    "msi": "https://www.notebooksbilliger.de/notebooks/msi+notebooks/page/1?perPage=50&sort=popularity&order=desc&availability=alle",
    "samsung": "https://www.notebooksbilliger.de/notebooks/samsung+notebooks/page/1?perPage=50&sort=popularity&order=desc&availability=alle"
}


In [4]:
# Define a custom request header that is used by the requests package to retrieve data from the NBB website. 
# The header suggests to the website that we are retrieving information from a regular browser.
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0"}

In [86]:
### SCRAPING THE DATA FROM THE WEBSITE ###


# Dictionary to store the request response for each brand webpage
responses = {}


# Next, iterate through all links, try retrieving the content and save them in the responses variable if the server responds with 200.
for brand, url in links.items():
    # Make a GET request to the URL
    req = requests.get(url, headers=headers)

    # For the subsequent steps, record the date of retrieval in a variable 'date_retrieved'
    date_retrieved = datetime.now().strftime("%Y-%m-%d")

    # ONLY if the server responds with 'OK' (in other words, the request was successful), go ahead and save the data.
    if req.status_code == 200:
        # Save the response under a variable name equal to the respective key value
        responses[brand] = req
        print(f"Completed request for {brand}")
        time.sleep(8) # Delay of 8 seconds between queries for compliance
    else:
        # If the request fails, notify us that the request was unsuccessful
        print(f"Error {req.status_code} received when trying to retrieve data for {brand}")
        time.sleep(8) # Delay of 8 seconds between queries for compliance

Completed request for acer
Completed request for apple
Completed request for asus
Completed request for dell
Completed request for hp
Completed request for huawei
Completed request for lenovo
Completed request for msi
Completed request for samsung


In [87]:
### DATA EXTRACTION ###

# Create a new list in which the products for all brands will be stored later. Pre-fill the date at which the request was performed.
# The actual data will be stored in a list in the 'data' key.
all_data = {
    'date-retrieved': date_retrieved,
    'data': []
}
    
# Iterate through the server response for each brand page and filter out the relevant data of the first 10 products.
for brand, data in responses.items():

    # Create a BeautifulSoup object from the request html code
    soup = BeautifulSoup(data.text, 'html')

    # Assuming that every product is inside a HTML div  with the class 'product-card__content-wrapper'.
    # Find 10 elements max which match the criteria, and save them in a new variable, 'products'.
    products = soup.find_all('div', class_='product-card__content-wrapper', limit=10)

    # Create a (temporary) list that will collect the product entries for one brand
    brand_data = []

    # Iterate through the products identified in the current soup object and extract only the desired data
    for product in products:

        # The vendor is already known, no need to extract it again.

        # First, find the product title in the div with the class name 'product-card__product-heading-title
        title_element = product.find('div', {'class':'product-card__product-heading-title'})
        # Strip only the text and save it in the 'title' variable. If the title_element is empty, return a "Not Found" failsafe
        title = title_element.text.strip() if title_element else "Title Not Found"

        # Exact same method for the product price, also with a failsafe.
        price_element = product.find('div', {'class':'js-product-price'})
        price = price_element.text.strip() if price_element else "Price Not Found"

        # Exact same method for the product availability, also with a failsafe.
        availability_element = product.find('span', {'class':'product-detail__availability'})
        availability = availability_element.text.strip() if availability_element else "Availability Not Found"

        # Then, extract the evaluation (star rating). This is a bit more tricky, since we are only interested in the sum of filled stars.
        # Thus, we count the number of star images in the rating which are ending in 'full.svg', therefore leaving out empty stars.
        rating_images = product.findAll('img', {'class': 'rating__star'})
        evaluation = sum(1 for img in rating_images if 'full.svg' in img['src'])

        # Lastly, extract the rating count. Same method as in the beginning, with a failsafe.
        # Also, getting rid of the parentheses and converting String into int
        ratingcount_element = product.find('span', {'class':'rating__count'})
        ratingcount = int(ratingcount_element.text.strip().replace('(','').replace(')','')) if ratingcount_element else "Rating Not Found"

        
        # Quick Console Output Tester. Uncomment to use. CONSIDER REMOVING THIS BEFORE HANDING IN THE ASSIGNMENT.
        # print(f"{title}: {price}. Availability: {availability}\nRating: {evaluation} based on {ratingcount} ratings\n")


        # For each item, create a (temporary) dictionary which aggregates the extracted information...
        item_data = {
            'title': title,
            'price': price,
            'availability': availability,
            'evaluation': evaluation,   
            'ratingcount': ratingcount
        }

        #... and then add it to the list of items for the brand in the current iteration.
        brand_data.append(item_data)

    # After all items have been extracted for one brand, add the list of items to the all_data variable, in form of a dict.
    brand_dict = {
        'brand': brand,
        'items': brand_data
    }
    all_data['data'].append(brand_dict)
        

In [88]:
### SAVING DATAFRAME AS CSV ###


# Erstellung eines DataFrames aus der Gesamtliste
df = pd.DataFrame(all_data)

# Speichern des DataFrames in einer CSV-Datei
csv_file_name = 'nbb_notebooks_allbrands.csv'
df.to_csv(csv_file_name, index=False)
print(f"Data saved to {csv_file_name}")

PermissionError: [Errno 13] Permission denied: 'nbb_notebooks_allbrands.csv'

In [81]:
df

Unnamed: 0,date-retrieved,data
0,2024-03-21 11:33:12,"{'brand': 'acer', 'items': [{'title': 'Acer Ex..."
1,2024-03-21 11:33:12,"{'brand': 'apple', 'items': [{'title': 'Apple ..."
2,2024-03-21 11:33:12,"{'brand': 'asus', 'items': [{'title': 'ASUS RO..."
3,2024-03-21 11:33:12,"{'brand': 'dell', 'items': [{'title': 'Dell La..."
4,2024-03-21 11:33:12,"{'brand': 'hp', 'items': [{'title': 'HP 250 G9..."
5,2024-03-21 11:33:12,"{'brand': 'huawei', 'items': [{'title': 'HUAWE..."
6,2024-03-21 11:33:12,"{'brand': 'lenovo', 'items': [{'title': 'Lenov..."
7,2024-03-21 11:33:12,"{'brand': 'msi', 'items': [{'title': 'MSI Thin..."
8,2024-03-21 11:33:12,"{'brand': 'samsung', 'items': [{'title': 'SAMS..."
