# Web Mining & Coginitive Computing - MSc 2025 - FS2024
## Group Assignment 1

Authors: **Bachem**, Kilian;
**Mohr**, Otis

In [1]:
# Load all packages needed for the code below

import requests # Needed for sending GET requests to the NBB website
from bs4 import BeautifulSoup # Needed for finding and extracting elements from the request response
import pandas as pd # Needed for generating/analyzing DataFrames
import time # Needed for UNIX Timestamp

In [2]:
# Define dict variable 'links', which holds the NBB website URLs for each notebook brand
links = {
    "Acer": "https://www.notebooksbilliger.de/notebooks/acer+notebooks/page/1?perPage=50&sort=popularity&order=desc&availability=alle",
    "Apple": "https://www.notebooksbilliger.de/notebooks/apple+notebooks/page/1?perPage=50&sort=popularity&order=desc&availability=alle",
    "Asus": "https://www.notebooksbilliger.de/notebooks/asus+notebooks/page/1?perPage=50&sort=popularity&order=desc&availability=alle",
    "Dell": "https://www.notebooksbilliger.de/notebooks/dell+notebooks/page/1?perPage=50&sort=popularity&order=desc&availability=alle",
    "HP": "https://www.notebooksbilliger.de/notebooks/hp+notebooks/page/1?perPage=50&sort=popularity&order=desc&availability=alle",
    "Huawei": "https://www.notebooksbilliger.de/notebooks/huawei+notebooks/page/1?perPage=50&sort=popularity&order=desc&availability=alle",
    "Lenovo": "https://www.notebooksbilliger.de/notebooks/lenovo+notebooks/page/1?perPage=50&sort=popularity&order=desc&availability=alle",
    "MSI": "https://www.notebooksbilliger.de/notebooks/msi+notebooks/page/1?perPage=50&sort=popularity&order=desc&availability=alle",
    "Samsung": "https://www.notebooksbilliger.de/notebooks/samsung+notebooks/page/1?perPage=50&sort=popularity&order=desc&availability=alle"
}


In [3]:
# Define a custom request header that is used by the requests package to retrieve data from the NBB website. 
# The header suggests to the website that we are retrieving information from a regular browser.
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0"}

In [6]:
### SCRAPING THE DATA FROM THE WEBSITE ###


# Dictionary to store the request response for each brand webpage
responses = {}


# Next, iterate through all links, try retrieving the content and save them in the responses variable if the server responds with 200.
for brand, url in links.items():
    # Make a GET request to the URL
    req = requests.get(url, headers=headers)

    # For the subsequent steps, record the date of retrieval in a variable 'date_retrieved'
    date_retrieved = time.strftime('%Y-%m-%d')

    # ONLY if the server responds with 'OK' (in other words, the request was successful), go ahead and save the data.
    if req.status_code == 200:
        # Save the response under a variable name equal to the respective key value
        responses[brand] = req
        print(f"Completed request for {brand}")
        time.sleep(8) # Delay of 8 seconds between queries for compliance
    else:
        # If the request fails, notify us that the request was unsuccessful
        print(f"Error {req.status_code} received when trying to retrieve data for {brand}")
        time.sleep(8) # Delay of 8 seconds between queries for compliance

Completed request for Acer
Completed request for Apple
Completed request for Asus
Completed request for Dell
Completed request for HP
Completed request for Huawei
Completed request for Lenovo
Completed request for MSI
Completed request for Samsung


In [7]:
### DATA EXTRACTION ###

# Create a new list in which the products for all brands will be stored later. 
all_data = []
    
# Iterate through the server response for each brand page and filter out the relevant data of the first 10 products.
for brand, data in responses.items():

    # Create a BeautifulSoup object from the request html code
    soup = BeautifulSoup(data.text, 'html')

    # Assuming that every product is inside a HTML div  with the class 'product-card__content-wrapper'.
    # Find 10 elements max which match the criteria, and save them in a new variable, 'products'.
    products = soup.find_all('div', class_='product-card__content-wrapper', limit=10)

    # Create a (temporary) list that will collect the product entries for one brand
    brand_data = []

    # Iterate through the products identified in the current soup object and extract only the desired data
    for product in products:

        # The timestamp is already known and will be added to the dict directly.
        # The vendor is also already known, no need to extract it again. However, we need to add it later to the dictionary.

        # First, find the product title in the div with the class name 'product-card__product-heading-title
        title_element = product.find('div', {'class':'product-card__product-heading-title'})
        # Strip only the text and save it in the 'title' variable. If the title_element is empty, return a "Not Found" failsafe
        title = title_element.text.strip() if title_element else "Title Not Found"

        # Exact same method for the product price, also with a failsafe.
        price_element = product.find('div', {'class':'js-product-price'})
        price = price_element.text.strip() if price_element else "Price Not Found"

        # Extracting Availability is more difficult, since the text can live in multiple elements for some products.
        # We first find the overarching wrapper for all elements which can be part of the availability string
        availability_wrapper = product.find('div', class_='product-card__availability')
        
        # Then initialize a temporary list to hold the texts
        availability_texts = []
        
        # Then check if the element was found
        if availability_wrapper:
            # Find all elements nested inside the 'product-card__availability' div which could hold text, iterate through them and...
            for element in availability_wrapper.find_all():
                # Append the text of each nested element to the list
                availability_texts.append(element.get_text(strip=True))
                # Lastly, combine the texts from the nested elements
                availability = ''.join(availability_texts)
        else:
            availability = "Availability not found"


        # Then, extract the evaluation (star rating). This is a bit more tricky, since we are only interested in the sum of filled stars.
        # Thus, we count the number of star images in the rating which are ending in 'full.svg', therefore leaving out empty stars.
        rating_images = product.findAll('img', {'class': 'rating__star'})
        evaluation = sum(1 for img in rating_images if 'full.svg' in img['src'])

        # Lastly, extract the rating count. Same method as in the beginning, with a failsafe.
        # Also, getting rid of the parentheses and converting String into int
        ratingcount_element = product.find('span', {'class':'rating__count'})
        ratingcount = int(ratingcount_element.text.strip().replace('(','').replace(')','')) if ratingcount_element else "Rating Not Found"

        
        # Quick Console Output Tester. Uncomment to use. CONSIDER REMOVING THIS BEFORE HANDING IN THE ASSIGNMENT.
        # print(f"{title}: {price}. Availability: {availability}\nRating: {evaluation} based on {ratingcount} ratings\n")


        # For each item, create a (temporary) dictionary which aggregates the extracted information...
        item_data = {
            'date-retrieved': date_retrieved, # Directly filled in from global variable (date of the GET request)
            'vendor': brand, # Directly filled in from loop parameter
            'title': title,
            'price': price,
            'availability': availability,
            'evaluation': evaluation,   
            'ratingcount': ratingcount
        }

        #... and then add it to the list of items for the brand in the current iteration.
        brand_data.append(item_data)

    # After all items have been extracted for one brand, add the list of items to the all_data variable, in form of a dict.
    all_data.extend(brand_data)
        

In [8]:
### SAVING DATAFRAME AS CSV ###


# Creating a pandas DataFrame from the all_data list
df = pd.DataFrame(all_data)

# Saving the DataFrame in a .csv file. Set encoding to UTF-8 so that characters like € will be preserved.
# The Date Timestamp will be included in the file name, so that we will not accidentally overwrite existing dbs.
csv_file_name = f'nbb_notebooks_allbrands_{date_retrieved}.csv'
df.to_csv(csv_file_name, index=False, encoding='utf-8-sig')
print(f"Data saved to {csv_file_name}")

Data saved to nbb_notebooks_allbrands_2024-03-21.csv


In [9]:
### PRICE COMPARISON BETWEEN TWO DATASETS ###

# to be added at a later stage. #


In [10]:
df

Unnamed: 0,date-retrieved,vendor,title,price,availability,evaluation,ratingcount
0,2024-03-21,Acer,Acer Aspire 3 (A317-53-7117),"579,00 €",Sofort ab Lager/Express,4,4
1,2024-03-21,Acer,Acer Extensa 215 (EX215-55-30UU),"319,00 €",Sofort ab Lager/Express,1,2
2,2024-03-21,Acer,Acer Aspire 5 (A515-57G-541Q),"549,00 €",Sofort ab Lager/Express,4,4
3,2024-03-21,Acer,Acer Aspire 5 (A517-58GM-72YC),"1.199,00 €",Sofort ab Lager/Express,3,2
4,2024-03-21,Acer,Acer Chromebook 314 (CB314-2HT-K4GV),"199,00 €",Sofort ab Lager/Express,4,9
...,...,...,...,...,...,...,...
85,2024-03-21,Samsung,SAMSUNG Galaxy Book3 360 B-Ware,"835,99 €",Sofort ab Lager/Express,0,0
86,2024-03-21,Samsung,SAMSUNG Galaxy Book3 Ultra B-Ware,"2.351,02 €",Sofort ab Lager/Express,4,1
87,2024-03-21,Samsung,SAMSUNG Galaxy Book3 Ultra B-Ware,"2.015,99 €",Sofort ab Lager/Express,4,1
88,2024-03-21,Samsung,SAMSUNG Galaxy Book3 Pro,"1.599,00 €",Sofort ab Lager,0,0
