# Lego scraping

This notebook scrapes information on Lego sets and exports the data to a `.csv` file.

## Imports

In [42]:
import requests as req
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep

## Setup

In [34]:
BASE_URL = "https://www.lego.com"

## Scrape themes

In [35]:
themes_url = "/en-gb/themes"

In [36]:
response = req.get(BASE_URL + themes_url)

In [37]:
soup = BeautifulSoup(response.content)  # Parse the HTML

In [38]:
articles = soup.find_all("article")

In [39]:
themes = []

for article in articles:
    data = {
        "name": article.find(class_="hlipzx").text,
        "url": article.find("a")["href"]
    }
    
    themes.append(data)

## Scrape sets

In [137]:
def parse_set(lego_set):
    
    title_block = lego_set.find("a", attrs={ "data-test": True})
    price_block = [x for x in lego_set.find_all("span", attrs={ "data-test": True}) if x["data-test"] == "product-price"][0]
    rating_block = [x for x in lego_set.find_all("div", attrs={ "data-test": True}) if x["data-test"] == "product-leaf-rating"][0].find("div", attrs={"title" : True})
    
    data = {
        "title": title_block.text,
        "url": title_block["href"],
        "price" : price_block.contents[1],
        "rating" : rating_block["title"] if rating_block else None
    }
    
    return data

In [138]:
def get_sets(theme):
    
    sets = []
    valid = True
    page = 1
    
    while valid:
        
        # Load the next page
        
        page_url = f"{BASE_URL}{theme['url']}?page={page}"
        res = req.get(page_url)
        
        # Check that there's still a valid page
        
        valid = res.status_code == 200
        
        # Find all the sets
        
        soup = BeautifulSoup(res.content)
        products = soup.find_all("div", attrs={"data-test": True})
        sets.extend([x for x in products if x["data-test"] == "product-leaf"])
        page += 1
        sleep(1)
    
    sets = [{**parse_set(s), "theme": theme["name"]} for s in sets]
    
    return sets

### Scrape sets by theme

In [139]:
holder = []

for theme in themes:
    holder.extend(get_sets(theme))

In [140]:
len(holder)

1024

## Data export

In [142]:
lego_sets = pd.DataFrame(holder)

In [143]:
lego_sets.head()

Unnamed: 0,title,url,price,rating,theme
0,Taj Mahal,/en-gb/product/taj-mahal-21056,£104.99,4.7,Architecture
1,Statue of Liberty,/en-gb/product/statue-of-liberty-21042,£89.99,4.6,Architecture
2,The White House,/en-gb/product/the-white-house-21054,£89.99,4.8,Architecture
3,Singapore,/en-gb/product/singapore-21057,£54.99,4.6,Architecture
4,Tokyo,/en-gb/product/tokyo-21051,£54.99,4.4,Architecture


In [145]:
lego_sets.to_csv("./output/lego.csv", index=False)