# Create sample data

I wanted some sample data that was easier to understand than actual end-user event data stream.

I thought a collection of files with simple colour data might be easier to use and understand;
it would also work well with the colourful diagram I have in mind.

I found a wonderful website with lots of example colours:
[Colours Explained](https://www.colorsexplained.com/meanings-of-the-colors/)

This little Jupyter notebook uses Beautiful Soup to download the colour webpages and extract
just the basic colour information.
It then saves all of this colour data into simple `.json.gz` compressed JSON files, which
I have uploaded to S3 for the blog.


In [1]:
!pip install pyfunctional



In [1]:
import re
import gzip
import uuid
import requests
import json
from bs4 import BeautifulSoup
from dataclasses import dataclass
from typing import Any, Generator

from functional import seq  # type: ignore[reportMissingTypeStubs]

## Domain model

In [10]:
@dataclass(frozen=True)
class Colour:
    """A very small data class that holds a colour name and details."""

    name: str
    base: str
    hex: str
    rgb: tuple[int, int, int]


@dataclass(frozen=True)
class ColourId:
    """A very small data class that holds a colour name and details."""

    id: str
    name: str
    base: str
    hex: str
    rgb: tuple[int, int, int]

    def toJSON(self) -> dict[str, Any]:
        return {
            "id": self.id,
            "name": self.name,
            "base": self.base,
            "hex": self.hex,
            "rgb": self.rgb,
        }

In [3]:
def add_id_to_colour(colour: Colour) -> ColourId:
    """Add an ID to a colour object to make it more like an 'event'."""
    id = str(uuid.uuid4())
    return ColourId(id, colour.name, colour.base, colour.hex, colour.rgb)

## Data extraction functions

These functions help parse the data in the HTML pages.

In [4]:
def extract_hex_colour(text: str) -> str | None:
    """Extract the HEX colour from the given text"""
    # Define the regular expression pattern to match the HTML HEX color
    hex_colour_pattern = r"#[0-9a-fA-F]{6}"

    # Use the re.findall() method to extract all matches of the pattern in the text
    match = re.search(hex_colour_pattern, text, re.IGNORECASE)
    return match.group(0) if match is not None else None


# A small test to ensure the function works as we expect
test_text = "This text contains a color code: #FFA500 and more text after"
result = extract_hex_colour(test_text)
assert result == "#FFA500", f"Expected #FFA500 but got '{result}'"
assert extract_hex_colour("No hex colour") is None, "Should have retrieved None result"

In [5]:
def extract_rgb_colour(text: str) -> tuple[int, int, int] | None:
    """Extract the RBG colour information from the given text"""
    # Define the regular expression pattern to match the RGB color
    rgb_colour_pattern = r"rgb\s*(\d{1,3}),\s*(\d{1,3}),\s*(\d{1,3})"

    # Use the re.search() method to find the first match of the pattern in the text
    match = re.search(rgb_colour_pattern, text, re.IGNORECASE)

    # If a match is found, extract the RGB color values from the match object
    if not match:
        return None

    # Turn the values into numbers
    red = int(match.group(1))
    green = int(match.group(2))
    blue = int(match.group(3))
    return (red, green, blue)


# A small test to ensure the function works as we expect
test_text = "This text contains a color code: rgb 1, 2, 342 and more text after"
result = extract_rgb_colour(test_text)
assert result == (1, 2, 342), f"Expected (1,2,342) but got '{result}'"

## Helper function to save a compressed JSON file

In [7]:
def save_to_json_gz_file(file_name: str, data: Any) -> None:
    """Save the given data into the given json.gz file"""
    with gzip.GzipFile(file_name, "w") as file_out:
        json_str: str = json.dumps(data, indent=2)
        json_bytes = json_str.encode("utf-8")
        file_out.write(json_bytes)

## Download and extract data from a colour webpage

In [6]:
def download_colour_data(
    base_colour: str,
    url: str,
) -> Generator[Colour, None, None]:
    """Extract all the colour from the given URL"""

    # Send a request to the URL and get the response
    response = requests.get(url)

    # Use Beautiful Soup to parse the HTML content of the response
    soup = BeautifulSoup(response.content, "html.parser")

    for paragraph in soup.find_all("p", class_="has-white-color"):
        # Find the name in the paragraph data where the name is inside an HTML strong element
        name_element = paragraph.find("strong")
        if not name_element:
            continue
        name = name_element.text

        # Extract the colour information in different ways
        hex = extract_hex_colour(paragraph.text)
        rgb = extract_rgb_colour(paragraph.text)
        if hex is None or rgb is None:
            continue

        # We have all the information needed to create the colour
        yield Colour(name, base_colour, hex, rgb)


# Test our code
test_url = "https://www.colorsexplained.com/shades-of-red-color-names/"
result = download_colour_data("red", test_url)
actual_colours: list[Colour] = seq(result).take(3)  # type: ignore

expected1 = Colour("Carmine", "red", "#960018", (150, 0, 24))
expected2 = Colour("Burgundy", "red", "#8D021F", (141, 2, 31))
expected3 = Colour("Vivid burgundy", "red", "#9F1D35", (159, 29, 53))

assert (
    actual_colours[0] == expected1
), f"Expected {expected1} but got {actual_colours[0]}"
assert (
    actual_colours[1] == expected2
), f"Expected {expected2} but got {actual_colours[1]}"
assert (
    actual_colours[2] == expected3
), f"Expected {expected3} but got {actual_colours[2]}"

## "main" code to download, extract and save all the colour data

In [19]:
def create_colours_url(colour_name: str) -> str:
    """Convert a colour name into a colour URL"""
    return f"https://www.colorsexplained.com/shades-of-{colour_name}-color-names/"

In [18]:
# The list of colours that are documented
colour_names = [
    "black",
    "blue",
    "gray",
    "green",
    "orange",
    "pink",
    "purple",
    "red",
    "yellow",
]

In [None]:
# Download all the colours into a simple data structure,
# add a GUID ID to each colour and then save the data into a .json.gz file
# for the blog post to process
for colour in colour_names:
    url = create_colours_url(colour)
    colours = download_colour_data(colour, url)

    colours_with_ids_json: list[ColourId] = (
        seq(colours)  # type: ignore
        .map(add_id_to_colour)  # type: ignore
        .map(lambda c: c.toJSON())  # type: ignore
        .to_list()  # type: ignore
    )  # type: ignore

    num_found_colours = len(colours_with_ids_json)
    if num_found_colours == 0:
        print(f"NO colours found for: {colour}")
    else:
        print(f"Found {num_found_colours} colours for {colour}")
        output_file_name = f"./small-files/{colour}.json.gz"
        save_to_json_gz_file(output_file_name, colours_with_ids_json)