In [None]:
pip install requests beautifulsoup4 pandas sentence-transformers




In [1]:
import requests
import pandas as pd
import re
import time
from pathlib import Path
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer, util
from urllib.parse import urljoin

# ================= BASIC SETUP =================
BOOKSRUN_KEY = "8ctef3gdn61t1aw96ubh"

EXPORT_DIR = Path("output_15_books")
EXPORT_DIR.mkdir(exist_ok=True)

MODEL = SentenceTransformer("all-MiniLM-L6-v2")

BASE_URL = "https://books.toscrape.com/"
CATALOGUE_URL = "https://books.toscrape.com/catalogue/"

HEADERS = {"User-Agent": "Mozilla/5.0"}

# ================= SYSTEM =================
class IntelligentPricingSystem15Books:

    def __init__(self, api_key):
        self.api_key = api_key
        self.records = []

    # -------- GOOGLE BOOKS ISBN RESOLUTION --------
    def resolve_isbn(self, title):
        base_vec = MODEL.encode(title)
        url = f"https://www.googleapis.com/books/v1/volumes?q=intitle:{title}"

        try:
            data = requests.get(url, timeout=10).json()
            if "items" not in data:
                return None, None, None, None

            for item in data["items"]:
                info = item.get("volumeInfo", {})
                cand_title = info.get("title", "")

                score = util.cos_sim(
                    base_vec, MODEL.encode(cand_title)
                ).item()

                if score < 0.70:
                    continue

                ids = info.get("industryIdentifiers", [])
                isbn10 = next((x["identifier"] for x in ids if x["type"] == "ISBN_10"), None)
                isbn13 = next((x["identifier"] for x in ids if x["type"] == "ISBN_13"), None)
                author = ", ".join(info.get("authors", ["Unknown"]))

                return isbn10, isbn13, cand_title, author

        except:
            pass

        return None, None, None, None

    # -------- BOOKSRUN PRICE --------
    def booksrun_price(self, isbn):
        url = f"https://booksrun.com/api/v3/price/buy/{isbn}?key={self.api_key}"
        try:
            res = requests.get(url, timeout=10).json()
            offers = res.get("result", {}).get("offers", {})

            price = offers.get("booksrun", {}).get("used", {}).get("price")
            if price and price != "none":
                return float(price)

            market = offers.get("marketplace", [])
            prices = [
                float(x["used"]["price"])
                for x in market
                if x.get("used", {}).get("price") not in (None, "none")
            ]
            return min(prices) if prices else 0.0
        except:
            return 0.0

    # -------- MAIN EXECUTION --------
    def execute(self, required_books=15):
        collected = 0
        page_no = 1

        print(f"\n🚀 Collecting EXACTLY {required_books} books\n")

        while collected < required_books:
            page_url = (
                urljoin(CATALOGUE_URL, "page-1.html")
                if page_no == 1
                else urljoin(CATALOGUE_URL, f"page-{page_no}.html")
            )

            print(f"📘 Scraping page {page_no}")
            resp = requests.get(page_url, headers=HEADERS)
            soup = BeautifulSoup(resp.text, "html.parser")

            books = soup.select(".product_pod")
            if not books:
                break

            for book in books:
                if collected >= required_books:
                    break

                detail_href = book.h3.a["href"]
                detail_url = urljoin(page_url, detail_href)

                d_resp = requests.get(detail_url, headers=HEADERS)
                d_soup = BeautifulSoup(d_resp.text, "html.parser")

                title = d_soup.find("h1").text.strip()
                price_text = d_soup.select_one(".price_color").text
                our_price = float(re.sub(r"[^\d.]", "", price_text))

                upc = ""
                for row in d_soup.select("table tr"):
                    if row.th and row.th.text == "UPC":
                        upc = row.td.text
                        break

                isbn10, isbn13, clean_title, author = self.resolve_isbn(title)
                if not isbn13:
                    continue

                competitor_price = self.booksrun_price(isbn13)
                if competitor_price <= 0:
                    continue

                discount = 0.15 if competitor_price > 30 else 0.10
                final_price = round(competitor_price * (1 - discount), 2)

                self.records.append({
                    "UPC": upc,
                    "ISBN-10": isbn10,
                    "ISBN-13": isbn13,
                    "Book Title": clean_title,
                    "Author": author,
                    "BookstoScrape URL": detail_url,
                    "BooksRun ISBN": isbn13,
                    "Source Price (£)": our_price,
                    "Competitor Price (£)": competitor_price,
                    "Discount Applied": f"{int(discount*100)}%",
                    "Final Price (£)": final_price,
                    "Status": "SUCCESS"
                })

                collected += 1
                print(f"✅ [{collected}/{required_books}] {clean_title}")
                time.sleep(0.5)

            page_no += 1

        df = pd.DataFrame(self.records)
        df.to_csv(EXPORT_DIR / "ai_pricing_15_books.csv", index=False)
        df.to_json(EXPORT_DIR / "ai_pricing_15_books.json", indent=4)

        print("\n🎉 SUCCESS — 15 BOOKS PROCESSED\n")
        return df


# ================= RUN =================
engine = IntelligentPricingSystem15Books(BOOKSRUN_KEY)
final_df = engine.execute(required_books=15)

print("=" * 150)
print("📊 FINAL AI PRICING REPORT")
print("=" * 150)
print(final_df.to_string(index=False))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


🚀 Collecting EXACTLY 15 books

📘 Scraping page 1
✅ [1/15] The Dirty Little Secrets of Getting Your Dream Job
📘 Scraping page 2
📘 Scraping page 3
📘 Scraping page 4
📘 Scraping page 5
✅ [2/15] orange: The Complete Collection 1
✅ [3/15] Lumberjanes Vol. 1
📘 Scraping page 6
📘 Scraping page 7
✅ [4/15] Thomas Jefferson and the Tripoli Pirates
📘 Scraping page 8
✅ [5/15] The Regional Office is Under Attack!
✅ [6/15] The Murder of Roger Ackroyd
✅ [7/15] The Matchmaker's Playbook
📘 Scraping page 9
✅ [8/15] The 10% Entrepreneur
✅ [9/15] Redeeming Love
📘 Scraping page 10
📘 Scraping page 11
✅ [10/15] Wild Swans
📘 Scraping page 12
✅ [11/15] The Star-Touched Queen
📘 Scraping page 13
✅ [12/15] The Marriage of Opposites
✅ [13/15] The Immortal Life of Henrietta Lacks
✅ [14/15] The Guilty: a Will Robie Novel 4
✅ [15/15] The Bane Chronicles

🎉 SUCCESS — 15 BOOKS PROCESSED

📊 FINAL AI PRICING REPORT
             UPC    ISBN-10       ISBN-13                                         Book Title                

📌 **Observations**

1. **Accurate Book Data Collection**
The system successfully scraped book details such as title, UPC, and source price from the Books to Scrape website using HTTP requests and HTML parsing.

2. **Intelligent ISBN Resolution**
Book titles were matched with Google Books API results using Sentence-BERT semantic similarity, ensuring accurate identification of ISBN-10 and ISBN-13 values instead of relying on exact string matching.

3. **External Market Price Integration**
The system fetched real-world competitor prices from the BooksRun API, allowing comparison between local prices and market prices.

4. **AI-Based Pricing Strategy**
A dynamic pricing rule was applied:


*   15% discount when competitor price is high
*   10% discount when competitor price is moderate

This demonstrates practical AI-assisted pricing optimization.

5. **Data Validation & Filtering**
Books without valid ISBNs or competitor prices were automatically skipped, ensuring that only reliable and meaningful records were included in the final dataset.

6. **Controlled Data Volume**
The system processed exactly 15 books, showing precise control over data collection and execution flow.

7. **Structured Output Generation**
The final pricing data was exported in both CSV and JSON formats, making it suitable for reporting, analysis, or integration with other systems.

8. **Efficient and Lightweight Execution**
By using requests and BeautifulSoup instead of browser automation, the system achieved faster execution with lower resource usage.

🧾 **Conclusion**

This project successfully demonstrates an AI-driven intelligent pricing system by integrating web scraping, semantic text matching, and real-time market data. The use of Sentence-BERT for ISBN resolution significantly improves accuracy compared to traditional string-matching techniques.

By combining competitor price analysis with rule-based discount strategies, the system generates competitive and data-driven pricing decisions suitable for e-commerce and online retail platforms. The solution is efficient, scalable, and closely aligned with real-world pricing challenges.

Overall, the project showcases how AI and NLP techniques can enhance business intelligence, making it a strong practical implementation of dynamic pricing, market analysis, and automation.