This notebook is used to scrap this website: https://www.coffeereview.com/review/

Sitemap: https://www.coffeereview.com/sitemap_index.xml



# Import Libraries

In [22]:
import requests
from bs4 import BeautifulSoup
import os
import time
import re
from datetime import datetime

# Fetch URLs in the sitemap

In [19]:
# Step 1: Get all review URLs from the sitemap
sitemap_url = "https://www.coffeereview.com/review-sitemap.xml"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}

# Fetch and parse the sitemap XML
response = requests.get(sitemap_url, headers=headers)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, "xml")
    review_urls = [loc.text for loc in soup.find_all("loc")]
    print(f"✅ Found {len(review_urls)} review URLs.")
else:
    print(f"❌ Failed to fetch sitemap. Status code: {response.status_code}")
    exit()

✅ Found 1001 review URLs.


In [20]:
# For the first sitemap: "https://www.coffeereview.com/review-sitemap.xml", we need to remove the first element which is https://www.coffeereview.com/review/
if sitemap_url == "https://www.coffeereview.com/review-sitemap.xml":
    print(review_urls.pop(0))
    print(f"{len(review_urls)} URLs need to be scrapped.")

https://www.coffeereview.com/review/
1000 URLs need to be scrapped.


In [None]:
# Before scrapping, check the number of URLs
if sitemap_url != "https://www.coffeereview.com/review-sitemap9.xml":
    assert len(review_urls) == 1000
else:
    assert len(review_urls) == 379 # NOTE: This number might change if more reviews are added.

# Scrapping loop

In [23]:
# Step 2: Function to scrape text from a review page
def scrape_text(url):
    """ Scrapes all text from a given coffee review URL """
    try:
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"❌ Skipped {url} (status code {response.status_code})")
            return None

        # Extract text using BeautifulSoup
        soup = BeautifulSoup(response.text, "html.parser")
        all_text = soup.get_text(separator="\n", strip=True)

        # Include the URL in the text
        full_text = f"URL: {url}\n\n{all_text}"
        return full_text

    except Exception as e:
        print(f"❌ Error scraping {url}: {e}")
        return None

In [25]:
# Step 3: Save each review text using URL as the file name
save_dir = "coffee_reviews_text"
os.makedirs(save_dir, exist_ok=True)  # Create directory if not exists

for i, url in enumerate(review_urls):  # Limit to first 5 for testing; remove `[:5]` to scrape all
    print(f"🔄 Scraping {i+1}/{len(review_urls)}: {url}")

    text_content = scrape_text(url)
    if text_content:
        # Sanitize URL for file name (replace special characters with `_`)
        safe_filename = re.sub(r"[^\w\-]", "_", url)  # Keep only alphanumeric, `_`, and `-`
        file_path = os.path.join(save_dir, f"{safe_filename}.txt")

        # Save the text
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(text_content)

        print(f"✅ Saved: {file_path}")

    # Avoid overwhelming the server (polite scraping)
    time.sleep(3)

print("🎯 All reviews scraped and saved!")

🔄 Scraping 1/1000: https://www.coffeereview.com/review/100-colombian/
✅ Saved: coffee_reviews_text\https___www_coffeereview_com_review_100-colombian_.txt
🔄 Scraping 2/1000: https://www.coffeereview.com/review/moka-java/
✅ Saved: coffee_reviews_text\https___www_coffeereview_com_review_moka-java_.txt
🔄 Scraping 3/1000: https://www.coffeereview.com/review/java/
✅ Saved: coffee_reviews_text\https___www_coffeereview_com_review_java_.txt
🔄 Scraping 4/1000: https://www.coffeereview.com/review/sumatra-gayo-mountain/
✅ Saved: coffee_reviews_text\https___www_coffeereview_com_review_sumatra-gayo-mountain_.txt
🔄 Scraping 5/1000: https://www.coffeereview.com/review/folgers-french-roast/
✅ Saved: coffee_reviews_text\https___www_coffeereview_com_review_folgers-french-roast_.txt
🔄 Scraping 6/1000: https://www.coffeereview.com/review/folger-mountain-grown-coffee/
✅ Saved: coffee_reviews_text\https___www_coffeereview_com_review_folger-mountain-grown-coffee_.txt
🔄 Scraping 7/1000: https://www.coffeerevie

# Ignore: Check the scrapped content from 1 webpage for 1 coffee

In [None]:
# Target webpage
url = "https://www.coffeereview.com/review/wilton-benitez-colombia-yellow-bourbon/"

# Set User-Agent to simulate a browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}

# Send the request
response = requests.get(url, headers=headers)

# Check response status code
if response.status_code == 200:
    print("✅ Request successful!")
else:
    print(f"❌ Request failed, status code: {response.status_code}")


✅ Request successful!


In [None]:
with open("coffee_review_html_code_sample.txt", "w", encoding="utf-8") as f:
    f.write(response.text)

print("✅ File saved.")

✅ File saved.


In [15]:
soup = BeautifulSoup(response.text, "html.parser")
all_text = soup.get_text(separator="\n", strip=True)  # Extract all text

with open("coffee_review_text_sample.txt", "w", encoding="utf-8") as f:
    f.write(all_text)

print("✅ Plain text file saved as coffee_review.txt. You can open it with Notepad or VSCode.")


✅ Plain text file saved as coffee_review.txt. You can open it with Notepad or VSCode.
