# Import Required Libraries

In [17]:
import requests
import json
from transformers import pipeline
from playwright.async_api import async_playwright


In [25]:
# Step 3: Setup Hugging Face API token
API_TOKEN = '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'  #  Hugging Face API token
API_URL = "https://api-inference.huggingface.co/models/tiiuae/falcon-7b-instruct"  # Falcon 7B or another suitable model
headers = {
    "Authorization": f"Bearer {API_TOKEN}",
    "Content-Type": "application/json"  # Correctly added this line
}



In [26]:
# Step 4: Function to get selectors using Hugging Face Inference API
def get_selectors_from_model(prompt):
    data = json.dumps({"inputs": prompt})
    response = requests.post(API_URL, headers=headers, data=data)

    if response.status_code == 200:
        result = response.json()
        return result
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return None



In [32]:
# Step 5: Generate Selectors
def generate_selectors(html_content):
    prompt = (
        "Extract CSS selectors for the following elements in this HTML:\n"
        "- Title of the review\n"
        "- Body of the review\n"
        "- Rating\n"
        "- Reviewer's name\n\n"
        f"HTML:\n{html_content}"
    )
    selectors = get_selectors_from_model(prompt)
    if selectors:
        return selectors
    return {}  # Return an empty dictionary if no selectors are generated



In [33]:
# Step 6: Scrape Reviews
async def scrape_reviews(page_url, selectors):
    if not selectors:  # If selectors are empty or None, return an empty list
        print("No selectors found.")
        return []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(page_url)
        reviews = []

        # Use the selectors to query elements
        review_elements = await page.query_selector_all(selectors.get("review", ".review"))
        for element in review_elements:
            title = await element.query_selector(selectors.get("title", ".review-title"))
            body = await element.query_selector(selectors.get("body", ".review-body"))
            rating = await element.query_selector(selectors.get("rating", ".review-rating"))
            reviewer = await element.query_selector(selectors.get("reviewer", ".reviewer-name"))
            
            if body:
                reviews.append({
                    "title": await title.inner_text() if title else "N/A",
                    "body": await body.inner_text(),
                    "rating": await rating.inner_text() if rating else "N/A",
                    "reviewer": await reviewer.inner_text() if reviewer else "N/A"
                })
        await browser.close()
        return reviews



In [34]:
# Step 7: Filter Non-Review Content
def filter_reviews(reviews):
    return [review for review in reviews if "body" in review and review["body"] and len(review["body"]) > 30]


In [35]:
# Step 8: Example HTML content
html_example = """
<html>
    <div class='review'>
        <h3>Excellent Product</h3>
        <p>This cream worked wonders for my sore muscles after exercise. I highly recommend it!</p>
        <span class='rating'>5</span>
        <span class='reviewer'>John Doe</span>
    </div>
    <div class='review'>
        <h3>Not great</h3>
        <p>Didn't work as expected. The smell is too strong, and it left my skin irritated.</p>
        <span class='rating'>2</span>
        <span class='reviewer'>Jane Smith</span>
    </div>
</html>
"""


In [36]:
# Step 9: Run the Scraping and Filtering
selectors = generate_selectors(html_example)
print(f"Generated Selectors: {selectors}")

# Example URL (replace with the actual product review URL)
page_url = "https://2717recovery.com/products/recovery-cream"

# Scrape reviews from the page
reviews = await scrape_reviews(page_url, selectors)
print(f"Scraped Reviews: {reviews}")

# Filter reviews to discard non-relevant content
filtered_reviews = filter_reviews(reviews)
print("Filtered Reviews:", filtered_reviews)


Generated Selectors: [{'generated_text': "Extract CSS selectors for the following elements in this HTML:\n- Title of the review\n- Body of the review\n- Rating\n- Reviewer's name\n\nHTML:\n\n<html>\n    <div class='review'>\n        <h3>Excellent Product</h3>\n        <p>This cream worked wonders for my sore muscles after exercise. I highly recommend it!</p>\n        <span class='rating'>5</span>\n        <span class='reviewer'>John Doe</span>\n    </div>\n    <div class='review'>\n        <h3>Not great</h3>\n        <p>Didn't work as expected. The smell is too strong, and it left my skin irritated.</p>\n        <span class='rating'>2</span>\n        <span class='reviewer'>Jane Smith</span>\n    </div>\n</html>\n\nCSS:\n\nh3 {\n    text-align: center;\n    font-size: 24px;\n}\nspan {\n    background-color: #ccc;\n    color: #000;\n    padding: 10px 20px;\n}\n\nTo select the title of the review, body of the review, and rating of each review in this HTML, you can use the following CSS:\n

TimeoutError: Page.goto: Timeout 30000ms exceeded.
Call log:
  - navigating to "https://2717recovery.com/products/recovery-cream", waiting until "load"
