In [1]:
from flask import Flask, request, jsonify
import threading
from bs4 import BeautifulSoup
import openai
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# Setup driver with Chrome in headless mode
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")

    driver = webdriver.Chrome(options=chrome_options)
    return driver

# Initialize Flask app
app = Flask(__name__)

# Route to handle scraping and filtering
@app.route('/scrape', methods=['POST'])
def scrape():
    data = request.json
    url = data.get('url')
    prompt = data.get('prompt')

    if not url or not prompt:
        return jsonify({"error": "Please provide both a URL and a prompt."}), 400

    # Scraping function: Crawl page and extract HTML content
    def crawl_page(url):
        driver = setup_driver()
        driver.get(url)
        page_source = driver.page_source
        driver.quit()
        return page_source

    # Crawl the webpage
    page_source = crawl_page(url)

    # Optimized extraction: Get first few paragraphs to minimize processing
    def extract_content(page_source):
        soup = BeautifulSoup(page_source, 'html.parser')

        # Limit extraction to first 5 <p> tags for speed
        paragraphs = soup.find_all('p', limit=5)
        paragraph_texts = [p.get_text() for p in paragraphs]

        return paragraph_texts

    # Extract relevant content from page
    extracted_content = extract_content(page_source)

    # Initialize OpenAI API
    openai.api_key = '*your_openai_key*'

    # Function to determine relevance of content using OpenAI
    def is_content_relevant(prompt, content):
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an assistant that helps decide if content is relevant to a user's prompt."},
                {"role": "user", "content": f"Based on the instruction '{prompt}', is the following content relevant: {content}?"}
            ]
        )
        answer = response['choices'][0]['message']['content'].strip().lower()
        return 'yes' in answer

    # Filter content in larger chunks instead of individual paragraphs for speed
    def filter_relevant_content(prompt, content_list):
        chunked_content = " ".join(content_list[:2])  # Create a chunk of the first two paragraphs
        if is_content_relevant(prompt, chunked_content):
            return chunked_content
        return "No relevant content found."

    # Filter the content based on prompt
    relevant_content = filter_relevant_content(prompt, extracted_content)

    # Return JSON response
    return jsonify({
        "url": url,
        "prompt": prompt,
        "relevant_content": relevant_content
    })

# Run Flask in background thread
def run_flask():
    app.run(port=5000, debug=True, use_reloader=False)

threading.Thread(target=run_flask).start()


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


In [1]:
from flask import Flask, request, jsonify
import threading
from bs4 import BeautifulSoup
import openai
import requests  # Using requests for faster page fetching

# Initialize Flask app
app = Flask(__name__)

# Route to handle scraping and filtering
@app.route('/scrape', methods=['POST'])
def scrape():
    data = request.json
    url = data.get('url')
    prompt = data.get('prompt')

    if not url or not prompt:
        return jsonify({"error": "Please provide both a URL and a prompt."}), 400

    # Use requests to fetch the webpage
    def crawl_page(url):
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            return None

    # Crawl the webpage
    page_source = crawl_page(url)
    if page_source is None:
        return jsonify({"error": "Failed to fetch webpage."}), 500

    # Optimized extraction: Get first few paragraphs to minimize processing
    def extract_content(page_source):
        soup = BeautifulSoup(page_source, 'html.parser')

        # Limit extraction to first 5 <p> tags for speed
        paragraphs = soup.find_all('p', limit=5)
        paragraph_texts = [p.get_text() for p in paragraphs]

        return paragraph_texts

    # Extract relevant content from page
    extracted_content = extract_content(page_source)

    # Initialize OpenAI API
    openai.api_key = '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'

    # Function to determine relevance of content using OpenAI
    def is_content_relevant(prompt, content):
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an assistant that helps decide if content is relevant to a user's prompt."},
                {"role": "user", "content": f"Based on the instruction '{prompt}', is the following content relevant: {content}?"}
            ]
        )
        answer = response['choices'][0]['message']['content'].strip().lower()
        return 'yes' in answer

    # Filter content in larger chunks instead of individual paragraphs for speed
    def filter_relevant_content(prompt, content_list):
        chunked_content = " ".join(content_list[:2])  # Create a chunk of the first two paragraphs
        if is_content_relevant(prompt, chunked_content):
            return chunked_content
        return "No relevant content found."

    # Filter the content based on prompt
    relevant_content = filter_relevant_content(prompt, extracted_content)

    # Return JSON response
    return jsonify({
        "url": url,
        "prompt": prompt,
        "relevant_content": relevant_content
    })

# Run Flask in background thread
def run_flask():
    app.run(port=5000, debug=True, use_reloader=False)

threading.Thread(target=run_flask).start()


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


In [2]:
import requests
response = requests.post(
    "http://127.0.0.1:5000/scrape",
    json={
        "url": "https://en.wikipedia.org/wiki/Artificial_intelligence",
        "prompt": "Find information about artificial intelligence"
    }
)
print(response.json())


INFO:werkzeug:127.0.0.1 - - [23/Oct/2024 00:38:52] "POST /scrape HTTP/1.1" 200 -


{'prompt': 'Find information about artificial intelligence', 'relevant_content': '\n Artificial intelligence (AI), in its broadest sense, is intelligence exhibited by machines, particularly computer systems. It is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals.[1] Such machines may be called AIs.\n', 'url': 'https://en.wikipedia.org/wiki/Artificial_intelligence'}


In [3]:
!lt --port 5000

your url is: https://eleven-experts-decide.loca.lt


INFO:werkzeug:127.0.0.1 - - [23/Oct/2024 00:39:36] "POST /scrape HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [23/Oct/2024 00:41:09] "POST /scrape HTTP/1.1" 200 -


^C
