In [None]:
import time
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
import openai

# -------------------------
# Step 1: Scrape Website using Selenium
# -------------------------
def scrape_website(url, output_file="raw_data.txt"):
    driver = webdriver.Chrome()  # Ensure ChromeDriver is installed
    driver.get(url)
    time.sleep(5)  # allow page to load

    # Extract all visible text
    body_text = driver.find_element(By.TAG_NAME, "body").text
    driver.quit()

    # Save to file
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(body_text)

    print(f"Data scraped and saved to {output_file}")

# -------------------------
# Step 2: Read from Text File
# -------------------------
def read_text_file(file_path="raw_data.txt"):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

# -------------------------
# Step 3: Process with OpenAI LLM
# -------------------------
def extract_structured_data(raw_text):
    prompt = f"""
    You are an information extraction assistant. 
    From the following unstructured proposal text, extract and return structured JSON with keys:
    - title
    - duration
    - budget
    - deadline
    - key_highlights

    Text:
    {raw_text}
    """

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    content = response["choices"][0]["message"]["content"]

    # Try parsing JSON, fallback if plain text
    try:
        structured_data = json.loads(content)
    except:
        structured_data = {"raw_summary": content}

    return structured_data

# -------------------------
# Step 4: Run Pipeline
# -------------------------
if __name__ == "__main__":
    openai.api_key = "your_api_key_here"

    # Example website (replace with actual proposal site)
    url = "https://example.com/proposal123"

    # Step 1: Scrape
    scrape_website(url)

    # Step 2: Load raw text
    raw_text = read_text_file()

    # Step 3: Extract JSON summary
    structured_output = extract_structured_data(raw_text)

    print("\n=== Structured Proposal Data ===")
    print(json.dumps(structured_output, indent=4))
