In [1]:
# imports

import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI

ModuleNotFoundError: No module named 'requests'

In [None]:
# Load environment variables in a file called .env

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")


In [None]:
openai = OpenAI()


In [None]:
# A class to represent a Webpage
# If you're not familiar with Classes, check out the "Intermediate Python" notebook

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:

    def __init__(self, url):
        """
        Create this Website object from the given url using the BeautifulSoup library
        """
        self.url = url
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        self.text = soup.body.get_text(separator="\n", strip=True)

In [None]:
Adi = Website("https://www.ncbi.nlm.nih.gov/books/NBK526128/")
print(Adi.title)
print(Adi.text)

In [None]:
system_prompt = "You are an assistant that analyzes the contents of a website \
and provides a short summary, ignoring text that might be navigation related. \
Respond in markdown."

In [None]:
# A function that writes a User Prompt that asks for summaries of websites:

def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "\nThe contents of this website is as follows; \
please provide a short summary of this website in markdown. \
If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt

In [None]:
print(user_prompt_for(Adi))

In [None]:
messages = [
    {"role": "system", "content": "You are a snarky assistant"},
    {"role": "user", "content": "What is 2 + 2?"}
]

In [None]:
import openai

openai.api_key = "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"


In [None]:
# To give you a preview -- calling OpenAI with system and user messages:

response = openai.chat.completions.create(model="gpt-4o-mini", messages=messages)
print(response.choices[0].message.content)

In [None]:
def messages_for(website):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(website)}
    ]

In [None]:
messages_for(Adi)

In [None]:

def summarize(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model = "gpt-4o-mini",
        messages = messages_for(website)
    )
    return response.choices[0].message.content

In [None]:
summarize("https://www.ncbi.nlm.nih.gov/books/NBK526128/")

In [None]:
# A function to display the above summary nicely in the Jupyter output, using markdown

def display_summary(url):
    summary = summarize(url)
    display(Markdown(summary))

In [None]:
display_summary("https://www.ncbi.nlm.nih.gov/books/NBK526128/")

In [None]:
## FOR SOME SITES WHICH USES JAVA SCRIPT , IT CANT SUMMARIZE DUE TO EXTRACTION PROBLEMS . TO ENCOUNTER THIS WE USE SELLENIUM

In [None]:
pip install openai selenium webdriver-manager python-dotenv beautifulsoup4


In [None]:
# Imports
import os
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display

from openai import OpenAI
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

# ✅ Load OpenAI API Key
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
    print("OpenAI API key missing!")
else:
    print("API key loaded successfully.")

# ✅ Initialize OpenAI Client
client = OpenAI(api_key=api_key)

# ✅ Function to fetch page content using Selenium
def get_page_content(url):
    options = Options()
    options.headless = True  # ✅ Headless browser mode
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    try:
        driver.get(url)
        # Optional: Wait until page loads
        driver.implicitly_wait(5)
        page_source = driver.page_source
    finally:
        driver.quit()
    
    return page_source

# ✅ Function to extract text from HTML
def extract_text_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    # Remove script & style elements
    for script in soup(['script', 'style']):
        script.decompose()
    text = soup.get_text(separator=' ')
    return text.strip()

# ✅ Function to summarize using OpenAI
def summarize_text(text, max_tokens=300):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "Summarize the following webpage content:"},
            {"role": "user", "content": text}
        ],
        max_tokens=max_tokens,
        temperature=0.5,
    )
    return response.choices[0].message.content

# ✅ Main Workflow
def summarize_website(url):
    html_content = get_page_content(url)
    page_text = extract_text_from_html(html_content)
    if len(page_text) > 4000:  
        page_text = page_text[:4000]
    summary = summarize_text(page_text)
    display(Markdown(f"### Summary of [{url}]({url})\n\n{summary}"))

# ✅ Example Usage
summarize_website("https://www.bbc.com/news")  


In [None]:
summarize_website("https://openai.com")

In [None]:
!pip install PyMuPDF openai




Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-win_amd64.whl (18.7 MB)
   ---------------------------------------- 0.0/18.7 MB ? eta -:--:--
   ------ --------------------------------- 3.1/18.7 MB 23.1 MB/s eta 0:00:01
   --------- ------------------------------ 4.5/18.7 MB 12.2 MB/s eta 0:00:02
   ---------------- ----------------------- 7.9/18.7 MB 14.3 MB/s eta 0:00:01
   ------------------- -------------------- 8.9/18.7 MB 11.8 MB/s eta 0:00:01
   --------------------- ------------------ 10.0/18.7 MB 10.4 MB/s eta 0:00:01
   --------------------------- ------------ 13.1/18.7 MB 11.1 MB/s eta 0:00:01
   ------------------------------ --------- 14.4/18.7 MB 11.2 MB/s eta 0:00:01
   ---------------------------------------  18.4/18.7 MB 11.6 MB/s eta 0:00:01
   ---------------------------------------- 18.7/18.7 MB 11.4 MB/s eta 0:00:00
Installing collected packages: PyMuPDF
Successfully installed PyMuPDF-1.

In [None]:
pdf_text = extract_text_from_pdf(pdf_path)
print(len(pdf_text))
print(pdf_text[:1000])  # Print first 1000 characters


0



In [None]:
pip install pytesseract pdf2image pillow


Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pytesseract, pdf2image

   ---------------------------------------- 2/2 [pdf2image]

Successfully installed pdf2image-1.17.0 pytesseract-0.3.13
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pytesseract
from pdf2image import convert_from_path

def extract_text_with_ocr(pdf_path):
    images = convert_from_path(pdf_path)
    text = ""
    for img in images:
        text += pytesseract.image_to_string(img)
    return text


In [None]:
# 📄 PDF Summarizer with Chunking Support using OpenAI API + OCR fallback

import fitz  # PyMuPDF
import openai
import pytesseract
from pdf2image import convert_from_path
# Set Tesseract OCR path manually (no need to set system PATH)
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"


# Set your OpenAI API Key
openai.api_key = "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"

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# OCR fallback for image-based PDFs
def extract_text_with_ocr(pdf_path):
    images = convert_from_path(pdf_path)
    text = ""
    for img in images:
        text += pytesseract.image_to_string(img)
    return text

# Function to chunk text based on approximate token size
def chunk_text(text, max_tokens=3000):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        current_length += len(word) + 1  # +1 for space
        if current_length > max_tokens:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(word)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

# Function to summarize text using OpenAI API
def summarize_text(text, model="gpt-4o-mini"):
    response = openai.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "Summarize the following PDF content."},
            {"role": "user", "content": text}
        ]
    )
    return response.choices[0].message.content

# Function to summarize large text by splitting and summarizing each chunk
def summarize_large_text(text, model="gpt-4o-mini"):
    chunks = chunk_text(text)
    summaries = []

    for i, chunk in enumerate(chunks):
        print(f"Summarizing chunk {i+1} of {len(chunks)}...")
        summary = summarize_text(chunk, model=model)
        summaries.append(summary)

    print("Summarizing combined summaries...")
    final_summary = summarize_text(' '.join(summaries), model=model)

    return final_summary

# Example Usage
pdf_path = r"C:\Users\DELL\Downloads\aadhar.pdf"

pdf_text = extract_text_from_pdf(pdf_path)

if not pdf_text.strip():
    print("No text found in PDF, using OCR...")
    pdf_text = extract_text_with_ocr(pdf_path)

if not pdf_text.strip():
    print("Still empty after OCR. Please check your PDF file.")
else:
    final_summary = summarize_large_text(pdf_text)
    print("\nFinal Summary:\n")
    print(final_summary)


No text found in PDF, using OCR...
Summarizing chunk 1 of 3...
Summarizing chunk 2 of 3...
Summarizing chunk 3 of 3...
Summarizing combined summaries...

Final Summary:

The document is an Aadhaar card registration or enrollment letter from the Unique Identification Authority of India (UIDAI). It contains personal details of an individual, including their name, address in Telangana, Aadhar number, date of birth, and contact information. Key highlights include:

1. **Aadhaar as Identity Proof**: It is emphasized that Aadhaar serves as proof of identity but not as confirmation of citizenship or date of birth. Users are encouraged to verify their information through online methods or QR code scanning.

2. **Documentation Updates**: Individuals should update their identity and address supporting documents every ten years.

3. **Aadhaar Benefits**: It allows access to a variety of government and non-government services.

4. **Security Practices**: Users should keep their contact information