In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe
from sklearn.preprocessing import LabelEncoder


#### Step-1: - Loading the URL PDF

In [2]:
import time
import requests
import io
import pymupdf as fitz  # PyMuPDF
import pytesseract
from pdf2image import convert_from_bytes
from PIL import Image
import time

In [3]:
# Mimic browser headers
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
    "Referer": "https://www.google.com",
    "Accept": "application/pdf",
    "Accept-Language": "en-US,en;q=0.9",
    "Cache-Control": "no-cache",
    "Connection": "keep-alive"
}

def fetch_pdf(url, timeout=20):
    start_time = time.time()
    
    try:
        response = requests.get(url, headers=HEADERS, timeout=timeout)
        response.raise_for_status()  # Raise error for HTTP codes like 403, 404
        
        total_time = time.time() - start_time
        
        # Log if response is not PDF
        if response.content[:4] != b'%PDF':
            with open("debug_response.html", "wb") as f:
                f.write(response.content)
            return "Invalid PDF. HTML or different content returned.", 415, total_time
        
        return response.content, response.status_code, total_time
    
    except requests.exceptions.Timeout:
        return f"Timeout: Could not retrieve PDF from {url}", 408, 0
    except requests.exceptions.RequestException as e:
        return f"Error downloading PDF: {str(e)}", 500, 0


In [4]:

def is_valid_pdf(pdf_bytes):
    return pdf_bytes[:4] == b'%PDF'

def perform_ocr(pdf_bytes):
    """Perform OCR on a scanned PDF (fallback)."""
    images = convert_from_bytes(pdf_bytes)
    text = ""
    for img in images:
        text += pytesseract.image_to_string(img)
    
    return text.strip() if text else "OCR could not extract text."

In [5]:
def extract_text_from_pdf(pdf_bytes):
    if not is_valid_pdf(pdf_bytes):
        return "Invalid PDF or HTML returned instead of PDF."

    pdf_file = io.BytesIO(pdf_bytes)
    
    try:
        doc = fitz.open(stream=pdf_file, filetype="pdf")
    except Exception as e:
        return f"Failed to open PDF: {str(e)}"
    
    text = ""
    for page in doc:
        page_text = page.get_text("text")
        if not page_text.strip():  # If no text, fallback to OCR
            return perform_ocr(pdf_bytes)
        text += page_text
    
    return text.strip() if text else "No extractable text found."

In [6]:
def read_pdf_from_url(url):
    """Unified function to read PDF content from a URL."""
    if url == '-':
        return "No URL provided", 400, 0
    
    pdf_bytes, status_code, total_time = fetch_pdf(url)
    
    if isinstance(pdf_bytes, str):
        return pdf_bytes, status_code, total_time  # Return errors directly
    
    extracted_text = extract_text_from_pdf(pdf_bytes)
    
    return extracted_text, status_code, total_time

#### Step-2: - Perform Cleaning and Processing

In [7]:
from nltk.corpus import stopwords, words
import nltk

nltk.download('stopwords')
nltk.download('words')
stop_words = set(stopwords.words('english'))
valid_words = set(words.words())            # Load English dictionary

import spacy
nlp = spacy.load('en_core_web_sm')
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vchopra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/vchopra/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [8]:
def remove_line_breaks(text):
    # Replace line breaks with space if not at the end of a sentence
    text = re.sub(r'\n+', ' ', text)  # Replace multiple newlines
    text = re.sub(r'(\S)\n(\S)', r'\1 \2', text)  # Handle breaks within sentences
    text = re.sub(r'([a-z])\-\n([a-z])', r'\1\2', text)  # Fix hyphenated words across lines
    return text

def remove_extra_spaces(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text.strip()  # Trim leading/trailing spaces

def remove_special_characters(text):
    # Keep only alphabets and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

def remove_page_headers(text):
    # Example: Remove "Page 1 of 20"
    text = re.sub(r'page \d+\s?(of\s?\d+)?', '', text, flags=re.IGNORECASE)
    return text



def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

def fix_broken_words(text):
    # Fix hyphenated words that break across lines
    text = re.sub(r'(\w+)-\s(\w+)', r'\1\2', text)
    return text

def dictionary_filter(text):
    return ' '.join([word for word in text.split() if word.lower() in valid_words])


def remove_entities(text):
    doc = nlp(text)
    return ' '.join([token.text for token in doc if not token.ent_type_])

def clean_pdf_text(text:str):
    text = text.lower()
    text = remove_line_breaks(text)
    text = remove_extra_spaces(text)
    text = fix_broken_words(text)
    text = remove_special_characters(text)
    text = remove_page_headers(text)
    text = remove_stopwords(text)
    text = dictionary_filter(text)
    text = remove_entities(text)
    return text

### Step-3:- Load the Model

In [9]:
tokenizer = get_tokenizer("basic_english")
glove = GloVe(name='6B', dim=100)

In [10]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        lstm_out, (hidden, _) = self.lstm(embedded)
        return self.fc(hidden[-1])

def preprocess_text(text, vocab):
        tokens = tokenizer(text.lower())
        numericalized = [vocab[token] for token in tokens]
        
        # Pad sequence to match LSTM input size
        padded_sequence = pad_sequence(numericalized, max_len=50)
        
        # Convert to tensor
        input_tensor = torch.tensor([padded_sequence], dtype=torch.long)
        return input_tensor
    
def pad_sequence(seq, vocab, max_len=50):
    if len(seq) > max_len:
        return seq[:max_len]
    return seq + [vocab['<pad>']] * (max_len - len(seq))

# Define the numericalize_and_pad function
def numericalize_and_pad(texts, vocab, max_len=50):
    pad_value = vocab.get('<pad>', 0)  # Use 0 as default pad value if '<pad>' is not in vocab
    return [pad_sequence([vocab[token] for token in text], max_len, pad_value) for text in texts]

def return_loaded_model():
    # Load the same vocabulary used during training
    vocab = torch.load('./auxillary_files/vocab.pth')  # Load the vocab dictionary
    vocab.set_default_index(vocab['<unk>'])  # Handle unknown words
    # Load saved model
    vocab_size = len(vocab)
    embedding_dim = 100
    hidden_dim = 128
    output_dim = 4  # Number of classes
    pad_idx = 1
    
    model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx)
    model.load_state_dict(torch.load('./auxillary_files/lstm_model.pth'))
    model.eval()
    print("Model loaded successfully.")
    
    
    # Label Encoder
    label_encoder = LabelEncoder()
    label_encoder.classes_ = torch.load('./auxillary_files/label_encoder.pth')  # Load label encoder

    return model, vocab, label_encoder
    

### Step-4: - Run the Inference

In [11]:
def classify_pdf_from_url(url):
    
    # Step-1: - Fetch the PDF content
    extracted_text, status_code, total_time =read_pdf_from_url(url)
    
    if status_code != 200:
        return f"Failed to retrieve PDF: {extracted_text}"
    
    # Step-2: - Clean the extracted text
    clean_text=clean_pdf_text(extracted_text)
    
    if not extracted_text or extracted_text.startswith("Failed"):
        return extracted_text
    
    # Step-3: - Load the model 
    model, vocab, label_encoder = return_loaded_model()
    
    
    def preprocess_inference_text(text, vocab, max_len=50):
        # Step 1: Tokenize the text (Simple split by space, customize if needed)
        tokens = text.lower().split()
        
        # Step 2: Numericalize (convert tokens to indices using vocab)
        numericalized = [vocab[token] for token in tokens]
        
        # Step 3: Pad the sequence to max_len
        padded_sequence = pad_sequence(numericalized, vocab, max_len)
        
        # Step 4: Convert to tensor and return
        return torch.tensor([padded_sequence], dtype=torch.long)  # Add batch dimension

    # Step-4: - Preprocess text and make predictions
    input_tensor =  preprocess_inference_text(clean_text, vocab)
    
    with torch.no_grad():
        output = model(input_tensor)
        probabilities = F.softmax(output, dim=1)
        predicted_class_idx = torch.argmax(probabilities, dim=1).item()
        confidence = torch.max(probabilities).item()

    predicted_label = label_encoder.inverse_transform([predicted_class_idx])[0]

    final_output= {
        'predicted_class': predicted_label,
        'confidence': round(confidence, 2),
        'extracted_clean_text': clean_text  # Limit output for display
    }
    return final_output
    

In [12]:
url = "https://media.iuseelite.com/specsheet2/ows-cyl-101.pdf"
final_output = classify_pdf_from_url(url)
final_output

Model loaded successfully.


{'predicted_class': 'lighting',
 'confidence': 0.93,
 'extracted_clean_text': 'series sconce cylinder wall sconce series wattage w high pressure sodium w high pressure sodium w high pressure sodium w metal halide w metal halide w metal halide w metal halide compact fluorescent compact fluorescent incandescent w incandescent voltage color bronze replacement lens kit photo double fuse min temp bal standard w included please consult factory consult factory volt available w fixture guide example use decorative cylindrical shape architectural treatment complement blend building general lighting wall washing apartment parking recreation construction aluminum housing reflector made specular aluminum high efficiency removable baffle convert fixture silicone gasket effectively outside sealing optical chamber porcelain socket screw shell pin base standard ballast minimum starting temperature f c extreme temperature ballast available finish dark bronze polyester powder finish standard lens froste