In [None]:
%pip install PyMuPDF 
%pip install fitz
%pip install tqdm
%pip install bs4
%pip install PyPDF2
%pip install langchain
%pip install nltk
%pip install spacy



In [None]:
import warnings

# Ignore all warnings (not recommended unless you know the implications)
warnings.filterwarnings('ignore')

# Filter out specific warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)


In [None]:
import requests
from bs4 import BeautifulSoup
import json
import os
import PyPDF2
import re

# URL of the webpage to scrape
url = "https://www.icar-crida.res.in/publications_annualreports.html"

def fetch_pdf_urls(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        content = BeautifulSoup(response.content, 'html.parser')
        links = content.find_all('a')
        pdf_links = []
        for link in links:
            href = link.get('href')
            if href and href.endswith('.pdf'):
                if not href.startswith('http'):
                    href = 'https://www.icar-crida.res.in/' + href.lstrip('/')
                # Adjusting to match only specific structure ending with .pdf directly after Annualreports/
                if re.match(r'https://www\.icar-crida\.res\.in/.*?/Annualreports/[^/]+\.pdf$', href):
                    pdf_links.append(href)
        return pdf_links
    except requests.RequestException as e:
        print(f"Failed to fetch webpage: {url} with error: {e}")
        return []

def download_pdfs(pdf_urls, download_dir='downloaded_pdfs'):
    os.makedirs(download_dir, exist_ok=True)
    downloaded_files = []
    for url in pdf_urls:
        filename = url.split('/')[-1]
        filepath = os.path.join(download_dir, filename)
        if os.path.exists(filepath):
            print(f"Skipping {filename}. Already downloaded.")
            downloaded_files.append(filepath)
            continue
        try:
            response = requests.get(url)
            response.raise_for_status()
            with open(filepath, 'wb') as f:
                f.write(response.content)
            downloaded_files.append(filepath)
            print(f"Downloaded {filename} to {download_dir}")
        except requests.RequestException as e:
            print(f"Failed to download {filename}: {e}")
    return downloaded_files

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text.strip()

# Fetch PDF URLs and download PDFs
pdf_urls = fetch_pdf_urls(url)

# Save PDF URLs to a JSON file
with open('icar_crida_report_urls.json', 'w', encoding='utf-8') as f:
    json.dump(pdf_urls, f, ensure_ascii=False, indent=2)

print(pdf_urls)
downloaded_pdfs = download_pdfs(pdf_urls)
print("PDF download and URL scraping completed!")

# Directory to save the text files
text_dir = 'extracted_texts'
os.makedirs(text_dir, exist_ok=True)

# Process each downloaded PDF
for pdf_path in downloaded_pdfs:
    text = extract_text_from_pdf(pdf_path)
    cleaned_text = clean_text(text)
    txt_filename = os.path.basename(pdf_path).replace('.pdf', '.txt')
    txt_filepath = os.path.join(text_dir, txt_filename)
    with open(txt_filepath, 'w', encoding='utf-8') as file:
        file.write(cleaned_text)
    print(f"Extracted, cleaned, and saved text from {pdf_path} to {txt_filepath}")

print("Text extraction, cleaning, and saving completed!")


In [None]:
import fitz
import os
from tqdm import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for pgno in range(len(doc)):
            page = doc.load_page(pgno)
            text += page.get_text().replace('\n', ' ')  # Remove \n characters
        return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""
from nltk.tokenize import sent_tokenize, word_tokenize

# Function to preprocess text and chunk into sentences with overlap
def preprocess_and_chunk_sentences(text, max_chunk_size=300, chunk_overlap=10):
    sentences = sent_tokenize(text)  # Tokenize text into sentences
    chunks = []
    current_chunk = []
    
    for sentence in sentences:
        tokens = word_tokenize(sentence)  # Tokenize each sentence into words
        for token in tokens:
            current_chunk.append(token)
            if len(' '.join(current_chunk)) >= max_chunk_size:
                # Create the chunk and add it to the list of chunks
                chunks.append(' '.join(current_chunk))
                # Create the overlap for the next chunk
                current_chunk = current_chunk[-chunk_overlap:]
    
    # Add any remaining tokens as the last chunk
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks


# Function to process PDFs and directly create text files
def process_pdfs_and_create_files(pdf_dir='downloaded_pdfs', output_folder='final_chunks'):
    os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists

    for filename in tqdm(os.listdir(pdf_dir), desc="Processing PDFs"):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(pdf_dir, filename)
            text = extract_text_from_pdf(pdf_path)
            if text:
                # Preprocess and chunk text into sentences
                chunks = preprocess_and_chunk_sentences(text)
                # Create text file for each PDF
                output_file_path = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}.txt")
                with open(output_file_path, 'w', encoding='utf-8') as f:
                    for chunk in chunks:
                        f.write(chunk + '\n\n')  # Add new lines between chunks
                print(f"Processed {filename}: {len(chunks)} chunks")

# Process PDFs and directly create text files
process_pdfs_and_create_files()
