In [1]:
import pandas as pd
import numpy as np
import os
import re
import fitz  # PyMuPDF
from tika import parser as tika_parser
from tika import detector
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import pytesseract
from pdf2image import convert_from_path
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import joblib  # For saving models

print("--- Starting One-Time Pre-processing ---")

# --- 1. Set up all paths and functions (from your notebook) ---

# (Make sure these paths are correct)
tesseract_install_path = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
pytesseract.pytesseract.tesseract_cmd = tesseract_install_path
poppler_install_path = r"C:\Users\VAMSHI KRISHNA BABU\poppler\poppler-25.07.0\Library\bin"
DATASET_FOLDER = r"C:\Users\VAMSHI KRISHNA BABU\Applied AI A-2\Dataset"

# (Copy your helper functions here)
def extract_text_from_pdf(pdf_path):
    try:
        text = ""
        with fitz.open(pdf_path) as doc:
            for page in doc: text += page.get_text()
        if text.strip(): return text
    except Exception: pass
    try:
        parsed = tika_parser.from_file(pdf_path)
        text = parsed.get('content')
        if text and text.strip(): return text.strip()
    except Exception: pass
    try:
        images = convert_from_path(pdf_path, poppler_path=poppler_install_path)
        ocr_text = ""
        for img in images: ocr_text += pytesseract.image_to_string(img, lang='eng')
        if ocr_text.strip(): return ocr_text
    except Exception: return ""
    return ""

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return " ".join(filtered_tokens)

# --- 2. Build the database (This is the slow part) ---
print("Building author database... (This will take time)")
authors_data = []
for root, dirs, files in tqdm(os.walk(DATASET_FOLDER), desc="Scanning Folders"):
    for paper_file in files:
        if paper_file.endswith('.pdf'):
            pdf_path = os.path.join(root, paper_file)
            relative_path = os.path.relpath(root, DATASET_FOLDER)
            author_name = relative_path.split(os.path.sep)[0]
            raw_text = extract_text_from_pdf(pdf_path)
            if raw_text:
                processed_text = preprocess_text(raw_text)
                authors_data.append({
                    'author': author_name,
                    'paper': paper_file,
                    'processed_text': processed_text
                })
author_df = pd.DataFrame(authors_data)
print(f"Database built with {len(author_df)} papers.")

# --- 3. Build and save the TF-IDF Model ---
print("Building TF-IDF model...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=5, ngram_range=(1,2))
tfidf_matrix = tfidf_vectorizer.fit_transform(author_df['processed_text'])
print("Saving TF-IDF models to disk...")
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(tfidf_matrix, 'tfidf_matrix.joblib')

# --- 4. Build and save the Embedding Model ---
print("Building Embedding model... (This will also take time)")
model = SentenceTransformer('all-MiniLM-L6-v2')
paper_embeddings = model.encode(author_df['processed_text'].tolist(), show_progress_bar=True)
print("Saving Embeddings to disk...")
np.save('paper_embeddings.npy', paper_embeddings)

# --- 5. Save the clean Author DataFrame ---
# We use Parquet because it's fast and efficient
print("Saving author database to disk...")
author_df.to_parquet('author_database.parquet')

print("\n--- ✅ PRE-PROCESSING COMPLETE! ---")
print("The following files have been created in your folder:")
print(" - author_database.parquet")
print(" - paper_embeddings.npy")
print(" - tfidf_vectorizer.joblib")
print(" - tfidf_matrix.joblib")
print("\nYou are now ready to use the fast app.py script.")


--- Starting One-Time Pre-processing ---
Building author database... (This will take time)


Scanning Folders: 14it [00:10,  1.08s/it]

MuPDF error: library error: FT_New_Memory_Face(Times-Bold): unknown file format

MuPDF error: library error: FT_New_Memory_Face(Times-Bold): unknown file format



Scanning Folders: 72it [02:10,  1.81s/it]


Database built with 637 papers.
Building TF-IDF model...
Saving TF-IDF models to disk...
Building Embedding model... (This will also take time)


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Saving Embeddings to disk...
Saving author database to disk...

--- ✅ PRE-PROCESSING COMPLETE! ---
The following files have been created in your folder:
 - author_database.parquet
 - paper_embeddings.npy
 - tfidf_vectorizer.joblib
 - tfidf_matrix.joblib

You are now ready to use the fast app.py script.
