In [6]:
import pdfplumber
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
import faiss
import numpy as np
from transformers import pipeline
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import warnings
from transformers import logging

# Suppress warnings
logging.set_verbosity_error()
warnings.filterwarnings('ignore', category=UserWarning)

# Helper class to simulate the expected structure by text splitter
class Document:
    def __init__(self, text):
        self.page_content = text  # Text of the document
        self.metadata = {}  # Metadata can be extended as needed

# Function to clean and verify that the input is a string and replace newlines
def prepare_text(text):
    if isinstance(text, str) and text.strip():
        return text.replace("\n", " ")
    return ""  # Return an empty string if the input is not valid to prevent errors

# Load and prepare documents using pdfplumber
pdf_files = [
    "Msiri_one.pdf",
    "Msiri_two_many.pdf"
]

documents = []
for pdf_file in pdf_files:
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            text = prepare_text(page.extract_text())
            if text:
                documents.append(Document(text))  # Create Document objects

if not documents:
    raise ValueError("No documents loaded, check PDF paths and contents.")

# Splitting texts into manageable chunks using the custom Document class
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
splits = text_splitter.split_documents(documents)

# Setting up the embedding model using SentenceTransformer directly
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
splits_content = [split.page_content for split in splits]
embeddings = embedding_model.encode(splits_content)

# Initialize FAISS index and add embeddings
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

# Function to query documents based on textual query
def query_documents(query, top_k=5):
    query_embedding = embedding_model.encode([prepare_text(query)])
    distances, indices = index.search(query_embedding, top_k)
    return [splits[i] for i in indices[0]]

# Define the summarization and question answering chains using Hugging Face Pipelines
summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
question_answering_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad", tokenizer="bert-large-uncased-whole-word-masking-finetuned-squad")

class CombineDocsChain:
    def __init__(self, llm):
        self.llm = llm
    def __call__(self, documents):
        combined_text = " ".join([doc.page_content for doc in documents])
        return self.llm(combined_text, max_length=512, min_length=30, do_sample=False)[0]['summary_text']

class QuestionGeneratorChain:
    def __init__(self, llm):
        self.llm = llm
    def __call__(self, context, question):
        return self.llm(question=question, context=context)['answer']

# Testing the document processing components
query = "I feel worthless"
relevant_docs = query_documents(query)
combine_docs_chain = CombineDocsChain(summarization_pipeline)
summary = combine_docs_chain(relevant_docs)
print(f"Summary: {summary}")

question = "I feel stressed?"
question_gen_chain = QuestionGeneratorChain(question_answering_pipeline)
answer = question_gen_chain(context=summary, question=question)





Summary: The social context in which a person lives is a big influence in self-esteem. The feeling of being worthless is caused by your inside being not matching your outside result. Therapy could be a great way to change this feeling of worthlessness.
Answer: worthless
