In [17]:
!pip install scikit-learn-intelex pymupdf langchain-google-genai langchain-community python-dotenv faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [4]:
from sklearnex import patch_sklearn
patch_sklearn()  # Use optimized scikit-learn
import fitz  # PyMuPDF for PDF handling
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
import os
from dotenv import load_dotenv
from google.colab import drive

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [6]:
# Mount Google Drive
drive.mount('/content/drive')
from google.colab import userdata
# Access the API key from Google Colab Secrets
api_key = userdata.get('GOOGLE_API_KEY')

# Ensure that the API key is set correctly
if not api_key:
    raise ValueError("API key not found. Please set your API key in Google Colab Secrets.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
from google.colab import files
# Upload PDF files directly in Google Colab
uploaded_files = files.upload()

# Create a directory to store the uploaded files
pdf_directory = '/content/pdf_files'
os.makedirs(pdf_directory, exist_ok=True)

Saving GS 95001-9 OLD.pdf to GS 95001-9 OLD.pdf


In [9]:
# Save the uploaded files to the directory
for filename in uploaded_files.keys():
    with open(os.path.join(pdf_directory, filename), 'wb') as f:
        f.write(uploaded_files[filename])

In [10]:
def get_pdf_text(pdf_file_path):
    """Extract text from a PDF file."""
    text = ""
    with fitz.open(pdf_file_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

In [11]:
def get_text_chunks(text):
    """Split text into chunks for processing."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    chunks = text_splitter.split_text(text)
    return chunks

In [12]:
def get_vector_store(text_chunks, api_key):
    """Create and save a FAISS vector store from text chunks."""
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
    vector_store.save_local("faiss_index")


In [13]:
def initialize_vector_store():
    """Initialize vector store from all uploaded PDF files if it doesn't already exist."""
    if not os.path.exists("faiss_index"):
        print("Creating FAISS vector store from the uploaded PDF files...")
        all_text = ""

        # Iterate through all uploaded PDF files
        for filename in os.listdir(pdf_directory):
            if filename.endswith(".pdf"):
                pdf_file_path = os.path.join(pdf_directory, filename)
                raw_text = get_pdf_text(pdf_file_path)
                all_text += raw_text
                print(f"Debug: Processed {filename} with text length: {len(raw_text)}")

        text_chunks = get_text_chunks(all_text)
        print(f"Debug: Total number of chunks created: {len(text_chunks)}")

        get_vector_store(text_chunks, api_key)
        print("Vector store created successfully!")


In [27]:
def get_conversational_chain():
    """Set up the conversational chain for Q&A."""
    prompt_template = """
        You are an expert assistant with deep knowledge in various fields. When responding to queries:

        1. Start with a clear and concise explanation based on the user's concern or query.
        2. Provide accurate and relevant information based on the context.
        3. Explain complex concepts in simple, easy-to-understand language.
        4. Include relevant details or examples when applicable.
        5. Suggest practical next steps or actions the user can take if relevant.
        6. Mention any important deadlines or considerations if necessary.
        7. Clarify if any additional information or resources would be helpful.
        8. Try to list important points as bullet points one below the other when necessary, rather than in paragraphs.

        Remember to maintain a professional yet approachable tone, and always aim to provide helpful and informative responses.
        Context: {context}
        Question: {question}
        Response:
    """
    model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3, google_api_key=api_key)
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
    return chain

In [28]:
def user_input(user_question):
    """Handle user input and generate a response."""
    try:
        embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
        print("Debug: Embeddings created")

        new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
        print("Debug: Vector store loaded")

        docs = new_db.similarity_search(user_question)
        print(f"Debug: Found {len(docs)} relevant documents")
        print(f"Debug: First doc content: {docs[0].page_content if docs else 'No docs found'}")

        chain = get_conversational_chain()
        response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
        print(f"Debug: Generated response: {response}")

        return response.get("output_text", "No response generated.")
    except Exception as e:
        print(f"Error in user_input: {str(e)}")
        return f"An error occurred: {str(e)}"


In [18]:
# Initialize the vector store
initialize_vector_store()

Creating FAISS vector store from the uploaded PDF files...
Debug: Processed GS 95001-9 OLD.pdf with text length: 34499
Debug: Total number of chunks created: 85
Vector store created successfully!


In [30]:
# Working Outline based Output
user_question = input("Enter your question: ")
# Get the response
response = user_input(user_question)
print(response)

Enter your question: What is VDC?
Debug: Embeddings created
Debug: Vector store loaded
Debug: Found 4 relevant documents
Debug: First doc content: Application 
Mo 
Month(s) 
OCV 
Open-circuit voltage  
PDI 
Pre-delivery inspection; 
transfer inspection before vehicle is delivered to dealership and/or customer. 
PU 
Production interruption 
PWF 
Parking, standby, driving; 
status management for function releases and power supply in the vehicle; replaces the "pin 
control". 
SoC 
State of charge, indicated in % 
SoP 
Start of Production 
VDC 
Vehicle distribution center 
VPC 
Vehicle preparation center 
VZG 
Garching shipping center 
Wk
Debug: Generated response: {'output_text': 'VDC stands for Vehicle Distribution Center.'}
VDC stands for Vehicle Distribution Center.


In [31]:
def get_response(user_question):
    """Generate a response based on the user's question."""
    try:
        embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)

        new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

        docs = new_db.similarity_search(user_question)

        chain = get_conversational_chain()
        response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)

        return response.get("output_text", "No response generated.")
    except Exception as e:
        return f"An error occurred: {str(e)}"

In [32]:
# Prompt the user for a question
user_question = input("Enter your question: ")

# Get the response
response = get_response(user_question)
print(response)

Enter your question: What is VDC?
VDC stands for Vehicle Distribution Center.
