In [1]:
pip install pdfplumber



# Step 1: Extract Text from PDFs

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pdfplumber
import os

# Define the folder where your PDF files are stored
pdf_folder_path = "/content/drive/MyDrive/Data_Consigli"  # Folder path containing the PDF files

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = ''
            for page in pdf.pages:
                text += page.extract_text() or ''
        return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return None

# Extract text from all PDF files in the folder
pdf_texts = {}
print("Starting text extraction from PDF files...")
for root, dirs, files in os.walk(pdf_folder_path):
    for file in files:
        if file.endswith('.pdf'):
            file_path = os.path.join(root, file)
            pdf_texts[file] = extract_text_from_pdf(file_path)
            print(f"Extracted text from {file}")

# Save extracted text to a local dictionary or separate files for review
print("\nExtraction completed. Sample content from extracted PDFs:\n")
for file_name, text in pdf_texts.items():
    if text:
        # Print a sample of the extracted text
        print(f"Sample content from {file_name}:\n{text[:1000]}\n{'-'*80}\n")
    else:
        print(f"No text extracted from {file_name}.\n{'-'*80}\n")

print("Text extraction process finished. All files processed.")

Starting text extraction from PDF files...
Extracted text from BMW_Annual_Report_2023.pdf
Extracted text from BMW_Annual_Report_2022.pdf
Extracted text from BMW_Annual_Report_2021.pdf
Extracted text from Tesla_Annual_Report_2023.pdf
Extracted text from Tesla_Annual_Report_2022.pdf
Extracted text from Ford_Annual_Report_2023.pdf
Extracted text from Ford_Annual_Report_2022.pdf


# Step 2: Clean and Prepare Data

Once the text is extracted, you need to preprocess it to remove unnecessary whitespace, headers, and footers. Use this code for preprocessing:

In [None]:
import re

def clean_text(text):
    # Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters if not needed
    text = re.sub(r'[^\w\s.,]', '', text)
    return text.strip()

# Clean all extracted texts
cleaned_texts = {file_name: clean_text(text) for file_name, text in pdf_texts.items() if text}

# Save cleaned text into files for later use
output_folder = "cleaned_texts"
os.makedirs(output_folder, exist_ok=True)

for file_name, text in cleaned_texts.items():
    output_path = os.path.join(output_folder, file_name.replace('.pdf', '.txt'))
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(text)
    print(f"Cleaned text saved to {output_path}")


# Step 3: Index the Data for RAG

We'll use FAISS (Facebook AI Similarity Search) to index the cleaned text data. FAISS is a library for efficient similarity search and clustering of dense vectors.

In [None]:
pip install faiss-cpu

Convert Text to Embeddings
We need to convert the cleaned text into embeddings. We'll use a pre-trained model from the sentence-transformers package to generate these embeddings.

In [None]:
!pip install sentence-transformers

Python Code for Indexing
Here's the Python code to convert text into embeddings and index them using FAISS:

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os

# Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to read cleaned text files
def read_cleaned_texts(folder_path):
    texts = {}
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.txt'):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as f:
                texts[file_name] = f.read()
    return texts

# Read the cleaned texts
cleaned_text_folder = "cleaned_texts"
cleaned_texts = read_cleaned_texts(cleaned_text_folder)

# Debugging print statements
print(f"Number of documents read: {len(cleaned_texts)}")

# Generate embeddings for each document
document_names = list(cleaned_texts.keys())
document_texts = list(cleaned_texts.values())

# Check if document_texts is empty
if not document_texts:
    print("No documents found in the specified folder.")
else:
    # Generate embeddings
    document_embeddings = model.encode(document_texts)

    # Debugging print statements
    print(f"Shape of document_embeddings: {document_embeddings.shape}")

    # Create a FAISS index
    dimension = document_embeddings.shape[1]
    faiss_index = faiss.IndexFlatL2(dimension)
    faiss_index.add(np.array(document_embeddings))

    # Save the index and document names for later use
    faiss.write_index(faiss_index, "document_faiss_index.index")
    with open("document_names.txt", 'w', encoding='utf-8') as f:
        for name in document_names:
            f.write(name + "\n")

    print("Index created and saved successfully.")

# Step 4: Implement the LLM-Powered QA System

In [None]:
pip install openai #== 0.28

In [None]:
import openai
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np

# Load the pre-trained model and FAISS index
model = SentenceTransformer('all-MiniLM-L6-v2')
faiss_index = faiss.read_index("document_faiss_index.index")

# Load document names
with open("document_names.txt", 'r', encoding='utf-8') as f:
    document_names = f.read().splitlines()

# Set OpenAI API key
openai.api_key = "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"

# Function to perform the retrieval
def retrieve_relevant_documents(query, top_k=5):
    query_embedding = model.encode([query])
    distances, indices = faiss_index.search(np.array(query_embedding), top_k)
    return [(document_names[idx], distances[0][i]) for i, idx in enumerate(indices[0])]

# Function to truncate context to fit within the token limit
def truncate_context(context, max_tokens=1000):
    words = context.split()
    if len(words) > max_tokens:
        words = words[:max_tokens]
    return ' '.join(words)

# Function to generate an answer using GPT
def generate_answer(query, context):
    truncated_context = truncate_context(context)
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"Context: {truncated_context}\n\nQuestion: {query}\nAnswer:"}
        ],
        max_tokens=150
    )
    return response['choices'][0]['message']['content'].strip()

# Interactive QA system
def interactive_qa():
    while True:
        query = input("Enter your question (or type 'exit' to quit): ")
        if query.lower() == 'exit':
            break
        relevant_docs = retrieve_relevant_documents(query)
        context = " ".join([cleaned_texts[doc[0]] for doc in relevant_docs])
        answer = generate_answer(query, context)
        print(f"Answer: {answer}\n")

# Start the interactive QA system
interactive_qa()