Imports and Configuration

In [1]:
import os
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from openai import OpenAI
import ipywidgets as widgets
from IPython.display import display, HTML
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import asyncio

# Configuration
FORM_RECOGNIZER_ENDPOINT = ""
FORM_RECOGNIZER_KEY = ""
AZURE_OPENAI_ENDPOINT = ""
AZURE_OPENAI_KEY = ""


Initialize Clients


In [2]:
document_analysis_client = DocumentAnalysisClient(
    endpoint=FORM_RECOGNIZER_ENDPOINT,
    credential=AzureKeyCredential(FORM_RECOGNIZER_KEY)
)

# Initialize Azure OpenAI client
client = OpenAI(
    base_url=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_KEY,
)

# Global storage
vector_store = {}
conversation_history = []


Document Processing Functions

In [3]:
def extract_text(file_content):
    """Extract text from an uploaded file using Azure Form Recognizer."""
    poller = document_analysis_client.begin_analyze_document(
        model_id="prebuilt-read", 
        document=file_content
    )
    result = poller.result()
    text_content = ""
    
    for page in result.pages:
        for line in page.lines:
            text_content += line.content + "\n"
    
    return text_content

def split_text_into_chunks(text, chunk_size=1000):
    """Split text into chunks of specified size."""
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

def add_to_vector_store(file_name, chunks):
    """Add text chunks to vector store with TF-IDF vectorization."""
    vectorizer = TfidfVectorizer()
    chunk_vectors = vectorizer.fit_transform(chunks).toarray()
    
    vector_store[file_name] = {
        "chunks": chunks,
        "vectors": chunk_vectors,
        "feature_names": vectorizer.get_feature_names_out()
    }
    print(f"✓ Stored {len(chunks)} chunks for '{file_name}'")


Search and Retrieval Functions

In [4]:
def search_vector_store(query, top_k=3):
    """Search vector store for relevant chunks based on query."""
    results = []
    
    for filename, file_data in vector_store.items():
        vectorizer = TfidfVectorizer(vocabulary=file_data["feature_names"])
        query_vector = vectorizer.fit_transform([query]).toarray()
        similarities = cosine_similarity(query_vector, file_data["vectors"])[0]
        
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        
        for idx in top_indices:
            if similarities[idx] > 0.1:
                results.append({
                    'filename': filename,
                    'chunk': file_data["chunks"][idx],
                    'similarity': similarities[idx]
                })
    
    results.sort(key=lambda x: x['similarity'], reverse=True)
    return results[:top_k]

def format_context(relevant_chunks):
    """Format retrieved chunks into context for AI."""
    if not relevant_chunks:
        return "No relevant context found in the documents."
    
    context = "Relevant passages from the documents:\n\n"
    for i, chunk in enumerate(relevant_chunks, 1):
        context += f"[Document: {chunk['filename']}]\n{chunk['chunk']}\n\n"
    return context


AI Response Function

In [8]:
def get_ai_response(messages):
    """Get response from Azure OpenAI using the new client library."""
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=messages,
            temperature=0.1,
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error getting AI response: {str(e)}"


UI Style and Components


In [11]:
CHAT_STYLE = """
    <style>
        .chat-message {
            margin: 12px 8px;
            padding: 10px 16px;
            border-radius: 20px;
            max-width: 70%;
            display: inline-block;
            box-shadow: 0 1px 2px rgba(0,0,0,0.1);
            word-wrap: break-word;
            line-height: 1.4;
            font-size: 15px;
        }
        .user-message {
            background-color: #0084ff;
            color: white;
            float: right;
            border-bottom-right-radius: 4px;
        }
        .assistant-message {
            background-color: #00a67d;
            color: white;
            float: left;
            border-bottom-left-radius: 4px;
        }
        .message-container {
            clear: both;
            overflow: hidden;
            margin: 4px 0;
            animation: fadeIn 0.3s ease;
        }
        @keyframes fadeIn {
            from { opacity: 0; transform: translateY(10px); }
            to { opacity: 1; transform: translateY(0); }
        }
    </style>
"""

def create_chat_interface():
    """Create and return chat interface widgets."""
    messages_area = widgets.HTML(
        value=CHAT_STYLE + '<div style="height: 400px; overflow-y: auto; border: 1px solid #ddd; padding: 10px; background-color: #f8f9fa; border-radius: 8px;"></div>'
    )
    
    input_box = widgets.Text(
        placeholder='Type your question here...',
        layout=widgets.Layout(width='80%')
    )
    
    send_button = widgets.Button(
        description='Send',
        layout=widgets.Layout(width='19%'),
        button_style='primary'
    )
    
    clear_button = widgets.Button(
        description='Clear Chat',
        layout=widgets.Layout(width='100%'),
        button_style='warning'
    )
    
    status_label = widgets.HTML(
        value='<div style="color: gray; font-style: italic;">Ready to chat about your documents...</div>'
    )
    
    return messages_area, input_box, send_button, clear_button, status_label

def create_file_upload():
    """Create and return file upload widget."""
    upload_widget = widgets.FileUpload(
        accept='.jpg,.jpeg,.png,.pdf,.doc,.docx',
        multiple=True,
        description='Upload Documents'
    )
    
    status_label = widgets.HTML(
        value='<div style="color: gray;">No documents uploaded yet.</div>'
    )
    
    return upload_widget, status_label

def add_message_to_display(messages_area, role, content):
    """Add a message to the chat display."""
    current_html = messages_area.value.split('</div>')[0]
    message_class = 'user-message' if role == 'user' else 'assistant-message'
    new_message = f'<div class="message-container"><div class="chat-message {message_class}">{content}</div></div>'
    messages_area.value = current_html + new_message + '</div>'

Main Application Setup

In [12]:
def setup_document_chat():
    """Set up and display the complete document chat interface."""
    # Create widgets
    upload_widget, upload_status = create_file_upload()
    messages_area, input_box, send_button, clear_button, chat_status = create_chat_interface()
    
    # File upload handler
    def handle_upload(change):
        if isinstance(upload_widget.value, dict):  # if it's a dictionary
            for filename, file_info in upload_widget.value.items():
                content = file_info['content']
                print(f"Uploaded file: {filename}, Size: {file_info['size']} bytes")
                # Add your file processing code here
        elif isinstance(upload_widget.value, tuple):  # if it's a tuple
            for file_info in upload_widget.value:
                filename = file_info['name']
                content = file_info['content']
                
                upload_status.value = f'<div style="color: blue;">Processing {filename}...</div>'
                
                text = extract_text(content)
                chunks = split_text_into_chunks(text)
                add_to_vector_store(filename, chunks)
            
            upload_status.value = '<div style="color: green;">✓ All documents processed successfully!</div>'

    async def on_send_button_click(b):
        query = input_box.value
        if not query.strip():
            return
        
        chat_status.value = '<div style="color: blue;">Processing your question...</div>'
        add_message_to_display(messages_area, 'user', query)
        
        relevant_chunks = search_vector_store(query)
        context = format_context(relevant_chunks)
        
        if not conversation_history:
            conversation_history.append({
                "role": "system",
                "content": "You are a helpful assistant that answers questions based on the provided document context. "
                          "Keep responses concise and relevant to the documents."
            })
        
        conversation_history.append({
            "role": "user",
            "content": f"{context}\n\nQuestion: {query}"
        })
        
        response = get_ai_response(conversation_history)
        conversation_history.append({
            "role": "assistant",
            "content": response
        })
        add_message_to_display(messages_area, 'assistant', response)
        
        input_box.value = ''
        chat_status.value = '<div style="color: gray;">Ready for next question...</div>'
    
    def on_clear_button_click(b):
        messages_area.value = messages_area.value.split('</div>')[0] + '</div>'
        conversation_history.clear()
        chat_status.value = '<div style="color: gray;">Chat cleared. Ready to start new conversation...</div>'
    
    # Connect handlers
    upload_widget.observe(handle_upload, names='value')
    send_button.on_click(lambda b: asyncio.create_task(on_send_button_click(b)))
    clear_button.on_click(on_clear_button_click)
    input_box.on_submit(lambda x: asyncio.create_task(on_send_button_click(None)))
    
    # Create layout
    upload_section = widgets.VBox([
        widgets.HTML('<h3>1. Upload Documents</h3>'),
        upload_widget,
        upload_status
    ])
    
    chat_section = widgets.VBox([
        widgets.HTML('<h3>2. Chat with Documents</h3>'),
        messages_area,
        widgets.HBox([input_box, send_button]),
        clear_button,
        chat_status
    ])
    
    # Display interface
    display(widgets.VBox([
        widgets.HTML('<h2>Document Chat Assistant</h2>'),
        upload_section,
        widgets.HTML('<hr>'),
        chat_section
    ]))

# Cell 8: Run Application
if __name__ == "__main__":
    setup_document_chat()


  input_box.on_submit(lambda x: asyncio.create_task(on_send_button_click(None)))


VBox(children=(HTML(value='<h2>Document Chat Assistant</h2>'), VBox(children=(HTML(value='<h3>1. Upload Docume…

✓ Stored 2 chunks for 'TimeTable.pdf'
✓ Stored 4 chunks for 'Swapanth_Resumee.pdf'
