In [None]:
from IPython.display import IFrame

IFrame('https://www.youtube.com/embed/38aMTXY2usU', width=560, height=315)



!pip install "jedi>=0.16" "cryptography>=41.0.5,<44"

!pip install langchain langchain-core arxiv langchain_community docx2txt pypdf langchain_chroma sentence_transformers langchain-together unstructured together

import getpass
import os

if not os.environ.get("TOGETHER_API_KEY"):
  os.environ["TOGETHER_API_KEY"] = getpass.getpass("Enter API key for Together AI: ")

from langchain.chat_models import init_chat_model

model = init_chat_model("meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", model_provider="together")

In [2]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = "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"
os.environ["LANGCHAIN_PROJECT"] = "langchain-course"

In [None]:
!pip install PyPDF2

In [14]:
import os
import uuid
import sqlite3
from datetime import datetime
from typing import List, Dict
from google.colab import files  # For Colab file upload
import arxiv
import time
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_together import ChatTogether, TogetherEmbeddings
from langchain_chroma import Chroma
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# Environment setup
os.environ["TOGETHERAI_API_KEY"] = "024254d1fcd8eee3a12258e40260345e8411c8afcec22c65bacar46f40aed1c60904"
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = "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"
os.environ["LANGCHAIN_PROJECT"] = "langchain-course"

# Database setup
DB_NAME = "rag_app.db"

def get_db_connection():
    conn = sqlite3.connect(DB_NAME)
    conn.row_factory = sqlite3.Row
    return conn

def create_application_logs():
    conn = get_db_connection()
    conn.execute('''CREATE TABLE IF NOT EXISTS application_logs
                    (id INTEGER PRIMARY KEY AUTOINCREMENT,
                     session_id TEXT,
                     user_query TEXT,
                     gpt_response TEXT,
                     model TEXT,
                     created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)''')
    conn.close()

def insert_application_logs(session_id, user_query, gpt_response, model):
    conn = get_db_connection()
    conn.execute('INSERT INTO application_logs (session_id, user_query, gpt_response, model) VALUES (?, ?, ?, ?)',
                 (session_id, user_query, gpt_response, model))
    conn.commit()
    conn.close()

def get_chat_history(session_id):
    conn = get_db_connection()
    cursor = conn.cursor()
    cursor.execute('SELECT user_query, gpt_response FROM application_logs WHERE session_id = ? ORDER BY created_at', (session_id,))
    messages = []
    for row in cursor.fetchall():
        messages.extend([
            {"role": "human", "content": row['user_query']},
            {"role": "ai", "content": row['gpt_response']}
        ])
    conn.close()
    return messages

# Initialize model and embeddings
model = ChatTogether(model="meta-llama/Llama-3-70b-chat-hf")
embedding_function = TogetherEmbeddings(model="togethercomputer/m2-bert-80M-8k-retrieval")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Global variables
current_paper = None
vectorstore = None
rag_chain = None
last_arxiv_papers = []  # To store arXiv search results with metadata
last_input_time = time.time()  # Track the last user input time
pdf_upload_mode = False  # Flag to track PDF upload mode

def process_uploaded_pdf(file_path: str) -> str:
    """Process a PDF uploaded via Colab."""
    global current_paper, vectorstore

    if current_paper:
        return "Error: Only one PDF can be processed at a time. Clear the current PDF with 'clear pdf'."

    if not os.path.exists(file_path) or not file_path.lower().endswith('.pdf'):
        return "Error: Invalid or missing PDF file. Please upload a valid PDF."

    try:
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        if not documents:
            return "Error: Failed to load the PDF. Please upload a valid PDF."

        splits = text_splitter.split_documents(documents)
        vectorstore = Chroma.from_documents(
            collection_name="research_paper",
            documents=splits,
            embedding=embedding_function,
            persist_directory="./chroma_db"
        )
        current_paper = {"source": file_path, "processed": True}  # Mark as processed immediately
        setup_rag_chain()
        global pdf_upload_mode
        pdf_upload_mode = False  # Reset mode after successful upload
        return f"PDF uploaded successfully! Loaded {len(documents)} page(s) and split into {len(splits)} chunks. What would you like to know about it?"
    except Exception as e:
        return f"Error: Failed to process PDF - {str(e)}. Please upload a valid PDF."

def fetch_arxiv_papers(query: str, max_results: int = 3) -> str:
    """Fetch recent papers from arXiv using Client.results()."""
    global current_paper, last_arxiv_papers

    if current_paper and "processed" in current_paper:
        return "Error: Clear the current paper or PDF before fetching new papers from arXiv."

    try:
        client = arxiv.Client()
        search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
        papers = list(client.results(search))
        last_arxiv_papers = papers  # Store full paper objects for later use

        if not papers:
            return "No arXiv results found for your query."

        response = "Here are some recent papers from arXiv:\n"
        for i, paper in enumerate(papers, 1):
            authors = ", ".join([author.name for author in paper.authors])
            arxiv_link = paper.entry_id
            response += f"""
            {i}. **{paper.title}**
               - **Abstract**: {paper.summary[:200]}...
               - **DOI**: {paper.doi or 'Not found'}
               - **Published**: {paper.published.year}
               - **Authors**: {authors}
               - **arXiv Link**: {arxiv_link}
               - Select this paper by saying 'select paper {i}'
            """
        return response
    except Exception as e:
        return f"Error fetching arXiv papers: {str(e)}"

def select_arxiv_paper(paper_number: int) -> str:
    """Select a paper and display its details without immediate processing."""
    global current_paper, last_arxiv_papers

    try:
        if not last_arxiv_papers or paper_number < 1 or paper_number > len(last_arxiv_papers):
            return "Error: Invalid paper selection."

        paper = last_arxiv_papers[paper_number - 1]
        current_paper = {"source": paper.entry_id, "title": paper.title, "metadata": paper}

        authors_str = ", ".join([author.name for author in paper.authors])
        arxiv_link = paper.entry_id

        response = f"""
        **Selected Paper #{paper_number}: {paper.title}**
        - **Abstract**: {paper.summary[:200]}...
        - **DOI**: {paper.doi or 'Not found'}
        - **Publish Year**: {paper.published.year}
        - **Authors**: {authors_str}
        - **arXiv Link**: {arxiv_link}

        This paper has been selected! What would you like to know about it? (Note: Content will be processed on your first question.)
        """
        return response
    except Exception as e:
        return f"Error selecting paper: {str(e)}"

def process_selected_paper(paper) -> bool:
    """Process the selected paper's PDF into the vector store."""
    global vectorstore, current_paper
    try:
        paper.download_pdf(filename=f"temp_paper.pdf")
        file_path = f"/content/temp_paper.pdf"
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        if not documents:
            return False

        splits = text_splitter.split_documents(documents)
        vectorstore = Chroma.from_documents(
            collection_name="research_paper",
            documents=splits,
            embedding=embedding_function,
            persist_directory="./chroma_db"
        )
        current_paper["processed"] = True
        setup_rag_chain()
        return True
    except Exception as e:
        print(f"Error processing paper: {str(e)}")
        return False

def setup_rag_chain():
    """Set up the RAG chain if vectorstore exists."""
    global vectorstore, rag_chain
    if vectorstore and not rag_chain:
        retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
        contextualize_q_prompt = ChatPromptTemplate.from_messages([
            ("system", "Given a chat history and the latest user question, formulate a standalone question."),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}")
        ])
        qa_prompt = ChatPromptTemplate.from_messages([
            ("system", "You are a helpful AI assistant. Use the following context to answer the user's question."),
            ("system", "Context: {context}"),
            MessagesPlaceholder(variable_name="chat_history"),
            ("human", "{input}")
        ])
        history_aware_retriever = create_history_aware_retriever(model, retriever, contextualize_q_prompt)
        question_answer_chain = create_stuff_documents_chain(model, qa_prompt)
        rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

def handle_input(user_input: str, session_id: str, chat_history: List) -> str:
    """Handle user input and maintain conversation flow with exit intent detection."""
    global current_paper, vectorstore, rag_chain, last_arxiv_papers, last_input_time, pdf_upload_mode
    last_input_time = time.time()  # Update the last input time

    # Detect exit intent
    if any(phrase in user_input.lower() for phrase in ["exit", "bye", "quit", "end"]) or \
       ("no" in user_input.lower() and any(exit_phrase in chat_history[-1]["content"].lower() for exit_phrase in ["exit", "bye", "quit", "end"])):
        confirm = input("Are you sure you want to exit? (yes/no): ")
        if confirm.lower() == "yes":
            return "Goodbye!"
        elif confirm.lower() == "no":
            return "Exit cancelled. How can I assist you further?"
        else:
            return "Please enter 'yes' or 'no' to confirm."

    # Handle PDF upload mode
    if user_input.lower() == "switch to pdf":
        pdf_upload_mode = True
        return "PDF upload mode activated. Enter 'upload pdf' to open the file dialog, or press Enter to upload."

    if pdf_upload_mode and user_input.lower() == "upload pdf":
        uploaded = files.upload()
        if uploaded:
            file_path = f"/content/{list(uploaded.keys())[0]}"
            if file_path.lower().endswith('.pdf'):
                response = process_uploaded_pdf(file_path)
                if "Error" not in response:
                    insert_application_logs(session_id, "uploaded PDF", response, "meta-llama/Llama-3-70b-chat-hf")
                    chat_history = get_chat_history(session_id)
                pdf_upload_mode = False  # Reset mode after upload
                return response
            else:
                pdf_upload_mode = False  # Reset mode on error
                return "Error: Only PDF files are supported. Please upload a valid PDF."
        else:
            return "No file uploaded. Please try again or enter 'switch to pdf' to restart."
        pdf_upload_mode = False  # Reset mode if upload fails or is cancelled

    # Check if input is empty or handle other cases
    if not user_input or user_input.strip() == "":
        if pdf_upload_mode:
            return "Please enter 'upload pdf' to open the file dialog."
        elif current_paper and "processed" in current_paper:
            return "PDF is already uploaded. Enter a query (e.g., 'summary') or use 'clear pdf' to start over."
        return "Please enter text, use 'switch to pdf' to upload a PDF, or use 'clear pdf' to reset."

    if user_input.lower() == "switch to text":
        pdf_upload_mode = False
        return "Text input mode activated. Please enter your text or command."

    # Handle commands and queries
    if user_input.lower().startswith("select paper "):
        try:
            paper_num = int(user_input.split("select paper ")[1].strip())
            response = select_arxiv_paper(paper_num)
            if "Error" not in response:
                insert_application_logs(session_id, user_input, response, "meta-llama/Llama-3-70b-chat-hf")
                chat_history = get_chat_history(session_id)
            return response
        except ValueError:
            return "Error: Please provide a valid paper number after 'select paper'."

    elif user_input.lower() == "clear pdf":
        current_paper = None
        vectorstore = None
        rag_chain = None
        last_arxiv_papers.clear()
        pdf_upload_mode = False
        response = "Current paper or PDF cleared. You can upload a new one or enter a topic."
        insert_application_logs(session_id, user_input, response, "meta-llama/Llama-3-70b-chat-hf")
        chat_history = get_chat_history(session_id)
        return response

    # Process uploaded PDF or arXiv paper if available
    elif current_paper and "processed" in current_paper:
        response = rag_chain.invoke({"input": user_input, "chat_history": chat_history})["answer"]
        insert_application_logs(session_id, user_input, response, "meta-llama/Llama-3-70b-chat-hf")
        chat_history = get_chat_history(session_id)
        return response
    elif current_paper and "metadata" in current_paper and "processed" not in current_paper:
        if not process_selected_paper(current_paper["metadata"]):
            return "Error: Failed to process the selected paper. Please try again or select another paper."
        response = rag_chain.invoke({"input": user_input, "chat_history": chat_history})["answer"]
        insert_application_logs(session_id, user_input, response, "meta-llama/Llama-3-70b-chat-hf")
        chat_history = get_chat_history(session_id)
        return response

    # Default to arXiv search if no PDF or processed paper
    else:
        response = fetch_arxiv_papers(user_input)
        return response

# Example usage in Colab
create_application_logs()
session_id = str(uuid.uuid4())
chat_history = get_chat_history(session_id)
last_input_time = time.time()  # Initialize last input time

# Interactive loop with 10-minute timeout and exit intent detection
print("Start by entering text or uploading a PDF. Use 'switch to pdf' to upload a PDF.")
while True:
    try:
        user_input = input("Human: ")
        response = handle_input(user_input, session_id, chat_history)
        if response == "Goodbye!":
            print("Goodbye!")
            break
        print(f"AI: {response}\n")

        # Check for 10-minute inactivity
        current_time = time.time()
        if current_time - last_input_time > 600:  # 600 seconds = 10 minutes
            print("No input received for 10 minutes. Exiting...")
            break
    except Exception as e:
        print(f"Error: {str(e)}\n")
        continue

Start by entering text or uploading a PDF. Use 'switch to pdf' to upload a PDF.
Human: switch to pdf
AI: PDF upload mode activated. Enter 'upload pdf' to open the file dialog, or press Enter to upload.

Human: upload pdf


Saving RBI Grade B 2024 Phase 2 - Memory Based Paper.pdf to RBI Grade B 2024 Phase 2 - Memory Based Paper.pdf
AI: PDF uploaded successfully! Loaded 75 page(s) and split into 209 chunks. What would you like to know about it?

Human: summary in 3 line.
AI: Here is a summary in 3 lines:

In aiohttp, you can set a timeout for the entire session or for individual requests using the `ClientTimeout` object. If the request is not successful within the specified time, an `asyncio.TimeoutError` is raised. The `asyncio.gather` method can be used to cleanly handle multiple awaitables with timeouts.

Human: want to exit
Are you sure you want to exit? (yes/no): yes
Goodbye!
