In [12]:
# First, clear pip cache and install with --upgrade flag
!pip cache purge
!pip install --upgrade langchain langchain-community sentence-transformers chromadb pdfminer.six faiss-cpu transformers torch

# Now for the imports
import os
import sys
import logging
from pathlib import Path

# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate

# Transformers for running local models
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    pipeline
)

# Set up logging
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

print(f"Python version: {sys.version}")
print("All libraries imported successfully!")

# Step 3: Check for GPU Availability

# Check for GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("No GPU available, using CPU. This may be slower but will still work.")

# Step 4: Set Up Data Directory and Load PDF Files

# Define data directory
DATA_DIR = Path("./data")

# Create the directory if it doesn't exist
DATA_DIR.mkdir(exist_ok=True)

# Check for PDF files
pdf_files = list(DATA_DIR.glob("**/*.pdf"))
print(f"Found {len(pdf_files)} PDF files in the data directory:")
for file in pdf_files:
    print(f" - {file}")

if len(pdf_files) == 0:
    print("\nNo PDF files found! Please add your lecture notes to the data directory.")
    print("You can add them now and rerun this cell.")

# Step 5: Load PDF Lecture Notes

# Set up PDF document loader
pdf_loader = DirectoryLoader(
    DATA_DIR,
    glob="**/*.pdf",
    loader_cls=PyPDFLoader
)

# Load all documents
try:
    documents = pdf_loader.load()
    print(f"Successfully loaded {len(documents)} pages from PDF files")
except Exception as e:
    print(f"Error loading PDF documents: {e}")
    documents = []

# Print a sample of document content to verify loading
if documents:
    print("\nSample document content:")
    print(f"Document 1 - Page {documents[0].metadata.get('page', 'unknown')}:")
    print(documents[0].page_content[:300] + "..." if len(documents[0].page_content) > 300 else documents[0].page_content)

# Step 6: Process Documents for RAG - Split into Chunks

# Split documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Adjust based on your LLM's context window
    chunk_overlap=200,
    length_function=len,
)

# Split the documents
chunks = text_splitter.split_documents(documents)
print(f"Split {len(documents)} documents into {len(chunks)} chunks")

# Print a sample chunk to verify splitting
if chunks:
    print("\nSample chunk:")
    print(f"Chunk 1 - From page {chunks[0].metadata.get('page', 'unknown')}:")
    print(chunks[0].page_content[:300] + "..." if len(chunks[0].page_content) > 300 else chunks[0].page_content)

# Step 7: Create Embeddings and Vector Store

# Initialize embedding model - this is free and runs locally
print("Loading embedding model...")
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': str(device)}
)
print("Embedding model loaded successfully!")

# Create vector store
print("Creating vector store...")
vectorstore = FAISS.from_documents(chunks, embeddings)
print(f"Vector store created successfully with {len(chunks)} chunks!")

# Save the vector store to disk for future use
VECTORSTORE_PATH = "./vectorstore"
vectorstore.save_local(VECTORSTORE_PATH)
print(f"Vector store saved to {VECTORSTORE_PATH}")

# Step 8: Set Up Retriever

# Create retriever from vector store
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 5}  # Return top 5 most relevant chunks
)

# Test the retriever with a sample query
if chunks:
    test_query = "What are the main topics of CTSE?"
    retrieved_docs = retriever.get_relevant_documents(test_query)
    print(f"Retrieved {len(retrieved_docs)} relevant chunks for query: '{test_query}'")
    if retrieved_docs:
        print("\nSample retrieved chunk:")
        print(f"From page {retrieved_docs[0].metadata.get('page', 'unknown')}:")
        print(retrieved_docs[0].page_content[:300] + "..." if len(retrieved_docs[0].page_content) > 300 else retrieved_docs[0].page_content)

# Step 9: Load Language Model (LLM)

# Use a small, free model from Hugging Face
# For this example, we'll use Phi-2, which is small enough to run on most computers
# If you have GPU limitations, you could try smaller models like TinyLlama

print("Loading language model (this may take a few minutes depending on your internet speed and computer)...")

model_name = "microsoft/phi-2"  # A good small model that works well

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
    device_map="auto",
    low_cpu_mem_usage=True
)

# Create a text generation pipeline
text_generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=2048,
    temperature=0.1,  # Lower temperature for more focused answers
    top_p=0.95,
    repetition_penalty=1.15,
    device=device
)

# Create a LangChain wrapper around the pipeline
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

print("Language model loaded successfully!")
print(f"Using model: {model_name}")

# Step 10: Create Custom Prompt Template for QA

# Define a custom prompt template that includes context
template = """
You are an intelligent chatbot specializing in Current Trends in Software Engineering (CTSE).
Use the following pieces of context from the lecture notes to answer the question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Keep your answer focused and relevant to the question.

Context:
{context}

Question: {question}

Answer:
"""

PROMPT = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

# Step 11: Create the QA Chain

# Create the QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # 'stuff' simply stuffs all retrieved documents into the prompt
    retriever=retriever,
    return_source_documents=True,  # Return source documents for reference
    chain_type_kwargs={"prompt": PROMPT}
)

print("QA Chain created successfully!")

# Step 12: Create Chat Interface Function

def ask_question(question):
    """
    Ask a question to the CTSE chatbot
    
    Args:
        question (str): The question to ask
        
    Returns:
        dict: The answer and source documents
    """
    if not question.strip():
        return {"answer": "Please ask a question about CTSE."}
    
    try:
        result = qa_chain({"query": question})
        
        # Format source information
        sources = []
        for doc in result.get("source_documents", []):
            source = f"Page {doc.metadata.get('page', 'unknown')} of {doc.metadata.get('source', 'unknown file')}"
            if source not in sources:
                sources.append(source)
        
        # Add source information to the answer
        if sources:
            result["answer"] += "\n\nSources: " + ", ".join(sources)
            
        return result
    except Exception as e:
        logger.error(f"Error processing question: {e}")
        return {"answer": f"Sorry, I encountered an error while processing your question: {str(e)}"}

# Step 13: Test the Chatbot

# Test with some sample questions
test_questions = [
    "What are the key aspects of agile development?",
    "What is RAG in the context of LLMs?",
    "How can I implement CI/CD in a software project?"
]

for question in test_questions:
    print(f"\n\nQuestion: {question}")
    result = ask_question(question)
    print(f"\nAnswer: {result['answer']}")

# Step 14: Interactive Chat Interface

from IPython.display import display, HTML, clear_output
import ipywidgets as widgets

# Create widgets for the UI
question_input = widgets.Text(
    value='',
    placeholder='Type your question about CTSE here...',
    description='Question:',
    layout=widgets.Layout(width='80%')
)

submit_button = widgets.Button(
    description='Ask',
    button_style='primary',
    tooltip='Ask your question'
)

output_area = widgets.Output()

# Define the callback function for the button
def on_submit_button_clicked(b):
    with output_area:
        clear_output()
        question = question_input.value
        print(f"Question: {question}")
        if question.strip():
            result = ask_question(question)
            print(f"\nAnswer: {result['answer']}")
        else:
            print("Please enter a question.")

# Connect the button to the callback function
submit_button.on_click(on_submit_button_clicked)

# Display the UI
print("\n\nCTSE Lecture Notes Chatbot")
print("Ask any question about Current Trends in Software Engineering")
display(widgets.HBox([question_input, submit_button]))
display(output_area)

# You're done! You now have a fully functional CTSE lecture notes chatbot using free LLMs.

Files removed: 0 (0 bytes)






ModuleNotFoundError: No module named 'langchain'