
# MATLAB Troubleshooting RAG with FAISS

This notebook demonstrates building a Retrieval-Augmented Generation (RAG)
pipeline using LangChain to answer questions about MATLAB troubleshooting.
It loads data from a CSV file, processes it, creates embeddings using
HuggingFace models, stores them in a FAISS vector store, and uses a
ConversationalRetrievalChain with a DeepSeek-R1 model for question answering.


In [None]:
%%capture
!pip install -U langchain langchain-community langchain-huggingface
!pip install huggingface_hub
!pip install tiktoken
!pip install sentence_transformers
!pip install faiss-cpu
!pip install chromadb # Retained just in case, but FAISS is primary
!pip install pandas

In [None]:
!pip install langchain-huggingface


Collecting langchain-huggingface
  Using cached langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Using cached langchain_huggingface-0.1.2-py3-none-any.whl (21 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.1.2


In [None]:
import os
import sys
import pandas as pd
import ast # For safely evaluating the string representation of the metadata dictionary
import warnings
from pathlib import Path
from pprint import pprint
import getpass # To securely get API key

from langchain_community.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain import HuggingFaceHub
from langchain.chains import ConversationalRetrievalChain
from langchain.schema import Document

warnings.filterwarnings("ignore")

In [None]:
try:
    from google.colab import userdata
    os.environ["HUGGINGFACEHUB_API_TOKEN"] = userdata.get('HF_TOKEN')
    print("Hugging Face API Token loaded from Colab secrets.")
except ImportError:
    hf_token = getpass.getpass("Please enter your Hugging Face Hub API Token: ")
    os.environ["HUGGINGFACEHUB_API_TOKEN"] = hf_token
    if not hf_token:
        print("Warning: Hugging Face API Token not provided. LLM calls will fail.")
    else:
        print("Hugging Face API Token set.")

Hugging Face API Token loaded from Colab secrets.


In [None]:
# Embedding Model Configuration
embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

# Text Splitter Configuration
chunk_size = 1200
chunk_overlap = 200

# LLM Configuration
llm_repo_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
llm_model_kwargs = {"temperature": 0.6, "max_length": 1024}

# FAISS Retriever Configuration
search_k = 4 # Number of documents to retrieve

# CSV Data File Path
csv_file_path = '/content/drive/MyDrive/DL-Hackathon/crawl_results_for_depth_2.csv'

In [None]:
def parse_metadata_string(metadata_str):
    """Safely parses the string representation of a dictionary."""
    if not isinstance(metadata_str, str) or not metadata_str.strip():
        return {} # Return empty dict if input is not a valid string or is empty
    try:
        # Using ast.literal_eval for safety
        metadata_dict = ast.literal_eval(metadata_str)
        if isinstance(metadata_dict, dict):
            # Filter out None values, replacing them with a placeholder or empty string
            return {k: (v if v is not None else "N/A") for k, v in metadata_dict.items()}
        else:
            print(f"Warning: Parsed metadata is not a dictionary: {metadata_str}")
            return {}
    except (ValueError, SyntaxError, TypeError) as e:
        print(f"Warning: Could not parse metadata string: {metadata_str}. Error: {e}")
        return {} # Return empty dict on error

def load_and_process_csv(file_path):
    """Loads data from CSV, processes metadata, and creates Document objects."""
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: CSV file not found at {file_path}. Please upload the file.")
        return []
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return []

    # Ensure required columns exist
    required_cols = ['URL', 'Extracted_Content', 'metadata']
    if not all(col in df.columns for col in required_cols):
        print(f"Error: CSV missing one or more required columns: {required_cols}")
        return []

    docs = []
    for index, row in df.iterrows():
        url = row['URL']
        content = row['Extracted_Content']
        metadata_str = row['metadata']

        # Skip rows with missing essential content
        if pd.isna(content) or not content.strip():
            print(f"Warning: Skipping row {index+1} due to missing content.")
            continue

        # Parse the metadata string
        parsed_metadata = parse_metadata_string(metadata_str)

        # Combine URL with parsed metadata
        doc_metadata = {"source_url": url} # Use a distinct key for the primary URL
        doc_metadata.update(parsed_metadata) # Add fields from the parsed metadata string

        # Create the page content - potentially combine title and content if title exists
        page_content = f"URL: {url}\n"
        if 'title' in doc_metadata and doc_metadata['title'] != 'N/A':
             page_content += f"Title: {doc_metadata['title']}\n\n"
        page_content += str(content) # Ensure content is string

        # Create LangChain Document
        doc = Document(page_content=page_content, metadata=doc_metadata)
        docs.append(doc)

    print(f"Successfully loaded and processed {len(docs)} documents from CSV.")
    return docs

# Load the documents
raw_documents = load_and_process_csv(csv_file_path)

# Display sample document structure (optional)
if raw_documents:
    print("\nSample Document Structure:")
    pprint(raw_documents[20])

Successfully loaded and processed 1947 documents from CSV.

Sample Document Structure:
Document(metadata={'source_url': 'https://in.mathworks.com/help/slrealtime/troubleshooting-in-slrt-target.html?s_tid=CRUX_topnav', 'title': 'Troubleshooting in Simulink Real-Time - MATLAB &amp; Simulink', 'description': 'Troubleshoot problems that you encounter while using the Simulink Real-Time product', 'keywords': 'N/A', 'author': 'N/A', 'og:image:url': 'https://in.mathworks.com/template-service/help/full-header-footer-offcanvas/_jcr_content/thumbnail.adapt.1200.medium.jpg/1744894899777.jpg', 'og:image:secure_url': 'https://in.mathworks.com/template-service/help/full-header-footer-offcanvas/_jcr_content/thumbnail.adapt.1200.medium.jpg/1744894899777.jpg', 'og:url': 'https://in.mathworks.com/help/slrealtime/troubleshooting-in-slrt-target.html', 'og:description': 'Troubleshoot problems that you encounter while using the Simulink Real-Time product', 'og:title': 'Troubleshooting in Simulink Real-Time -

In [None]:
def split_documents(docs, chunk_size=1000, chunk_overlap=100):
    """Splits documents into smaller chunks."""
    if not docs:
        print("No documents to split.")
        return []
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        add_start_index=True, # Helpful for locating chunks if needed
    )
    split_docs = text_splitter.split_documents(docs)
    print(f"Split {len(docs)} documents into {len(split_docs)} chunks.")
    return split_docs

if raw_documents:
    split_docs = split_documents(raw_documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    # Display sample chunk (optional)
    if split_docs:
        print("\nSample Chunk Structure:")
        pprint(split_docs[0])
else:
    split_docs = []

Split 1947 documents into 60953 chunks.

Sample Chunk Structure:
Document(metadata={'source_url': 'https://in.mathworks.com/help/slrealtime/ug/troubleshooting-basics.html', 'title': 'Troubleshooting Basics - MATLAB &amp; Simulink', 'description': 'Investigate issues in a Simulink Real-Time installation.', 'keywords': 'N/A', 'author': 'N/A', 'og:image:url': 'https://ch.mathworks.com/template-service/help/full-header-footer-offcanvas/_jcr_content/thumbnail.adapt.1200.medium.jpg/1744894899777.jpg', 'og:image:secure_url': 'https://ch.mathworks.com/template-service/help/full-header-footer-offcanvas/_jcr_content/thumbnail.adapt.1200.medium.jpg/1744894899777.jpg', 'og:url': 'https://in.mathworks.com/help/slrealtime/ug/troubleshooting-basics.html', 'og:description': 'Investigate issues in a Simulink Real-Time installation.', 'og:title': 'Troubleshooting Basics - MATLAB &amp; Simulink', 'og:type': 'website', 'twitter:card': 'summary_large_image', 'twitter:site': '@MathWorks', 'twitter:image': '

In [None]:
def create_faiss_vector_store(docs_to_embed):
    """Creates a FAISS vector store from document chunks."""
    if not docs_to_embed:
        print("Error: No document chunks available to create vector store.")
        return None
    try:
        print(f"Initializing embedding model: {embedding_model_name}...")
        embeddings = HuggingFaceEmbeddings(
            model_name=embedding_model_name,
            model_kwargs=model_kwargs,
            encode_kwargs={'normalize_embeddings': True} # Normalize for better cosine similarity
        )
        print("Embedding model initialized.")

        print(f"Creating FAISS vector store with {len(docs_to_embed)} chunks...")
        # This step can take time depending on the number of chunks and embedding model
        vector_store = FAISS.from_documents(docs_to_embed, embeddings)
        print("FAISS vector store created successfully.")
        return vector_store
    except Exception as e:
        print(f"Error creating FAISS vector store: {e}")
        # Add specific checks, e.g., for CUDA availability if using GPU
        if "CUDA" in str(e):
            print("Hint: Check if CUDA is available and correctly configured, or switch model_kwargs to {'device': 'cpu'}.")
        return None

# Create the vector store only if we have chunks
if split_docs:
    vector_store = create_faiss_vector_store(split_docs)
else:
    vector_store = None
    print("Skipping vector store creation as there are no document chunks.")

Initializing embedding model: sentence-transformers/all-mpnet-base-v2...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding model initialized.
Creating FAISS vector store with 60953 chunks...
FAISS vector store created successfully.


In [None]:
# @title 7a. Save FAISS Vector Store to Disk (Google Drive Version)

import os
import pickle # FAISS save_local uses pickle for docstore/index_to_docstore_id

def save_vector_store(vector_store_instance, folder_path, index_name="index"):
    """
    Saves the FAISS vector store index and document store/mappings to disk.

    Args:
        vector_store_instance: The FAISS vector store object to save.
        folder_path (str): The directory path where the store will be saved.
                           Two files will be created inside: index_name.faiss and index_name.pkl.
        index_name (str, optional): The base name for the saved files. Defaults to "index".

    Returns:
        bool: True if saving was successful, False otherwise.
    """
    if not vector_store_instance:
        print("Error: Cannot save. Provided vector_store_instance is None.")
        return False

    if not hasattr(vector_store_instance, 'save_local'):
         print(f"Error: The provided object does not seem to be a FAISS vector store with a 'save_local' method.")
         return False

    try:
        # Ensure the target directory exists
        # This should work even on Google Drive paths after mounting
        os.makedirs(folder_path, exist_ok=True)
        print(f"Saving FAISS vector store to: {folder_path} with index name '{index_name}'...")

        # Use the built-in save_local method
        vector_store_instance.save_local(folder_path=folder_path, index_name=index_name)

        print(f"FAISS vector store saved successfully.")
        print(f"Files created: {os.path.join(folder_path, index_name + '.faiss')}, {os.path.join(folder_path, index_name + '.pkl')}")
        return True
    except Exception as e:
        print(f"Error saving FAISS vector store to {folder_path}: {e}")
        # Common errors might include Drive permissions or quota issues
        if "Transport endpoint is not connected" in str(e):
             print("Hint: Google Drive connection might have been lost. Try re-mounting.")
        return False

# --- Example Usage (Google Drive) ---

# Define the path *within your Google Drive* where you want to save the index
# Assumes drive is mounted at /content/drive and you have a MyDrive folder
# MAKE SURE 'drive_base_path' was defined in the Mount Drive cell (Cell 2a)
if 'drive_base_path' not in locals():
     print("Warning: 'drive_base_path' not defined. Attempting default '/content/drive/MyDrive/'.")
     drive_base_path = "/content/drive/MyDrive/" # Default if mount cell wasn't run

# *** IMPORTANT: Adjust this path as needed for your Drive structure ***
drive_save_path = os.path.join(drive_base_path, "MATLAB_RAG_Data/faiss_index")

# Call the save function *after* the vector store has been created in Cell 7
if 'vector_store' in locals() and vector_store is not None:
    print(f"\nAttempting to save the created vector store to Google Drive path: {drive_save_path}")

    # Optional: Check if the path looks like a Drive path before proceeding
    if not drive_save_path.startswith("/content/drive"):
        print(f"Warning: The save path '{drive_save_path}' does not appear to be inside the mounted Google Drive '/content/drive'. Saving locally instead?")
        # Decide here if you want to stop or proceed with the potentially incorrect path

    save_successful = save_vector_store(vector_store, drive_save_path) # Use the drive path
    if not save_successful:
        print("Vector store saving to Google Drive failed. Check errors above.")
    else:
        print(f"Check your Google Drive folder: {drive_save_path}")

else:
    print("\nSkipping vector store saving because it was not created successfully.")


Attempting to save the created vector store to Google Drive path: /content/drive/MyDrive/MATLAB_RAG_Data/faiss_index
Saving FAISS vector store to: /content/drive/MyDrive/MATLAB_RAG_Data/faiss_index with index name 'index'...
FAISS vector store saved successfully.
Files created: /content/drive/MyDrive/MATLAB_RAG_Data/faiss_index/index.faiss, /content/drive/MyDrive/MATLAB_RAG_Data/faiss_index/index.pkl
Check your Google Drive folder: /content/drive/MyDrive/MATLAB_RAG_Data/faiss_index


In [None]:
def classify_query_type(query):
    """Basic keyword-based query classification."""
    query = query.lower().strip()
    if any(word in query for word in ["how to", "steps", "fix", "error", "troubleshoot", "resolve"]):
        return "troubleshooting_query"
    elif any(word in query for word in ["what is", "explain", "define"]):
        return "definition_query"
    elif not query or query in ["exit", "quit", "q", "bye"]:
         return "exit_command"
    else:
        return "other_query"

In [None]:
# @title 8a. Define Custom Prompt for Combining Documents (Ultra-Concise Output)

from langchain.prompts import PromptTemplate

# This template aims for the absolute minimal output from the LLM.
ultra_concise_prompt_template = """Follow these instructions to generate an answer based on the provided text blocks:
1. Analyze the first text block (context) and the second text block (question) about MATLAB.
2. Prioritize the context if it's relevant and helpful for answering the question.
3. If the context is irrelevant, missing, or insufficient, answer using your general MATLAB knowledge.
4. Provide clear, step-by-step guidance for troubleshooting questions where possible.
5. If you cannot answer confidently using either context or your knowledge, state only that you don't have enough information.
6. **CRITICAL: Your entire response must consist ONLY of the final answer to the question. Do NOT include these instructions, the context text, the question text, or any introductory phrases like 'Here is the answer:'. Start your response immediately with the answer content.**

--- CONTEXT ---
{context}
--- END CONTEXT ---

--- QUESTION ---
{question}
--- END QUESTION ---

--- ANSWER ---"""

CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(ultra_concise_prompt_template)

In [None]:
# @title 8. Initialize LLM and Conversational Retrieval Chain

from langchain.prompts import PromptTemplate # Ensure PromptTemplate is imported here too

def setup_qa_chain(vector_store_instance, repo_id, model_kwargs_llm, combine_prompt):
    """Sets up the LLM and the Conversational Retrieval Chain with a custom prompt."""
    if vector_store_instance is None:
        print("Error: Cannot setup QA chain without a valid vector store.")
        return None

    if not os.environ.get("HUGGINGFACEHUB_API_TOKEN"):
         print("Error: Hugging Face API Token not set. Cannot initialize LLM.")
         return None

    try:
        print(f"Initializing LLM from Hugging Face Hub: {repo_id}...")
        llm = HuggingFaceHub(
            repo_id=repo_id,
            model_kwargs=model_kwargs_llm
        )
        print("LLM initialized.")

        print("Setting up Conversational Retrieval Chain with custom prompt...")
        retriever = vector_store_instance.as_retriever(search_kwargs={'k': search_k})

        # Define keyword arguments for the combine_docs_chain part
        combine_docs_chain_kwargs = {"prompt": combine_prompt}

        qa_chain = ConversationalRetrievalChain.from_llm(
            llm=llm,
            retriever=retriever,
            return_source_documents=True,
            combine_docs_chain_kwargs=combine_docs_chain_kwargs, # Pass the custom prompt here
            # You can customize the condense_question_prompt separately if needed
            # condense_question_prompt=...,
            verbose=False # Set to True for more detailed chain output
        )
        print("Conversational Retrieval Chain setup complete.")
        return qa_chain

    except Exception as e:
        print(f"Error setting up QA chain: {e}")
        return None

# Setup the chain only if the vector store was created
if vector_store:
    qa_chain = setup_qa_chain(vector_store, llm_repo_id, llm_model_kwargs, CUSTOM_QUESTION_PROMPT)
else:
    qa_chain = None
    print("Skipping QA chain setup as the vector store is not available.")


def format_output(response):
    """Formats the QA chain response into a structured dictionary."""
    answer = response.get('answer', 'No answer generated by the LLM.')
    source_documents = response.get('source_documents', [])
    query = response.get('question', 'N/A')

    evidence_list = []
    seen_urls = set() # To avoid duplicate entries from chunks of the same page

    for doc in source_documents:
        url = doc.metadata.get('source_url', 'Unknown URL')
        # Only add evidence if we haven't seen this URL before in this response
        if url not in seen_urls:
            evidence = {
                "title": doc.metadata.get('title', 'N/A'),
                # Add other relevant metadata fields you parsed
                "url": url,
                "source_content_preview": doc.page_content[:300] + "..." # Preview of the chunk content
            }
            evidence_list.append(evidence)
            seen_urls.add(url)

    return {
        "query": query,
        "answer": answer.strip(),
        # "question_type": classify_query_type(query), # Uncomment if using classification
        "evidence_list": evidence_list
    }

Initializing LLM from Hugging Face Hub: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B...
LLM initialized.
Setting up Conversational Retrieval Chain with custom prompt...
Conversational Retrieval Chain setup complete.


In [None]:
def run_chat(qa_chain_instance):
    """Runs the interactive command-line chat session."""
    if not qa_chain_instance:
        print("\nERROR: The QA chain is not initialized. Cannot start chat.")
        print("Please check previous steps for errors (data loading, vector store, LLM setup).")
        return

    print("\n--- MATLAB Troubleshooting Assistant ---")
    print("Enter your MATLAB-related question or type 'exit'/'quit'/'q' to end.")
    chat_history = []

    while True:
        try:
            query = input('\nPrompt: ')
            query_lower = query.lower().strip()

            if query_lower in ["exit", "quit", "q", "bye"]:
                print('\nExiting chatbot. Goodbye!')
                break

            if not query:
                print("Please enter a question.")
                continue

            # Call the QA chain
            print("Thinking...")
            result = qa_chain_instance({'question': query, 'chat_history': chat_history})

            # Format and display the output
            structured_output = format_output(result)

            print('\nAnswer:')
            print(structured_output['answer'])

            if structured_output['evidence_list']:
                print('\nEvidence Sources:')
                for i, evidence in enumerate(structured_output['evidence_list'], 1):
                    print(f"  {i}. Title: {evidence['title']}")
                    print(f"     URL: {evidence['url']}")
                    # print(f"     Preview: {evidence['source_content_preview']}") # Optional: Show content preview
                    print() # Newline for better separation
            else:
                print("\nNo specific evidence documents were retrieved or cited for this answer.")

            # Update chat history
            chat_history.append((query, structured_output['answer']))
            # Optional: Limit chat history size to avoid excessive token usage
            max_history_len = 5
            if len(chat_history) > max_history_len:
                chat_history = chat_history[-max_history_len:]

        except KeyboardInterrupt:
             print('\nExiting chatbot due to interrupt. Goodbye!')
             break
        except Exception as e:
            print(f"\nAn error occurred: {e}")
            print("Please try again or type 'exit' to quit.")
            # Optional: Clear history on error?
            chat_history = []

# Run the chat only if the QA chain is ready
if qa_chain:
    run_chat(qa_chain)
else:
    print("\nChat interface cannot start because the QA chain failed to initialize.")