In [7]:
import pandas as pd

# Function to load and inspect the CSV
def load_csv(file_path):
    """
    Load a CSV file and return a DataFrame.

    Args:
        file_path (str): Path to the CSV file.

    Returns:
        pd.DataFrame: Loaded data as a pandas DataFrame.
    """
    try:
        data = pd.read_csv(file_path)
        print("CSV loaded successfully!")
        return data
    except Exception as e:
        print(f"Error loading CSV: {e}")
        return None

# Path to the CSV file
file_path = 'sample_integration_data.csv'

# Load the CSV
integration_data = load_csv(file_path)

# Display the first few rows for inspection
if integration_data is not None:
    print("Sample Data:")
    display(integration_data.head())

CSV loaded successfully!
Sample Data:


Unnamed: 0,Consumer,Producer,Integration Type,Context-Domain
0,Payroll System,Research Database,REST-JSON-RPC,Research
1,Finance System,Analytics Dashboard,SFTP,Alumni Relations
2,Finance System,HR System,ETL,Admissions
3,IT Support System,Analytics Dashboard,SFTP,Student Services
4,Finance System,Inventory System,ETL,Finance


In [None]:
# !pip install -v qdrant_client

In [17]:
from sentence_transformers import SentenceTransformer
import qdrant_client
from qdrant_client.http.models import PointStruct

# Initialize vectorizer (e.g., SentenceTransformer)
vectorizer = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Function to prepare and store data in the vector database
def setup_vector_db(graph):
    """
    Create and populate an in-memory vector database with graph data.

    Args:
        graph (dict): Graph representation with `nodes` and `edges`.

    Returns:
        QdrantClient: In-memory Qdrant client with indexed data.
    """
    try:
        client = qdrant_client.QdrantClient(":memory:")  # In-memory instance
        client.recreate_collection(
            collection_name="systems",
            vectors_config=qdrant_client.http.models.VectorParams(
                size=384, distance="Cosine"  # Embedding size from the model
            )
        )

        # Add graph data to the collection
        points = []
        for edge in graph["edges"]:
            text = f"{edge['source']} interacts with {edge['target']} via {edge['integration']}"
            embedding = vectorizer.encode(text).tolist()
            points.append(
                PointStruct(
                    id=len(points), vector=embedding,
                    payload={"source": edge["source"], "target": edge["target"],
                             "integration": edge["integration"], "context": edge["context"]}
                )
            )

        client.upsert(collection_name="systems", points=points)
        print(f"Vector database populated with {len(points)} points!")
        return client
    except Exception as e:
        print(f"Error setting up vector database: {e}")
        return None

# Set up the vector database
vector_db_client = setup_vector_db(graph)

# Confirm successful setup
if vector_db_client:
    print("Vector database setup successful!")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  client.recreate_collection(


Vector database populated with 100 points!
Vector database setup successful!


In [19]:
def retrieve_context_from_vector_db(client, query, top_k=5):
    """
    Retrieve relevant context for the query from the vector database.

    Args:
        client (QdrantClient): Qdrant client instance.
        query (str): The natural language query.
        top_k (int): Number of top results to retrieve.

    Returns:
        list: Retrieved context from the vector database.
    """
    try:
        query_embedding = vectorizer.encode(query).tolist()
        results = client.search(
            collection_name="systems",
            query_vector=query_embedding,
            limit=top_k
        )
        context = [
            result.payload for result in results
        ]
        print(f"Retrieved {len(context)} context items for query: '{query}'")
        return context
    except Exception as e:
        print(f"Error querying vector database: {e}")
        return []

# Example query
example_query = "What are the systems that interact with Finance System?"
retrieved_context = retrieve_context_from_vector_db(vector_db_client, example_query)

# Confirm successful retrieval
if retrieved_context:
    print("Context retrieval successful!")
    print("Retrieved Context:", retrieved_context)
else:
    print("No context retrieved.")

Retrieved 5 context items for query: 'What are the systems that interact with Finance System?'
Context retrieval successful!
Retrieved Context: [{'source': 'Finance System', 'target': 'Finance System', 'integration': 'REST-JSON-RPC', 'context': 'Reporting'}, {'source': 'Finance System', 'target': 'Inventory System', 'integration': 'ETL', 'context': 'Finance'}, {'source': 'HR System', 'target': 'Finance System', 'integration': 'CDC-Stream', 'context': 'Alumni Relations'}, {'source': 'Finance System', 'target': 'Inventory System', 'integration': 'SFTP', 'context': 'Student Services'}, {'source': 'Finance System', 'target': 'CRM System', 'integration': 'ETL', 'context': 'Finance'}]


In [29]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Function to load the LLM
def load_llm(model_name="EleutherAI/gpt-neo-125M"):
    """
    Load a local LLM for processing queries.

    Args:
        model_name (str): The name of the Hugging Face model to load.

    Returns:
        tokenizer, model: The tokenizer and model for the LLM.
    """
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name)
        print(f"Model '{model_name}' loaded successfully!")
        return tokenizer, model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None, None

# Load the tokenizer and model
tokenizer, model = load_llm()

Model 'EleutherAI/gpt-neo-125M' loaded successfully!


In [77]:
def generate_json_with_prompt(context, query, tokenizer, model):
    """
    Generate a strict JSON response using a single example to guide the LLM.

    Args:
        context (list): Context retrieved from the vector database.
        query (str): User query.
        tokenizer: Tokenizer for the LLM.
        model: Loaded LLM.

    Returns:
        str: JSON response as a string.
    """
    try:
        # Limit context to the first 3 items for clarity
        limited_context = context[:3]
        formatted_context = "\n".join([
            f"Source: {item['source']}, Target: {item['target']}, Integration: {item['integration']}, Context: {item['context']}"
            for item in limited_context
        ])

        # Single example for guidance
        example = """
Example:
Context:
Source: Payroll System, Target: HR System, Integration: API, Context: Employee Data
Source: Payroll System, Target: Analytics Dashboard, Integration: FTP, Context: Reporting

Question: What are the systems that interact with Payroll System?

JSON Response:
{
  "query": "What are the systems that interact with Payroll System?",
  "nodes": [
    {"id": "Payroll System"},
    {"id": "HR System"},
    {"id": "Analytics Dashboard"}
  ],
  "edges": [
    {"source": "Payroll System", "target": "HR System", "integration": "API", "context": "Employee Data"},
    {"source": "Payroll System", "target": "Analytics Dashboard", "integration": "FTP", "context": "Reporting"}
  ]
}
"""

        # Combine example and actual query into the prompt
        prompt = f"""
{example}

Now, based on the following context and question, generate ONLY JSON:

Context:
{formatted_context}

Question: {query}

JSON Response:
"""

        # Generate response
        inputs = tokenizer(prompt, return_tensors="pt")
        outputs = model.generate(
            inputs["input_ids"],
            max_length=1000,  # Allow up to 1000 tokens for input + output
            pad_token_id=tokenizer.eos_token_id
        )
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print("Raw LLM Response:", response)

        # Extract the JSON portion from the response
        start_index = response.find('{')
        end_index = response.rfind('}')
        if start_index != -1 and end_index != -1:
            response_json = response[start_index:end_index + 1]
            return response_json
        else:
            print("No valid JSON found in response.")
            return None

    except Exception as e:
        print(f"Error generating JSON response: {e}")
        return None

In [79]:
# Example query for the vector database
example_query = "What are the systems that interact with Finance System?"

# Step 1: Retrieve relevant context from the vector DB
retrieved_context = retrieve_context_from_vector_db(vector_db_client, example_query)
print("Retrieved Context:", retrieved_context)

# Step 2: Generate JSON response if context is retrieved
if retrieved_context:
    print("Testing JSON Generation with Updated Few-Shot Prompt...")
    
    # Call the updated generate_json_with_prompt function
    json_response = generate_json_with_prompt(retrieved_context, example_query, tokenizer, model)

    # Print the generated JSON response
    if json_response:
        print("Generated JSON Response:", json_response)
    else:
        print("Failed to generate JSON response.")
else:
    print("No context retrieved from Vector DB.")

Retrieved 5 context items for query: 'What are the systems that interact with Finance System?'
Retrieved Context: [{'source': 'Finance System', 'target': 'Finance System', 'integration': 'REST-JSON-RPC', 'context': 'Reporting'}, {'source': 'Finance System', 'target': 'Inventory System', 'integration': 'ETL', 'context': 'Finance'}, {'source': 'HR System', 'target': 'Finance System', 'integration': 'CDC-Stream', 'context': 'Alumni Relations'}, {'source': 'Finance System', 'target': 'Inventory System', 'integration': 'SFTP', 'context': 'Student Services'}, {'source': 'Finance System', 'target': 'CRM System', 'integration': 'ETL', 'context': 'Finance'}]
Testing JSON Generation with Updated Few-Shot Prompt...
Raw LLM Response: 

Example:
Context:
Source: Payroll System, Target: HR System, Integration: API, Context: Employee Data
Source: Payroll System, Target: Analytics Dashboard, Integration: FTP, Context: Reporting

Question: What are the systems that interact with Payroll System?

JSON R