In [1]:
import pandas as pd

# Function to load and inspect the CSV
def load_csv(file_path):
    """
    Load a CSV file and return a DataFrame.

    Args:
        file_path (str): Path to the CSV file.

    Returns:
        pd.DataFrame: Loaded data as a pandas DataFrame.
    """
    try:
        data = pd.read_csv(file_path)
        print("CSV loaded successfully!")
        return data
    except Exception as e:
        print(f"Error loading CSV: {e}")
        return None

# Path to the CSV file
file_path = 'sample_integration_data.csv'

# Load the CSV
integration_data = load_csv(file_path)

# Display the first few rows for inspection
if integration_data is not None:
    print("Sample Data:")
    display(integration_data.head())

CSV loaded successfully!
Sample Data:


Unnamed: 0,Consumer,Producer,Integration Type,Context-Domain
0,Payroll System,Research Database,REST-JSON-RPC,Research
1,Finance System,Analytics Dashboard,SFTP,Alumni Relations
2,Finance System,HR System,ETL,Admissions
3,IT Support System,Analytics Dashboard,SFTP,Student Services
4,Finance System,Inventory System,ETL,Finance


In [2]:
# Extract nodes and edges from the DataFrame
nodes = list(set(integration_data['Consumer']).union(set(integration_data['Producer'])))
edges = integration_data.rename(
    columns={
        "Consumer": "source",
        "Producer": "target",
        "Integration Type": "integration",
        "Context-Domain": "context"
    }
).to_dict(orient="records")

# Create the graph structure
graph = {
    "nodes": [{"id": node} for node in nodes],
    "edges": edges
}

# Print the graph for verification
print("Graph Representation:")
print("Nodes:", graph["nodes"])
print("Edges:", graph["edges"])


Graph Representation:
Nodes: [{'id': 'HR System'}, {'id': 'CRM System'}, {'id': 'Admissions Portal'}, {'id': 'Catalog System'}, {'id': 'Alumni Portal'}, {'id': 'Finance System'}, {'id': 'Course Registration System'}, {'id': 'IT Support System'}, {'id': 'Analytics Dashboard'}, {'id': 'Research Database'}, {'id': 'Inventory System'}, {'id': 'Student Portal'}, {'id': 'Library System'}, {'id': 'Payroll System'}]
Edges: [{'source': 'Payroll System', 'target': 'Research Database', 'integration': 'REST-JSON-RPC', 'context': 'Research'}, {'source': 'Finance System', 'target': 'Analytics Dashboard', 'integration': 'SFTP', 'context': 'Alumni Relations'}, {'source': 'Finance System', 'target': 'HR System', 'integration': 'ETL', 'context': 'Admissions'}, {'source': 'IT Support System', 'target': 'Analytics Dashboard', 'integration': 'SFTP', 'context': 'Student Services'}, {'source': 'Finance System', 'target': 'Inventory System', 'integration': 'ETL', 'context': 'Finance'}, {'source': 'Alumni Port

In [3]:
# !pip install -v qdrant_client

In [4]:
from sentence_transformers import SentenceTransformer
import qdrant_client
from qdrant_client.http.models import PointStruct

# Initialize vectorizer (e.g., SentenceTransformer)
vectorizer = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Function to prepare and store data in the vector database
def setup_vector_db(graph):
    """
    Create and populate an in-memory vector database with graph data.

    Args:
        graph (dict): Graph representation with `nodes` and `edges`.

    Returns:
        QdrantClient: In-memory Qdrant client with indexed data.
    """
    try:
        client = qdrant_client.QdrantClient(":memory:")  # In-memory instance
        client.recreate_collection(
            collection_name="systems",
            vectors_config=qdrant_client.http.models.VectorParams(
                size=384, distance="Cosine"  # Embedding size from the model
            )
        )

        # Add graph data to the collection
        points = []
        for edge in graph["edges"]:
            text = f"{edge['source']} interacts with {edge['target']} via {edge['integration']}"
            embedding = vectorizer.encode(text).tolist()
            points.append(
                PointStruct(
                    id=len(points), vector=embedding,
                    payload={"source": edge["source"], "target": edge["target"],
                             "integration": edge["integration"], "context": edge["context"]}
                )
            )

        client.upsert(collection_name="systems", points=points)
        print(f"Vector database populated with {len(points)} points!")
        return client
    except Exception as e:
        print(f"Error setting up vector database: {e}")
        return None

# Set up the vector database
vector_db_client = setup_vector_db(graph)

# Confirm successful setup
if vector_db_client:
    print("Vector database setup successful!")

  client.recreate_collection(


Vector database populated with 100 points!
Vector database setup successful!


In [5]:
def retrieve_context_from_vector_db(client, query, top_k=5):
    """
    Retrieve relevant context for the query from the vector database.

    Args:
        client (QdrantClient): Qdrant client instance.
        query (str): The natural language query.
        top_k (int): Number of top results to retrieve.

    Returns:
        list: Retrieved context from the vector database.
    """
    try:
        query_embedding = vectorizer.encode(query).tolist()
        results = client.search(
            collection_name="systems",
            query_vector=query_embedding,
            limit=top_k
        )
        context = [
            result.payload for result in results
        ]
        print(f"Retrieved {len(context)} context items for query: '{query}'")
        return context
    except Exception as e:
        print(f"Error querying vector database: {e}")
        return []

# Example query
example_query = "What are the systems that interact with Finance System?"
retrieved_context = retrieve_context_from_vector_db(vector_db_client, example_query)

# Confirm successful retrieval
if retrieved_context:
    print("Context retrieval successful!")
    print("Retrieved Context:", retrieved_context)
else:
    print("No context retrieved.")

Retrieved 5 context items for query: 'What are the systems that interact with Finance System?'
Context retrieval successful!
Retrieved Context: [{'source': 'Finance System', 'target': 'Finance System', 'integration': 'REST-JSON-RPC', 'context': 'Reporting'}, {'source': 'Finance System', 'target': 'Inventory System', 'integration': 'ETL', 'context': 'Finance'}, {'source': 'HR System', 'target': 'Finance System', 'integration': 'CDC-Stream', 'context': 'Alumni Relations'}, {'source': 'Finance System', 'target': 'Inventory System', 'integration': 'SFTP', 'context': 'Student Services'}, {'source': 'Finance System', 'target': 'CRM System', 'integration': 'ETL', 'context': 'Finance'}]


In [15]:
import os
from openai import OpenAI

# Initialize the OpenAI client with API key from environment variable
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

def generate_response(messages, model="gpt-4o-mini", max_tokens=500, temperature=0.7):
    """
    Generate a response using the OpenAI API.

    Args:
        messages (list): List of messages for the conversation.
        model (str): Model to use for generation (e.g., "gpt-4o-mini").
        max_tokens (int): Maximum tokens for the output.
        temperature (float): Sampling temperature for randomness.

    Returns:
        str: Generated response content.
    """
    try:
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature,
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error using OpenAI API: {e}")
        return None

def generate_json_with_openai_few_shot(context, query):
    """
    Generate a strict JSON response using the OpenAI API with few-shot examples.

    Args:
        context (list): Context retrieved from the vector database.
        query (str): User query.

    Returns:
        str: JSON response as a string.
    """
    try:
        # Format context for the API using "Consumer" and "Producer"
        formatted_context = "\n".join([
            f"Consumer: {item['consumer']}, Producer: {item['producer']}, Integration: {item['integration']}, Context: {item['context']}"
            for item in context
        ])

        # Few-shot examples to guide the model
        few_shot_examples = """
Example 1:
Context:
Consumer: System A, Producer: System B, Integration: API, Context: Authentication
Consumer: System A, Producer: System C, Integration: Webhook, Context: Notifications

Question: What are the systems that interact with System A?

JSON Response:
{
  "query": "What are the systems that interact with System A?",
  "nodes": [
    {"id": "System A"},
    {"id": "System B"},
    {"id": "System C"}
  ],
  "edges": [
    {"consumer": "System A", "producer": "System B", "integration": "API", "context": "Authentication"},
    {"consumer": "System A", "producer": "System C", "integration": "Webhook", "context": "Notifications"}
  ]
}
"""

        # Combine examples with actual query
        messages = [
            {"role": "system", "content": "You are an assistant that generates JSON responses."},
            {"role": "user", "content": few_shot_examples},
            {
                "role": "user",
                "content": f"""
Now, based on the following context and question, generate ONLY JSON:

Context:
{formatted_context}

Question: {query}

JSON Response:
"""
            }
        ]

        # Generate the response using `generate_response`
        response_content = generate_response(messages, model="gpt-4o-mini", max_tokens=500, temperature=0.0)
        print("Raw OpenAI Response:", response_content)

        # Parse the JSON portion of the response
        start_index = response_content.find('{')
        end_index = response_content.rfind('}')
        if start_index != -1 and end_index != -1:
            response_json = response_content[start_index:end_index + 1]
            return response_json
        else:
            print("No valid JSON found in response.")
            return None

    except Exception as e:
        print(f"Error generating JSON response with OpenAI: {e}")
        return None

In [16]:
# Test the updated OpenAI API integration with Consumer and Producer
test_context = [
    {"source": "System A", "target": "System B", "integration": "API", "context": "Authentication"},
    {"source": "System A", "target": "System C", "integration": "Webhook", "context": "Notifications"}
]

# Rename keys in test_context to match "Consumer" and "Producer"
test_context = [
    {"consumer": item["source"], "producer": item["target"], "integration": item["integration"], "context": item["context"]}
    for item in test_context
]

test_query = "What are the systems that interact with System A?"

print("Testing updated OpenAI integration with few-shot prompts (Consumer/Producer)...")
response = generate_json_with_openai_few_shot(test_context, test_query)
if response:
    print("Generated JSON Response:")
    print(response)
else:
    print("Failed to generate a JSON response.")

Testing updated OpenAI integration with few-shot prompts (Consumer/Producer)...
Raw OpenAI Response: {
  "query": "What are the systems that interact with System A?",
  "nodes": [
    {"id": "System A"},
    {"id": "System B"},
    {"id": "System C"}
  ],
  "edges": [
    {"consumer": "System A", "producer": "System B", "integration": "API", "context": "Authentication"},
    {"consumer": "System A", "producer": "System C", "integration": "Webhook", "context": "Notifications"}
  ]
}
Generated JSON Response:
{
  "query": "What are the systems that interact with System A?",
  "nodes": [
    {"id": "System A"},
    {"id": "System B"},
    {"id": "System C"}
  ],
  "edges": [
    {"consumer": "System A", "producer": "System B", "integration": "API", "context": "Authentication"},
    {"consumer": "System A", "producer": "System C", "integration": "Webhook", "context": "Notifications"}
  ]
}
