In [152]:
import boto3
import yaml
from io import BytesIO
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# AWS credentials from .env
AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")
BUCKET_NAME = os.getenv("BUCKET_NAME")


# S3 client
s3 = boto3.client(
    "s3",
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key=AWS_SECRET_KEY
)



In [135]:
# Your S3 key (path to the file inside the bucket)
s3_key = "openapi-specs/ef0a6586-98c3-4d29-a587-849771eccb43_openapi.yaml"  # Replace with your actual key



In [136]:

# Load and parse the YAML file from S3
response = s3.get_object(Bucket=BUCKET_NAME, Key=s3_key)
yaml_content = yaml.safe_load(response["Body"].read())

# Display sample
print(yaml_content.keys())

dict_keys(['openapi', 'info', 'servers', 'components', 'paths'])


In [137]:
sum = 0
for path, content in yaml_content["paths"].items():
    sum += len(str(content))
    
print("sum:", sum)
print("avg:", sum / len(yaml_content["paths"]))
print("max:", max(len(str(content)) for path, content in yaml_content["paths"].items()))
print("min:", min(len(str(content)) for path, content in yaml_content["paths"].items()))

sum: 8122
avg: 1160.2857142857142
max: 1558
min: 485


In [138]:
unique_keys = set()
for path, content in yaml_content["paths"].items():
    print("path:", path)
    print("content:", content)
    print("content_keys:", content.keys())
    unique_keys.update(content.keys())
    print("\n")
    


path: /tickets
content: {'get': {'summary': 'Get all tickets', 'description': 'Retrieve a list of all tickets with optional filtering', 'parameters': [{'name': 'status', 'in': 'query', 'schema': {'type': 'string', 'enum': ['open', 'in-progress', 'resolved', 'closed']}, 'description': 'Filter tickets by status'}, {'name': 'priority', 'in': 'query', 'schema': {'type': 'string', 'enum': ['low', 'medium', 'high', 'urgent']}, 'description': 'Filter tickets by priority'}, {'name': 'assignedTo', 'in': 'query', 'schema': {'type': 'string'}, 'description': 'Filter tickets by assigned user ID'}], 'responses': {'200': {'description': 'List of tickets', 'content': {'application/json': {'schema': {'type': 'array', 'items': {'$ref': '#/components/schemas/Ticket'}}}}}}}, 'post': {'summary': 'Create a new ticket', 'security': [{'basicAuth': []}], 'requestBody': {'required': True, 'content': {'application/json': {'schema': {'type': 'object', 'properties': {'title': {'type': 'string'}, 'description': {'

In [139]:
for key, content in yaml_content["paths"].items():
    print({key:content})
    print("\n")

{'/tickets': {'get': {'summary': 'Get all tickets', 'description': 'Retrieve a list of all tickets with optional filtering', 'parameters': [{'name': 'status', 'in': 'query', 'schema': {'type': 'string', 'enum': ['open', 'in-progress', 'resolved', 'closed']}, 'description': 'Filter tickets by status'}, {'name': 'priority', 'in': 'query', 'schema': {'type': 'string', 'enum': ['low', 'medium', 'high', 'urgent']}, 'description': 'Filter tickets by priority'}, {'name': 'assignedTo', 'in': 'query', 'schema': {'type': 'string'}, 'description': 'Filter tickets by assigned user ID'}], 'responses': {'200': {'description': 'List of tickets', 'content': {'application/json': {'schema': {'type': 'array', 'items': {'$ref': '#/components/schemas/Ticket'}}}}}}}, 'post': {'summary': 'Create a new ticket', 'security': [{'basicAuth': []}], 'requestBody': {'required': True, 'content': {'application/json': {'schema': {'type': 'object', 'properties': {'title': {'type': 'string'}, 'description': {'type': 'str

In [140]:
import yaml

def collect_all_parameters(path_item, operation):
    path_params = path_item.get("parameters", [])
    op_params = operation.get("parameters", [])
    return path_params + op_params

def format_parameters(params):
    if not params:
        return "None"
    lines = []
    for p in params:
        name = p.get("name")
        location = p.get("in")
        required = p.get("required", False)
        description = p.get("description", "")
        schema = p.get("schema", {})
        type_ = schema.get("type", "unknown")
        enum = schema.get("enum")
        enum_str = f" — enum: {', '.join(enum)}" if enum else ""
        req_str = "required" if required else "optional"
        lines.append(f"- {name} ({location}, {type_}, {req_str}): {description}{enum_str}")
    return "\n".join(lines)

def format_request_body(request_body):
    if not request_body:
        return "None"
    content = request_body.get("content", {})
    if "application/json" in content:
        schema = content["application/json"].get("schema", {})
        props = schema.get("properties", {})
        required = schema.get("required", [])
        lines = []
        for name, val in props.items():
            type_ = val.get("type", "unknown")
            enum = val.get("enum")
            enum_str = f" — enum: {', '.join(enum)}" if enum else ""
            req_str = "required" if name in required else "optional"
            lines.append(f"- {name} ({type_}, {req_str}){enum_str}")
        return "\n".join(lines)
    return "Unsupported content type"

def format_responses(responses):
    lines = []
    for status, resp in responses.items():
        desc = resp.get("description", "")
        lines.append(f"- {status}: {desc}")
    return "\n".join(lines)

def describe_security(security, security_schemes):
    if not security:
        return "None"
    descriptions = []
    for scheme_group in security:
        for name in scheme_group:
            scheme = security_schemes.get(name, {})
            scheme_type = scheme.get("type", "unknown")
            description = f"{name} (type: {scheme_type}"
            if "scheme" in scheme:
                description += f", scheme: {scheme['scheme']}"
            if "bearerFormat" in scheme:
                description += f", bearerFormat: {scheme['bearerFormat']}"
            description += ")"
            descriptions.append(description)
    return "\n".join(descriptions)

def create_openapi_chunks(yaml_content, integration_name="generic"):
    chunks = []
    paths = yaml_content.get("paths", {})
    security_schemes = yaml_content.get("components", {}).get("securitySchemes", {})

    for path, path_item in paths.items():
        for method in path_item:
            if method not in ["get", "post", "put", "delete", "patch", "options", "head"]:
                continue
            operation = path_item[method]

            summary = operation.get("summary", "")
            description = operation.get("description", "")
            all_params = collect_all_parameters(path_item, operation)
            request_body = operation.get("requestBody")
            responses = operation.get("responses", {})
            security = operation.get("security", [])

            chunk = f"""Integration: {integration_name}
Authentication Required: {describe_security(security, security_schemes)}
Path: {path}
Method: {method.upper()}
Summary: {summary}
Description: {description}

Parameters:
{format_parameters(all_params)}

Request Body:
{format_request_body(request_body)}

Responses:
{format_responses(responses)}
"""
            chunks.append({
                "id": f"{integration_name}:{method.upper()}:{path}",
                "text": chunk,
                "summary": summary
            })

    return chunks


In [141]:
INTEGRATION_NAME = "TICKETING SYSTEM"
chunks = create_openapi_chunks(yaml_content, integration_name=INTEGRATION_NAME)

for a, i in enumerate(chunks):
    print(i["id"])
    print(i["text"])
    print("\n")
    

TICKETING SYSTEM:GET:/tickets
Integration: TICKETING SYSTEM
Authentication Required: None
Path: /tickets
Method: GET
Summary: Get all tickets
Description: Retrieve a list of all tickets with optional filtering

Parameters:
- status (query, string, optional): Filter tickets by status — enum: open, in-progress, resolved, closed
- priority (query, string, optional): Filter tickets by priority — enum: low, medium, high, urgent
- assignedTo (query, string, optional): Filter tickets by assigned user ID

Request Body:
None

Responses:
- 200: List of tickets



TICKETING SYSTEM:POST:/tickets
Integration: TICKETING SYSTEM
Authentication Required: basicAuth (type: http, scheme: basic)
Path: /tickets
Method: POST
Summary: Create a new ticket
Description: 

Parameters:
None

Request Body:
- title (string, required)
- description (string, required)
- priority (string, optional) — enum: low, medium, high, urgent

Responses:
- 201: Ticket created successfully
- 400: Invalid input
- 401: Unauthorized


In [142]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    google_api_key= os.getenv("GOOGLE_API_KEY")
)


In [143]:
from langchain.schema import Document

all_docs = [
    Document(
        page_content=chunk["text"],
        metadata={
            "id": chunk["id"],
            "method": chunk["id"].split(":")[1],
            "path": chunk["id"].split(":")[2],
            "integration": INTEGRATION_NAME,
            "summary": chunk["summary"],
        }
    )
    for chunk in chunks
]


In [144]:
print(all_docs)

[Document(metadata={'id': 'TICKETING SYSTEM:GET:/tickets', 'method': 'GET', 'path': '/tickets', 'integration': 'TICKETING SYSTEM', 'summary': 'Get all tickets'}, page_content='Integration: TICKETING SYSTEM\nAuthentication Required: None\nPath: /tickets\nMethod: GET\nSummary: Get all tickets\nDescription: Retrieve a list of all tickets with optional filtering\n\nParameters:\n- status (query, string, optional): Filter tickets by status — enum: open, in-progress, resolved, closed\n- priority (query, string, optional): Filter tickets by priority — enum: low, medium, high, urgent\n- assignedTo (query, string, optional): Filter tickets by assigned user ID\n\nRequest Body:\nNone\n\nResponses:\n- 200: List of tickets\n'), Document(metadata={'id': 'TICKETING SYSTEM:POST:/tickets', 'method': 'POST', 'path': '/tickets', 'integration': 'TICKETING SYSTEM', 'summary': 'Create a new ticket'}, page_content='Integration: TICKETING SYSTEM\nAuthentication Required: basicAuth (type: http, scheme: basic)\n

In [153]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(
    index_name= os.getenv("PINECONE_INDEX"),
    embedding=embedding_model,
    pinecone_api_key= os.getenv("PINECONE_API_KEY")
)


In [146]:
# vector_store.delete(delete_all=)

In [147]:
all_docs

[Document(metadata={'id': 'TICKETING SYSTEM:GET:/tickets', 'method': 'GET', 'path': '/tickets', 'integration': 'TICKETING SYSTEM', 'summary': 'Get all tickets'}, page_content='Integration: TICKETING SYSTEM\nAuthentication Required: None\nPath: /tickets\nMethod: GET\nSummary: Get all tickets\nDescription: Retrieve a list of all tickets with optional filtering\n\nParameters:\n- status (query, string, optional): Filter tickets by status — enum: open, in-progress, resolved, closed\n- priority (query, string, optional): Filter tickets by priority — enum: low, medium, high, urgent\n- assignedTo (query, string, optional): Filter tickets by assigned user ID\n\nRequest Body:\nNone\n\nResponses:\n- 200: List of tickets\n'),
 Document(metadata={'id': 'TICKETING SYSTEM:POST:/tickets', 'method': 'POST', 'path': '/tickets', 'integration': 'TICKETING SYSTEM', 'summary': 'Create a new ticket'}, page_content='Integration: TICKETING SYSTEM\nAuthentication Required: basicAuth (type: http, scheme: basic)\

In [148]:
vector_store.add_documents(all_docs)
print(f"Successfully stored {len(all_docs)} documents with full API details")


Successfully stored 13 documents with full API details


In [157]:
INTEGRATION_NAME = "TICKETING SYSTEM"
query = "which path will give me all the tickets"
results = vector_store.similarity_search(
    query,
    k=5,
    filter={"integration": INTEGRATION_NAME}
)

for doc in results:
    print(doc.page_content) 


Integration: TICKETING SYSTEM
Authentication Required: None
Path: /tickets
Method: GET
Summary: Get all tickets
Description: Retrieve a list of all tickets with optional filtering

Parameters:
- status (query, string, optional): Filter tickets by status — enum: open, in-progress, resolved, closed
- priority (query, string, optional): Filter tickets by priority — enum: low, medium, high, urgent
- assignedTo (query, string, optional): Filter tickets by assigned user ID

Request Body:
None

Responses:
- 200: List of tickets

Integration: TICKETING SYSTEM
Authentication Required: None
Path: /tickets/{ticketId}
Method: GET
Summary: Get ticket details
Description: 

Parameters:
- ticketId (path, string, required): ID of the ticket

Request Body:
None

Responses:
- 200: Ticket details
- 404: Ticket not found

Integration: TICKETING SYSTEM
Authentication Required: None
Path: /tickets/{ticketId}/priority
Method: GET
Summary: Get ticket priority
Description: 

Parameters:
- ticketId (path, strin