## Feature 1 — Ticket Classification

**What it does:**  
Automatically classifies support tickets using an AI model.

**How it works:**
- Loads tickets from a JSON file.
- Uses Google Generative AI to assign:
  - **Topic** (e.g., Product, Feedback)
  - **Sentiment** (e.g., Angry, Curious)
  - **Priority** (P0, P1, P2)

**Result:**  
Each ticket gets these three labels for easier management.


In [None]:
!pip install -U langchain-google-genai

In [None]:
!pip install -q langchain-community langchain-core

### Defing our LLM Model

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
GOOGLE_API_KEY="AIzaSyDiE4IX_azfFI7sbnYDUXAUl949lzFr8kg"
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", google_api_key=GOOGLE_API_KEY)

### Definig our System Prompt

In [None]:
from langchain.prompts import PromptTemplate
classification_prompt = PromptTemplate(
    input_variables=["ticket_id","ticker_subject","ticket_text"],
    template=(
        "You are a ticket classification assistant.\n"
        "Given the user support ticket below, label it with:\n"
        "  - Topic: one of [How-to, Product, Connector, Feedback, ...]\n"
        "  - Sentiment: one of [Frustrated, Curious, Angry, Neutral]\n"
        "  - Priority: one of [P0/High, P1/Medium, P2/Low]\n"
        "Ticket Id: {ticket_id}\n"
        "Subject: {ticker_subject}\n"
        "Ticket:\n---\n{ticket_text}\n---\n"
        "Return format:\n"
        "Topic: <topic>\nSentiment: <sentiment>\nPriority: <priority>\n"
    )
)

### Creating LLM Chain

In [None]:
from langchain.chains import LLMChain
classification_chain = LLMChain(
    llm=llm,
    prompt=classification_prompt
)

### Loading the Data from `tickets.json` file

In [None]:
import json
with open('/content/tickets_data.json', 'r') as file:
    data = json.load(file)

ids = [ticket['id'] for ticket in data]
subjects = [ticket['subject'] for ticket in data]
bodies = [ticket['body'] for ticket in data]

print("IDs:", len(ids))
print("Subjects:", len(subjects))
print("Bodies:", len(bodies))

### Passing each ticket to the LLM chain and store the results in a list

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def classify_ticket(ticket_input):
    return classification_chain.apply([ticket_input])[0]["text"]

inputs = [
    {"ticket_id": ids[i], "ticker_subject": subjects[i], "ticket_text": bodies[i]}
    for i in range(len(ids))
]

results = []
with ThreadPoolExecutor(max_workers=2) as executor:
    futures = [executor.submit(classify_ticket, inp) for inp in inputs]
    for future in as_completed(futures):
        results.append(future.result())


In [None]:
results[1]

In [None]:
ids[1]

In [None]:
subjects[1]

In [None]:
bodies[1]

## Feature 2 — Documentation Crawler & Retriever

**What it does:**  
Crawls Atlan documentation sites to collect pages, builds a searchable knowledge base, and enables question-answering over the docs.

**How it works:**
- Crawls all relevant pages from given documentation URLs.
- Extracts and saves page text in chunks for processing.
- Converts text into embeddings and stores them using FAISS for fast retrieval.
- Provides a QA (Question Answering) interface powered by Google Generative AI, allowing users to ask questions and get answers based on the crawled documentation.

**Result:**  
You can search and answer queries from Atlan docs instantly using natural language, making it easy to find information.


In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
START_URL = "https://docs.atlan.com/"
visited = set()
to_visit = [START_URL]
all_urls_ = set()
while to_visit:
    url = to_visit.pop(0)
    if url in visited or ".pdf" in url or "#" in url:
        continue
    try:
        resp = requests.get(url, timeout=10)
        visited.add(url)
        all_urls_.add(url)
        soup = BeautifulSoup(resp.text, "html.parser")
        for link in soup.find_all("a", href=True):
            full_url = urljoin(url, link["href"])
            if urlparse(full_url).netloc == urlparse(START_URL).netloc and full_url not in visited:
                to_visit.append(full_url)
    except Exception as e:
        print(f"Error visiting {url}: {e}")

print(f"Discovered {len(all_urls_)} documentation URLs.")

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
START_URL = "https://developer.atlan.com/"
visited = set()
to_visit = [START_URL]
all_urls= set()

while to_visit:
    url = to_visit.pop(0)
    if url in visited or ".pdf" in url or "#" in url:
        continue
    try:
        resp = requests.get(url, timeout=10)
        visited.add(url)
        all_urls.add(url)
        soup = BeautifulSoup(resp.text, "html.parser")
        for link in soup.find_all("a", href=True):
            full_url = urljoin(url, link["href"])
            if urlparse(full_url).netloc == urlparse(START_URL).netloc and full_url not in visited:
                to_visit.append(full_url)
    except Exception as e:
        print(f"Error visiting {url}: {e}")
print(f"Discovered {len(all_urls)} documentation URLs.")

In [None]:
all_urls

In [None]:
import json

# Save all_urls to a JSON file
with open("developer_atlan_urls.json", "w") as f:
    json.dump(list(all_urls), f)

# Save all_urls_ to a JSON file
with open("docs_atlan_urls.json", "w") as f:
    json.dump(list(all_urls_), f)

print("URLs saved to developer_atlan_urls.json and docs_atlan_urls.json")

In [None]:
all_urls_

### Retrive the text from each link and store them in `Docunments` list

In [None]:
def fetch_page_text(url):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, 'html.parser')
    for tag in soup(['nav', 'footer', 'script', 'style']):
        tag.decompose()
    text = '\n'.join([p.get_text(separator=' ', strip=True) for p in soup.find_all(['p', 'li', 'h2', 'h3'])])
    return text

def chunk_text(text, max_chunk_size=500):
    words = text.split()
    return [' '.join(words[i:i+max_chunk_size]) for i in range(0, len(words), max_chunk_size)]
documents = []
for url in all_urls_:
    raw_text = fetch_page_text(url)
    for chunk in chunk_text(raw_text):
        documents.append({"text": chunk, "source": url})


In [None]:
def fetch_page_text(url):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, 'html.parser')
    for tag in soup(['nav', 'footer', 'script', 'style']):
        tag.decompose()
    text = '\n'.join([p.get_text(separator=' ', strip=True) for p in soup.find_all(['p', 'li', 'h2', 'h3'])])
    return text

def chunk_text(text, max_chunk_size=500):
    words = text.split()
    return [' '.join(words[i:i+max_chunk_size]) for i in range(0, len(words), max_chunk_size)]

for url in all_urls:
    raw_text = fetch_page_text(url)
    for chunk in chunk_text(raw_text):
        documents.append({"text": chunk, "source": url})


In [None]:
len(documents)

In [None]:
!pip install -q faiss-cpu chromadb langchain

In [None]:
!pip install -q langchain-google-genai

In [None]:
texts = [doc["text"] for doc in documents]


In [None]:
metadatas = [{"source": doc["source"]} for doc in documents]


In [None]:
!pip install -q sentence-transformers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(texts)


In [None]:
import faiss
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)


In [None]:
faiss.write_index(index, "faiss.index")
import json
with open("metadata.json", "w") as f:
    json.dump(documents, f)

In [None]:
!pip install -q -U langchain-community

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.from_texts(texts, embedding=embeddings, metadatas=metadatas)
vectorstore.save_local("faiss_store")

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.load_local("faiss_store", embeddings,allow_dangerous_deserialization=True)

In [None]:
retriever = vectorstore.as_retriever()


In [None]:
prompt = PromptTemplate(
    template="Context:\n{context}\n\nQuestion: {question}\n\nAnswer:",
    input_variables=["context", "question"]
)


### We create the `qa_chain`

In [None]:
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt},
)


### Testing by passing an query

In [None]:
query = "Hi team, we're trying to set up our primary Snowflake production database as a new source in Atlan, but the connection keeps failing. We've tried using our standard service account, but it's not working. Our entire BI team is blocked on this integration for a major upcoming project, so it's quite urgent. Could you please provide a definitive list of the exact permissions and credentials needed on the Snowflake side to get this working? Thanks."
result = qa_chain({"query": query})
print("Answer:", result["result"])

if "source_documents" in result:
    print("\nSources:")
    for doc in result["source_documents"]:
        url = doc.metadata.get("source", None) or doc.metadata.get("url", None)
        print(f"- {url}")
