In [35]:
from langchain_openai import ChatOpenAI
from tenacity import retry, wait_random_exponential, stop_after_attempt
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser

chatgpt = ChatOpenAI(model_name="gpt-4o-mini", temperature=0, timeout=60, max_retries=8)
summarizer = chatgpt.bind(max_tokens=600)

In [38]:
import tiktoken
enc = tiktoken.get_encoding("o200k_base")
def clamp_tokens(text: str, max_tokens: int) -> str:
	toks = enc.encode(text)
	if len(toks) <= max_tokens:
		return text
	out = enc.decode(toks[:max_tokens])
	cut = max(out.rfind(". "), out.rfind("\n"), out.rfind(" "))
	if cut > 0:
		out = out[:cut+1]
	return out + " …"

In [39]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(8))
def summarize_chunk(chunk_text: str, title: str | None = None, page: int | None = None) -> str:
	prefix = f"Title: {title}\nPage: {page}\n" if title is not None else ""
	prompt_tmpl = ChatPromptTemplate.from_template(
		"You are an assistant for road-work document analysis.\n"
		"Summarize the following chunk in 2–3 sentences, suitable for retrieval context.\n"
		"Be specific (places, programs, dates), avoid generalities, no hallucinations.\n\n"
		"{pref}Chunk:\n{chunk}\n\nSummary:"
	)
	chain = prompt_tmpl | summarizer | StrOutputParser()
	# clamp the prompt input so each call has predictable token size
	return chain.invoke({"pref": prefix, "chunk": clamp_tokens(chunk_text, 2000)})

In [30]:
# @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(8))
# def generate_chunk_context(paper_context: str, chunk: str) -> str:
#     prompt_tmpl = ChatPromptTemplate.from_template(
#         "You are an AI assistant for road-work paper analysis.\n"
#         "Using the summary and compressed document below, provide a brief 2–3 sentence context for the chunk.\n\n"
#         "{paper}\n\nChunk:\n{chunk}\n\nContext:"
#     )
#     chain = prompt_tmpl | chatgpt | StrOutputParser()
#     return chain.invoke({"paper": clamp_tokens(paper_context, 5200), "chunk": chunk})

In [40]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import time,uuid

def create_contextual_chunks(file_path, chunk_size=2000, chunk_overlap=200, neighbor_chars=300, throttle_s=0.25):
	# load + split
	loader = PyMuPDFLoader(file_path)
	doc_pages = loader.load()
	splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
	doc_chunks = splitter.split_documents(doc_pages)

	contextual_chunks = []
	n = len(doc_chunks)
	for i, chunk in enumerate(doc_chunks):
		text = chunk.page_content

		# optional small neighbor window (keeps chunk-first, adds coherence without big token cost)
		if neighbor_chars and n > 1:
			prev_txt = doc_chunks[i-1].page_content[-neighbor_chars:] if i > 0 else ""
			next_txt = doc_chunks[i+1].page_content[:neighbor_chars] if i+1 < n else ""
			if prev_txt:
				text = f"(Prev) {prev_txt}\n\n{chunk.page_content}"
			if next_txt:
				text = f"{text}\n\n(Next) {next_txt}"

		meta = {
			'id': str(uuid.uuid4()),
			'page': chunk.metadata.get('page'),
			'source': chunk.metadata.get('source'),
			'title': chunk.metadata.get('source', '').split('/')[-1]
		}
		summary = summarize_chunk(text, title=meta['title'], page=meta['page'])

		# store both: prepend summary to content for embedding; also keep in metadata
		contextual_chunks.append(
			Document(
				page_content=f"{summary}\n\n{chunk.page_content}",
				metadata={**meta, "chunk_summary": summary}
			)
		)
		time.sleep(throttle_s)  # smooth TPM
	return contextual_chunks

In [41]:
from glob import glob

pdf_files = glob(r'C:\Users\DELL\Desktop\AI Projects\Langchain_proj\Logistic_Agent_Planner\data\road_knowledge\*.pdf')
pdf_files

['C:\\Users\\DELL\\Desktop\\AI Projects\\Langchain_proj\\Logistic_Agent_Planner\\data\\road_knowledge\\Chapter-2-Project-Rationale.pdf',
 'C:\\Users\\DELL\\Desktop\\AI Projects\\Langchain_proj\\Logistic_Agent_Planner\\data\\road_knowledge\\Hume-Region-Planning-for-Freight-Pilot-Strategy-Report.pdf',
 'C:\\Users\\DELL\\Desktop\\AI Projects\\Langchain_proj\\Logistic_Agent_Planner\\data\\road_knowledge\\Hume-Regional-Growth-Plan-May-2014.pdf']

In [42]:
try:
    chatgpt = ChatOpenAI(
        model_name="gpt-4o-mini",  # Changed from "gpt-4o-mini" to a valid model name
        temperature=0
    )
    
    # Test the connection with a simple prompt
    test_response = chatgpt.invoke("Hello, are you working?")
    print("API Connection Test:", test_response)
except Exception as e:
    print(f"API Connection Error: {str(e)}")

API Connection Test: content="Hello! Yes, I'm here and ready to help. What can I assist you with today?" additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 13, 'total_tokens': 32, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_560af6e559', 'id': 'chatcmpl-C6b2FrO51MslxKCGMK83ft7PbmFNG', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--c9cf4ca4-1cd1-4e2d-9af9-73f13f2ff4ad-0' usage_metadata={'input_tokens': 13, 'output_tokens': 19, 'total_tokens': 32, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}


In [43]:
paper_docs = []
for fp in pdf_files:
    try:
        print(f"Processing file: {fp}")
        chunks = create_contextual_chunks(file_path=fp)
        if chunks:  # Only extend if we got valid chunks
            paper_docs.extend(chunks)
            print(f"Successfully processed {len(chunks)} chunks")
        else:
            print(f"No chunks were generated for {fp}")
    except Exception as e:
        print(f"Error processing {fp}: {str(e)}")
        continue

Processing file: C:\Users\DELL\Desktop\AI Projects\Langchain_proj\Logistic_Agent_Planner\data\road_knowledge\Chapter-2-Project-Rationale.pdf
Successfully processed 52 chunks
Processing file: C:\Users\DELL\Desktop\AI Projects\Langchain_proj\Logistic_Agent_Planner\data\road_knowledge\Hume-Region-Planning-for-Freight-Pilot-Strategy-Report.pdf
Successfully processed 123 chunks
Processing file: C:\Users\DELL\Desktop\AI Projects\Langchain_proj\Logistic_Agent_Planner\data\road_knowledge\Hume-Regional-Growth-Plan-May-2014.pdf
Successfully processed 182 chunks


In [44]:
from langchain_openai import OpenAIEmbeddings

# details here: https://openai.com/blog/new-embedding-models-and-api-updates
openai_embed_model = OpenAIEmbeddings(model='text-embedding-3-small')

In [45]:
from langchain_chroma import Chroma

chroma_db = Chroma.from_documents(documents=paper_docs,
                                  collection_name='my_context_db',
                                  embedding=openai_embed_model,
                                  # need to set the distance function to cosine else it uses euclidean by default
                                  # check https://docs.trychroma.com/guides#changing-the-distance-function
                                  collection_metadata={"hnsw:space": "cosine"},
                                  persist_directory="./my_context_db")

In [46]:
chroma_db = Chroma(persist_directory="./my_context_db",
                   collection_name='my_context_db',
                   embedding_function=openai_embed_model)

In [47]:
similarity_retriever = chroma_db.as_retriever(search_type="similarity",
                                              search_kwargs={"k": 5})

In [48]:
from langchain_core.prompts import ChatPromptTemplate

rag_prompt = """You are an assistant who is an expert in question-answering tasks.
                Answer the following question using only the following pieces of retrieved context.
                If the answer is not in the context, do not make up answers, just say that you don't know.
                Keep the answer detailed and well formatted based on the information from the context.

                Question:
                {question}

                Context:
                {context}

                Answer:
            """

rag_prompt_template = ChatPromptTemplate.from_template(rag_prompt)

In [49]:
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

chatgpt = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

qa_rag_chain = (
    {
        "context": (similarity_retriever
                      |
                    format_docs),
        "question": RunnablePassthrough()
    }
      |
    rag_prompt_template
      |
    chatgpt
)

In [None]:
from IPython.display import display, Markdown

query = "Travelling to sydney via hume route to deliver goods what should I keep in mind "
result = qa_rag_chain.invoke(query)
display(Markdown(result.content))

When traveling to Sydney via the Hume route to deliver goods, there are several important considerations to keep in mind:

1. **Freight Corridors**: The Hume region serves as a critical corridor for freight movement in Australia, connecting major cities such as Melbourne, Sydney, and Brisbane. Understanding the significance of these corridors can help in planning your route effectively.

2. **Road and Rail Linkages**: The region has intensive management and investment in road and rail linkages to regional NSW and Queensland. Familiarize yourself with the key freight corridors, such as the Murray River and localities like Tocumwal, Wodonga, and Shepparton, which are essential for enhancing regional freight efficiency.

3. **Preferred Access Routes**: Utilize preferred access route maps for industrial and commercial precincts. These maps can assist in avoiding road space conflicts, minimizing localized congestion, and reducing wear on local roads.

4. **State-Managed Roads**: Prioritize state-managed roads over local routes due to limited funding and maintenance issues. This is crucial for ensuring that heavy vehicles have access to well-maintained routes.

5. **Rest Areas**: Be aware that there are 165 recorded rest areas along A and B roads, but only 35 are designated for truck parking. Plan your stops accordingly to ensure compliance with rest regulations and to avoid fatigue.

6. **Freight Management Solutions**: Collaborate with local councils and freight generators to understand the costs and benefits of route upgrades and maintenance. This collaboration can lead to better freight management solutions and improved delivery efficiency.

7. **Topography Considerations**: The topography of the Hume region can affect freight movement. Be prepared for variations in route conditions, especially in areas with alpine valleys or dispersed freight movement patterns.

8. **Future Infrastructure Developments**: Stay informed about ongoing and future infrastructure projects, such as the proposed high-speed rail system and upgrades to existing transport networks, which may impact your travel and delivery times.

By keeping these factors in mind, you can optimize your journey and ensure a successful delivery of goods to Sydney via the Hume route.