In [2]:
import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI
from searchwebsite import Website
import gradio as gr

In [3]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

In [4]:
#Load environment variables in a file called .env

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

In [36]:
def process_url(url: str):
    # instantiate your Website loader
    website = Website(url)
# print(website.title)
# print(website.text)
# website.links

# sending information to model first time

In [6]:
openai = OpenAI()

In [7]:
response = openai.responses.create(
    model="o4-mini",
    input=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": (
            "Here’s a list of URLs I’d like you to analyze:\n\n"
            f"{website.links}\n\n"
            "Can you please:\n"
            "1. Use all the links that will be useful for creating a profession resume\n"
            "2. Extract domain names.\n"
            "3. Summarize what kind of content lives at each URL (e.g. blog post, homepage, contact page)."
        )}
    ]
)

# 4. Print out the assistant’s reply
ass_response = response.output_text

In [8]:
ass_response

'Here’s an organized breakdown of your URLs:\n\n1. Links most useful when building a professional résumé/profile  \n   • /education  \n   • /Intellectual_Portfolio  \n   • /AI_hub  \n   • /contact  \n   • https://www.linkedin.com/in/premkora/  \n   • https://x.com/premkumarkora  \n\n2. Domain names extracted  \n   • (relative URLs): your personal site’s domain (e.g. “yourname.com”)  \n   • medium.com  \n   • linkedin.com  \n   • x.com  \n\n3. Brief summary of each URL’s content\n\n• #thememaincontent  \n  – Anchor/skip link to the main content area on a page.  \n  – Not a standalone page.\n\n• /  \n  – Homepage of your personal site.  \n  – Likely overview of who you are, key achievements, navigation to other sections.\n\n• /AI_hub  \n  – “AI Hub” section/page on your site.  \n  – Likely showcases AI projects, tools, articles or a portfolio of AI-related work.\n\n• /Intellectual_Portfolio  \n  – Dedicated portfolio of publications, patents, whitepapers or research.  \n  – Highlights yo

# A system prompt 

In [9]:
system_prompt = "You are an assistant that analyzes the contents of a website \
and provide a resume, look for infomation from other links provided \
Respond in markdown."

# User Prompt

In [10]:
user_prompt = f"You are looking at a website titled {website.title}"
user_prompt += "\nThe contents of this website is as follows; \
please create a professional resume pass ATS in markdown. \n\n"
user_prompt += website.text
user_prompt += ass_response

# sending information to model second time

In [11]:
response = openai.responses.create(
    model="o4-mini",
    reasoning={"effort": "medium"},
    input=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt }
    ]
)

#print(response.output_text)
#response.choices[0].message.content
display(Markdown(response.output_text))

```markdown
# PremKumar Kora  
Data Scientist & AI Architect  

Email: premkora@example.com • LinkedIn: https://www.linkedin.com/in/premkora/ • X: https://x.com/premkumarkora • Website: yourname.com  

---

## Professional Summary
Seasoned Data Scientist & AI Architect with 25+ years in software technology and artificial intelligence. Specializes in Generative AI, Large Language Models (LLMs), prompt engineering, NLP, predictive analytics, and cloud-based AI deployments. Proven track record designing and delivering enterprise-scale AI solutions, LLMOps frameworks, real-time data pipelines, and automated decision-making systems. Published author and inventor with a passion for mentoring the next generation of AI professionals and driving community-focused humanitarian initiatives.

---

## Core Competencies
• Generative AI & LLMOps  
• AI Architecture & System Design  
• Machine Learning & Deep Learning  
• Natural Language Processing (NLP)  
• Prompt Engineering & Transformer Models  
• Predictive Analytics & Model Evaluation  
• Real-Time Data Pipelines  
• Cloud AI Deployment (AWS, Azure, GCP)  
• AI Automation & MLOps  
• Technical Leadership & Mentoring  

---

## Professional Experience

**Senior Data Scientist & GenAI Architect (Independent Consultant)**  
2000 – Present  
- Architected and delivered end-to-end AI solutions across finance, healthcare, manufacturing and retail.  
- Designed LLMOps frameworks for fine-tuning, deployment and monitoring of large language models.  
- Built scalable data ingestion and processing pipelines enabling real-time analytics and automated decision systems.  
- Led cross-functional teams to integrate GenAI capabilities into CRM, ERP and custom enterprise applications.  
- Authored technical whitepapers and conducted workshops on prompt engineering, model bias mitigation and performance tuning.

---

## Patents & Intellectual Property
- 2+ Patents granted in AI-driven data analytics and model optimization (details available upon request).  

---

## Publications & Thought Leadership
22+ peer-reviewed articles and technical posts on Medium, covering:  
- “Specialized AI Intelligence — Vertical AI Agents”  
- “Fine-Tuning Foundational Models: A Guide to Customizing AI for Specific Needs”  
- “Evaluating Machine Learning and Deep Learning Models”  
- “Streaming Large Language Models: Architecture & Trade-Offs”  
- “Bias and Variance / Overfitting and Underfitting Trade-Offs”  
- “Data Lineage in Modern Data Management”  
- “Discriminative vs. Generative Models: Use Cases & Comparisons”  
- “Difference Between an ML Algorithm and an ML Model”  

---

## Mentoring & Leadership
- Mentored 100+ future AI professionals via technical guidance, career coaching and project-based learning.  
- Developed curriculum and delivered workshops on machine learning best practices, MLOps and AI ethics.  

---

## Community & Volunteering
- Organized Eye Camp: screened 300+ individuals; facilitated 70+ cataract surgeries for underprivileged communities.  
- Led Flood Relief Camp: coordinated relief materials, food, medical aid and rehabilitation for displaced families.  
- Hosted Medical Camp at Irrukkam Island: provided free check-ups and treatments for the local fisherman community.  

---

## Awards & Recognitions
- Best Mason of the Year, Pitt Macdonald Lodge No.1198, for outstanding charitable contributions.  
- Recognition for exceptional humanitarian service in disaster-relief and community health initiatives.  

---

## Education
- [Degree], [Major], [Institution], [Year]  
- Certifications: [List relevant AI/ML certifications]

---

## Links & Contact
- Portfolio & AI Projects: yourname.com/AI_hub  
- Intellectual Portfolio: yourname.com/Intellectual_Portfolio  
- Education Details: yourname.com/education  
- Contact Form: yourname.com/contact  

*References available upon request.*  
```

In [12]:
# Your text to write
text = response.output_text

# Open (or create) output.txt for writing, in UTF-8
with open("output.txt", "w", encoding="utf-8") as f:
    f.write(text)

print("Text has been written to output.txt")

Text has been written to output.txt


In [13]:
# 1. Read your file
with open("output.txt", "r", encoding="utf-8") as f:
    full_text = f.read()

# 2. Wrap it in a Document (so split_documents knows what to split)
doc = Document(page_content=full_text, metadata={"source": "output.txt"})

# 3. Create and run the splitter
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents([doc])

# `chunks` is now a list of Document objects, each with up to 1000 chars
for i, chunk in enumerate(chunks):
    print(f"── chunk {i} ({len(chunk.page_content)} chars) ──")
    print(chunk.page_content[:200].replace("\n", " "), "…\n")


── chunk 0 (795 chars) ──
```markdown # PremKumar Kora   Data Scientist & AI Architect    Email: premkora@example.com • LinkedIn: https://www.linkedin.com/in/premkora/ • X: https://x.com/premkumarkora • Website: yourname.com   …

── chunk 1 (418 chars) ──
---  ## Core Competencies • Generative AI & LLMOps   • AI Architecture & System Design   • Machine Learning & Deep Learning   • Natural Language Processing (NLP)   • Prompt Engineering & Transformer M …

── chunk 2 (841 chars) ──
---  ## Professional Experience  **Senior Data Scientist & GenAI Architect (Independent Consultant)**   2000 – Present   - Architected and delivered end-to-end AI solutions across finance, healthcare, …

── chunk 3 (764 chars) ──
---  ## Patents & Intellectual Property - 2+ Patents granted in AI-driven data analytics and model optimization (details available upon request).    ---  ## Publications & Thought Leadership 22+ peer- …

── chunk 4 (996 chars) ──
---  ## Mentoring & Leadership - Mentored 100+ future 

In [15]:
db_name = "scrape_website"

In [27]:
embeddings = OpenAIEmbeddings()
#If there is a old DB then delete it
if os.path.exists(db_name):
    Chroma(
        persist_directory=db_name,
        embedding_function=embeddings
    ).delete_collection()

#create a new Vector DB
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=db_name
)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 6 documents


In [28]:
MODEL="o4-mini"

In [29]:
llm = ChatOpenAI(temperature=1, model_name=MODEL)
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
retriever = vectorstore.as_retriever()
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [30]:
query = "Who is Prem"
result = conversation_chain.invoke({"question": query})
print(result["answer"])

Prem refers to PremKumar Kora, a seasoned Data Scientist and AI Architect with over 25 years of experience in software technology and artificial intelligence. He specializes in Generative AI, Large Language Models (LLMs), NLP, predictive analytics, real-time data pipelines and cloud-based AI deployments, and has a strong track record of designing enterprise-scale AI solutions, authoring technical whitepapers, mentoring emerging AI talent and leading community humanitarian initiatives.


In [31]:
query = "Where did Prem study"
result = conversation_chain.invoke({"question": query})
print(result["answer"])

I’m sorry, but I don’t know where PremKumar Kora studied; the educational institution isn’t specified in the provided information.


In [34]:
# Wrapping that in a function

def chat(message, history):
    result = conversation_chain.invoke({"question": message})
    return result["answer"]

In [None]:
#view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

In [42]:
with gr.Blocks() as demo:
    url = gr.Textbox(label="Type your URL")
    greet_btn = gr.Button("Send your URL")
    greet_btn.click(fn=process_url, inputs=url)
    view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)
demo.launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7864
* To create a public link, set `share=True` in `launch()`.


* Running on local URL:  http://127.0.0.1:7865
* To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "C:\Users\premk\anaconda3\envs\llms\Lib\site-packages\gradio\queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\premk\anaconda3\envs\llms\Lib\site-packages\gradio\route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\premk\anaconda3\envs\llms\Lib\site-packages\gradio\blocks.py", line 2146, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\premk\anaconda3\envs\llms\Lib\site-packages\gradio\blocks.py", line 1664, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\premk\anaconda3\envs\llms\Lib\site-packages\anyio\to_thread.py", line 56, in run_sync
    return awai