Installing prerequisites

In [1]:
!pip install transformers sentence-transformers langchain langchain_community pypdf gradio accelerate language-tool-python
!apt-get install openjdk-17-jre-headless -qq > /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"



Loading Document

In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyPDFLoader("/content/Documents.pdf")
documents = loader.load()

if not documents:
    raise ValueError("No documents were loaded. Please check the PDF file path.")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1250, chunk_overlap=250)
texts = text_splitter.split_documents(documents)

if not texts:
    raise ValueError("No text chunks were generated. Please check the document content.")

 Embedding

In [3]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("intfloat/e5-large-v2")
embeddings = embedding_model.encode([text.page_content for text in texts])

if len(embeddings) == 0:
    raise ValueError("No embeddings were generated. Please check the embedding model.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Cosine Similarity search

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

class SimpleVectorStore:
    def __init__(self, texts, embeddings):
        self.texts = texts
        self.embeddings = np.array(embeddings)
    #Search function to compare the document and queries
    def search(self, query_embedding, top_k=3):
        query_embedding = np.array(query_embedding).reshape(1, -1)
        similarities = cosine_similarity(query_embedding, self.embeddings)[0]
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        return [self.texts[i] for i in top_indices]

vector_store = SimpleVectorStore(
    texts=[text.page_content for text in texts],
    embeddings=embeddings
)


RAG Pipeline

In [5]:
from transformers import pipeline
import language_tool_python

# loading llms
generator = pipeline("text2text-generation", model="google/flan-t5-base")

tool = language_tool_python.LanguageTool('en-US')

def correct_grammar(text):
    return tool.correct(text)

def rag_pipeline(query):
    if query.lower() in ["hello", "salam", "hi", "hey","What's up","Yo","Assalamualaikum"]:
        return "Hello! How can I help you?"

    query_embedding = embedding_model.encode([query])[0]
    relevant_docs = vector_store.search(query_embedding, top_k=5)
    if not relevant_docs:
        return "No relevant documents found. Please try a different query."

    context = "\n\n".join(relevant_docs)
    prompt = f"""You are a helpful and professional assistant. Use the following context to answer the user's question clearly, concisely, and in grammatically correct English.
Context:
{context}
Question: {query}
Answer:"""

    response = generator(prompt, max_length=250)
    # Extract and correct response
    if response and len(response) > 0:
        raw_text = response[0]['generated_text']
        corrected = correct_grammar(raw_text)
        return corrected
    else:
        return "No response generated. Please try again."



Device set to use cuda:0


terminal interface

In [None]:
import gradio as gr

with gr.Blocks(css="""
.chatbot-container {
    background: linear-gradient(to bottom right, #f8f4ff, #ffffff);
    padding: 1.2rem;
    border-radius: 1.25rem;
    max-height: auto;
    overflow-y: auto;
    font-family: 'Segoe UI', 'Inter', sans-serif;
    display: flex;
    flex-direction: column;
    box-shadow: 0 4px 24px rgba(78, 42, 132, 0.07);
}

.user-bubble, .assistant-bubble {
    opacity: 0;
    animation: fadeIn 0.5s ease-in-out forwards;
    transition: transform 0.3s ease;
    font-size: 1.125rem;
}

.user-bubble:hover, .assistant-bubble:hover {
    transform: translateY(-2px);
}

.user-bubble {
    background: linear-gradient(135deg, #fef6f0, #f5d4c4);
    color: #5a2a00;
    padding: 12px 18px;
    border-radius: 1rem 1rem 0 1rem;
    max-width: 100%;
    align-self: flex-end;
    margin: 6px 0;
    box-shadow: 0 3px 6px rgba(0,0,0,0);
}

.assistant-bubble {
    background: linear-gradient(135deg, #e6e1f4, #d9c7ff);
    color: #2c1b4f;
    padding: 12px 18px;
    border-radius: 1rem 1rem 1rem 0;
    max-width: 100%;
    align-self: flex-start;
    margin: 6px 0;
    border: 1px solid #cab8f5;
    box-shadow: 0 3px 12px rgba(0,0,0,0);
}

@keyframes fadeIn {
    0% {opacity: 0; transform: translateY(10px);}
    100% {opacity: 1; transform: translateY(0);}
}

.header {
    background: linear-gradient(90deg, #4c1d95 0%, #7c3aed 100%);
    padding: 20px;
    border-radius: 16px;
    color: white;
    text-align: center;
    box-shadow: 0 4px 16px rgba(0,0,0,0);
}

.footer {
    text-align: center;
    font-size: 13px;
    color: #7f6e9d;
    margin-top: 1rem;
}
""") as ui:

    gr.Markdown("""
    <div class='header'>
        <h1 style='margin-bottom: 0; font-size: 2rem;'>Study Abroad Assistant</h1>
        <p style='margin-top: 6px; font-size: 1rem;'></p>
    </div>
    """)

    chatbot = gr.Chatbot(
        bubble_full_width=False,
        height=630,
        show_copy_button=True,
        elem_classes="chatbot-container"
    )

    with gr.Row():
        txt_input = gr.Textbox(
            placeholder="Ask about top universities, scholarships, or career guidance...",
            show_label=False,
            lines=1,
            scale=9
        )
        send_btn = gr.Button("Send", variant="primary", scale=1)

    def animated_chatbot(query, history):
        if not query.strip():
            return history, ""
        response = rag_pipeline(query)
        user_bubble = f"<div class='user-bubble'> {query}</div>"
        assistant_bubble = f"<div class='assistant-bubble'>{response}</div>"
        history = history + [[user_bubble, assistant_bubble]]
        return history, ""

    send_btn.click(
        fn=animated_chatbot,
        inputs=[txt_input, chatbot],
        outputs=[chatbot, txt_input]
    )

    txt_input.submit(
        fn=animated_chatbot,
        inputs=[txt_input, chatbot],
        outputs=[chatbot, txt_input]
    )

ui.launch(share=True, debug=True)


  chatbot = gr.Chatbot(
  chatbot = gr.Chatbot(


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://cfc1d2c3fa1504ef45.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
