#  import the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.metrics.pairwise import cosine_similarity


# import the embeddins

In [2]:
path = 'text_chunks_embeddings_2.csv' #path to the chunks and embedding
df = pd.read_csv(path )

# Convert embedding string → numpy vector
df["vec"] = df["embedding"].apply(
    lambda x: np.array(ast.literal_eval(x), dtype=np.float32)
)


In [3]:
# calculate the centroids
centroids = (
    df.groupby("category")["vec"]
      .apply(lambda s: np.mean(np.stack(s.values), axis=0))
      .to_dict()
)


# importing the embdding model

In [4]:
from sentence_transformers import SentenceTransformer
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
embedding_model  = SentenceTransformer( model_name_or_path= 'all-mpnet-base-v2' ,
                                       device = device )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

MPNetModel LOAD REPORT from: sentence-transformers/all-mpnet-base-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

#The retrival funtions

In [5]:
def retrieve(query_text, model = embedding_model , top_categories=2, top_k=5):

    # 1️⃣ Embed query
    query_vec = model.encode(query_text)
    query_vec = np.array(query_vec, dtype=np.float32)

    # 2️⃣ Compute category similarities
    category_scores = {
        cat: float(
            cosine_similarity(
                query_vec.reshape(1, -1),
                centroid.reshape(1, -1)
            )[0][0]
        )
        for cat, centroid in centroids.items()
    }

    # 3️⃣ Select top N categories
    sorted_categories = sorted(
        category_scores.items(),
        key=lambda x: x[1],
        reverse=True
    )

    selected_categories = [cat for cat, _ in sorted_categories[:top_categories]]

    # 4️⃣ Filter dataset by selected categories
    filtered_df = df[df["category"].isin(selected_categories)].copy()

    # 5️⃣ Compute chunk similarity
    matrix = np.stack(filtered_df["vec"].values)

    scores = cosine_similarity(
        query_vec.reshape(1, -1),
        matrix
    )[0]

    filtered_df["score"] = scores

    # 6️⃣ Select top-k per category
    results = (
        filtered_df
        .sort_values(["category", "score"], ascending=[True, False])
        .groupby("category")
        .head(top_k)
    )

    return results[["text", "department", "category", "score", "source"]], category_scores

# login to hugging face

In [3]:
import huggingface_hub
from huggingface_hub import login
login() # you needed to pass your hugging face token

#  importing the model

In [8]:


import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = "meta-llama/Meta-Llama-3-8B-Instruct"


# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name ,
                                          use_auth_token = True )

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype = torch.float16 ,
    device_map="auto"
)

print("Model Loaded Successfully!")

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]



Model Loaded Successfully!


#  generate the output for the given query

In [9]:
def generate_response(prompt, max_tokens=512):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            # repetition_penalty=1.12
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Add the prompt to input text

In [10]:
def get_prompt( retrived , message , history ) :
  conversation = ""

  for user_msg, bot_msg in history:
      conversation += f"User: {user_msg}\nAssistant: {bot_msg}\n"
  prompt = f"""
  You are an assistant answering questions about Panimalar Engineering Institution.

  Use only the context below.
  Conversation so far:
  { conversation }

  Context:
  {retrived}

  Question:
  {message}

  Answer:
  """
  return prompt


#  chat interface preprocessing funtion

In [11]:
def chat_interface(message, history):
  results, category_scores = retrieve(message, embedding_model )
  content = "\n\n".join(results["text"].tolist() )
  prompt = get_prompt( content , message  , history )
  result = generate_response( prompt )
  return result[len( prompt)-1 : ]

#  the gradio the user interface

In [12]:
import gradio as gr



demo = gr.ChatInterface(
    fn=chat_interface,
    title="🎓 College RAG Chatbot (LLaMA 3 7B)",
    description="Ask questions about the college.",
)

demo.launch(share=True , debug=True )

  self.chatbot = Chatbot(


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://a6211e7ab3e6317633.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://a6211e7ab3e6317633.gradio.live




In [13]:
'hi'

'hi'