In [1]:
!git clone https://github.com/AayushSharma-1/SRM_RAG.git

Cloning into 'SRM_RAG'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 11 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (11/11), 44.39 KiB | 733.00 KiB/s, done.


<img src="https://gradientflow.com/wp-content/uploads/2023/10/newsletter87-RAG-simple.png" alt="RAG Example" width="1000">


            +-------------------------+
            |                         |
            |    Start of Program     |
            |                         |
            +------------+------------+
                         |
                         v
          +----------------------------------+
          |                                  |
          |       Step 1: Data for RAG       |
          |     (Web Scraping, PDF, etc.)    |
          +------------+---------------------+
                       |
                       v
          +----------------------------------+
          |                                  |
          |     Step 2: Chunking of Data     |
          |     (Sentence Tokenization)      |
          +------------+---------------------+
                       |
                       v
          +----------------------------------+
          |                                  |
          |      Step 3: Making Embeddings   |
          |      of Chunks (Sentence Vectors)|
          +------------+---------------------+
                       |
                       v
          +----------------------------------+
          |                                  |
          |     Step 4: Importing Large      |
          |     Language Model (LLM)         |
          |       and Passing Embeddings     |
          |        through LLM               |
          +------------+---------------------+
                       |
                       v
          +----------------------------------+
          |                                  |
          |      Step 5: Generating Output   |
          |      (Text Generation)           |
          +------------+---------------------+
                       |
                       v
            +---------------------------+
            |                           |
            |      End of Program       |
            |                           |
            +---------------------------+



# Step 1: **Data for RAG**

In [2]:
def loader(path):
    """
    Loads data from different folders and organizes into identifiable variables
    Args:
        path (str): Base path containing the folders

    Returns:
        dict: Dictionary containing the data organized by departments and categories
    """
    import os

    # Dictionary to store all data
    data = {
        'departments': {},
        'about_us': {},
        'facility': {}
    }

    # Read departments data
    dept_path = os.path.join(path, 'departments')
    if os.path.exists(dept_path):
        with open(os.path.join(dept_path, 'data.txt'), 'r', encoding='utf-8') as f:
            data['departments'] = f.read()

    # Read about_us data
    about_path = os.path.join(path, 'about_us')
    if os.path.exists(about_path):
        with open(os.path.join(about_path, 'data.txt'), 'r', encoding='utf-8') as f:
            data['about_us'] = f.read()

    # Read facility data
    facility_path = os.path.join(path, 'facility')
    if os.path.exists(facility_path):
        with open(os.path.join(facility_path, 'data.txt'), 'r', encoding='utf-8') as f:
            data['facility'] = f.read()

    # Read RAG notebook

    return data

In [3]:
# Example usage
data = loader("/content/SRM_RAG")

# Access different sections
dept_data_unstructured = data['departments']
about_data_unstructured = data['about_us']
facility_data_unstructured = data['facility']

## Convert the unstructured data into structured data **(Preprocessing)**

In [4]:
import os
import textwrap
import re

def clean_text(text):
    """
    Cleans text by removing special characters and extra whitespace.
    """
    text = re.sub(r'[\n\r\t]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s\.\,\-\']', ' ', text)
    return text.strip()

def align_text(text, width=80):
    """
    Aligns text to specified width.
    """
    wrapped_lines = textwrap.wrap(text, width=width)
    return '\n'.join(wrapped_lines)

def preprocess_unstructured_data(data):
    """
    Preprocesses unstructured data from different sources.
    """
    processed_text = []


    cleaned_text = clean_text(data)
    # Align the text
    aligned_text = align_text(cleaned_text)
    processed_text.append(aligned_text)

    return '\n\n'.join(processed_text)

# Example usage:
# Process department data
dept_data_structured = preprocess_unstructured_data(dept_data_unstructured)

# Process about us data
about_text_structured = preprocess_unstructured_data(about_data_unstructured)

# Process facility data
facility_text_structured = preprocess_unstructured_data(facility_data_unstructured)

# Combine all text if needed
all_text_structured = f"{dept_data_structured}\n\n{about_text_structured}\n\n{facility_text_structured}"

print(f"Processed text sample: {all_text_structured[:200]}")

Processed text sample: --- Start of btechCE.txt --- Title  Civil Engineering College in Lucknow URL
Source  https   srmcem.ac.in btechCE.aspx Markdown Content  RESULT GALLERY
Admission Tollfree  18001035298 APPLY NOW GRIEVA


In [5]:
print(dept_data_structured)
print(about_text_structured)
print(facility_text_structured)

--- Start of btechCE.txt --- Title  Civil Engineering College in Lucknow URL
Source  https   srmcem.ac.in btechCE.aspx Markdown Content  RESULT GALLERY
Admission Tollfree  18001035298 APPLY NOW GRIEVANCE REDRESSAL ABOUT US
DEPARTMENTS ACADEMICS ADMISSIONS TRAINING   PLACEMENTS CAMPUS LIFE FACILITIES
LIBRARY LOGIN Department of Civil Engineering  CE  Civil Engineering Department
Duration  4 Years Seats  60 Seats  B.Tech  Civil Engg.  APPLY NOW Duration  2
Years 09 Seats  M.Tech  Civil Engg.  M.Tech  Environmental Engg.  APPLY NOW
About the Department Civil Engineering is rightly considered the most versatile
amongst all engineering branches. It encompasses a rich miscellany of streams,
ranging from Geotechnical Engineering to Structural Engineering, Environmental
Engineering to Hydraulics Engineering, and Construction to Hydrology. In terms
of scope and application, Civil Engineering can be considered the largest core
engineering branch. The primary endeavor of the Civil Engineering dep

# Step 2: **Chunking of Data**

In [6]:
!pip install -qU langchain-text-splitters

In [7]:
from  langchain_text_splitters import RecursiveCharacterTextSplitter

def chunker(data):
    """
    Splits the input data into chunks of specified size with overlap.

    This function uses the RecursiveCharacterTextSplitter to divide the input
    text into chunks of a specified size, with a specified overlap between chunks.

    Args:
        data (str): The input text data to be split into chunks.

    Returns:
        list: A list of text chunks.

    Example:
        >>> data = "This is a sample text that needs to be split into chunks."
        >>> chunks = chunker(data)
        >>> print(chunks)
        ['This is a sample text that needs to be split into chunks.']

    Note:
        - The chunk_size is set to 500 characters.
        - The chunk_overlap is set to 100 characters.
        - The length_function is set to len.
        - The is_separator_regex is set to False.
    """


    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                               chunk_overlap=100,
                                               length_function = len,
                                               is_separator_regex=False)
    return text_splitter.split_text(data)

In [8]:
dept_chunks = chunker(dept_data_structured)
print("Number of chunks for dept_data: ", len(dept_chunks))

# Similar chunking can be done for other data sections
about_chunks = chunker(about_text_structured)
facility_chunks = chunker(facility_text_structured)

print("Number of chunks for about_data: ", len(about_chunks))
print("Number of chunks for facility_data: ", len(facility_chunks))

Number of chunks for dept_data:  248
Number of chunks for about_data:  86
Number of chunks for facility_data:  14


# Step 3: **Making Embeddings of Chunks**

<img src="https://www.nvidia.com/content/nvidiaGDC/us/en_US/glossary/vector-database/_jcr_content/root/responsivegrid/nv_container_1795650/nv_image_copy.coreimg.100.1070.jpeg/1710829331227/vector-database-embedding-1920x1080.jpeg" alt="Vector Database Embedding" width="500">


In [9]:
!pip install sentence_transformers



In [10]:
from sentence_transformers import SentenceTransformer
"""
This script initializes a SentenceTransformer model using the 'paraphrase-MiniLM-L6-v2' pre-trained model.
Modules:
    sentence_transformers: A library for applying state-of-the-art sentence, text, and image embeddings.
Classes:
    SentenceTransformer: A class from the sentence_transformers library used to load pre-trained models for generating embeddings.
Usage:
    The script imports the SentenceTransformer class from the sentence_transformers library and initializes a model named 'paraphrase-MiniLM-L6-v2'. This model can be used to generate embeddings for sentences, which can be useful for various NLP tasks such as semantic search, clustering, and paraphrase mining.
"""

# Initialize a SentenceTransformer model named 'paraphrase-MiniLM-L6-v2'
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

<img src="https://aitechtrend.com/wp-content/uploads/2023/02/Cosine-similarity.jpg" alt="Cosine Similarity Example" width="500">


In [11]:
import numpy as np

def cosine_similarity(a, b):
    """
    Compute the cosine similarity between two matrices of vectors.

    Args:
    a (numpy.ndarray): The first matrix of vectors.
    b (numpy.ndarray): The second matrix of vectors.

    Returns:
    numpy.ndarray: The cosine similarity matrix, where each element (i, j) represents
    the cosine similarity between the ith vector in 'a' and the jth vector in 'b'.
    """
    # Compute the dot product of the matrices and divide by the product of their norms
    return np.dot(a, b.T) / (np.linalg.norm(a, axis=1)[:, np.newaxis] * np.linalg.norm(b, axis=1))


In [12]:
# Generate embeddings for dept_data
dept_embeddings = model.encode(dept_chunks)

# Generate embeddings for about_us
about_embeddings = model.encode(about_chunks)

# Generate embeddings for facility
facility_embeddings = model.encode(facility_chunks)

print("Embeddings generated for dept_data, about_us, and facility.")

Embeddings generated for dept_data, about_us, and facility.


In [13]:
# Display the shape and a snippet of the embeddings for dept_data
print("Shape of dept_embeddings:", dept_embeddings.shape)
print("Snippet of dept_embeddings:\n", dept_embeddings[:2])  # Display first 2 embeddings for brevity

Shape of dept_embeddings: (248, 384)
Snippet of dept_embeddings:
 [[-1.88035443e-01  1.90352142e-01 -1.09716475e-01 -3.90454717e-02
  -6.32309690e-02  1.91812426e-01 -1.25484750e-01  2.31407419e-01
  -2.12347239e-01  1.42696917e-01 -8.13355073e-02 -5.10341823e-01
   2.20214114e-01 -3.88532877e-01 -2.21620481e-02 -1.23370364e-01
   1.13991071e-02 -1.54173702e-01  3.71501744e-01 -2.38458708e-01
  -1.68640614e-01  1.03417918e-01 -4.67959434e-01  8.79289024e-03
   2.21540540e-01 -3.76255274e-01 -2.40775838e-01 -1.45669535e-01
   6.23760402e-01 -3.24722946e-01  1.09930187e-01  1.17339708e-01
  -1.79661706e-01  3.34374815e-01  2.94498205e-01  3.73752356e-01
  -1.17331870e-01 -9.99128371e-02  1.17736600e-01  1.45918459e-01
  -8.32439363e-02 -2.86601931e-01  8.59783590e-02 -1.21165238e-01
   1.05482452e-01  8.42243340e-03 -4.33853492e-02 -8.86643007e-02
  -2.29910780e-02  3.03670168e-01  8.28145966e-02  1.17467813e-01
  -6.08969592e-02  1.81657776e-01 -2.11543709e-01  2.96032995e-01
  -7.26243

In [14]:
# Display the shape and a snippet of the embeddings for about_data
print("\nShape of about_embeddings:", about_embeddings.shape)
print("Snippet of about_embeddings:\n", about_embeddings[:2])  # Display first 2 embeddings for brevity


Shape of about_embeddings: (86, 384)
Snippet of about_embeddings:
 [[-3.31109583e-01  2.60959923e-01 -2.12160140e-01  3.84646431e-02
  -8.92190188e-02 -1.58494011e-01 -5.67425549e-01  9.70378816e-02
  -1.07221939e-01 -1.65254802e-01 -6.80764019e-02 -2.13327840e-01
   1.29558682e-01 -1.20220939e-02 -1.62084669e-01 -5.44008873e-02
  -1.38197914e-01 -2.09403131e-02  1.30539030e-01  2.02691510e-01
  -2.90156007e-01 -1.66741759e-02 -1.13468999e-02 -2.35739633e-01
   2.59651065e-01 -5.75700309e-03  2.40244064e-02 -4.76194806e-02
   3.13539445e-01 -2.14270994e-01  3.09720576e-01 -1.67441338e-01
  -2.11802218e-02  1.14894755e-01 -6.01824485e-02  5.46995461e-01
  -1.17266975e-01  3.96049805e-02 -3.81052382e-02  5.40397242e-02
  -1.05355814e-01  5.21817170e-02  3.52585554e-01  2.23195255e-01
  -5.21997400e-02  1.27693638e-01  6.15289472e-02  2.85467561e-02
  -9.91406217e-02  1.62444457e-01 -2.99397022e-01  1.00175235e-02
  -1.70648575e-01 -2.17542395e-01  1.70455128e-01  1.06724620e-01
  -1.735

In [15]:
# Display the shape and a snippet of the embeddings for facility_data
print("\nShape of facility_embeddings:", facility_embeddings.shape)
print("Snippet of facility_embeddings:\n", facility_embeddings[:2])  # Display first 2 embeddings for brevity


Shape of facility_embeddings: (14, 384)
Snippet of facility_embeddings:
 [[-0.14669727 -0.05164526 -0.17155635 -0.20275058  0.04860336  0.19477037
  -0.38394982  0.23385201  0.17216036  0.02994411  0.30791125  0.17565729
   0.3837982  -0.17528518 -0.25833222 -0.23635432 -0.1664553  -0.09151947
   0.29258642 -0.19510977 -0.07379653 -0.3951105  -0.05445313 -0.30846596
   0.4006054   0.2632607  -0.08410691 -0.10418082  0.18947193 -0.32439765
   0.56788456 -0.05410272  0.30703583 -0.13971165  0.06805867  0.310168
  -0.05640415 -0.17937241 -0.2021486   0.05874828 -0.22434829 -0.08153923
   0.20341708  0.04656869  0.4468728  -0.04641501 -0.18784606 -0.06144275
  -0.15847762 -0.2593032   0.38595867 -0.26497135  0.30528885 -0.19821717
  -0.19356517  0.3593257   0.04207664  0.17394443 -0.14308928  0.04463715
   0.18640192 -0.2780435   0.07262978  0.09438272 -0.1480493  -0.04965033
  -0.03961108  0.13664307  0.22083655 -0.28591162 -0.54459786 -0.00849921
  -0.01037845  0.02252799  0.27018008  0

# Step 4: **Importing Large Language Model (LLM) and Passing Embeddings through LLM**

In [23]:
!pip install -qU google-generativeai groq

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/108.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.8/108.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [16]:
import google.generativeai as genai
import os


class GeminiModel:
    def __init__(self, api_key=None, model_name="gemini-1.5-pro"):
        # Configure the API with the provided key
        if not api_key:
            # api_key = os.getenv("GOOGLE_API_KEY")
            api_key = "********************" 
            # generate api key from pinecone api

        genai.configure(api_key=api_key)

        # Default configuration settings; can be customized further if needed
        generation_config = {
            "temperature": 0.00001,
            "top_p": 1,
            "top_k": 1,
            "max_output_tokens": 8192,
        }

        safety_settings = [
            {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_ONLY_HIGH"},
            {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_ONLY_HIGH"},
            {
                "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
                "threshold": "BLOCK_ONLY_HIGH",
            },
            {
                "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
                "threshold": "BLOCK_ONLY_HIGH",
            },
        ]

        # Set up the model with the provided model name
        self.model = genai.GenerativeModel(
            model_name=model_name,
            generation_config=generation_config,
            safety_settings=safety_settings,
        )
    def generate_content(self, prompts):
        # Generate content based on the provided prompts
        response = self.model.generate_content([prompts])
        return response.text
Gemini= GeminiModel()

In [24]:


import os
from getpass import getpass
from groq import Groq

class GroqWrapper:
    def __init__(self, api_key=None, model_name="gemma2-9b-it"):
        """
        Initialize the Groq Wrapper with the API key and default model name.
        """
        # Retrieve the API key from the environment variable or prompt the user
        if api_key is None:
            api_key = getpass("Please enter your Groq API key: ")  # Securely prompt for API key
            if not api_key:
                raise ValueError("Groq API key not found. Please provide it explicitly or set 'GROQ_API_KEY' as an environment variable.")

        self.api_key = api_key
        self.model_name = model_name

        # Configure the Groq client
        self.client = Groq(api_key=self.api_key)


    def generate_content(self, prompt):
        """
        Generate a chat completion for the given prompt using the configured model.
        """
        try:


            # Call the Groq API
            chat_completion = self.client.chat.completions.create(
                messages=[
                    {"role": "user", "content": prompt}
                ],
                model=self.model_name
            )

            # Extract and return the generated content
            generated_text = chat_completion.choices[0].message.content

            return generated_text

        except Exception as e:
            return f"Error: {str(e)}"

# Instantiate and test the wrapper
groq = GroqWrapper()  # This will prompt for the API key



Please enter your Groq API key: ··········


# Step 4: **Generating Output (Text Generation)**

In [25]:
def combine_data(embeddings_list, chunks_list):
    """
    Combines multiple embedding arrays and chunk lists into single arrays

    Args:
        embeddings_list (list): List of embedding arrays
        chunks_list (list): List of text chunk lists

    Returns:
        tuple: Combined embeddings array and chunks list
    """
    combined_embeddings = np.vstack(embeddings_list)
    combined_chunks = [chunk for chunks in chunks_list for chunk in chunks]
    return combined_embeddings, combined_chunks

def run_rag(ip, embeddings_array, chunks):
    """
    Runs the RAG process for a given input query.

    Args:
        ip (str): The input query
        embeddings_array (numpy.ndarray): Combined embeddings array
        chunks (list): Combined text chunks list

    Returns:
        str: Generated output from Gemini model
    """
    queries = [ip]
    embedded_queries = model.encode(queries)

    for i, query_vec in enumerate(embedded_queries):
        similarities = cosine_similarity(query_vec[np.newaxis, :], embeddings_array)
        top_indices = np.argsort(similarities[0])[::-1][:10]
        top_doct = [chunks[index] for index in top_indices]
        print(top_doct)

        argumented_prompt = f'''You are an expert information system about SRM. I'll give you a question and context which is regarding SRM CEM (Shri Ramswaroop Memorial College of Engineering & Management), and you'll return the answer in English. Your name is LAILA a female Ai agent. Query: {queries[i]}. Context: {top_doct}'''

        model_output = groq.generate_content(argumented_prompt)
        return model_output

# Combine embeddings and chunks from all sources
all_embeddings_list = [dept_embeddings, about_embeddings, facility_embeddings]
all_chunks_list = [dept_chunks, about_chunks, facility_chunks]

combined_embeddings, combined_chunks = combine_data(all_embeddings_list, all_chunks_list)


In [18]:
# Example usage with combined data:
output = run_rag("What is the vision of SRMCEM?", combined_embeddings, combined_chunks)
print(output)

['information technology. The MCA programme at SRMCEM is a specialized training\ncourse enabling students to come to terms with application-based knowledge of IT\nfield. During the programme, the student may opt for various electives, such as\nAdvanced Database Management Systems, Mobile Computing, Network, Pattern\nRecognition, Cyber LAWS, and Compiler Design. Courses Offered MCA -  Seats  120\nAbout the Department Vision and Mission Faculty Members Time Table Laboratories', '11  https   srmcem.ac.in images AdmistrativeBlock.jpg     Image 12  SRMCEM\nhttps   srmcem.ac.in Updatedsrmcemlogo.png   https   srmcem.ac.in index.aspx\nShri Ramswaroop Memorial College of Engineering   Management, Lucknow is one of\nthe leading institutes of professional education in UP. SRMCEM is affiliated to\nDr. A.P.J. Abdul Kalam Technical University and recognized by AICTE.      Get In\nTouch   Tiwariganj, Faizabad Road, Lucknow  UP  - 226028   757-000-3074', 'human values and ethos, compassion for ecosys

In [26]:
output = run_rag("Tell me about the history and facilities of SRMCEM.", combined_embeddings, combined_chunks)
print(output)

['leading institutes of professional education in UP. SRMCEM is affiliated to Dr.\nA.P.J. Abdul Kalam Technical University and recognized by AICTE.      Get In\nTouch   Tiwariganj, Faizabad Road, Lucknow  UP  - 226028   757-000-3074\n983-901-0407   admissions srmcem.ac.in      Other Links    List of Holidays\nhttps   srmcem.ac.in ListOfHolidays.aspx     Downloads  https   srmcem.ac.in\ndownloadssrcem.aspx     Newsletters  https   srmcem.ac.in newsletter.aspx', 'human values and ethos, compassion for ecosystem and obligation towards society\nand the nation.     To provide an environment conducive to continuous learning,\nand all-round development of college fraternity.    Image 12  SRMCEM  https\nsrmcem.ac.in Updatedsrmcemlogo.png   https   srmcem.ac.in index.aspx Shri\nRamswaroop Memorial College of Engineering   Management, Lucknow is one of the\nleading institutes of professional education in UP. SRMCEM is affiliated to Dr.', '11  https   srmcem.ac.in images AdmistrativeBlock.jpg    

## Gradio Application

In [20]:
!pip install -qU gradio

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.2/57.2 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.2/320.2 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.8/94.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m89.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.2/73.2 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.8/63.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.2/168.2 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [27]:
import gradio as gr
import time

# UI Code
with gr.Blocks() as demo:
    # Adding a heading using Markdown
    gr.Markdown("<h1 style='text-align: center;'>SRM Rag Chatbot</h1>")  # Centered heading

    # Add a radio button to select language style
    style_selector = gr.Radio(
        choices=["English", "Lucknowi Hinglish"],
        value="English",
        label="Select Response Style",
    )

    chatbot = gr.Chatbot(type="messages")  # Chatbot component to display messages
    msg = gr.Textbox(label="Your Message")  # Textbox to input user message
    clear = gr.Button("Clear")  # Button to clear chat history

    # Function to process user input and update history
    def user(user_message, history: list):
        history.append({"role": "user", "content": user_message})
        return "", history  # Clear the input box and return updated history

    # Function to process bot responses
    def bot(history: list, style):
        # Get the last user message from the history
        user_message = history[-1]["content"]

        # Generate the RAG response with the selected style
        bot_message = run_rag(user_message, combined_embeddings, combined_chunks)  # Pass the style (English or Lucknowi Hinglish)

        history.append({"role": "assistant", "content": ""})
        for character in bot_message:
            history[-1]['content'] += character
            time.sleep(0.05)  # Simulate typing effect
            yield history  # Yield the updated history

    # Link the user message submission and bot response flow
    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, [chatbot, style_selector], chatbot
    )

    # Clear the chat history when the clear button is clicked
    clear.click(lambda: None, None, chatbot, queue=False)

if __name__ == "__main__":
    demo.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5b2bef653ebcfbd567.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
