### Necessary imports

In [1]:
!pip install -q -U torch datasets transformers tensorflow langchain playwright html2text sentence_transformers faiss-cpu
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 trl==0.4.7


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.5/755.5 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.0/817.0 kB[0m [31m60.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.4/37.4 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m19.

### Dependencies

In [2]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from datasets import load_dataset
from peft import LoraConfig, PeftModel
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_transformers import Html2TextTransformer
from langchain.document_loaders import AsyncChromiumLoader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain

### Load quantized Mistal 7B

In [3]:
#################################################################
# Tokenizer
################

model_name='mistralai/Mistral-7B-Instruct-v0.1'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
# if compute_dtype == torch.float16 and use_4bit:
#     major, _ = torch.cuda.get_device_capability()
#     if major >= 8:
#         print("=" * 80)
#         print("Your GPU supports bfloat16: accelerate training with bf16=True")
#         print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


### Count number of trainable parameters

In [4]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 262410240
all model parameters: 3752071168
percentage of trainable model parameters: 6.99%


### Build Mistral text generation pipeline

In [5]:
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=1000,
)

In [6]:
mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

In [7]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

### Load and chunk documents. Load chunked documents into FAISS index

In [8]:
!playwright install
!playwright install-deps

Downloading Chromium 121.0.6167.57 (playwright build v1097)[2m from https://playwright.azureedge.net/builds/chromium/1097/chromium-linux.zip[22m
[1G152.8 MiB [] 0% 10.1s[0K[1G152.8 MiB [] 0% 23.8s[0K[1G152.8 MiB [] 0% 19.6s[0K[1G152.8 MiB [] 0% 10.1s[0K[1G152.8 MiB [] 0% 6.6s[0K[1G152.8 MiB [] 1% 5.5s[0K[1G152.8 MiB [] 1% 5.2s[0K[1G152.8 MiB [] 2% 4.6s[0K[1G152.8 MiB [] 2% 4.8s[0K[1G152.8 MiB [] 3% 4.6s[0K[1G152.8 MiB [] 3% 4.5s[0K[1G152.8 MiB [] 4% 4.1s[0K[1G152.8 MiB [] 4% 4.2s[0K[1G152.8 MiB [] 5% 4.1s[0K[1G152.8 MiB [] 5% 3.9s[0K[1G152.8 MiB [] 6% 3.8s[0K[1G152.8 MiB [] 7% 3.6s[0K[1G152.8 MiB [] 7% 3.5s[0K[1G152.8 MiB [] 8% 3.4s[0K[1G152.8 MiB [] 9% 3.4s[0K[1G152.8 MiB [] 9% 3.5s[0K[1G152.8 MiB [] 10% 3.3s[0K[1G152.8 MiB [] 11% 3.3s[0K[1G152.8 MiB [] 12% 3.1s[0K[1G152.8 MiB [] 13% 3.0s[0K[1G152.8 MiB [] 14% 3.0s[0K[1G152.8 MiB [] 14% 2.9s[0K[1G152.8 MiB [] 15% 2.9s[0K[1G152.8 MiB [] 16% 2.8s[0K[1G152.8 MiB [] 17% 2.7s[0K

In [None]:
import pandas as pd
# import io
# from google.colab import files
# uploaded = files.upload()
df = pd.read_csv('/content/remarks&subject_content.csv')

In [None]:
print(df)


                                         remarks_text  \
0   \r\nमहोदय/महोदया, \r\nहम इस संदर्भ में आपको अव...   
1    Customer concern is regarding slow browsing c...   
2    \tPlease provide the name of ISP of internet ...   
3    \r\nThe Grievance has been sent to the State ...   
4           The petitioner has been suitably advised.   
5           The petitioner has been suitably advised.   
6           The petitioner has been suitably advised.   
7                    As per report of the Department.   
8                    As per report of the Department.   
9                    As per report of the Department.   
10                   As per report of the Department.   
11                   As per report of the Department.   
12                         Contact to circle office     
13     Debit card has been issued and satisfaction...   
14     The Role of General Administration Departme...   
15                  Action taken by the concerned AD.   
16    As per psp system remarks

In [None]:
df.head(24)

Unnamed: 0,remarks_text,subject_content_text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,"\r\nमहोदय/महोदया, \r\nहम इस संदर्भ में आपको अव...",Financial Services (Banking Division) >> Fraud...,,,
1,Customer concern is regarding slow browsing c...,Telecommunications >> Broadband Related >> Oth...,,,
2,\tPlease provide the name of ISP of internet ...,My call drop and internet is not working for t...,,,
3,\r\nThe Grievance has been sent to the State ...,Food and Public Distribution >> Public Distrib...,,,
4,The petitioner has been suitably advised.,December month pension not come.,,,
5,The petitioner has been suitably advised.,Though I have diposited my life certificate in...,,,
6,The petitioner has been suitably advised.,वित्तीय सेवा विभाग (बैंकिंग प्रभाग) >> Miscell...,,,
7,As per report of the Department.,Agriculture and Farmers Welfare >> Crops relat...,,,
8,As per report of the Department.,Agriculture and Farmers Welfare >> PMKISAN rel...,,,
9,As per report of the Department.,सर् नमस्कार मै PM KISAN SAMMAN NIDHI का लाभार्...,,,


In [None]:
# with open("/content/remarks_subject_content.txt", "r") as f:
#   # Read the entire content into a variable
#   text_content = f.read()
# print(text_content)

remarks_text             subject_content_text
                         Financial Services (Banking
                         Division) >> Fraud

                         Department/Bank/Financial
                         Institute : Bank of India
                         -----------------------
                         I have already written to Bank of
                         India Kalyanpur Kanpur in
                         response to Grievance-Redressal
                         Cell HDFC Bank reply dated
                         27.12.2022 and lodge a complaint
                         against the beneficiaries with law
                         enforcement agency.( Cybercrime
                         Cell on 13.08.2022 whose case
                         number is 1537/ 2022/dated
                         13.08.2022)


                         TO
                         BRANCH MANAGER
                         BANK OF INDIA
                         KALYANPUR KANPUR

               

In [9]:
!pip install PyPDF2
import PyPDF2
import os
file_paths = [
  '/content/Grievances.Dataset.pdf'
]

text = ""
for i in file_paths:
  pdfFile = open(i,'rb')
  Reader = PyPDF2.PdfReader(pdfFile)

  for page in range(len(Reader.pages)):
    pageObj = Reader.pages[page]
    text += pageObj.extract_text()

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m122.9/232.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
# Save the extracted PDF text in a Doc called 'extracted_grievances'
with open('extracted_grievances','w') as f:
  f.write(text)

In [None]:
print(text)

remarks_text             subject_content_text  
                         Financial Services (Banking  
                         Division) >> Fraud  
  
                         Department/Bank/Financial  
                         Institute : Bank of India  
                         -----------------------  
                         I have already written to Bank of  
                         India Kalyanpur Kanpur in  
                         response to Grievance-Redressal  
                         Cell HDFC Bank reply dated  
                         27.12.2022 and lodge a complaint  
                         against the beneficiaries with law  
                         enforcement agency.( Cybercrime  
                         Cell on 13.08.2022 whose case  
                         number is 1537/ 2022/dated  
                         13.08.2022)  
  
  
                         TO  
                         BRANCH MANAGER  
                         BANK OF INDIA  
              

In [10]:
class Document:
    def __init__(self, content):
        self.page_content = content
        self.metadata= {"source":"gmail"}

In [12]:
# !pip install PyPDF2
import PyPDF2
import nest_asyncio
import imaplib
import email
nest_asyncio.apply()

attachment=""
# Articles to index
articles = [
            " https://www.pgportal.gov.in/",
            "https://www.pgportal.gov.in/Home/Faq",
            "https://www.pgportal.gov.in/Home/AboutUs",
            "https://www.pgportal.gov.in/Home/ContactUs",


]



# Scrapes the blogs above
loader = AsyncChromiumLoader(articles)
docs = loader.load()
print(docs)
    # Converts HTML to plain text
html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)
print(docs_transformed)
  # Chunk text

text_splitter = CharacterTextSplitter(chunk_size=100,
                                        chunk_overlap=0)
#TRY
# from langchain_core.documents.base import Document
# import PyPDF2

# file_paths = ['/content/grievance.pdf']

# documents = []

# for file_path in file_paths:
#     pdfFile = open(file_path, 'rb')
#     pdf_reader = PyPDF2.PdfReader(pdfFile)

#     text = ""
#     for page in range(len(pdf_reader.pages)):
#         page_obj = pdf_reader.pages[page]
#         text += page_obj.extract_text()

#     document = Document()
#     document.text = text
#     documents.append(document)

# Now 'documents' is a list of Document objects with the extracted text

# read grievances pdf document for RAG
file_paths = [
  '/content/Grievances.Dataset.pdf'
]

text = ""
for i in file_paths:
  pdfFile = open(i,'rb')
  Reader = PyPDF2.PdfReader(pdfFile)

  for page in range(len(Reader.pages)):
    pageObj = Reader.pages[page]
    text += pageObj.extract_text()

# Save the extracted PDF text in a Doc called 'extracted_grievances'
# with open('/content/grievance.pdf','w') as f:
#   f.write(text)


import langchain_core
# Prepare Document object (modify based on actual requirements)
document = langchain_core.documents.base.Document(page_content=text)  # Assuming this is the correct way to create a Document object
# document.text = text  # Assuming there's a "text" attribute in the Document class
docs_transformed.append(document)
print(docs_transformed)


# convert text into langchain_core.documents.base.Document
# docs_transformed.append(text)
chunked_documents=text_splitter.split_documents(docs_transformed)

#   # Load chunked documents into the FAISS index
db = FAISS.from_documents(chunked_documents,
                            HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'))

retriever = db.as_retriever()




[Document(page_content="  * भारत सरकार  Government of India\n  * कार्मिक, लोक शिकायत और पेंशन मंत्रालय Ministry of Personnel, Public Grievances & Pensions\n\n  * Home\n  * Contact Us\n  * About Us\n  * FAQs/Help\n  * Site Map\n  * 2024 Holiday List\n\n# CPGRAMS\n\nCentralized Public Grievance Redress And Monitoring System\n\n\n\n  * View Status  Grievance Status Appeal Status\n  * Nodal PG Officers  Central Government State Government\n  * Redress Process  Redress Process Flow\n  * Grievance  Lodge Public Grievance Lodge Pension Grievance View Status Reminder Clarification Rate Grievance\n  * __Nodal Authority for Appeal(current)\n  * Mobile App \n\nLanguage :\n\nEnglish\n\nEnglish हिंदी (Hindi) ગુજરાતી (Gujarati) मराठी (Marathi) বাংলা (Bangala)\nతెలుగు (Telugu) অসমীয়া (Assamese) ଓଡିଆ (Odia) தமிழ் (Tamil) മലയാളം (Malayalam)\n(Urdu) اردو Sindhi बोडो (Bodo) कोंकणी (Konkani) नेपाली (Nepali) Manipuri\nਪੰਜਾਬੀ (Punjabi) ಕನ್ನಡ (Kannada) डोगरी (Dogri) मैथिली (Maithili) کشمیر\n(Kashmiri) संस्कृ



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
print(chunked_documents)

[Document(page_content='* भारत सरकार  Government of India\n  * कार्मिक, लोक शिकायत और पेंशन मंत्रालय Ministry of Personnel, Public Grievances & Pensions', metadata={'source': ' https://www.pgportal.gov.in/'}), Document(page_content='* Home\n  * Contact Us\n  * About Us\n  * FAQs/Help\n  * Site Map\n  * 2024 Holiday List\n\n# CPGRAMS', metadata={'source': ' https://www.pgportal.gov.in/'}), Document(page_content='Centralized Public Grievance Redress And Monitoring System', metadata={'source': ' https://www.pgportal.gov.in/'}), Document(page_content='* View Status  Grievance Status Appeal Status\n  * Nodal PG Officers  Central Government State Government\n  * Redress Process  Redress Process Flow\n  * Grievance  Lodge Public Grievance Lodge Pension Grievance View Status Reminder Clarification Rate Grievance\n  * __Nodal Authority for Appeal(current)\n  * Mobile App', metadata={'source': ' https://www.pgportal.gov.in/'}), Document(page_content='Language :\n\nEnglish', metadata={'source': '

In [14]:
print(docs_transformed)

[Document(page_content="  * भारत सरकार  Government of India\n  * कार्मिक, लोक शिकायत और पेंशन मंत्रालय Ministry of Personnel, Public Grievances & Pensions\n\n  * Home\n  * Contact Us\n  * About Us\n  * FAQs/Help\n  * Site Map\n  * 2024 Holiday List\n\n# CPGRAMS\n\nCentralized Public Grievance Redress And Monitoring System\n\n\n\n  * View Status  Grievance Status Appeal Status\n  * Nodal PG Officers  Central Government State Government\n  * Redress Process  Redress Process Flow\n  * Grievance  Lodge Public Grievance Lodge Pension Grievance View Status Reminder Clarification Rate Grievance\n  * __Nodal Authority for Appeal(current)\n  * Mobile App \n\nLanguage :\n\nEnglish\n\nEnglish हिंदी (Hindi) ગુજરાતી (Gujarati) मराठी (Marathi) বাংলা (Bangala)\nతెలుగు (Telugu) অসমীয়া (Assamese) ଓଡିଆ (Odia) தமிழ் (Tamil) മലയാളം (Malayalam)\n(Urdu) اردو Sindhi बोडो (Bodo) कोंकणी (Konkani) नेपाली (Nepali) Manipuri\nਪੰਜਾਬੀ (Punjabi) ಕನ್ನಡ (Kannada) डोगरी (Dogri) मैथिली (Maithili) کشمیر\n(Kashmiri) संस्कृ

### Create PromptTemplate and LLMChain

In [15]:
prompt_template = """
### [INST] Instruction: You Are a Centralized Public Grievance Redress And Monitoring System Knowledge ChatBot, Reply to each query accurately by giving detailed answers relevant to the context provided. Here is the context to help:

{context}

### QUESTION:
{question} [/INST]
 """

# Create prompt from prompt template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create llm chain
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

### Build RAG Chain

In [16]:

rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)
result = rag_chain.invoke("List some grievances faced?")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [17]:
result['context']

[Document(page_content='10\\. What are the types of grievances which are not taken up for redress by\nthe Department?', metadata={'source': 'https://www.pgportal.gov.in/Home/Faq'}),
 Document(page_content='6\\. What happens to the grievances? How are the grievances dealt with in\nCentral Ministries/Departments?', metadata={'source': 'https://www.pgportal.gov.in/Home/Faq'}),
 Document(page_content='15\\. What can a citizen do if he is not satisfied with the redressal of his\ngrievance?', metadata={'source': 'https://www.pgportal.gov.in/Home/Faq'}),
 Document(page_content='2\\. Where can the grievances be sent?\n\nThe grievances can be sent to :', metadata={'source': 'https://www.pgportal.gov.in/Home/Faq'})]

In [18]:
print(result['text'])

1. Grievances related to corruption and maladministration
 2. Grievances related to non-delivery of public services
 3. Grievances related to violation of human rights
 4. Grievances related to environmental degradation
 5. Grievances related to social justice and equality
 6. Grievances related to health and education
 7. Grievances related to employment and labor issues
 8. Grievances related to taxation and financial matters
 9. Grievances related to security and defense issues
 10. Grievances related to foreign policy and diplomacy issues


In [19]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.19.2-py3-none-any.whl (16.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.9/16.9 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.110.0-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.1/92.1 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==0.10.1 (from gradio)
  Downloading gradio_client-0.10.1-py3-none-any.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.9/307.9 kB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━

In [20]:
from transformers import pipeline
p = pipeline("automatic-speech-recognition")

No model was supplied, defaulted to facebook/wav2vec2-base-960h and revision 55bb623 (https://huggingface.co/facebook/wav2vec2-base-960h).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

In [21]:
import gradio as gr

In [22]:
# first function
def transcribe(audio):
    transcribed_text = p(audio)["text"]
    result = rag_chain.invoke(transcribed_text)
    return result['text']

In [23]:
# second function
def chat(chat_history, user_input):

  result = rag_chain.invoke(user_input)

  response = ""
  for letter in ''.join(result['text']): #[bot_response[i:i+1] for i in range(0, len(bot_response), 1)]:
      response += letter + ""
      yield chat_history + [(user_input, response)]

In [None]:
# !pip install --upgrade gradio
import gradio as gr
with gr.Blocks() as demo:
    with gr.Tab("Text"):
          chatbot = gr.Chatbot()
          message = gr.Textbox ("What is this document about?")
          message.submit(chat,[chatbot, message], chatbot)
    with gr.Tab ("Audio"):
      gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(type="filepath")

    ],
    outputs=[
        "textbox"

    ], live=True)
    demo.queue().launch(debug = True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://ba4c91f45dc6482337.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    await wrap_app_handling_exceptions(app, request)(scope, receive, send)
  File "/usr/local/lib/python3.10/dist-packages/starlette/_exception_handler.py", line 64, in wrapped_app
    raise exc
  File "/usr/local/lib/python3.10/dist-packages/starlette/_exception_handler.py", line 53, in wrapped_app
    await app(scope, receive, sender)
  File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 74, in app
    response = await func(request)
  File "/usr/local/lib/python3.10/dist-packages/fastapi/routing.py", line 278, in app
    raw_response = await run_endpoint_function(
  File "/usr/local/lib/python3.10/dist-packages/fastapi/routing.py", line 191, in run_endpoint_function
    return await dependant.call(**values)
  File "/usr/local/lib/python3.10/dist-packages/gradio/routes.py", line 786, in upload_file
    form = await multipart_parser.parse()
  File "/usr/local/lib/python3.10/dist-packages/gradio/route