In [3]:
from langchain.llms import HuggingFaceHub
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFaceEndpoint

from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

llm = HuggingFacePipeline.from_model_id(
    model_id="google/flan-t5-small",
    task="text-generation",
    pipeline_kwargs={"max_new_tokens": 10},
)

import gradio as gr
import os
from dotenv import load_dotenv
load_dotenv()
# Set Hugging Face API Token (Replace with your actual token)
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")

# Suppress warnings
def warn(*args, **kwargs):
    pass

import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

## LLM using Hugging Face
def get_llm():
    llm = llm
    return llm

## Document loader with debugging
def document_loader(file):
    loader = PyPDFLoader(file.name)
    docs = loader.load_and_split()
    for doc in docs:
        print(doc.page_content)
    return docs


## Text splitter with debugging
def text_splitter(data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        length_function=len,
    )
    chunks = text_splitter.split_documents(data)
    if not chunks:
        raise ValueError("Text splitting failed: No chunks were created from the document.")
    print(f"Generated {len(chunks)} text chunks.")
    return chunks

## Embedding model using Hugging Face
def huggingface_embedding():
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding_model

## Vector database with debugging
def vector_database(chunks):
    embedding_model = huggingface_embedding()
    vectordb = Chroma.from_documents(chunks, embedding_model,persist_directory="./chroma_db")
    vectordb.persist()
    print("Vector database created successfully.")
    return vectordb

## Retriever with debugging
def retriever(file):
    splits = document_loader(file)
    chunks = text_splitter(splits)
    vectordb = vector_database(chunks)
    retriever = vectordb.as_retriever()
    return retriever

## QA Chain
def retriever_qa(file, query):
    llm = get_llm()
    retriever_obj = retriever(file)
    qa = RetrievalQA.from_chain_type(llm=llm, 
                                     chain_type="stuff", 
                                     retriever=retriever_obj, 
                                     return_source_documents=False)
    response = qa.invoke(query)
    return response['result']

# Create Gradio interface
rag_application = gr.Interface(
    fn=retriever_qa,
    allow_flagging="never",
    inputs=[
        gr.File(label="Upload PDF File", file_count="single", file_types=['.pdf'], type="filepath"),  # Drag and drop file upload
        gr.Textbox(label="Input Query", lines=2, placeholder="Type your question here...")
    ],
    outputs=gr.Textbox(label="Output"),
    title="RAG Chatbot",
    description="Upload a PDF document and ask any question. The chatbot will try to answer using the provided document."
)

# Launch the app
rag_application.launch(server_name="0.0.0.0", server_port=7860)


ValueError: Unrecognized configuration class <class 'transformers.models.t5.configuration_t5.T5Config'> for this kind of AutoModel: AutoModelForCausalLM.
Model type should be one of AriaTextConfig, BambaConfig, BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CohereConfig, Cohere2Config, CpmAntConfig, CTRLConfig, Data2VecTextConfig, DbrxConfig, DiffLlamaConfig, ElectraConfig, Emu3Config, ErnieConfig, FalconConfig, FalconMambaConfig, FuyuConfig, GemmaConfig, Gemma2Config, GitConfig, GlmConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, GraniteConfig, GraniteMoeConfig, JambaConfig, JetMoeConfig, LlamaConfig, MambaConfig, Mamba2Config, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MllamaConfig, MoshiConfig, MptConfig, MusicgenConfig, MusicgenMelodyConfig, MvpConfig, NemotronConfig, OlmoConfig, Olmo2Config, OlmoeConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, Phi3Config, PhimoeConfig, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, Qwen2MoeConfig, RecurrentGemmaConfig, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, StableLmConfig, Starcoder2Config, TransfoXLConfig, TrOCRConfig, WhisperConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig, ZambaConfig.

In [2]:
!pip install chromadb

Collecting chromadb
  Using cached chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Using cached chroma_hnswlib-0.7.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.11.0-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting importlib-metadata<=8.5.0,>=6.0 (from opentelemetry-api>=1.2.0->chromadb)
  Downloading importlib_metadata-8.5.0-py3-none-any.whl.metadata (4.8 kB)
Collecting protobuf (from onnxruntime>=1.14.1->chromadb)
  Downloading protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting monotonic>=1.5 (from posthog>=2.4.0->chromadb)
  Using cached monotonic-1.6-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting backoff>=1.10.0 (from posthog>=2.4.0->chromadb)
  Using cached backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting pyasn1-modules>=0.2.1 (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb)
 

In [3]:
gr.close_all()

Closing server running on port: 7860


In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_id = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

input_text = "Translate English to French: Hello, how are you?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
2025-02-06 22:34:59.084385: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-06 22:34:59.091169: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-06 22:34:59.096786: E exte

<extra_id_0>


In [5]:
!pip install python-dotenv


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [14]:
def document_loader(file):
    loader = PyPDFLoader(file)
    for page in loader.load_and_split():
        return page

In [16]:
print(document_loader("/home/kronos/Desktop/UAI.pdf"))

None


In [25]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("/home/kronos/Desktop/hindu.pdf")
pages = loader.load()

In [26]:
len(pages)

22

In [27]:
page = pages[0]

In [33]:
print(page.page_content)

AHY-HYE CM
YK
WEDNESDAY
January 29, 2025
HYDERABAD
CITY EDITION
18 Pages /uni20B9 8.00www.thehindu.com
Printed at » Chennai» Coimbatore » Bengaluru » Hyderabad » Madurai» Noida» Visakhapatnam » Thiruvananthapuram » Kochi» Vijayawada » Mangaluru » Tiruchirapalli » Kolkata» Hubballi» Mohali» Malappuram » Mumbai» Tirupati» Lucknow » Cuttack» PatnaVol. 50 /L50539No. 24https://newsth.live/fb
https://newsth.live/x
https://newsth.live/ig
RNI No. TELENG/1976/49963
‘Good policies
drew companies’ 
HYDERABAD
Chief Minister A. Revanth
Reddy has asserted that
several global majors came
forward to sign MoUs with the
State government at Davos
because of the progressive
policies that have been put in
place by the Congress
government. »Page 6
Skyhigh airfares
to Maha Kumbh
HYDERABAD
Pilgrimage to Maha Kumbh
Mela in Prayagraj from
Hyderabad has been
overshadowed by exorbitant
airfares, with prices ranging up
to /uni20B91.02 lakh per person for a
one-way journey. »Page 4NEARBY
/L50301
MISRI VISIT
Had fra

In [30]:
page.metadata

{'source': '/home/kronos/Desktop/hindu.pdf', 'page': 0}

In [24]:
page

Document(metadata={'source': '/home/kronos/Desktop/UAI.pdf', 'page': 0}, page_content='')

In [12]:
from langchain.llms import HuggingFaceHub


In [1]:
!pip install llama-index


Collecting llama-index
  Downloading llama_index-0.12.16-py3-none-any.whl.metadata (12 kB)
Collecting llama-index-agent-openai<0.5.0,>=0.4.0 (from llama-index)
  Downloading llama_index_agent_openai-0.4.5-py3-none-any.whl.metadata (727 bytes)
Collecting llama-index-cli<0.5.0,>=0.4.0 (from llama-index)
  Downloading llama_index_cli-0.4.0-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.13.0,>=0.12.16 (from llama-index)
  Downloading llama_index_core-0.12.16.post1-py3-none-any.whl.metadata (2.5 kB)
Collecting llama-index-embeddings-openai<0.4.0,>=0.3.0 (from llama-index)
  Downloading llama_index_embeddings_openai-0.3.1-py3-none-any.whl.metadata (684 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.4.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.6.4-py3-none-any.whl.metadata (3.6 kB)
Collecting llama-index-llms-openai<0.4.0,>=0.3.0 (from llama-index)
  Downloading llama_index_llms_openai-0.3.18-py3-none-any.whl.metadata (3.3 kB)


In [None]:
import os
from getpass import getpass
from huggingface_hub import login

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
HF_Token = getpass()

In [7]:
login(token = HF_Token)

In [8]:
from llama_index.core.agent import ReActAgent
from llama_index.llms.openai import OpenAI
from llama_index.core.tools import FunctionTool

In [9]:
def multiply(a: float, b: float) -> float:
    """Multiply two numbers and returns the product"""
    return a * b


multiply_tool = FunctionTool.from_defaults(fn=multiply)


def add(a: float, b: float) -> float:
    """Add two numbers and returns the sum"""
    return a + b


add_tool = FunctionTool.from_defaults(fn=add)

In [1]:
from sqlalchemy import create_engine

In [16]:
dbEngine = create_engine('sqlite:////home/kronos/Desktop/raman.db')

In [17]:
import pandas as pd

In [30]:
r = pd.read_sql('select name from sqlite_master',dbEngine)

In [48]:
from sqlalchemy import text

def schema(x: str) -> str:
    sql = f"SELECT sql FROM sqlite_master WHERE name = '{x}';"
    with dbEngine.connect() as conn:
        result = conn.execute(text(sql))
        return result.fetchall()


In [49]:
r['schema'] = r['name'].map(schema)

In [39]:
r[r.loc[:,'name']=='WPA_all_time_connect']

Unnamed: 0,name
0,WPA_all_time_connect


In [52]:
for i in r.schema:
    print(i)

[('CREATE TABLE "WPA_all_time_connect" (\n\t"_id"\tTEXT,\n\t"collectionType"\tTEXT,\n\t"eventName"\tTEXT,\n\t"createdAt"\tTEXT,\n\t"partnerRegion"\tTEXT,\n\t"sessionId"\tTEXT,\n\t"data_careerId"\tTEXT,\n\t"data_jobId"\tTEXT,\n\t"data_programId"\tTEXT,\n\t"data_resourceId"\tTEXT,\n\t"WPA_id"\tTEXT\n)',)]
[('CREATE TABLE "WAP_site_resources" (\n\t"data_careerId"\tTEXT,\n\t"featured"\tTEXT,\n\t"providerName"\tTEXT,\n\t"updatedAt"\tTEXT,\n\t"resource_tag"\tTEXT\n)',)]
[('CREATE TABLE "WPA_individual" (\n\t"rural_type"\tTEXT,\n\t"lastLoggedInAt"\tTEXT,\n\t"myPathCompleted"\tTEXT,\n\t"preferredRegion"\tTEXT,\n\t"preferr ... (487 characters truncated) ... tTEXT,\n\t"completed_jobs_milestone"\tTEXT,\n\t"completed_training_milestone"\tTEXT,\n\t"gender_3"\tTEXT,\n\t"birth_year"\tTEXT,\n\t"WPA_id"\tTEXT\n)',)]
[('CREATE TABLE "WPA_pulse_survey" (\n\t"version"\tTEXT,\n\t"year_quarter"\tTEXT,\n\t"children"\tTEXT,\n\t"concern_finances"\tTEXT,\n\t"concern_health"\ ... (2910 characters truncated) ... 

In [10]:
agent = ReActAgent.from_tools([multiply_tool, add_tool], llm=llm, verbose=True)

NameError: name 'llm' is not defined