[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Sarthakrw/web-query/blob/main/web_query_app_demo.ipynb)

# Web Query App

Make sure you are connected to free T4 GPU given by Google Colab before running this notebook.

## Downloading Dependencies

In [1]:
!pip install gradio transformers langchain auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/

Looking in indexes: https://pypi.org/simple, https://huggingface.github.io/autogptq-index/whl/cu118/
Collecting gradio
  Downloading gradio-3.41.2-py3-none-any.whl (20.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.1/20.1 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m104.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.0.274-py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m91.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting auto-gptq
  Downloading https://huggingface.github.io/autogptq-index/whl/cu118/auto-gptq/auto_gptq-0.4.2%2Bcu118-cp310-cp310-linux_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?2

## Downloading and Loading Model

In [2]:
from transformers import AutoTokenizer, pipeline
from auto_gptq import AutoGPTQForCausalLM

model_name = "TheBloke/Llama-2-7b-Chat-GPTQ"

use_triton = False

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

model = AutoGPTQForCausalLM.from_quantized(model_name,
        use_safetensors=True,
        trust_remote_code=True,
        device="cuda:0",
        use_triton=use_triton,
        quantize_config=None)

Downloading (…)okenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

Downloading (…)quantize_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]



In [3]:
# move a model's parameters and computation to GPU device
model.to('cuda')

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): FusedLlamaAttentionForQuantizedModel(
          (qkv_proj): GeneralQuantLinear(in_features=4096, out_features=12288, bias=True)
          (o_proj): GeneralQuantLinear(in_features=4096, out_features=4096, bias=True)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (act_fn): SiLUActivation()
          (down_proj): GeneralQuantLinear(in_features=11008, out_features=4096, bias=True)
          (gate_proj): GeneralQuantLinear(in_features=4096, out_features=11008, bias=True)
          (up_proj): GeneralQuantLinear(in_features=4096, out_features=11008, bias=True)
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=32000, bias=

In [4]:
#creating pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.6,
    top_p=0.95,
    repetition_penalty=1.15
)

The model 'LlamaGPTQForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead',

## Creating LLM Application with Langchain

In [5]:
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain
from langchain import PromptTemplate

In [6]:
llm = HuggingFacePipeline(pipeline=pipe)

In [7]:
system_message = """
You are a helpful, respectful and honest assistant. Your job is to answer the users query as best as possible given the Web Page Content. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. If you DO NOT KNOW THE ANSWER DO NOT SHARE FALSE INFORMATION.
You have been given scraped text content of a webpage under the section called "Web Page Content". Using this information answer the users query. However, if the webpage DOES NOT contain the answer to the query, you CAN answer based on your existing knowledge IF you are sure of the answer, but ALWAYS let the user know when doing so.
"""

prompt_template='''[INST] <<SYS>>
{system_message}
<</SYS>>

Web Page Content:
```
{context}
```

{prompt}[/INST]'''

## Creating Chat Chain

In [8]:
chat = LLMChain(
    llm=llm,
    prompt=PromptTemplate.from_template(prompt_template),
)

## Implementing Web scraper

In [9]:
import requests
from bs4 import BeautifulSoup
import re

In [10]:
def scraper(url):
  req = requests.get(url)
  soup = BeautifulSoup(req.content, "html.parser")
  context = soup.get_text()
  relevant_text = soup.get_text()
  cleaned_text = re.sub(r'\s+', ' ', relevant_text).strip()

  return cleaned_text

## Run Function

In [11]:
def run(url, input):
  context = scraper(url)
  cleaned_response = chat.predict(system_message=system_message, context=context, prompt=input)

  return cleaned_response

## Creating UI using Gradio

In [12]:
import gradio as gr

In [13]:
# Create a Gradio interface
iface = gr.Interface(
    fn=run,
    inputs=["text","text"],
    outputs="text",
    title="Web Query App",
    description="Enter the webpage url and your query\nIMPORTANT: Larger webpages are likely to cause error due to lack of high end computational resources"
)

# Launch the interface
iface.launch()

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

