# Resources

* https://python.langchain.com/docs/integrations/document_loaders/recursive_url

In [None]:
# !pip install -r requirements.txt

# Imports

In [2]:
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader

# Parsing Data

In [3]:
from bs4 import BeautifulSoup as Soup

url = "https://www.linkedin.com/legal/privacy-policy"
loader = RecursiveUrlLoader(
    url=url, max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
)
docs = loader.load()

In [4]:
docs[0].metadata['source']

'https://www.linkedin.com/legal/privacy-policy'

In [5]:
import tiktoken
tokenizer = tiktoken.get_encoding('cl100k_base')
# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200,
    length_function = tiktoken_len
)

In [7]:
# !pip install -q -U faiss-cpu tiktoken sentence-transformers

In [8]:
documents = text_splitter.transform_documents(docs)

In [9]:
documents[0].metadata

{'source': 'https://www.linkedin.com/legal/privacy-policy',
 'title': 'LinkedIn Privacy Policy',
 'language': 'en'}

In [10]:
from langchain.embeddings import CacheBackedEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore

store = LocalFileStore('./cache/')
embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
core_embeddings_model = HuggingFaceEmbeddings(
    model_name = embed_model_id
)

embedder = CacheBackedEmbeddings.from_bytes_store(
    core_embeddings_model, store, namespace=embed_model_id
)

vector_store = FAISS.from_documents(documents, embedder)

In [11]:
query = 'What is the data collection policy of linkedin?'
query_vector = core_embeddings_model.embed_query(query)
embedded_docs = vector_store.similarity_search_by_vector(query_vector, k = 4)

for doc in embedded_docs:
    print(doc.metadata)

{'source': 'https://www.linkedin.com/legal/privacy-policy', 'title': 'LinkedIn Privacy Policy', 'language': 'en'}
{'source': 'https://www.linkedin.com/legal/privacy-policy?trk=content_footer-privacy-policy', 'title': 'LinkedIn Privacy Policy', 'language': 'en'}
{'source': 'https://www.linkedin.com/legal/privacy-policy', 'title': 'LinkedIn Privacy Policy', 'language': 'en'}
{'source': 'https://www.linkedin.com/legal/privacy-policy?trk=content_footer-privacy-policy', 'title': 'LinkedIn Privacy Policy', 'language': 'en'}


In [12]:
# !pip install huggingface-hub -q

In [13]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
import torch
import transformers
model_id = "meta-llama/Llama-2-7b-chat-hf"

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_config = transformers.AutoConfig.from_pretrained(
    model_id
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto'
)

model.eval()

Downloading config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )


In [16]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id
)

Downloading tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [17]:
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    return_full_text=True,
    temperature=0.0,
    max_new_tokens=256
)

In [18]:
# !pip install xformers

In [19]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

In [20]:
retriever = vector_store.as_retriever()

In [21]:
from langchain.chains import RetrievalQA
from langchain.callbacks import StdOutCallbackHandler

handler = StdOutCallbackHandler()

qa_with_sources_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    callbacks=[handler],
    return_source_documents=True
)

In [22]:
qa_with_sources_chain({"query" : "What is the data collection policy of linkedin?"})

  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m


This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.



[1m> Finished chain.[0m


{'query': 'What is the data collection policy of linkedin?',
 'result': '\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nup\nup\nup\nup\n\nup\nup\n\n\n\n\n\n\n\n\n\n    \n\n\n\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\ness\ness\ness\ness\ness\ness\nup\nup\nup\nup\n                                    \n        \nesset\n    \n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nup\nup\n up\n up\n\n\n\nup up up up up up up up up up up up up\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n',
 'source_documents': [Document(page_content='LinkedIn Privacy Policy\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n \n\n\n      Skip to main content\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n          User Agreement\n        \n\n\n\n\n\n\n\nSummary of User Agreement\n\n\n\n\n\n\n\n\n\n\n\n          Privacy Policy\n        \n \n\n\n\n\n\n\n          Professiona