<a href="https://colab.research.google.com/github/ShekSingh/AlgorithmsPartOne/blob/master/Abhishek_LangChain_Running_HuggingFace_Models_Locally.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install langchain huggingface_hub transformers sentence_transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m846.5/846.5 kB[0m [31m44.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m111.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m70.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m118.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m84.7 MB/s[0m 

## HuggingFace

There are two Hugging Face LLM wrappers, one for a local pipeline and one for a model hosted on Hugging Face Hub. Note that these wrappers only work for models that support the following tasks: text2text-generation, text-generation


In [2]:
import os


os.environ['HUGGINGFACEHUB_API_TOKEN'] = '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'

## Use the HuggingFaceHub

In [3]:
from langchain import PromptTemplate, HuggingFaceHub, LLMChain

template = """Question: {question}

Answer: Let's think step by step."""

prompt = PromptTemplate(template=template, input_variables=["question"])

In [4]:
llm_chain = LLMChain(prompt=prompt, 
                     llm=HuggingFaceHub(repo_id="google/flan-t5-xl", 
                                        model_kwargs={"temperature":0, 
                                                      "max_length":64}))

In [6]:
question = "What is the capital of France?"

print(llm_chain.run(question))

KeyboardInterrupt: ignored

In [None]:
question = "What area is best for growing wine in France?"

print(llm_chain.run(question))

## BlenderBot

Doesn't work on the Hub

In [None]:
blenderbot_chain = LLMChain(prompt=prompt, 
                     llm=HuggingFaceHub(repo_id="facebook/blenderbot-1B-distill", 
                                        model_kwargs={"temperature":0, 
                                                      "max_length":64}))

In [None]:
# question = "What is the capital of France?"
# question = "What area is best for growing wine in France?"

# print(blenderbot_chain = LLMChain(prompt=prompt, 
# .run(question))

In [None]:
pip install accelerate

## With Local model from HF 

### Why would you want to use local mode?

- fine-tuned models
- GPU hosted etc
- some models only work locally

## T5-Flan - Encoder-Decoder

In [None]:
pip install bitsandbytes

In [None]:
from langchain.llms import HuggingFacePipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM

model_id = 'google/flan-t5-large'# go for a smaller model if you dont have the VRAM
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True,device_map='auto')

pipe = pipeline(
    "text2text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_length=100
)

local_llm = HuggingFacePipeline(pipeline=pipe)


In [None]:
print(local_llm('What is the capital of France? '))

In [14]:
llm_chain = LLMChain(prompt=prompt, 
                     llm=local_llm
                     )

question = "Can you tell me in words what is the following code doing?  def test(a,b): return a+b"

print(llm_chain.run(question))

Test(a,b): return a+b


## GPT2-medium - Decoder Only Model

microsoft/DialoGPT-large

In [15]:
model_id = "gpt2-medium"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

pipe = pipeline(
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
    max_length=100
)

local_llm = HuggingFacePipeline(pipeline=pipe)

Downloading (…)lve/main/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [17]:
llm_chain = LLMChain(prompt=prompt, 
                     llm=local_llm
                     )

question = "What is the capital of France?"

print(llm_chain.run(question))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 The code is testing some boolean statement. 

The code starts adding a and checking to see if the value of a matches the given boolean statement.

The code then starts adding b as return value to the function.

It then moves on to the function that we just tested, the following


## BlenderBot - Encoder-Decoder

In [5]:
from langchain.llms import HuggingFacePipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM

model_id = 'facebook/blenderbot-1B-distill'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    "text2text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_length=100
)

local_llm = HuggingFacePipeline(pipeline=pipe)

Downloading (…)neration_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

In [8]:
llm_chain = LLMChain(prompt=prompt, 
                     llm=local_llm
                     )

# question = "What area is best for growing wine in France?"
question = "Can you tell me what the following code is doing?  def test(a,b): return a+b"
print(llm_chain.run(question))

 I'm not sure what you mean by that. Can you give me an example of what you're trying to say?


## SentenceTransformers

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"

hf = HuggingFaceEmbeddings(model_name=model_name)

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
hf.embed_query('this is an embedding')

[0.010657310485839844,
 -0.09967269003391266,
 -0.026967095211148262,
 0.06531771272420883,
 0.021005000919103622,
 0.04262349754571915,
 0.011534060351550579,
 -0.006229334976524115,
 0.05175827071070671,
 0.007306730840355158,
 0.021353425458073616,
 0.04269154742360115,
 0.023143811151385307,
 0.009952709078788757,
 0.05646306648850441,
 -0.06137978285551071,
 0.0527438223361969,
 0.024683939293026924,
 -0.013267776928842068,
 -0.007051161024719477,
 0.02665640041232109,
 -0.005913516040891409,
 0.004097455181181431,
 0.0384124331176281,
 -0.014230639673769474,
 0.023023542016744614,
 -0.007326607592403889,
 -0.03562537580728531,
 -0.017934124916791916,
 -0.013930189423263073,
 0.011977523565292358,
 -0.007365939673036337,
 0.024451466277241707,
 -0.06637256592512131,
 1.5677629789934144e-06,
 0.01821723021566868,
 0.00197486556135118,
 -0.018329503014683723,
 -0.014930644072592258,
 -0.00539350276812911,
 -0.011222291737794876,
 0.015792978927493095,
 -0.027141856029629707,
 -0.015

In [None]:
hf.embed_documents(['this is an embedding','this another embedding'])

[[0.010657318867743015,
  -0.09967268258333206,
  -0.02696709893643856,
  0.06531770527362823,
  0.021004999056458473,
  0.042623501271009445,
  0.011534065939486027,
  -0.006229353602975607,
  0.0517583042383194,
  0.007306722458451986,
  0.021353380754590034,
  0.04269153252243996,
  0.023143835365772247,
  0.00995270162820816,
  0.056463032960891724,
  -0.06137979403138161,
  0.0527438260614872,
  0.024683943018317223,
  -0.013267838396131992,
  -0.007051167543977499,
  0.02665640041232109,
  -0.005913490429520607,
  0.004097461700439453,
  0.038412418216466904,
  -0.01423065084964037,
  0.023023542016744614,
  -0.007326596416532993,
  -0.03562536463141441,
  -0.017934132367372513,
  -0.013930188491940498,
  0.011977534741163254,
  -0.007365899626165628,
  0.024451464414596558,
  -0.06637255847454071,
  1.5677629789934144e-06,
  0.018217233940958977,
  0.0019748930353671312,
  -0.01832951232790947,
  -0.014930643141269684,
  -0.005393484607338905,
  -0.011222314089536667,
  0.015792

In [None]:


hf = HuggingFaceHubEmbeddings(
    repo_id=model_name,
    task="feature-extraction",
    # huggingfacehub_api_token="my-api-key",
)