<a href="https://colab.research.google.com/github/ThienAnTrinh/LangChainLlama2ProductSearch/blob/master/product_search_langchain_llama2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain==0.0.277
!pip install transformers==4.31.0 accelerate==0.21.0 bitsandbytes==0.40.2 datasets==2.14.5
!pip install sentence-transformers==2.2.2 InstructorEmbedding==1.0.1
!pip install chromadb



## Model

In [None]:
# Load llm and tokenizer

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GenerationConfig
import torch

model_name = "NousResearch/Llama-2-7b-chat-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

llm = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    use_cache=False
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

generation_config = GenerationConfig.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# make langchain pipeline

from transformers import pipeline
from langchain.llms import HuggingFacePipeline

pipe = pipeline(
        "text-generation",
        model=llm,
        tokenizer=tokenizer,
        max_length=2048,
        temperature=0.2,
        top_p=0.8,
        repetition_penalty=1.15,
        generation_config=generation_config,
    )

llm_pipeline = HuggingFacePipeline(pipeline=pipe)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


## Data

In [None]:
from datasets import load_dataset

dataset = load_dataset("wdc/products-2017", "cameras_medium", split="validation")

In [None]:
dataset

Dataset({
    features: ['pair_id', 'label', 'id_left', 'category_left', 'cluster_id_left', 'brand_left', 'title_left', 'description_left', 'price_left', 'specTableContent_left', 'id_right', 'category_right', 'cluster_id_right', 'brand_right', 'title_right', 'description_right', 'price_right', 'specTableContent_right'],
    num_rows: 1051
})

In [None]:
def concat_title_description(example):

    assert example["id_left"] is not None, "NULL ID"

    example["id"] = example["id_left"]
    example["title"] = example["title_left"] if example["title_left"] else ""
    example["description"] = example["description_left"] if example["description_left"] else ""
    example["text"] = example["description"] + " " + example["title"]
    example["brand"] = example["brand_left"] if example["brand_left"] else ""
    example["price"] = example["price_left"] if example["price_left"] else ""

    return example

In [None]:
updated_dataset = dataset.map(concat_title_description)
updated_dataset

Dataset({
    features: ['pair_id', 'label', 'id_left', 'category_left', 'cluster_id_left', 'brand_left', 'title_left', 'description_left', 'price_left', 'specTableContent_left', 'id_right', 'category_right', 'cluster_id_right', 'brand_right', 'title_right', 'description_right', 'price_right', 'specTableContent_right', 'id', 'title', 'description', 'text', 'brand', 'price'],
    num_rows: 1051
})

### Data embedding

In [None]:
from langchain.docstore.document import Document

docs = []

for id, title, description, text, brand, price in zip(updated_dataset["id"], updated_dataset["title"], updated_dataset["description"], updated_dataset["text"], updated_dataset["brand"], updated_dataset["price"]):
    assert text is not None, "null document"
    doc = Document(
        page_content=text,
        metadata={
            "id": id,
            "title": title,
            "description": description,
            "brand": brand,
            "price": price
        }
    )
    docs.append(doc)

In [None]:
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import Chroma


EMBEDDING_MODEL_NAME = "hkunlp/instructor-large"
VECTOR_STORE_PATH = "data/vectorstore"

embeddings = HuggingFaceInstructEmbeddings(model_name=EMBEDDING_MODEL_NAME)

db = Chroma.from_documents(
    docs,
    embeddings,
    persist_directory=VECTOR_STORE_PATH
)

load INSTRUCTOR_Transformer
max_seq_length  512


## Query

In [None]:
db = Chroma(
    persist_directory=VECTOR_STORE_PATH,
    embedding_function=embeddings
)

In [None]:
import pandas as pd
import numpy as np


def search(query):
    outputs = db.similarity_search(query, k=10)
    results = [{
        "id": output.metadata["id"],
        "brand": output.metadata["brand"],
        "title": output.metadata["title"],
        "description": output.metadata["description"],
        "price": output.metadata["price"]
    } for output in outputs]

    return pd.DataFrame(results)

In [None]:
search("multislot portable lightweight quality cheap")

Unnamed: 0,id,brand,title,description,price
0,8051451,,"""SDCFXPS-256G-X46""@en ""SDCFXPS-256G-X46-NR | ...","""\n COMPACT FLASH CARD 256G...",
1,8051451,,"""SDCFXPS-256G-X46""@en ""SDCFXPS-256G-X46-NR | ...","""\n COMPACT FLASH CARD 256G...",
2,8051451,,"""SDCFXPS-256G-X46""@en ""SDCFXPS-256G-X46-NR | ...","""\n COMPACT FLASH CARD 256G...",
3,17401560,"""Sandisk""@en","""SanDisk 32GB Extreme CompactFlash Memory Car...","""\n32GB Data Storage Capacity\n400x Speed Rati...",
4,17401560,"""Sandisk""@en","""SanDisk 32GB Extreme CompactFlash Memory Car...","""\n32GB Data Storage Capacity\n400x Speed Rati...",
5,17401560,"""Sandisk""@en","""SanDisk 32GB Extreme CompactFlash Memory Car...","""\n32GB Data Storage Capacity\n400x Speed Rati...",
6,8679099,,"""Sandisk Ultra SDXC SD Memory Card Class 10 -...","""Ideal for compact-to-midrange point-and-shoo...","""GBP""@en, ""48.69""@en"
7,8679099,,"""Sandisk Ultra SDXC SD Memory Card Class 10 -...","""Ideal for compact-to-midrange point-and-shoo...","""GBP""@en, ""48.69""@en"
8,8679099,,"""Sandisk Ultra SDXC SD Memory Card Class 10 -...","""Ideal for compact-to-midrange point-and-shoo...","""GBP""@en, ""48.69""@en"
9,17424720,"""Pelican""@en","""Pelican 0945 Compact Flash Memory Card Case""...","""\nStores 6 Compact Flash Cards\nRemovable Ins...",
