In [1]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import Chroma, FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain

import openai
import os
# openai.api_key = ""
os.environ['OPENAI_API_KEY'] = ""

In [2]:
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader('data/TextBooks', glob="./dsc250fall2023*", show_progress=True) ### many doc loaders
docs = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(docs)

100%|██████████| 12/12 [01:16<00:00,  6.36s/it]
Created a chunk of size 2460, which is longer than the specified 1000
Created a chunk of size 1791, which is longer than the specified 1000
Created a chunk of size 1614, which is longer than the specified 1000
Created a chunk of size 1389, which is longer than the specified 1000
Created a chunk of size 1066, which is longer than the specified 1000
Created a chunk of size 1544, which is longer than the specified 1000
Created a chunk of size 1019, which is longer than the specified 1000
Created a chunk of size 1229, which is longer than the specified 1000


In [3]:
# embeddings = OpenAIEmbeddings() ### many embedding functions
embeddings = HuggingFaceInstructEmbeddings(query_instruction="Represent the query for retrieval: ", model_kwargs={'device':'cuda:0'}, encode_kwargs={'batch_size':32})


  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


In [4]:
embeddings.embed_documents()

HuggingFaceInstructEmbeddings(client=INSTRUCTOR(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: T5EncoderModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
  (2): Dense({'in_features': 1024, 'out_features': 768, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity'})
  (3): Normalize()
), model_name='hkunlp/instructor-large', cache_folder=None, model_kwargs={'device': 'cuda:0'}, encode_kwargs={'batch_size': 32}, embed_instruction='Represent the document for retrieval: ', query_instruction='Represent the query for retrieval: ')

In [5]:
vectorstore = FAISS.from_documents(documents, embeddings) ### choose from FAISS, Chroma, 
vectorstore.save_local('data/TextBooks/vectorstore_DSC250.vs')
FAISS.load_local("data/TextBooks/vectorstore_DSC250.vs", embeddings=embeddings)

<langchain.vectorstores.faiss.FAISS at 0x7fb8cdfe6ee0>

In [6]:
documents[0]

Document(page_content='DSC250: Advanced Data Mining\n\nLanguage Models\n\nZhiting Hu Lecture 9, October 26, 2023\n\nLast lecture\n\nNeural language models:\n\n! Embedding: one-hot vectors -> embedding vectors\n\n! Neural networks\n\n2\n\nNeural Architectures of LMs\n\n3\n\nOutline\n\nRecurrent Networks (RNNs)\n\n! Long-range dependency, vanishing gradients ! LSTM ! RNNs in different forms\n\nAttention Mechanisms ! (Query, Key, Value) ! Attention on Text and Images\n\nTransformers: Multi-head Attention\n\n! Transformer ! BERT\n\n4\n\nOutline\n\nRecurrent Networks (RNNs)\n\n! Long-range dependency, vanishing gradients ! LSTM ! RNNs in different forms\n\nAttention Mechanisms ! (Query, Key, Value) ! Attention on Text and Images\n\nTransformers: Multi-head Attention\n\n! Transformer ! BERT\n\n5\n\nConvNets v.s. Recurrent Networks (RNNs)\n\nSpatial Modeling vs. Sequential Modeling ● Fixed vs. variable number of computation steps.\n\nℎ"\n\nℎ#\n\nℎ$\n\nℎ!\n\nℎ!\n\n=\n\n𝑥"\n\n𝑥#\n\n𝑥$\n\n𝑥!\n\n

In [5]:
query = "What is ML algorithm?"
docs = vectorstore.similarity_search(query)
print(docs[0].page_content)

An ML algorithm is a set of precise instructions telling the computer how to learn from data.

This is because real world data has “structure”.

An ML algorithm is a set of precise instructions telling the computer how to learn from data.

Spoiler: the algorithms are usually pretty simple. It’s the data that does the real work.

An ML algorithm is a set of precise instructions telling the computer how to learn from data.

Spoiler: the algorithms are usually pretty simple. It’s the data that does the real work.

This is because real world data has “structure”.

Problem: On a scale from 1-10, how happy is this person?

Recall: Least Squares Regression

▶ Example: predict the price of a laptop.

▶ Choose some features:

▶ CPU speed, amount of RAM, weight (kg).

▶ Prediction function (weighted “vote”):

(price) = 𝑤0 + 𝑤1 × (cpu) + 𝑤2 × (ram) + 𝑤3 × (weight)

▶ Learn 𝑤𝑖 by minimizing squared error.

Representations

▶ Computers don’t understand the concept of a

laptop.


In [6]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [None]:
from langchain.llms import HuggingFacePipeline
import torch
llm = HuggingFacePipeline.from_model_id(
                model_id="lmsys/vicuna-7b-v1.5",
                task="text-generation",#"text2text-generation",#,
                model_kwargs={"temperature": 0.7, "max_length": 32, "torch_dtype": torch.float16},
                pipeline_kwargs={'max_new_tokens':32},
                device="cuda:0"
            )
qa = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), memory=memory, return_source_documents=True)

In [None]:
query = "What is ML algorithm?"
result = qa({"question": query})

In [None]:
result

In [None]:
from model.llm_langchain_tutor import LLMLangChainTutor

In [None]:
lmtutor = LLMLangChainTutor(openai_key="")
lmtutor.load_vector_store("data/TextBooks/vectorstore_DSC140B-Lec01")
lmtutor.conversational_qa_init()

In [None]:
lmtutor.conversational_qa('Is there a discussion session this week?')

In [None]:
from model.llm_langchain_tutor import LLMLangChainTutor

In [None]:
lmtutor = LLMLangChainTutor(embedding='instruct_embedding', llm='hf_facebook/bart-large-cnn', device='cuda:0')

In [None]:
lmtutor.load_document(doc_path="data/TextBooks", glob='./DSC140B-Lec01.pdf', chunk_size=100, chunk_overlap=10)
lmtutor.generate_vector_store()
# lmtutor.conversational_qa_init()

In [None]:
lmtutor.similarity_search("is there a discussion session?")[0].page_content

In [None]:
lmtutor.conversational_qa('is there a discussion session this week?')

# LLama based embedding generation

In [1]:
import torch

In [2]:
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
model_name = "lmsys/vicuna-13b-v1.5"
embedding_model = HuggingFaceEmbeddings(model_name=model_name)


Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00003.bin:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

No sentence-transformers model found with name /home/reventh/.cache/torch/sentence_transformers/lmsys_vicuna-13b-v1.5. Creating a new one with MEAN pooling.


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "meta-llama/Llama-2-13b-chat-hf"
token = "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"
model_kwargs = {'use_auth_token' : token} #your token to use the models
embedding_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

#Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` 
#or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})
embedding_model.client.tokenizer.pad_token =  embedding_model.client.tokenizer.eos_token

embedding = embedding_model.embed_query("your text")


Downloading .gitattributes:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading LICENSE.txt:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

Downloading (…)nsible-Use-Guide.pdf:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading USE_POLICY.md:   0%|          | 0.00/4.77k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

In [1]:
import argparse
import os

from dataset import prepare_data, get_parsed_data
from model.llm_langchain_tutor import LLMLangChainTutor
from utils import get_cache_dir, get_document_folder, get_vector_file
from metrics import EmbeddingModelMetrics
from loguru import logger
from tqdm import tqdm
import torch

Found cached dataset quac (/home/reventh/.cache/huggingface/datasets/quac/plain_text/1.1.0/4170258e7e72d7c81bd6441b3f3489ea1544f0ff226ce61e22bb00c6e9d01fb6)


  0%|          | 0/2 [00:00<?, ?it/s]

Available splits: ['train', 'validation']
**Train** data is used


Found cached dataset json (/home/reventh/.cache/huggingface/datasets/b-mc2___json/b-mc2--sql-create-context-21a3552632daf3cf/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Available splits: ['train']
**Train** data is used


In [2]:
base_data_dir = get_cache_dir()
doc_folder = get_document_folder(base_data_dir, 'squad', True)


In [3]:
doc_folder

'~/.cache/squad-debug/train/documents'

In [9]:
lmtutor = LLMLangChainTutor(
        embedding='hf_stas/tiny-random-llama-2',
        llm="hf_stas/tiny-random-llama-2",
        embed_device='cpu',
        llm_device='cpu',
        cache_dir=base_data_dir,
        debug=True,
        token = "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"
    )

Downloading config.json:   0%|          | 0.00/680 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/211k [00:00<?, ?B/s]

Some weights of LlamaModel were not initialized from the model checkpoint at stas/tiny-random-llama-2 and are newly initialized: ['model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading tokenizer_config.json:   0%|          | 0.00/918 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/64.2k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [5]:
`splitted_documents = lmtutor._load_document(
            doc_folder, "*.txt", chunk_size=1, chunk_overlap=1
        )

100%|██████████| 21/21 [00:00<00:00, 23.86it/s]


In [6]:
splitted_documents

[Document(page_content='Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', metadata={'source': '~/.cache/squad-debug/train/documents/13.txt'}),
 Document(page_content='This Main Building, and the library collection, was entirely destroyed by a fire in April 1879, and the school closed immediately and students were sent home. The university founder, F

In [7]:
docs = lmtutor.doc_loader(
            doc_folder,
            glob="*.txt",
            show_progress=True,
            use_multithreading=False,
            max_concurrency=16,
        ).load()

100%|██████████| 21/21 [00:00<00:00, 195.13it/s]


In [8]:
docs

[Document(page_content='The College of Engineering was established in 1920, however, early courses in civil and mechanical engineering were a part of the College of Science since the 1870s. Today the college, housed in the Fitzpatrick, Cushing, and Stinson-Remick Halls of Engineering, includes five departments of study – aerospace and mechanical engineering, chemical and biomolecular engineering, civil engineering and geological sciences, computer science and engineering, and electrical engineering – with eight B.S. degrees offered. Additionally, the college offers five-year dual degree programs with the Colleges of Arts and Letters and of Business awarding additional B.A. and Master of Business Administration (MBA) degrees, respectively.', metadata={'source': '~/.cache/squad-debug/train/documents/8.txt'}),
 Document(page_content='As of 2012[update] research continued in many fields. The university president, John Jenkins, described his hope that Notre Dame would become "one of the pre

In [5]:
from model.llm_encoder import LLMBasedEmbeddings
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [17]:
llm = AutoModelForCausalLM.from_pretrained(
                'stas/tiny-random-llama-2',
                torch_dtype=torch.float16,
                cache_dir='.cache',
                output_hidden_states=True,
            ).to('cuda:0')
tokenizer = AutoTokenizer.from_pretrained(
                'stas/tiny-random-llama-2', device='cuda:0'
            )

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at stas/tiny-random-llama-2 and are newly initialized: ['model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv_freq']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
embedding_model = LLMBasedEmbeddings(
                model=llm,
                tokenizer=tokenizer,
                device='cuda:0',
                aggregation='mean',
                hidden_state_id=-1,
            )

2023-11-21 18:12:09.556 | INFO     | model.llm_encoder:__init__:25 - Initialized LLMBasedEmbeddings. with hidden_state_id: -1 and aggregation: mean


In [19]:
embedding_model.embed_query("How are you?")

array([-0.2231 ,  0.2179 ,  0.5757 ,  0.538  , -0.2148 ,  0.1919 ,
       -0.3562 , -0.1221 ,  0.2183 ,  0.06445, -0.556  , -0.0528 ,
        0.0895 , -0.1726 ,  0.1979 , -0.2522 ], dtype=float16)

In [21]:
embedding_model.embed_query("How are you?"*100)

array([-0.1273  ,  0.1805  ,  0.4082  ,  0.3765  , -0.1032  ,  0.2081  ,
       -0.3352  , -0.14    ,  0.072   ,  0.3286  , -0.1765  , -0.3413  ,
        0.002716, -0.1946  ,  0.1428  , -0.2119  ], dtype=float16)

In [14]:
texts = list(map(lambda x: x.replace("\n", " "), ["How are you?"]))
texts

['How are you?']

In [16]:

embedding_model.model(
                    embedding_model.tokenizer(texts[0], return_tensors="pt")["input_ids"].to(
                        embedding_model.device
                    )
                )

CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.1000, -0.1898,  0.0226,  ...,  0.0487, -0.0326, -0.0445],
         [-0.0657, -0.1553, -0.0657,  ...,  0.0950, -0.0635,  0.0444],
         [-0.0210,  0.0241, -0.0523,  ...,  0.1320,  0.1636,  0.0373],
         ...,
         [-0.0420, -0.0106,  0.0022,  ...,  0.0060, -0.0757,  0.0936],
         [-0.0029,  0.0093,  0.0581,  ..., -0.0197, -0.1307, -0.0053],
         [-0.1171, -0.1307,  0.0906,  ...,  0.0430, -0.0152, -0.0641]]],
       device='cuda:0', grad_fn=<ToCopyBackward0>), past_key_values=((tensor([[[[-1.7944e-01,  1.3252e-02,  1.4551e-01, -1.1711e-02],
          [ 3.6865e-02,  5.9692e-02, -4.0817e-03,  6.3232e-02],
          [ 3.1830e-02,  9.8938e-02,  9.9609e-02, -1.0931e-01],
          [-1.0620e-02,  2.8214e-02, -1.8921e-01, -3.0548e-02],
          [ 1.8762e-01, -1.1816e-01,  4.1992e-02,  4.3152e-02],
          [ 6.7505e-02,  2.0828e-02,  2.1042e-02, -7.2327e-02],
          [ 1.7700e-03,  3.0075e-02,  1.5637e-01,  1.1725e-01],