##### $Name:\,\color{blue}{Christopher\,J.\,Watson,\,Joseph\,Binny,\,Viktor\,Veselov}$
##### $School:\,\color{blue}{Marcos\,School\,of\,Engineering,\,University\,of\,San\,Diego}$
##### $Research:\,\color{blue}{MSAAI\,Machine\,Learning\,\,TA}$
##### $Date:\,\color{blue}{1/18/2024}$
##### $Revision:\,\color{blue}{1}$

In [1]:
#Basic Imports
from torch import cuda, bfloat16
import torch
from time import time
import joblib

#Transformers Library
import transformers
from transformers import AutoTokenizer

#GPTQ Libraries
from optimum.gptq import GPTQQuantizer, load_quantized_model

#Pipelining-Langchain
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma

In [2]:
# Setup CUDA
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU.")

torch.set_default_device(device)

Using GPU: NVIDIA GeForce RTX 4080


In [3]:
# Initialize HuggingFace authentication token
hf_auth = You-Need-Your-Own-Key(This can be kept in secrets and memory keys)

model_id = "meta-llama/Llama-2-7b-chat-hf"

In [None]:
# If cuda is enabled do quantizing
if torch.cuda.is_available():
    bnb_config = transformers.BitsAndBytesConfig(
        load_in_4bit =True,
        #load_in_8bit =True, # Linux Setting
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=bfloat16
        #llm_int8_threshold = 6.0 # Linux Setting
    )
else:
    bnb_config = None

# Basic Tokenizing
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    trust_remote_code=True
)

# Setup model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
    token=hf_auth #This line needs to correspond to your key
)

# Let's pipeline it
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

In [5]:
# Function the return of a basic string
def AskMeAnything(query):
    sequences = pipeline(
                    query,
                    do_sample=True,
                    top_k=10,
                    num_return_sequences=1,
                    eos_token_id=tokenizer.eos_token_id,
                    max_length=200,
                    )

    results = []
    for seq in sequences:
        results.append(seq['generated_text'])
    
    return results

In [6]:
# Test output without rag for fun
results = AskMeAnything("What is data science?")
print(results)

["What is data science?\nIn today's data-driven world, data science has become an essential tool for organizations to gain insights, make informed decisions, and drive innovation. But what exactly is data science? In this article, we will explore the definition of data science, its key components, and its applications in various industries.\n\nDefinition of Data Science\n\nData science is a field that combines elements of computer science, statistics, and domain-specific knowledge to extract insights and knowledge from data. It involves using various techniques and tools to analyze, interpret, and visualize data to gain insights and make informed decisions. Data science is a multidisciplinary field that requires a combination of technical skills and domain expertise to be successful.\n\nKey Components of Data Science\n\nThere are several key components of data science that are essential to understand:\n\n1. Data Acquisition: This involves collecting and gathering data"]


In [8]:
# Load in rag data
loader = TextLoader("./TA_Data/Mod1-Presentations.txt",
                    encoding="utf8")
docs = loader.load()

In [9]:
# basic chunking, this only works for basic text
text_chunks = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
chunk_stack = text_chunks.split_documents(docs)

In [10]:
# Hugging Face Embeddings. Could be done with LLama2 embeddings but this is much more lightweight.
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": device}
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

In [11]:
# pipelines the hugging face entries into langchain
hfm = HuggingFacePipeline(pipeline=pipeline)

# Spawn a RAG vector store using ChromaDB (For now on the same device in script instead of standalone instance)
vectordb = Chroma.from_documents(documents=chunk_stack, embedding=embeddings, persist_directory="chroma_db")
# create a retriever from the database
retriever = vectordb.as_retriever()
qa = RetrievalQA.from_chain_type(
    llm=hfm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [None]:
# Testing RAG
query= "What is Diagnostic analytics?" 
result = qa.run(query)

In [13]:
# Show Results
result

' Diagnostic analytics is a type of analytics that is used to understand why something happened in the past. It is also known as causal analytics and is used to assess posts, followers, page views, likes and reviews to identify what worked and what did not work in past campaigns. Diagnostic analytics uses statistical techniques, such as correlation, to measure the degree of relationship between linear variables.'

In [7]:
#TODO - Quantize the model
#quantizer = GPTQQuantizer(bits=4, dataset="c4", block_name_to_quantize = "model.decoder.layers", model_seqlen = 2048)
#quantized_model = quantizer.quantize_model(model, tokenizer)