In [1]:
from ctransformers import AutoModelForCausalLM, AutoTokenizer

llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7b-Chat-GGUF")


# get model and tokenizer from ctransformers
model = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7b-Chat-GGUF", hf=True)
tokenizer = AutoTokenizer.from_pretrained(model)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]



In [2]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain import HuggingFacePipeline
from langchain import PromptTemplate,  LLMChain

In [3]:
pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens = 512,
                do_sample=True,
                top_k=30,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id
                )

In [4]:
import json
import textwrap

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""



def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

def cut_off_text(text, prompt):
    cutoff_phrase = prompt
    index = text.find(cutoff_phrase)
    if index != -1:
        return text[:index]
    else:
        return text

def remove_substring(string, substring):
    return string.replace(substring, "")



def generate(text):
    prompt = get_prompt(text)
    with torch.autocast('cuda', dtype=torch.bfloat16):
        inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
        outputs = model.generate(**inputs,
                                 max_new_tokens=512,
                                 eos_token_id=tokenizer.eos_token_id,
                                 pad_token_id=tokenizer.eos_token_id,
                                 )
        final_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        final_outputs = cut_off_text(final_outputs, '</s>')
        final_outputs = remove_substring(final_outputs, prompt)

    return final_outputs#, outputs

def parse_text(text):
        wrapped_text = textwrap.fill(text, width=100)
        print(wrapped_text +'\n\n')
        # return assistant_text

In [5]:
llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0})

In [5]:
system_prompt = "You are an advanced assistant that excels at summarization and finding useful infomation from text. "
instruction = """"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""
template = get_prompt(instruction, system_prompt)

prompt = PromptTemplate(template=template, input_variables=["text"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

NameError: name 'get_prompt' is not defined

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://caesar.web.engr.illinois.edu/")
data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
all_splits = text_splitter.split_documents(data)

persist_directory = 'docs/chroma/'



from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
OPENAI_API_KEY = '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'
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
# Create the vector store
vectordb = Chroma.from_documents(
    documents=all_splits,
    embedding=embedding,
    persist_directory=persist_directory
)

print(vectordb._collection.count())

5201


In [61]:
def scrape_website(url):
    import requests
    from bs4 import BeautifulSoup
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    # text_data = [element.get_text().replace('\n', ' \n ') for element in soup.find_all(['body'])]
    text_data = [element.get_text() for element in soup.find_all(['body'])]

    return " ".join(text_data)

In [62]:
txt = scrape_website("https://caesar.web.engr.illinois.edu/")

In [64]:
indices = [(m.start(0), m.end(0)) for m in re.finditer(r'cAlumni|Students|Teaching', txt)]

# Create a list to store the sections
sections = []

# Extract the sections based on the indices of the headings
for i, (start, end) in enumerate(indices):
    if i < len(indices) - 1:
        sections.append(txt[start:indices[i+1][0]])
    else:
        sections.append(txt[start:])

# Print each section
for i, section in enumerate(sections):
    print(f"Section {i+1}:\n{section}\n")

Section 1:
Matthew Caesar
Professor 
Department of Computer Science
University of Illinois at Urbana-Champaign
Urbana, IL, 61801

Email: caesar (at) cs (dot) illinois (dot) edu
Office: Room 3118, Siebel Center 
Phone: 847-323-2968


Links: 
[ Publications ]
[ Bio ]





I am a Professor in the Department of Computer Science at UIUC.
I am also an Affiliate Professor in the  Department of Electrical and Computer Engineering, an Affiliate Research Professor in the Coordinated Science Laboratory, Affiliate Professor in the School of Information Sciences, and a member of the Information Trust Institute. 
I currently serve as the Vice Chair of ACM SIGCOMM, and the co-chair of The Networking Channel, an online community talk series for the computer systems and networking community. 
I co-founded and previously served as the Chief Science Officer and President of Veriflow (sold to VMware in 2019). 
I received my Ph.D. in Computer Science from UC Berkeley.  


My research focuses on the design,

In [49]:
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter(' \n\n')
chunks = splitter.split_text(txt)

In [50]:
chunks[0]

'Matthew Caesar\nProfessor \nDepartment of Computer Science\nUniversity of Illinois at Urbana-Champaign\nUrbana, IL, 61801\n\nEmail: caesar (at) cs (dot) illinois (dot) edu\nOffice: Room 3118, Siebel Center \nPhone: 847-323-2968\n\n\nLinks: \n[ Publications ]\n[ Bio ]\n\n\n\n\n\nI am a Professor in the Department of Computer Science at UIUC.\nI am also an Affiliate Professor in the  Department of Electrical and Computer Engineering, an Affiliate Research Professor in the Coordinated Science Laboratory, Affiliate Professor in the School of Information Sciences, and a member of the Information Trust Institute. \nI currently serve as the Vice Chair of ACM SIGCOMM, and the co-chair of The Networking Channel, an online community talk series for the computer systems and networking community. \nI co-founded and previously served as the Chief Science Officer and President of Veriflow (sold to VMware in 2019). \nI received my Ph.D. in Computer Science from UC Berkeley.  \n\n\nMy research focuse

In [33]:
recombined_chunks = ['\n\n'.join(chunks[i:i+4]) for i in range(0, len(chunks), 4)]

for i, chunk in enumerate(recombined_chunks):
    print(f"Chunk {i+1}:\n{chunk}\n")
    print(f"//////////////////\n")

Chunk 1:
Matthew Caesar
Professor 
Department of Computer Science
University of Illinois at Urbana-Champaign
Urbana, IL, 61801

Email: caesar (at) cs (dot) illinois (dot) edu
Office: Room 3118, Siebel Center 
Phone: 847-323-2968


Links: 
[ Publications ]
[ Bio ]

I am a Professor in the Department of Computer Science at UIUC.
I am also an Affiliate Professor in the  Department of Electrical and Computer Engineering, an Affiliate Research Professor in the Coordinated Science Laboratory, Affiliate Professor in the School of Information Sciences, and a member of the Information Trust Institute. 
I currently serve as the Vice Chair of ACM SIGCOMM, and the co-chair of The Networking Channel, an online community talk series for the computer systems and networking community. 
I co-founded and previously served as the Chief Science Officer and President of Veriflow (sold to VMware in 2019). 
I received my Ph.D. in Computer Science from UC Berkeley.  


My research focuses on the design, analy

In [9]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [12]:
from langchain.prompts import PromptTemplate

# Bprompt takes in the documents and the question and passes it to a language model.

template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!".
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)# Run chain


# Initilaize chain
# Set return_source_documents to True to get the source document
# Set chain_type to prompt template defines
qa_chain = RetrievalQA.from_chain_type(
    llm,
    chain_type='stuff',
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [19]:
# Pass question to the qa_chain
question = "Who is the professor?"
result = qa_chain({"query": question})
result["result"]

' Thanks for asking! Matthew Caesar is the professor.'

In [13]:
import json
import textwrap

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""



def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

def cut_off_text(text, prompt):
    cutoff_phrase = prompt
    index = text.find(cutoff_phrase)
    if index != -1:
        return text[:index]
    else:
        return text

def remove_substring(string, substring):
    return string.replace(substring, "")



def generate(text):
    prompt = get_prompt(text)
    with torch.autocast('cuda', dtype=torch.bfloat16):
        inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
        outputs = model.generate(**inputs,
                                 max_new_tokens=512,
                                 eos_token_id=tokenizer.eos_token_id,
                                 pad_token_id=tokenizer.eos_token_id,
                                 )
        final_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        final_outputs = cut_off_text(final_outputs, '</s>')
        final_outputs = remove_substring(final_outputs, prompt)

    return final_outputs#, outputs

def parse_text(text):
        wrapped_text = textwrap.fill(text, width=100)
        print(wrapped_text +'\n\n')
        # return assistant_text

In [11]:
from langchain import PromptTemplate,  LLMChain
system_prompt = "You are an advanced assistant that excels at summarization and understanding. "
instruction = "Use the following pieces of context to summarize. If you don't know the answer, just say that you don't know, don't try to make up an answer. \n\n {context} \n\n "
template = get_prompt(instruction, system_prompt)
print(template)

prompt = PromptTemplate(template=template, input_variables=["context"])
llm_chain = LLMChain(prompt=prompt, llm=llm, verbose=True, vectordb=vectordb)

[INST]<<SYS>>
You are an advanced assistant that excels at summarization and understanding. 
<</SYS>>

Use the following pieces of context to summarize. If you don't know the answer, just say that you don't know, don't try to make up an answer. 

 {context} 

 [/INST]


ValidationError: 1 validation error for LLMChain
vectordb
  extra fields not permitted (type=value_error.extra)

In [None]:
text = "who is the professor?"
output = llm_chain.run(text)

parse_text(output)