In [1]:
from langchain.document_loaders import DataFrameLoader
from langchain.llms             import GPT4All
from langchain.prompts          import PromptTemplate

from pandas import read_csv, concat, DataFrame

In [2]:
def _callback(token_id: int, response: str) -> bool:
    global counter, responses, four_sentences
    
    # save the summary tokens
    responses.append(response.decode('utf-8'))
    
    if response.decode('utf-8') == '.':
        # every time a . is detected is a new sentence
        counter += 1
    
    # if the total number of sentences are less than 2
    if counter < 4:
        four_sentences = False
        
        # continue generation process
        return True
    
    else:
        # reset sentence counter
        counter = 0
        four_sentences = True
        
        # stop further generation
        return False

In [3]:
# set global variables
counter = 0
responses = []
four_sentences = False

# load the data up for summarization
data = read_csv('course-data-small.csv')[['course_name', 'overview']]

# load the model
model_path = '/app/ggml-gpt4all-j-v1.3-groovy.bin'
llm = GPT4All(model=model_path, verbose=False, n_threads=8, seed=42, temp=0.3, streaming=False, use_mlock=True)

# change the callback
llm.client.model._response_callback = _callback

# declare the prompt template
template = """
    Summarize this overview in as few words as possible.
    {overview}
    In summary, 
"""
prompt = PromptTemplate(
    input_variables=["overview"], template=template
)

Found model file at  /app/ggml-gpt4all-j-v1.3-groovy.bin
gptj_model_load: loading model from '/app/ggml-gpt4all-j-v1.3-groovy.bin' - please wait ...
gptj_model_load: n_vocab = 50400
gptj_model_load: n_ctx   = 2048
gptj_model_load: n_embd  = 4096
gptj_model_load: n_head  = 16
gptj_model_load: n_layer = 28
gptj_model_load: n_rot   = 64
gptj_model_load: f16     = 2
gptj_model_load: ggml ctx size = 5401.45 MB
gptj_model_load: kv self size  =  896.00 MB
gptj_model_load: ................................... done
gptj_model_load: model size =  3609.38 MB / num tensors = 285


In [4]:
temp = {
    'course_name': [], 
    'o_summarized': [], 
    'overview': []
}

result_df = DataFrame(temp)
result_df.to_csv('gpt-summarized-info.csv', index=False)
row_counter = 0

for idx, doc in data.iterrows():
    temp['course_name'].append(doc.course_name)
    temp['overview'].append(doc.overview)
    
    prompt = prompt.format(overview=doc.overview)
    # generate responses until the global counter 
    llm.predict(prompt,)
    
    while not four_sentences:
        new_prompt = prompt + ''.join(responses)
        llm.predict(new_prompt)
    
    temp['o_summarized'].append(''.join(responses))
    
    # save dataframe every 5 rows
    row_counter += 1
    
    # Save the data to the DataFrame every 5 rows
    if row_counter % 5 == 0:
        DataFrame(temp).to_csv('gpt-summarized-info.csv', mode='a', index=False, header=False)
        temp = {'course_name': [], 'o_summarized': [], 'overview': []}
        print(f'\rCompleted IDX: {idx} \t| Counter: {counter}', end='', flush=True)
    
    # reset the response variable
    responses = []
    four_sentences = False






GPT-J: reached the end of the context window so resizing





GPT-J: reached the end of the context window so resizing




Completed IDX: 4 	| Counter: 0

{'course_name': [], 'o_summarized': [], 'overview': []}

---

!pip list

from gpt4all import GPT4All

model = GPT4All('/app/ggml-gpt4all-j-v1.3-groovy.bin',)
model.model.set_thread_count(8)

help(model.model._response_callback)

%%time
model.generate(prompt.format(overview=data.overview[0]))

from pandas import read_csv, concat


def _callback(token_id: int, response: str) -> bool:
    global counter
    if response.decode('utf-8') == '.':
        # every time a . is detected is a new sentence
        counter += 1
    
    # if the total number of sentences are less than 2
    if counter < 2:
        
        # print them / process them - this will become calls to the /animateSay endpoint
        print(response.decode('utf-8'), end='')
        
        # continue generation process
        return True
    
    else:
        # reset sentence counter
        counter = 0
        print('.')
        
        # stop further generation
        return False


counter = 0
data = read_csv('course-data-small.csv')[['course_name', 'overview']]
model.model._response_callback = _callback

%%time
model.generate(
    prompt=prompt.format(overview=data.overview[0]), 
    streaming=True, temp=0.3,
)

from langchain.chains           import RetrievalQA
from langchain.document_loaders import DataFrameLoader
from langchain.llms             import GPT4All
from langchain.embeddings       import HuggingFaceEmbeddings
from langchain.text_splitter    import CharacterTextSplitter
from langchain.vectorstores     import Chroma
from langchain.prompts          import PromptTemplate

from langchain.chains.summarize import load_summarize_chain
from langchain.callbacks.base   import BaseCallbackHandler


import pandas as pd
from pandas import read_csv, concat

def capitalize(text):
    return " ".join([word.capitalize() for word in text.split()])


class MainCallback(BaseCallbackHandler):

    def __init__(self, max_sentences=5, *args, **kwargs):
        super().__init__(*args, **kwargs) 
        self.max_sentences = max_sentences
        self.num_sentences = 0
        self.raise_error = True

    def on_llm_new_token(self, token: str, **kwargs):
        if token == '.':
            self.num_sentences += 1
            if self.num_sentences >= self.max_sentences:
                print('Max senteces reached: ', self.num_sentences)
            raise Exception('Dumb Exception.')
        else:
            try:
                token.encode('utf-8')
            except UnicodeDecodeError as e:
                # take ascii equivalent of unicode char
                token = ord(token)
                    
            # print('TOKEN: ', type(token), token, capitalize(token))
    
    def on_llm_end(self, response, *, run_id, parent_run_id=None, **kwargs):
        print('The number of sentences is: ', self.num_sentences)
        


callback = MainCallback(max_sentences=5)

data = read_csv('course-data-small.csv')[['course_name', 'overview']]
data['content'] = data['course_name'] + ' ' + data.overview
data['content'] = data.content.str.replace('’', '').str.replace('^', ' ')
loader = DataFrameLoader(data[['content']], page_content_column='content')
document = loader.load_and_split()

# embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
# db = Chroma.from_documents(document, embeddings,)
model_path = '/app/ggml-gpt4all-j-v1.3-groovy.bin'
llm = GPT4All(model=model_path, verbose=False, n_threads=8, seed=42, temp=0.3, streaming=False, use_mlock=True)

llm.client.model._response_callback(0, 'Test'.encode())
llm.client.model._response_callback(0, 'Test'.encode())
llm.client.model._response_callback(0, 'Test'.encode())
llm.client.model._response_callback(0, 'Test'.encode())
llm.client.model._response_callback(0, 'Test'.encode())
llm.client.model._response_callback(0, 'Test'.encode())

from langchain import PromptTemplate

template = """
    Summarize this overview in as few words as possible.
    {overview}
    In summary, 
"""
prompt = PromptTemplate(
    input_variables=["overview"], template=template
)

prompt.format(overview=data.overview[0])

data = read_csv('course-data-small.csv')[['course_name', 'overview']].reset_index(drop=True)

temp = {
    'course_name': [], 
    'o_summarized': [], 
    'overview': []
}

result_df = pd.DataFrame(temp)
result_df.to_csv('gpt-summarized-info.csv', index=False)
counter = 0
for idx, doc in data.iterrows():
    temp['course_name'].append(doc.course_name)
    temp['overview'].append(doc.overview)
    temp['o_summarized'].append(
        llm.predict(
            prompt.format(overview=doc.overview),
        )
    )
    # save dataframe every 5 rows
    counter += 1

    # Save the data to the DataFrame every 5 rows
    if counter % 5 == 0:
        pd.DataFrame(temp).to_csv('gpt-summarized-info.csv', mode='a', index=False, header=False)
        temp = {'course_name': [], 'o_summarized': [], 'overview': []}
        print(f'\rCompleted IDX: {idx} \t| Counter: {counter}', end='', flush=True)
        
    break

temp