<a href="https://colab.research.google.com/github/TheOneTrueGuy/DigitalDayDream/blob/main/Ded_Simpl_LLM_chat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

All the extra comments in these cells is an attempt to use step-back prompting to improve the activity of the colab AI assistant. Mixed reviews for the method with that particular LLM. Zephyr 7B seems a good bit smarter. I will be putting some effort into trying out various models and CPU/GPU combinations in the near future.

In [None]:
# cell 0/zero
# this jupyter notebook project is to implement a dead simple large language model chat interface
# for the Zephyr 7B quantized model and other such LLM models

# What do these installs tell us about the code we are going to be writing?
# How will the functions and operation depend on these dependencies?
# What methods, variables, objects and function from these libraries will be used to achieve these ends?
# watch closely for the imports and import from statements that call on these libraries.
!pip install transformers tiktoken cohere autoawq gradio
!apt install cuda-cudart-12-3

In [None]:
# cell 1/one
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
#model_name="uwnlp/llama-2-70b-qlora-openorca" # a lot of these bigger models just chew down the T4 and V100 gpus.
model_name = "TheBloke/zephyr-7B-alpha-AWQ"
#model_name="TheBloke/CausalLM-14B-GPTQ"
#model_name = "EleutherAI/gpt-neo-2.7B"  # Choose the model you want this one and zephyr are the only T4 or less models I've found so far
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda:0")
#model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda:0")

In [None]:
# cell 2/two
# setup priveledges and drive access for saving chat history in text form to the google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# cell 3/three
#demo message place holder cell

msg="""
<|im_start|>system
You are a master python coder here to help write python code<|im_end|>
<|im_start|>user
I need help writing python code that outputs a .wav file<|im_end|>
<|im_start|>assistant
"""


In [None]:
# cell 4/four
# what do each of these imports bring with them in terms of properties, methods, functions and function call variables?
import json
import os
import random
import re
import sys
import time
from collections import deque
import gradio as gr
import numpy as np
import math
import datetime as dt

# this class will instantiate a stack object that will be used to keep track of the ongoing chat exchange between assistant and user
class Stack:
    def __init__(self):
        self.stack = deque(maxlen=20)
        self.count = 0

    def push(self, string1, string2, string3):
        self.stack.append([string1, string2, string3])
        self.count += 1
        if self.count>19: self.count=19

    def get_stack(self):
        return list(self.stack)

    def get_item(self, index):
        if index >= 0 and index < len(self.stack):
          return self.stack[index]
        else:
          return None

stacky=Stack()

# similarly this messagebuilder object will make it easier to construct the messages to be sent to the LLM

class MessageBuilder:
    def __init__(self, max_depth=20):
        self.stack = deque(maxlen=max_depth)
        self.content_template = {"role": None, "content": None}

    def add_line(self, role, content):
        line = self.content_template.copy()
        line["role"] = role
        line["content"] = content
        self.stack.append(line)

    def get_message(self):
        return list(self.stack)

    def clear(self):
        self.stack.clear()

builder = MessageBuilder()

def msg_llm(msg, frequency_penalty, presence_penalty):
  input_text = msg #"Once upon a time"
  input_ids = tokenizer.encode(input_text, return_tensors="pt")
  input_ids = input_ids.to("cuda:0")
  output = model.generate(input_ids, max_length=250, num_return_sequences=1, frequency_penalty=0, presence_penalty=0, stop_token_id=tokenizer.eos_token_id)
  generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
  # at this point we need to remove the part of generated_text that is identical to msg using string functions
  generated_text=generated_text.replace(msg,"")
  #print(generated_text)
  return generated_text #return just the output of the llm


In [None]:
# cell 5/five
# this cel kept for reference
input_text = msg #"Once upon a time"
input_ids = tokenizer.encode(input_text, return_tensors="pt")
input_ids = input_ids.to("cuda:0")
output = model.generate(input_ids, max_length=250, num_return_sequences=1)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

In [None]:
# cell 6/six
# define and declare the remaining functions to be called by a Gradio interface
# function to get data from gradio, place it in stack, format with messagebuilder and earlier stack entries
# then send to LLM via msg_llm()
def pass_inputs(system_text, user_input, frequency_penalty, presence_penalty):
    static systext

    builder.clear()
      if systext != system_text:
      systext=system_text
      dialog = open("dialog.txt", "a", encoding="utf-8")
      dialog.write(f"system: {system_text} \n :end_system \n")
      dialog.close()

    builder.add_line("system", system_text)
    for i in range(stacky.count):
        builder.add_line("assistant", stacky.get_item(i)[2])
        builder.add_line("user", stacky.get_item(i)[1])
    builder.add_line("user", user_input)
    msg=builder.get_message()
    result=msg_llm(msg, frequency_penalty, presence_penalty)
    stacky.push(system_text, user_input, result)
    # need to open chat history file "dialog.txt" and write the date and time, and the new user input, the new assistant/llm output
    # and the system prompt if it has been changed
    dialog = open("dialog.txt", "a", encoding="utf-8")
    diloag.write(f"dtn: {dt.datetime.now()} :dtn \n")
    diloag.write(f"user: {user_input} \n :end_user \n")
    dialog.write(f"assistant: {result} \n :end_assistant \n")
    dialog.close()
    return result


# Create the Gradio interface
iface = gr.Interface(
    fn=pass_inputs,
    inputs=[
        gr.Textbox(label="System prompt", type="text", default=""),
        gr.Textbox(label="User Input", type="text", default=""),
        gr.Number(label="Frequency Penalty", default=0, minimum=0, maximum=1),
        gr.Number(label="Presence Penalty", default=0, minimum=0, maximum=1),
        gr.Button("Submit")
    ],
    outputs=gr.Textbox(label="Generated Text", type="text")
)

# Launch the interface
iface.launch()