First, download Ollama, either from the command line or through the website: https://ollama.com/

It is also recoomended that you create and use a virtual environment.

In [None]:
!pip install ollama

The requests package is also needed.

In [None]:
!pip install requests

These are the needed import statements:

In [None]:
import ollama
import requests
import json
import os

Define the base URL of your local LLM server. This is the URL for Ollama:

In [None]:
BASE_URL = 'http://localhost:11434'

This is the function to pull the models from the Ollama server:

In [None]:
def pull_model(model_name):
    """
    Pulls the specified model from the library to the local server.
    """
    pull_url = f"{BASE_URL}/api/pull"
    payload = {"name": model_name}
    
    # POST request to pull the model
    response = requests.post(pull_url, json=payload)
    
    # Handle the response
    if response.status_code == 200:
        print("Model pull initiated.")
        # Streaming is true by default, so we expect multiple JSON objects
        for line in response.iter_lines():
            if line:  # filter out keep-alive new lines
                print(json.loads(line.decode('utf-8')))
    else:
        raise Exception(f"Error: {response.status_code}, {response.text}")


Here, you can select the models you wish to install and use:

In [None]:
# Pull the models
#pull_model('mistral:7b')
#pull_model('mistral:instruct')
#pull_model('mistral:text')
#pull_model('wizardlm2:7b')
#pull_model('gemma:7b')
#pull_model('llama2:13b')
#pull_model('llama2:70b')
pull_model('llama3:8b')
#pull_model('llama3:instruct')
pull_model('llama3:text')

This is an example of how to customize and tweak the models with Modelfiles. The below builds the mario example from the Ollama documentation.

In [None]:
!ollama create mario -f ./Modelfile

Here is an example of a LLaMA3 model modified with a Modelfile to have a lowered temperature for a more coherent and less creative response type and a custom system prompt to enhance its focus on the specific task.

In [None]:
!ollama create summary_llama3 -f ./Modelfile_Summary

Define the models you wish to use for which functions. Currently there are two: The LLM that you wish to interact with, and the LLM that will provide the conversation summary to the conversational LLM.

You can see your installed and created models with the following:

In [None]:
!ollama list

This code deletes models that are installed:

In [None]:
!ollama rm summary_llama3

In [None]:
# Define the model name
MODEL_NAME = 'llama3:8b'
SUMMARY_MODEL_NAME = 'summary_llama3'
#SUMMARY_MODEL_NAME = 'llama3:text'

This function initializes the memory file if it does not already exist.

In [None]:
def init_memory(file_path):
    """Initialize the memory file if it doesn't exist."""
    if not os.path.exists(file_path):
        with open(file_path, 'w') as file:
            json.dump([], file)  # Initialize with an empty list.

These functions read from and write to the memory file

In [None]:
def read_memory(file_path, person):
    """Retrieve a specific person's interaction from the memory."""
    with open(file_path, 'r') as file:
        data = json.load(file)
        for interaction in data:
            if interaction['person'] == person:
                return interaction['conversation']
    return None

def write_memory(file_path, person, user_input, llm_response):
    """Write or update a person's interaction in the memory."""
    conversation = f"You: {user_input} LLM: {llm_response}"
    found = False
    with open(file_path, 'r+') as file:
        data = json.load(file)
        for interaction in data:
            if interaction['person'] == person:
                interaction['conversation'] += f" {conversation}"  # Append new conversation
                found = True
                break
        if not found:
            data.append({'person': person, 'conversation': conversation})
        file.seek(0)
        file.truncate()
        json.dump(data, file, indent=4)

This function handles querying the chosen conversational LLM and recieving the response.

In [None]:
def generate_completion(prompt, model_name=MODEL_NAME, stream=True, context=None, conversation_history=None):
    """
    This function sends a prompt to the local LLM and returns the response and context.
    """

    # Concatenate conversation history, context, and prompt correctly
    if conversation_history:
        full_prompt = f"{conversation_history}\n{context}\n{prompt}"
    else:
        full_prompt = f"{context}\n{prompt}" if context else prompt

    generate_url = f"{BASE_URL}/api/generate"
    payload = {
        "model": model_name,
        "prompt": full_prompt,
        "stream": stream,  # We are explicitly asking for streaming responses
        "context": context,  # Include context if available
        "options": {"num_ctx": 4096}
    }
    
    # POST request to the LLM
    response = requests.post(generate_url, json=payload, stream=stream)
    full_response = ""  # Initialize to capture full response
    next_context = None  # Initialize context to None
    
    # Handle the response
    if response.status_code == 200:
        if not stream:
            # If not streaming, we expect one JSON object
            response_data = response.json()
            full_response = response_data.get('response', '')
            next_context = response_data.get('context')
        else:
            # If streaming, handle the streaming response
            for line in response.iter_lines():
                if line:
                    response_data = json.loads(line.decode('utf-8'))
                    if 'response' in response_data:
                        full_response += response_data['response']  # Accumulate the response
                    if 'context' in response_data:  # Capture the context if present
                        next_context = response_data['context']
                    if response_data.get('done'):  # Check if it's the end of the response
                        break
    else:
        raise Exception(f"Error: {response.status_code}, {response.text}")
    
    return next_context, full_response  # Return the context and full response

This function uses the Summary LLM to provide a concise summary of the conversation history when needed.

In [None]:
def get_summary(prompt, summary_model_name=SUMMARY_MODEL_NAME):
    """
    This function sends a prompt to the summarization LLM and returns the summary.
    """
    summary_instruction = "Please provide a concise summary of the following conversation: "
    summary_prompt = f"{summary_instruction} {prompt}"


    generate_url = f"{BASE_URL}/api/generate"
    payload = {
        "model": summary_model_name,
        "prompt": summary_prompt,
        # Assuming that summarization also requires streaming
        "stream": True,  # Update this based on whether the model supports streaming
    }
    
    # POST request to the summarization LLM
    response = requests.post(generate_url, json=payload, stream=True)  # Assuming streaming is needed
    full_summary = ""  # Initialize to capture full summary
    
    if response.status_code == 200:
        for line in response.iter_lines():
            if line:  # filter out keep-alive new lines
                response_data = json.loads(line.decode('utf-8'))
                if 'response' in response_data:
                    full_summary += response_data['response']  # Accumulate the response
                if response_data.get('done'):  # Check if it's the end of the response
                    break
    else:
        raise Exception(f"Error: {response.status_code}, {response.text}")
    
    return full_summary  # Return the full summary

This function is the main driver for the Chat with Memory program.

In [None]:
def cli_llm():
    global context  # Use the global context variable
    memory_file = 'memory.json'  # Define the path to the memory file
    gen_model = MODEL_NAME
    summary_model_name = SUMMARY_MODEL_NAME

    init_memory(memory_file)  # Initialize memory file if it doesn't exist
    print("Chat with Memory is running. You are chatting with: " + MODEL_NAME + ". Type 'exit' to quit.")
    
    user = input("Please enter your name to start: ")
    conversation_history = read_memory(memory_file, user)
    
    if conversation_history:
        print(f"Welcome back, {user}! You have previous conversations recorded.")
        if input("Would you like a summary of your last conversation(s)? (yes/no): ").lower().strip() == 'yes':
            summary = get_summary(conversation_history, summary_model_name)
            print("Here is the summary of your last conversation(s):")
            print(summary)
        else:
            print("Continuing without a summary.")
        #print("Your previous conversation was:")
        #print(conversation_history)
    else:
        print(f"Hello, {user}! Starting a new conversation.")

    while True:
        user_input = input("You: ")
        if user_input.lower() == 'exit':
            break
        print(f"You: {user_input}")  # Echo the user input
        context, response = generate_completion(
            prompt=user_input, 
            model_name=gen_model,  # Make sure gen_model is defined, or use MODEL_NAME directly
            stream=True, 
            context=context, 
            conversation_history=conversation_history
)
        print(f"LLM: {response}")  # Print LLM's response
        write_memory(memory_file, user, user_input, response)  # Update memory with the new input and response


This launches the program.

In [None]:
# Call the function to start the command line interface
context = None
cli_llm()