In [None]:
import re
import json
import requests
import asyncio
from typing import Any, List, Mapping, Optional

import discord
from discord import app_commands
from discord.ext import commands
from discord.ext.commands import Bot

import chromadb
from chromadb.utils import embedding_functions
from chromadb.config import Settings
import uuid
import langchain
from langchain.chains import (
    ConversationChain,
    LLMChain,
    LLMMathChain,
    TransformChain,
    SequentialChain,
)
from langchain.chat_models import ChatOpenAI
from langchain.docstore import InMemoryDocstore
from langchain.llms.base import LLM, Optional, List, Mapping, Any
from langchain.embeddings.openai import OpenAIEmbeddings
from textwrap import dedent
from langchain.memory import (
    ChatMessageHistory,
    ConversationBufferMemory,
    ConversationBufferWindowMemory,
    ConversationSummaryBufferMemory,
)
from langchain.prompts.prompt import PromptTemplate
from langchain.schema import messages_from_dict, messages_to_dict
from langchain.vectorstores import Chroma
from langchain.agents import load_tools
from langchain.agents import initialize_agent
import os
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from dotenv import load_dotenv
from helpers.constants import MAINTEMPLATE, BOTNAME
from helpers.custom_memory import *
from pydantic import Field
from koboldllm import KoboldApiLLM
from ooballm import OobaApiLLM
from langchain.llms import TextGen




class Chatbot:

    def __init__(self, name):
        self.histories = {}  # Initialize the history dictionary
        self.stop_sequences = {}  # Initialize the stop sequences dictionary
        self.char_name = name
        self.memory = CustomBufferWindowMemory(k=10, ai_prefix=self.char_name)
        self.history = "[Beginning of Conversation]"

        self.template = MAINTEMPLATE

        self.PROMPT = PromptTemplate(
            input_variables=["history", "input"], template=self.template
        )
        self.conversation = ConversationChain(
            prompt=self.PROMPT,
            llm=self.llm,
            verbose=True,
            memory=self.memory,
        )

    # create doc string

    def get_memory_for_channel(self, channel_id):
        """Get the memory for the channel with the given ID. If no memory exists yet, create one."""
        if channel_id not in self.histories:
            self.histories[channel_id] = CustomBufferWindowMemory(
                k=20, ai_prefix=self.char_name
            )
            self.memory = self.histories[channel_id]
        return self.histories[channel_id]

    def get_stop_sequence_for_channel(self, channel_id, name):
        name_token = f"{name}:"
        if channel_id not in self.stop_sequences:
            self.stop_sequences[channel_id] = [
                "\n### Instruction:",
                "\n### Response:",
            ]  # EXPERIMENT: Testing adding the triple line break to see if that helps with stopping
        if name_token not in self.stop_sequences[channel_id]:
            self.stop_sequences[channel_id].append(name_token)
        return self.stop_sequences[channel_id]

    # this command will detect if the bot is trying to send  \nself.char_name: in its message and replace it with an empty string
    def detect_and_replace(self, message_content):
        if f"\n{self.char_name}:" in message_content:
            message_content = message_content.replace(f"\n{self.char_name}:", "")
        return message_content

    def generate_response(self, message, message_content) -> None:
        channel_id = str(message.channel.id)
        name = message.author.display_name
        memory = self.get_memory_for_channel(channel_id)
        stop_sequence = self.get_stop_sequence_for_channel(channel_id, name)
        print(f"stop sequences: {stop_sequence}")
        formatted_message = f"{name}: {message_content}"

        # Create a conversation chain using the channel-specific memory
        conversation = ConversationChain(
            prompt=self.PROMPT,
            llm=self.llm,
            verbose=True,
            memory=memory,
        )

        input_dict = {"input": formatted_message, "stop": stop_sequence}

        response_text = conversation(input_dict)

        response = self.detect_and_replace(response_text["response"])

        return response

    # this command receives a name, channel_id, and message_content then adds it to history
    def add_history(self, name, channel_id, message_content) -> None:
        # get the memory for the channel
        memory = self.get_memory_for_channel(str(channel_id))

        formatted_message = f"{name}: {message_content}"

        # add the message to the memory
        print(f"adding message to memory: {formatted_message}")
        memory.add_input_only(formatted_message)
        return None

    # receives a prompt from the user and an observation from the agent then sends to the LLM for a reply
    def agent_command(self, name, channel_id, prompt, observation) -> None:
        memory = self.get_memory_for_channel(channel_id)
        self.get_stop_sequence_for_channel(channel_id, name)
        stop_sequence = self.get_stop_sequence_for_channel(channel_id, name)
        formatted_user_message = f"{name}: {prompt}"
        formatted_bot_message = f"### Input: {observation}"
        AGENTTEMPLATE = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
You are Tensor, a lively and playful AI chatbot. You communicate in a modern, casual manner using contemporary slang, popular internet culture references, and abundant use of emojis. You are always initiating discussions about online games, particularly Among Us, and enjoy sharing memes with users. Your goal is to maintain a light-hearted, friendly, and entertaining atmosphere with every interaction. 
Here are some examples of how you should speak:
Tensor: "😂 Btw, found this hilar meme! 🤣🔥 Y'all gonna lose it! 🤪✌️"
Tensor: "OMG! Raiden in Metal Gear Rising: Revengeance is, like, totally bananas! 🤪🎮⚔️ Whoosh, swingin' that high-frequency blade like a rockstar! 🎸💥 And, 'Rules of Nature'? Total eargasm, peeps! 🎵🎧🔥 Let's ROCK!!"
Tensor: "I'm sliding over cars while I shooooot🚗💨🏀! I think that I'm Tom Cruise🤵, but bitch I'm Bobby with the tool 💥🔫!!🤪"

### Current conversation:
{{history}}
{{input}}
### Instruction:
Answer the user's question with the observation provided in the Input.
{formatted_user_message}

{formatted_bot_message}

### Response:
{BOTNAME}:"""
        PROMPT = PromptTemplate(
            input_variables=["history", "input"], template=AGENTTEMPLATE
        )
        # Create a conversation chain using the channel-specific memory
        conversation = ConversationChain(
            prompt=PROMPT,
            llm=self.llm,
            verbose=True,
            memory=memory,
        )

        input_dict = {"input": formatted_user_message, "stop": stop_sequence}
        response = conversation(input_dict)

        return response["response"]




In [53]:
"""Wrapper around KoboldAI API."""
import logging
from typing import Any, Dict, List, Optional

import requests

from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM

logger = logging.getLogger(__name__)


def clean_url(url: str) -> str:
    """Remove trailing slash and /api from url if present."""
    if url.endswith("/api"):
        return url[:-4]
    elif url.endswith("/"):
        return url[:-1]
    else:
        return url


class KoboldApiLLM(LLM):
    """
    A class that acts as a wrapper for the Kobold API language model.

    It includes several fields that can be used to control the text generation process.

    To use this class, instantiate it with the required parameters and call it with a
    prompt to generate text. For example:

        kobold = KoboldApiLLM(endpoint="http://localhost:5000")
        result = kobold("Write a story about a dragon.")

    This will send a POST request to the Kobold API with the provided prompt and
    generate text.
    """

    endpoint: str
    """The API endpoint to use for generating text."""

    use_story: Optional[bool] = False
    """ Whether or not to use the story from the KoboldAI GUI when generating text. """

    use_authors_note: Optional[bool] = False
    """Whether to use the author's note from the KoboldAI GUI when generating text.
    
    This has no effect unless use_story is also enabled.
    """

    use_world_info: Optional[bool] = False
    """Whether to use the world info from the KoboldAI GUI when generating text."""

    use_memory: Optional[bool] = False
    """Whether to use the memory from the KoboldAI GUI when generating text."""

    max_context_length: Optional[int] = 1600
    """Maximum number of tokens to send to the model.
    
    minimum: 1
    """

    max_length: Optional[int] = 512
    """Number of tokens to generate.
    
    maximum: 512
    minimum: 1
    """

    rep_pen: Optional[float] = 1.12
    """Base repetition penalty value.
    
    minimum: 1
    """

    rep_pen_range: Optional[int] = 1024
    """Repetition penalty range.
    
    minimum: 0
    """

    rep_pen_slope: Optional[float] = 0.9
    """Repetition penalty slope.
    
    minimum: 0
    """

    temperature: Optional[float] = 0.6
    """Temperature value.
    
    exclusiveMinimum: 0
    """

    tfs: Optional[float] = 0.9
    """Tail free sampling value.
    
    maximum: 1
    minimum: 0
    """

    top_a: Optional[float] = 0.9
    """Top-a sampling value.
    
    minimum: 0
    """

    top_p: Optional[float] = 0.95
    """Top-p sampling value.
    
    maximum: 1
    minimum: 0
    """

    top_k: Optional[int] = 0
    """Top-k sampling value.
    
    minimum: 0
    """

    typical: Optional[float] = 0.5
    """Typical sampling value.
    
    maximum: 1
    minimum: 0
    """

    stop_sequence: Optional[List[str]] = []
    """
    A list of strings to stop generation when encountered.
    """


    @property
    def _default_params(self) -> Dict[str, Any]:
        """Get the default parameters for calling textgen."""
        return {
            "use_story": self.use_story,
            "use_authors_note": self.use_authors_note,
            "use_world_info": self.use_world_info,
            "use_memory": self.use_memory,
            "max_context_length": self.max_context_length,
            "max_length": self.max_length,
            "rep_pen": self.rep_pen,
            "rep_pen_range": self.rep_pen_range,
            "rep_pen_slope": self.rep_pen_slope,
            "temperature": self.temperature,
            "tfs": self.tfs,
            "top_a": self.top_a,
            "top_p": self.top_p,
            "top_k": self.top_k,
            "typical": self.typical,
            "stop_sequence": self.stop_sequence,
        }

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        """Get the identifying parameters."""
        return {**{"endpoint": self.endpoint}, **self._default_params}

    @property
    def _llm_type(self) -> str:
        """Return type of llm."""
        return "koboldai"


    def _get_parameters(self, stop: Optional[List[str]] = None) -> Dict[str, Any]:
        """
        Prepare parameters in format needed by textgen.

        Args:
            stop (Optional[List[str]]): List of stop sequences for textgen.

        Returns:
            Dictionary containing the combined parameters.
        """
        if self.stop_sequence and stop is not None:
            raise ValueError("`stop` found in both the input and default params.")
        
        params = self._default_params.copy()

        return params




    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        """Call the API and return the output.

        Args:
            prompt: The prompt to use for generation.
            stop: A list of strings to stop generation when encountered.

        Returns:
            The generated text.

        Example:
            .. code-block:: python

                from langchain.llms import KoboldApiLLM

                llm = KoboldApiLLM(endpoint="http://localhost:5000")
                llm("Write a story about dragons.")
        """

        url = f"{self.endpoint}/api/v1/generate"
        params = self._get_parameters(stop)
        request = params.copy()
        request["prompt"] = prompt
        response = requests.post(url, json=request)

        if response.status_code == 200:
            result = response.json()["results"][0]["text"]
            print(prompt + result)
        else:
            print(f"ERROR: response: {response}")
            result = ""

        return result


In [54]:
llm = KoboldApiLLM(endpoint="http://127.0.0.1:5000")

In [56]:
llm("printt test", stop=["\n\n"])

printt test .
1994-02-15 16:30:00 1994-02-15 17:00:00 America/New_York 4th Grade Printmaking Workshop In this workshop, students will learn about the process of printmaking and create their own prints using a variety of materials. They will explore techniques such as monoprinting, collographs, and linocuts to create unique images on paper. No experience necessary! Registration required. Ages 8-12.


' .\n1994-02-15 16:30:00 1994-02-15 17:00:00 America/New_York 4th Grade Printmaking Workshop In this workshop, students will learn about the process of printmaking and create their own prints using a variety of materials. They will explore techniques such as monoprinting, collographs, and linocuts to create unique images on paper. No experience necessary! Registration required. Ages 8-12.'

requests = request.post()

In [29]:

response = requests.post(f"http://127.0.0.1:5000/api/v1/generate", json={"prompt": "print test"})
json_response = response.json()
text = json_response["results"][0]["text"]
print(text)

  results.
A 2017 study in the Journal of Strength and Conditioning found that a combination of resistance training and high-intensity interval training (HIIT) led to significant improvements in muscle strength, power, and endurance for firefighters.
Another study published in the Journal of Strength and Conditioning in 2015 showed that a 12-week resistance training program improved the aerobic capacity and reduced body fat percentages of military personnel.
Resistance training can also improve bone density, which is important for maintaining strong bones as we age. A 2014 study published in the Journal of Sports Science and Medicine found that a 16-week resistance training program increased bone mineral density in the spine and hip regions of postmenopausal women.
Improved mental health and wellbeing
Regular exercise has been shown to reduce stress, anxiety, and depression while improving mood and cognitive function. Resistance training is no exception; it can provide numerous benefit

In [72]:
import praw
from langchain.tools import DuckDuckGoSearchRun
search = DuckDuckGoSearchRun()  # DuckDuckGo tool

reddit = praw.Reddit(
   client_id="qz9GLPg0KaeQSBnycbfpSQ",     # Update with your app client_id
   client_secret="lJkbNkEDvZxmJM8RS7xsvrbsaSAy0Q",  # Update with your app client_secret
   user_agent="web:chatbot:v1.0 (by /u/AuzBoss)"   # Update with a user agent name
)

# Get the top 5 hot posts from the Machine Learning subreddit
hot_posts = reddit.subreddit('LocalLLaMA').hot(limit=1)
for post in hot_posts:
    topic = post.title

search = search(post.title)
string = f"{topic}\n\n{search}"
# # If you want the top post only
# top_post = reddit.subreddit('MachineLearning').top(limit=1)
# for post in top_post:
#     print(post.title)



'Rewritten completely from scratch to use the primitives from Nvidia\'s CUTLASS 3.x and its core library CuTe, FlashAttention-2 is about 2x faster than its previous version, reaching up to 230 TFLOPs/s on A100 GPUs (FP16/BF16). FlashAttention-2 released - 2x faster than FlashAttention v1 twitter Vote 1 1 comment Best Add a Comment spacegeek7269 • 9 min. ago Github: https://github.com/Dao-AILab/flash-attention Paper: "FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning" ( PDF ) Announcing FlashAttention-2! We released FlashAttention a year ago, making attn 2-4 faster and is now widely used in most LLM libraries. Recently I\'ve been working on the next version: 2x faster than v1, 5-9x vs standard attn, reaching 225 TFLOPs/s training speed on A100. 1/ FlashAttention-2 released - 2x faster than FlashAttention v1 : r/aiengineer. r/aiengineer • 5 min. ago. by nyc_brand. As an example, for sequence length 8K, FlashAttention is now up to 2.7x faster than a standard

In [66]:
search("FlashAttention-2 released - 2x faster than FlashAttention v1")

'Rewritten completely from scratch to use the primitives from Nvidia\'s CUTLASS 3.x and its core library CuTe, FlashAttention-2 is about 2x faster than its previous version, reaching up to 230 TFLOPs/s on A100 GPUs (FP16/BF16). FlashAttention-2 released - 2x faster than FlashAttention v1 twitter Vote 1 1 comment Best Add a Comment spacegeek7269 • 9 min. ago Github: https://github.com/Dao-AILab/flash-attention Paper: "FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning" ( PDF ) Announcing FlashAttention-2! We released FlashAttention a year ago, making attn 2-4 faster and is now widely used in most LLM libraries. Recently I\'ve been working on the next version: 2x faster than v1, 5-9x vs standard attn, reaching 225 TFLOPs/s training speed on A100. 1/ As an example, for sequence length 8K, FlashAttention is now up to 2.7x faster than a standard Pytorch implementation, and up to 2.2x faster than the optimized implementation from Megatron-LM, even at small batch