In [6]:
import os

import discord
import openai
import functools
import typing
import asyncio
from dotenv import load_dotenv
import datetime
import re
import tiktoken
import time

load_dotenv()
TOKEN = os.getenv('DISCORD_TOKEN')
GUILD = os.getenv('DISCORD_GUILD')

DEFAULT_PROMPT = os.getenv('DEFAULT_PROMPT')

openai.api_key = os.getenv("OPENAI_API_KEY")

intents = discord.Intents.default()
intents.message_content = True
intents.members = True

client = discord.Client(intents=intents)

In [7]:
@client.event
async def on_ready():
    for guild in client.guilds:
        if guild.name == GUILD:
            break

    print(
        f'{client.user} is connected to the following guild:\n'
        f'{guild.name}(id: {guild.id})'
    )


In [8]:
@client.event
async def on_message(message):
    if message.author.bot:
            return
    try:
        guild_id = message.guild.id
        if not ('bot' in str(message.channel) or 'gpt' in str(message.channel) or 'chat' in str(message.channel)):
            return
    except:
        guild_id = 'dm'
    channel = message.channel
    channel_id = message.channel.id

    intent_id = str(channel_id) + '_' + str(guild_id)
    global channel_intents
    if intent_id not in channel_intents:
        channel_intents[intent_id] = init_intent()
    print(f'{message.author}: {message.content}')

In [13]:
with open('test_text.txt') as f:
    test_text = f.read()

In [22]:
import re
import markdown
def find_codeblocks(text):
    idx_list = []
    for m in re.finditer('```', text):
        idx_list.append(m.start())
    return idx_list
            
def split_code_sections(text):
    codeblock_marker_indices = find_codeblocks(text)
    
find_codeblocks(test_text)

[562, 2195]

In [23]:
def separate_sections(article_string):
    sections = []
    pattern = r'(#{1,6}\s.*|>\s.*|\*.*|```[a-zA-Z]+\n[\s\S]*?\n```|`[\s\S]*?`|[\s\S]+?(?=#|\n>|\n\*|\n```|\n`))'
    matches = re.findall(pattern, article_string, re.MULTILINE | re.DOTALL)
    
    for match in matches:
        if match.startswith('#'):
            sections.append(('Heading', match.strip('#').strip()))
        elif match.startswith('>'):
            sections.append(('Block Quote', match.strip('> ').strip()))
        elif match.startswith('*'):
            sections.append(('List', match.strip('* ').strip()))
        elif match.startswith('```'):
            code_block = match.strip('`')
            language = code_block.split('\n', 1)[0].strip('`')
            code = code_block.split('\n', 1)[1].strip()
            code = code.replace('`', '\\`')
            sections.append(('Code Block', f'Language: {language}\n{code}'))
        elif match.startswith('`'):
            sections.append(('Inline Code', match.strip('`').replace('`', '\\`')))
        else:
            sections.append(('Text', match.strip()))
    
    return sections
markdown.markdown(test_text)

'<p>I apologize for the confusion. The <code>pytorch_tabular</code> library does not have a built-in option to specify metrics in the <code>TrainerConfig</code>. The metrics are automatically determined based on the task (classification or regression).</p>\n<p>For regression tasks, the loss function used is the Mean Squared Error (MSE), and the metrics reported are MSE and R2 Score. Unfortunately, there\'s no straightforward way to report the Root Mean Squared Error (RMSE) directly using <code>pytorch_tabular</code>.</p>\n<p>However, you can calculate RMSE manually after making predictions with the model:</p>\n<p>```python\nfrom pytorch_tabular import TabularModel\nfrom pytorch_tabular.models import CategoryEmbeddingModelConfig\nfrom pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig\nfrom sklearn.metrics import mean_squared_error\nimport numpy as np</p>\n<h1>Assume we have a DataFrame \'df\' with categorical features \'cat\', numerical features 

In [4]:
import re
import discord_markdown_ast_parser
with open('test_text.txt') as f:
    test_text = f.read()

def parse_text_segment(text_segment):
    node_type = 'TEXT'
    code_lang = ''
    parsed_text = ''
    # Markdown stylings
    if text_segment['node_type'] == 'TEXT':
        parsed_text = text_segment['text_content']
    elif text_segment['node_type'] == 'ITALIC':
        parsed_text = '_'
        for child in text_segment['children']:
            parsed_text += parse_text_segment(child)['text_content']
        parsed_text += '_'
    elif text_segment['node_type'] == 'BOLD':
        parsed_text = '**'
        for child in text_segment['children']:
            parsed_text += parse_text_segment(child)['text_content']
        parsed_text += '**'
    elif text_segment['node_type'] == 'UNDERLINE':
        parsed_text = '__'
        for child in text_segment['children']:
            parsed_text += parse_text_segment(child)['text_content']
        parsed_text += '__'
    elif text_segment['node_type'] == 'STRIKETHROUGH':
        parsed_text = '~~'
        for child in text_segment['children']:
            parsed_text += parse_text_segment(child)['text_content']
        parsed_text += '~~'
    elif text_segment['node_type'] == 'SPOILER':
        parsed_text = '||'
        for child in text_segment['children']:
            parsed_text += parse_text_segment(child)['text_content']
        parsed_text += '||'
    elif text_segment['node_type'] == 'CODE_INLINE':
        parsed_text = '`'
        for child in text_segment['children']:
            parsed_text += parse_text_segment(child)['text_content']
        parsed_text += '`'
        
    # Blocks
    elif text_segment['node_type'] == 'QUOTE_BLOCK':
        node_type = 'QUOTE_BLOCK'
        for child in text_segment['children']:
            parsed_text += parse_text_segment(child)['text_content']
    elif text_segment['node_type'] == 'CODE_BLOCK':
        node_type = 'CODE_BLOCK'
        try:
            code_lang = text_segment['code_lang']
        except:
            pass

        for child in text_segment['children']:
            parsed_text += parse_text_segment(child)['text_content']
            
    return {
        'node_type': node_type,
        'code_lang': code_lang,
        'text_content': parsed_text
    }

def segment_block(segment, pattern):
    split_segments = []
    double_split_indices = [0]
    
    text = segment['text_content']
    
    for m in re.finditer(pattern, text):
        double_split_indices.append(m.start())
    double_split_indices.append(len(text))
    for idx, split_idx in enumerate(double_split_indices):
        if idx < len(double_split_indices) - 1:
            split_segments.append({
                'node_type': segment['node_type'],
                'code_lang': segment['code_lang'],
                'text_content': text[double_split_indices[idx]: double_split_indices[idx + 1]]
                
            })

    return split_segments

def segment_block_by_length(segment, max_length):
    split_segments = []
    idx = 0
    text = segment['text_content']
    
    markdown_length = 0
    if segment['node_type'] == 'QUOTE_BLOCK':
        markdown_length += len('> ')
    elif segment['node_type'] == 'CODE_BLOCK':
        markdown_length += len('```')*2 + len(segment['code_lang'])
    
    while True:
        print(idx)
        if idx >= len(text):
            break
        else:
            split_segments.append({
                'node_type': segment['node_type'],
                'code_lang': segment['code_lang'],
                'text_content': text[idx:min(idx + max_length - markdown_length, len(text))]
            })
        idx += max_length - markdown_length
        
    print(split_segments)
    return split_segments

def merge_block(segment1, segment2):
    

def get_block_length(segment):
    length = len(segment['text_content'])
    if segment['node_type'] == 'QUOTE_BLOCK':
        length += len('> ')
    elif segment['node_type'] == 'CODE_BLOCK':
        length += len('```')*2 + len(segment['code_lang'])
        
    return length
        
def get_merged_block_length(segment1, segment2):
    length = len(segment1['text_content']) + len(segment2['text_content'])
    if segment1['node_type'] == 'QUOTE_BLOCK':
        length += len('> ')
    elif segment1['node_type'] == 'CODE_BLOCK':
        length += len('```')*2 + len(segment1['code_lang'])
        
    return length
    
def segment_markdown(text, max_length):
    # Segment text to blocks according to text or code/quote blocks
    text_segments = []
    try:
        markdown_segments = discord_markdown_ast_parser.parse_to_dict(text)
        add_newline = True
        for segment in markdown_segments:
            if segment['node_type'] == 'QUOTE_BLOCK' or segment['node_type'] == 'CODE_BLOCK':
                text_segments.append(parse_text_segment(segment))
                add_newline = True
            else: 
                if add_newline:
                    text_segments.append(parse_text_segment(segment))
                    add_newline = False
                else:
                    text_segments[-1]['text_content'] += parse_text_segment(segment)['text_content']
    except Exception as e:
        # If segmentation unsuccessful, fall back to treating text as a single normal text block
        print(e)
        text_segments.append({
            'node_type': 'TEXT',
            'code_lang': '',
            'text_content': text
        })
        
    # Segment text to smallest possible blocks
    trimmed_segments = []
    for segment in text_segments:
        sub_segments = segment_block(segment, '\n\n')
        for sub_segment in sub_segments:
#             print(sub_segment)
            trimmed_segments.append(sub_segment)
            
    # Add up blocks to make each of them the maximum size that's still under the max_length limit
    # Uses greedy to add up
    # if single block is too big, subsegment using single \n
    # if newlines don't work, subsegment using white space
    # if none works, chop the blocks disregarding anything
    maxed_segments = []
    cumulated_segment = {}
    cumulated_length = 0
    for segment in trimmed_segments:
        current_length = get_block_length(segment)
            
        if cumulated_segment == {}:
            cumulated_segment = {
                'node_type': segment['node_type'],
                'code_lang': '',
                'text_content': '',
            } 
        # if segment too long, subsegment using newline
        if current_length >= max_length:
            sub_segments = segment_block(segment, '\n')
            
            for sub_segment in sub_segments:
                current_length = get_block_length(sub_segment)
                
                # if segment too long, subsegment using space
                if current_length >= max_length:
                    sub2_segments = segment_block(sub_segment, ' ')
                    
                    for sub2_segment in sub2_segments:
                        current_length = get_block_length(sub2_segment)
                        
                        # if segment too long, subsegment disregarding whitespaces
                        if current_length >= max_length:
                            sub3_segments = segment_block_by_length(sub2_segment, max_length)
                            for sub3_segment in sub3_segments:
                                current_length = get_block_length(sub3_segment)
                                cumulated_length = current_length
                                maxed_segments.append(sub3_segment)
                                
                                print(current_length, sub3_segment)
                        elif cirrent_length + cumulated_length:
                            if 
                            print(current_length, sub2_segment)
                else:
                    print(current_length, sub_segment)
            
#         print(current_length, segment['text_content'])
            
    return trimmed_segments
markdown_segments = segment_markdown(test_text, 20)

1 {'node_type': 'TEXT', 'code_lang': '', 'text_content': 'I'}
10 {'node_type': 'TEXT', 'code_lang': '', 'text_content': ' apologize'}
4 {'node_type': 'TEXT', 'code_lang': '', 'text_content': ' for'}
4 {'node_type': 'TEXT', 'code_lang': '', 'text_content': ' the'}
11 {'node_type': 'TEXT', 'code_lang': '', 'text_content': ' confusion.'}
4 {'node_type': 'TEXT', 'code_lang': '', 'text_content': ' The'}
18 {'node_type': 'TEXT', 'code_lang': '', 'text_content': ' `pytorch_tabular`'}
8 {'node_type': 'TEXT', 'code_lang': '', 'text_content': ' library'}
5 {'node_type': 'TEXT', 'code_lang': '', 'text_content': ' does'}
4 {'node_type': 'TEXT', 'code_lang': '', 'text_content': ' not'}
5 {'node_type': 'TEXT', 'code_lang': '', 'text_content': ' have'}
2 {'node_type': 'TEXT', 'code_lang': '', 'text_content': ' a'}
9 {'node_type': 'TEXT', 'code_lang': '', 'text_content': ' built-in'}
7 {'node_type': 'TEXT', 'code_lang': '', 'text_content': ' option'}
3 {'node_type': 'TEXT', 'code_lang': '', 'text_cont