## Using Gradio to show chunking.


In [1]:
!pip install gradio==4.19.1
!pip install langchain==0.1.7
!pip install transformers==4.37.0
!pip install datasets

Collecting gradio==4.19.1
  Downloading gradio-4.19.1-py3-none-any.whl (16.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.9/16.9 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio==4.19.1)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio==4.19.1)
  Downloading fastapi-0.110.1-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.9/91.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio==4.19.1)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==0.10.0 (from gradio==4.19.1)
  Downloading gradio_client-0.10.0-py3-none-any.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.7/307.7 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx (from gradio==4.19.1)
  Downloading httpx-0.27.0-py3-none-any.whl

In [None]:
import datasets
from datasets import load_dataset

In [None]:
hf_data = load_dataset('botchagalupe/ProfoundDeming')

In [None]:
dataset = ""
for item in hf_data ['train']:
    dataset += item['text'] + " "  # Adding a space after each text part

print("Data Ingested")

Data Ingested


In [None]:
### Utilities to get overlap between strings

def get_overlap_length(left: str, right: str):
    good_length, overlap = 0, ""
    for i in range(min(len(left), len(right))):
        if left[-i:] == right[:i]:
            good_length = i
            overlap = left[-i:]
    return good_length, overlap

def get_overlap_list(strings):
    """
    Returns a list of tuples of the form (overlap_length, overlap), one tuple for each pair of strings in the input list.
    """
    overlaps = []
    for i in range(len(strings) - 1):
        overlaps.append(get_overlap_length(strings[i], strings[i+1]))
    return overlaps

def unoverlap_list(strings):
    """
    Returns a list of tuples of the form (content, is_overlap), where is_overlap is a boolean indicating whether the content is an overlap or not.
    """
    overlaps = get_overlap_list(strings)
    new_list = []
    for index, string in enumerate(strings):
        # Add the last overlap when needed
        if index > 0 and len(overlaps[index-1][1]) > 0:
            new_list.append((overlaps[index-1][1], True))

        # prune the string with left and right overlaps
        left_overlap_length, right_overlap_length = 0, 0
        if index > 0:
            left_overlap_length = overlaps[index-1][0]
        if index < len(strings) - 1:
            right_overlap_length = overlaps[index][0]

        new_list.append((string[left_overlap_length:len(string)-right_overlap_length], False))
    return new_list

In [None]:
import gradio as gr
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    Language,
)
from transformers import AutoTokenizer
#from overlap import unoverlap_lis

In [None]:
LABEL_TEXTSPLITTER = "🦜🔗 LangChain's CharacterTextSplitter"
LABEL_RECURSIVE = "🦜🔗 LangChain's RecursiveCharacterTextSplitter"

bert_tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')

def length_tokens(txt):
    return len(bert_tokenizer.tokenize(txt))


def extract_separators_from_string(separators_str):
    try:
        separators_str = separators_str.replace("\\n", "\n").replace("\\t", "\t").replace("\\\\", "\\") # fix special characters
        separators = separators_str[1:-1].split(", ")
        return [separator.replace('"', "").replace("'", "") for separator in separators]
    except Exception as e:
        raise gr.Error(f"""
        Did not succeed in extracting seperators from string: {separator_str} due to: {str(e)}.
        Please type it in the correct format: "['separator_1', 'separator_2', ...]"
        """)

def change_split_selection(split_selection):
    return (
        gr.Textbox.update(visible=(split_selection==LABEL_RECURSIVE)),
        gr.Radio.update(visible=(split_selection==LABEL_RECURSIVE)),
    )

def chunk(text, length, splitter_selection, separators_str, length_unit_selection, chunk_overlap):
    separators = extract_separators_from_string(separators_str)
    length_function = (length_tokens if "token" in length_unit_selection.lower() else len)
    if splitter_selection == LABEL_TEXTSPLITTER:
        text_splitter = CharacterTextSplitter(
            chunk_size=length,
            chunk_overlap=int(chunk_overlap),
            length_function=length_function,
            strip_whitespace=False,
            is_separator_regex=False,
            separator=" ",
        )
    elif splitter_selection == LABEL_RECURSIVE:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=length,
            chunk_overlap=int(chunk_overlap),
            length_function=length_function,
            strip_whitespace=False,
            separators=separators,
        )
    splits = text_splitter.create_documents([text])
    text_splits = [split.page_content for split in splits]
    unoverlapped_text_splits = unoverlap_list(text_splits)
    output = [((split[0], 'Overlap') if split[1] else (split[0], f"Chunk {str(i)}")) for i, split in enumerate(unoverlapped_text_splits)]
    return output

def change_preset_separators(choice):
    text_splitter = RecursiveCharacterTextSplitter()
    if choice == "Default":
        return ["\n\n", "\n", " ", ""]
    elif choice == "Markdown":
        return text_splitter.get_separators_for_language(Language.MARKDOWN)
    elif choice == "Python":
        return text_splitter.get_separators_for_language(Language.PYTHON)
    else:
        raise gr.Error("Choice of preset not recognized.")


with gr.Blocks(theme=gr.themes.Soft(text_size='lg', font=["monospace"], primary_hue=gr.themes.colors.green)) as demo:
    text = gr.Textbox(label="Your text 🪶", value=dataset)
    with gr.Row():
        split_selection = gr.Dropdown(
            choices=[
                LABEL_TEXTSPLITTER,
                LABEL_RECURSIVE,
            ],
            value=LABEL_RECURSIVE,
            label="Method to split chunks 🍞",
        )
        separators_selection = gr.Textbox(
            elem_id="textbox_id",
            value=["\n\n", "\n", " ", ""],
            info="Separators used in RecursiveCharacterTextSplitter",
            show_label=False, # or set label to an empty string if you want to keep its space
            visible=True,
        )
        separator_preset_selection = gr.Radio(
            ['Default', 'Python', 'Markdown'],
            label="Choose a preset",
            info="This will apply a specific set of separators to RecursiveCharacterTextSplitter.",
            visible=True,
        )
    with gr.Row():
        length_unit_selection = gr.Dropdown(
            choices=[
                "Character count",
                "Token count (BERT tokens)",
            ],
            value="Character count",
            label="Length function",
            info="How should we measure our chunk lengths?",
        )
        slider_count = gr.Slider(
            50, 500, value=200, step=1, label="Chunk length 📏", info="In the chosen unit."
        )
        chunk_overlap = gr.Slider(
            0, 50, value=10, step=1, label="Overlap between chunks", info="In the chosen unit."
        )
    out = gr.HighlightedText(
        label="Output",
        show_legend=True,
        show_label=False,
        color_map={'Overlap': '#DADADA'}
    )

    split_selection.change(
        fn=change_split_selection,
        inputs=split_selection,
        outputs=[separators_selection, separator_preset_selection],
    )
    separator_preset_selection.change(
        fn=change_preset_separators,
        inputs=separator_preset_selection,
        outputs=separators_selection,
    )
    gr.on(
        [text.change, length_unit_selection.change, separators_selection.change, split_selection.change, slider_count.change, chunk_overlap.change],
        chunk,
        [text, slider_count, split_selection, separators_selection, length_unit_selection, chunk_overlap],
        outputs=out
    )
    demo.load(chunk, inputs=[text, slider_count, split_selection, separators_selection, length_unit_selection, chunk_overlap], outputs=out)
demo.launch(shadow=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://9ae9ad83d646021684.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




That's all! Go ahead and open that share link in a new tab. Check out our [getting started](https://gradio.app/getting_started.html) page for more complicated demos.