## truncate.ipynb
This notebook implements the following functionality:
1. Loads a multilingual corpus and splits long documents into smaller chunks using the `RecursiveCharacterTextSplitter` from LangChain.
2. Generates unique chunk IDs for each split and maps them to the corresponding text.
3. Saves the chunked document data as a JSON file for later use.

### Output
- A JSON file (`chunk_doc_<chunk_size>.json`) containing document chunks with unique IDs.
- Visualizations or distributions of document lengths before splitting.

### Notes
- Adjust `chunk_size` and `chunk_overlap` parameters in the `recursive()` function to suit specific use case.
- The default chunk size is set to 512 characters with a 0-character overlap.
- The notebook is designed to handle large multilingual corpora efficiently.

In [None]:
# !pip install gensim
# !pip install nltk

In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from gensim.models import FastText
from tqdm import tqdm

# This splitter will try to keep the sentence structure and keep the whole words.
from langchain.text_splitter import RecursiveCharacterTextSplitter

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\15163\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\15163\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:

corpus_path = '../dataset/corpus.json'

with open(corpus_path, 'r') as f:
    corpus = json.load(f)

doc_ids = [doc['docid'] for doc in corpus]
doc_texts = [doc['text'] for doc in corpus]

In [3]:
# doc_lengths = {doc['docid']: len(doc['text'].split()) for doc in corpus}
# lengths = list(doc_lengths.values())

# # Plot the distribution of text lengths
# plt.figure(figsize=(10, 6))
# plt.hist(lengths, bins=100, edgecolor='black', alpha=0.7)
# plt.title('Distribution of Document Lengths (in words)')
# plt.xlabel('Document Length (Number of Words)')
# plt.ylabel('Frequency')
# plt.show()

In [4]:
def recursive(chunk_size, chunk_overlap):
    recursive_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return recursive_splitter

In [None]:
doc_id = doc_ids[0]
doc_text = doc_texts[0]

chunk_doc_index = {}
chunks = recursive(512, 0).split_text(doc_text)
for i, chunk in enumerate(chunks):
    chunk_id = f"{doc_id}_chunk_{i}"  # Create a unique ID for each chunk
    chunk_doc_index[chunk_id] = chunk # map the id with the String

In [11]:
chunk_size = 512

# # Function to chunk the document into chunks of 128 tokens
# def chunk_document(doc_text, chunk_size=128):
#     tokens = word_tokenize(doc_text)
#     # Split tokens into chunks of size chunk_size
#     return [tokens[i:i+chunk_size] for i in range(0, len(tokens), chunk_size)]

# Build the chunk-doc index: a dictionary where the key is a chunk and the value is the doc_id
chunk_doc_index = {}
for doc_id, doc_text in tqdm(zip(doc_ids, doc_texts), total=len(doc_ids), desc="Processing Documents"):
    chunks = recursive(chunk_size, 40).split_text(doc_text)
    for i, chunk in enumerate(chunks):
        chunk_id = f"{doc_id}_chunk_{i}"  # Create a unique ID for each chunk
        chunk_doc_index[chunk_id] = chunk # map the id with the String

# Show the first few chunks
# list(chunk_doc_index.items())[:3]

Processing Documents: 100%|██████████| 268022/268022 [09:03<00:00, 492.99it/s] 


In [12]:
# Save as JSON file
with open(f'./chunk_doc_{chunk_size}.json', 'w') as json_file:
    json.dump(chunk_doc_index, json_file)

In [9]:
# %%time

# tokenized_chunks = chunk_doc_index.values()

# # Create the BM25 model
# bm25 = BM25Okapi(tokenized_chunks)

CPU times: user 6min 7s, sys: 16.6 s, total: 6min 23s
Wall time: 6min 23s


In [11]:
# %%time

# # Function to retrieve top N chunks for a query
# def bm25_retrieve(query, top_n=1000):
#     tokenized_query = word_tokenize(query)
#     scores = bm25.get_scores(tokenized_query)
#     ranked_chunks = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
#     top_chunks = [list(chunk_doc_index.keys())[idx] for idx, score in ranked_chunks[:top_n]]
#     return top_chunks

# # Example: Retrieve top 1000 chunks for a query
# query = "How to do efficient retrieval in large multilingual corpus for long texts"
# top_chunks = bm25_retrieve(query, top_n=1000)

# # Output the top chunks
# top_chunks[:5]

CPU times: user 4min 42s, sys: 39.5 s, total: 5min 22s
Wall time: 5min 22s


['doc-en-376830_chunk_1',
 'doc-en-461662_chunk_0',
 'doc-en-632778_chunk_5',
 'doc-en-6846_chunk_9',
 'doc-en-278325_chunk_3']