In [2]:
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer

In [3]:
checkpoint = "CohereForAI/aya-23-8b"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [4]:
silver_test_df = pd.read_csv('../data/test.csv')  # Replace with your actual file path

In [6]:
MAX_TOKENS = 500

# Step 3: Define the Splitting Function
def split_text(row, tokenizer, max_tokens=500):
    text = row['text']
    original_id = row['id']

    # Split text into lines based on newlines
    lines = text.split('\n')

    chunks = []
    current_chunk = ""

    for line in lines:
        # Tentatively add the line to the current chunk
        tentative_chunk = f"{current_chunk}\n{line}" if current_chunk else line
        tokens = tokenizer.encode(tentative_chunk, add_special_tokens=False)

        if len(tokens) <= max_tokens:
            current_chunk = tentative_chunk
        else:
            if current_chunk:
                # Finalize the current chunk
                chunks.append({
                    'id': original_id,
                    'text': current_chunk
                })
                current_chunk = line  # Start a new chunk with the current line

                # Check if the new line itself exceeds max_tokens
                line_tokens = tokenizer.encode(line, add_special_tokens=False)
                if len(line_tokens) > max_tokens:
                    # Truncate the line to fit
                    truncated_tokens = line_tokens[:max_tokens]
                    truncated_text = tokenizer.decode(truncated_tokens, clean_up_tokenization_spaces=True)
                    chunks.append({
                        'id': original_id,
                        'text': truncated_text
                    })
                    current_chunk = ""  # Reset current chunk
            else:
                # Current chunk is empty, but the line exceeds max_tokens
                truncated_tokens = tokens[:max_tokens]
                truncated_text = tokenizer.decode(truncated_tokens, clean_up_tokenization_spaces=True)
                chunks.append({
                    'id': original_id,
                    'text': truncated_text
                })
                current_chunk = ""

    # Append any remaining chunk
    if current_chunk:
        chunks.append({
            'id': original_id,
            'text': current_chunk
        })

    return chunks

# Step 4: Apply the Splitting Function to the DataFrame
split_data = []

for _, row in tqdm(silver_test_df.iterrows()):
    split_chunks = split_text(row, tokenizer, MAX_TOKENS)
    split_data.extend(split_chunks)

# Create a new DataFrame from the split data
silver_test_df = pd.DataFrame(split_data)

# Step 5: Handle Edge Cases and Ensure Data Integrity
# Optional: Remove any empty texts
silver_test_df = silver_test_df[silver_test_df['text'].str.strip() != ''].reset_index(drop=True)

# Optional: Log the number of splits per original id
split_counts = silver_test_df['id'].value_counts()
print("Number of splits per original id:")
print(split_counts)

silver_test_df

169it [00:04, 39.36it/s] 

Number of splits per original id:
id
f1d0b2b9ee882b29d1fb11414e662cbcdc74eece062e599ff1d93ae521f3f2e4    10
4b15a0d182a85fd225d74b18fc5a6b2d6ebc9cb1401c2efcb89fbdff74e815e0    10
9e4efdea7c59818f5fa27e497347a9827724f8caf32bb5af7d293ad9956bbd2e    10
b3499523741aece73ba78eb872626c365732d9a663bf03b05d4501a58138a08d    10
770f40e00a5840e7b5e727547a063582a00020f57b1b73b22463a7f911e20100    10
                                                                    ..
7261e630306cad5095c75c3444d7dc54952cd0f594ccad5903ac48e1ee12056a     1
2a4886ba8e84b6c0f099fd111ecd8d6ada8864c9d1a120883339f064676f7655     1
ac462db1dbcb330104e6aa92266ac56f0f73c005ae1ca8907a511a225b64cc7d     1
408a2cf9d7a70ab87154eae8bf7d0d48dfba307cd66c4c863975288d62b389a4     1
d2291754a5c5878fa471d750ed1145643d0369db349dba31b00a5993615afd37     1
Name: count, Length: 169, dtype: int64





Unnamed: 0,id,text
0,e29896ab781b5dbb97ae3f3f7862fa681e9d70a5e63866...,"Цікаву і пізнавальну подорож місцями , де наро..."
1,e29896ab781b5dbb97ae3f3f7862fa681e9d70a5e63866...,А кожну музейну залу прикрашають репродукції к...
2,d67655fe3fe45e95cd63613c2189fe86728293bd0b8d3c...,Нещодавно на шаховій Олімпіаді українська жіно...
3,d67655fe3fe45e95cd63613c2189fe86728293bd0b8d3c...,Стародавня гра активно культивувалася й в інши...
4,b2c68352957ed341637aa9dd25b8a8ac00c32e8129d689...,Виходець із козацько-старшинського роду А . Жд...
...,...,...
496,dcb588274235339ab44950ee26872ed5cfc3fbb503a1a3...,"Нагадаємо, минулого тижня в Києві відбулось за..."
497,b2a11bba5aeca217085bafa76544f734475c6eeb79ec1d...,23 жовтня Служба автомобільних доріг у Кіровог...
498,b2a11bba5aeca217085bafa76544f734475c6eeb79ec1d...,З початку року Azvirt MMC виграла 17 тендерів ...
499,685bc4e34d71522430a08a12f3f1fcef65c3fc01c51fba...,Заступник Головного військового прокурора Гене...


In [7]:
# Step 6: Final DataFrame Preparation
silver_test_df.reset_index(drop=True, inplace=True)
silver_test_df.to_csv('../data/silver_test.csv', index=False)