In [1]:
import random

# Function to convert text into desired format
def convert_to_format(text, label):
    formatted_text = "\t".join([" ".join(word) for word in zip(text, list(text))])
    return f"{formatted_text}\t{label}"

# Split data into train, test, and validate sets
def split_data(data, train_ratio=0.7, test_ratio=0.15):
    train_size = int(train_ratio * len(data))
    test_size = int(test_ratio * len(data))
    validate_size = len(data) - train_size - test_size
    random.shuffle(data)
    train_data = data[:train_size]
    test_data = data[train_size:train_size+test_size]
    validate_data = data[train_size+test_size:]
    return train_data, test_data, validate_data

# Function to read data from text file
def read_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    data = []
    for line in lines:
        parts = line.strip().split("\t")
        text = parts[:-1]
        label = parts[-1]
        data.append((text, label))
    return data

# Function to save data into files
def save_data(data, filename):
    with open(filename, "w", encoding="utf-8") as f:
        for text, label in data:
            formatted_text = convert_to_format(text, label)
            f.write(formatted_text + "\n")

# Read data from text file
data = read_data("annotated.txt")

# Split data into train, test, and validate sets
train_data, test_data, validate_data = split_data(data)

# Save data into train, test, and validate files
save_data(train_data, "train.txt")
save_data(test_data, "test.txt")
save_data(validate_data, "validate.txt")
