In [None]:
from sklearn.model_selection import train_test_split

def write_data(datapath, line_sen_list):
    '''
    datapath: The file address that needs to be written
    line_sen_list: List of file content lines that need to be written
    '''
    with open(datapath, 'w', encoding='utf-8') as o:
        o.write(''.join(line_sen_list))

def main():
    raw_data_path = 'Comments.txt'
    train_data_path = 'train_80000.txt'
    validate_data_path = 'validate_10000.txt'
    test_data_path = 'test_10000.txt'

    line_sen_list = []

    # Read TXT files
    with open(raw_data_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Tidy Data
    lines = [line for line in lines if line.strip()]

    # Select a fixed size dataset according to a certain pattern
    for line in lines[0:100000]:
        line_sen_list.append(line)

    label_list = [0] * len(line_sen_list)  # Using a label list of the same size as the data,has no impact.

    # First, randomly divide the 1. training set, 2. validation set, and test set according to an 8:2 ratio
    X_train, X_validate_test, _, y_validate_test = train_test_split(line_sen_list, label_list, test_size=0.2, random_state=42)
    # Randomly divide 1. validation set and 2. test set in a 1:1 ratio
    X_validate, X_test, _, _ = train_test_split(X_validate_test, y_validate_test, test_size=0.5, random_state=42)
    
    # Write the divided training set, validation set, and test set to the specified directory separately
    write_data(train_data_path, X_train)
    write_data(validate_data_path, X_validate)
    write_data(test_data_path, X_test)

    print(f"The training set data has been saved as {train_data_path}")
    print(f"The validation set data has been saved as {validate_data_path}")
    print(f"The testing set data has been saved as {test_data_path}")

if __name__ == '__main__':
    main()
