In [1]:
import csv

def load_data(filepath):
    with open(filepath) as f:
        data = f.read()
    return data

def preprocess_data(data):
    lines = data.split('\n')

    input_lines = []
    labels = []
    for line in lines:
        if line != '':
            [label, input_line] = line.split('\t')
            input_lines.append(input_line)
            if label == 'spam':
                labels.append(1)
            else:
                labels.append(0)
    
    return [input_lines, labels]

def prepare_train_validation_test_split(X, Y):
    n = len(Y)
    train_x = X[:int(0.7 * n)]
    train_y = Y[:int(0.7 * n)]
    validation_x = X[int(0.7 * n):int(0.85 * n)]
    validation_y = Y[int(0.7 * n):int(0.85 * n)]
    test_x = X[int(0.85 * n):]
    test_y = Y[int(0.85 * n):]

    return [
        [train_x, train_y],
        [validation_x, validation_y],
        [test_x, test_y]
    ]

def save_data(train, validation, test):
    filenames_and_data = [
        ('train.csv', train),
        ('validation.csv', validation),
        ('test.csv', test)
    ]

    for (filename, data) in filenames_and_data:
        with open(filename, 'w', newline='') as f:
            [x, y] = data
            writer = csv.writer(f, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            for i in range(len(data[1])):
                writer.writerow([x[i], y[i]])

In [2]:
data = load_data('smsspamcollection/SMSSpamCollection')
[X, Y] = preprocess_data(data)
[train, validation, test] = prepare_train_validation_test_split(X, Y)
save_data(train, validation, test)