In [1]:
import csv

def load_data(filepath):
    with open(filepath) as f:
        data = f.read()
    return data

def preprocess_data(data):
    lines = data.split('\n')

    input_lines = []
    labels = []
    for line in lines:
        if line != '':
            [label, input_line] = line.split('\t')
            input_lines.append(input_line)
            if label == 'spam':
                labels.append(1)
            else:
                labels.append(0)
    
    return [input_lines, labels]

def prepare_train_validation_test_split(X, Y):
    n = len(Y)
    train_x = X[:int(0.7 * n)]
    train_y = Y[:int(0.7 * n)]
    validation_x = X[int(0.7 * n):int(0.85 * n)]
    validation_y = Y[int(0.7 * n):int(0.85 * n)]
    test_x = X[int(0.85 * n):]
    test_y = Y[int(0.85 * n):]

    return [
        [train_x, train_y],
        [validation_x, validation_y],
        [test_x, test_y]
    ]

def save_raw_data(X, Y):
    with open('raw_data.csv', 'w', newline='') as f:
        writer = csv.writer(f, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for i in range(len(X)):
            writer.writerow([X[i], Y[i]])

def save_data(train, validation, test):
    filenames_and_data = [
        ('train.csv', train),
        ('validation.csv', validation),
        ('test.csv', test)
    ]

    for (filename, data) in filenames_and_data:
        with open(filename, 'w', newline='') as f:
            [x, y] = data
            writer = csv.writer(f, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            for i in range(len(data[1])):
                writer.writerow([x[i], y[i]])

In [2]:
data = load_data('smsspamcollection/SMSSpamCollection')
[X, Y] = preprocess_data(data)
[train, validation, test] = prepare_train_validation_test_split(X, Y)
save_raw_data(X, Y)
save_data(train, validation, test)

In [3]:
# Before update
import csv

train_y = []
with open('train.csv', newline='') as f:
    reader = csv.reader(f, delimiter=',', quotechar='|')
    for [_, label] in reader:
        train_y.append(int(label))

validation_y = []
with open('validation.csv', newline='') as f:
    reader = csv.reader(f, delimiter=',', quotechar='|')
    for [_, label] in reader:
        validation_y.append(int(label))

test_y = []
with open('test.csv', newline='') as f:
    reader = csv.reader(f, delimiter=',', quotechar='|')
    for [_, label] in reader:
        test_y.append(int(label))

print('Training data:')
print('Number of 0s: ', train_y.count(0))
print('Number of 1s: ', train_y.count(1))
print('Validation data:')
print('Number of 0s: ', validation_y.count(0))
print('Number of 1s: ', validation_y.count(1))
print('Testing data:')
print('Number of 0s: ', test_y.count(0))
print('Number of 1s: ', test_y.count(1))

Training data:
Number of 0s:  3382
Number of 1s:  519
Validation data:
Number of 0s:  718
Number of 1s:  118
Testing data:
Number of 0s:  727
Number of 1s:  110


In [None]:
# After update
import csv

train_y = []
with open('train.csv', newline='') as f:
    reader = csv.reader(f, delimiter=',', quotechar='|')
    for [_, label] in reader:
        train_y.append(int(label))

validation_y = []
with open('validation.csv', newline='') as f:
    reader = csv.reader(f, delimiter=',', quotechar='|')
    for [_, label] in reader:
        validation_y.append(int(label))

test_y = []
with open('test.csv', newline='') as f:
    reader = csv.reader(f, delimiter=',', quotechar='|')
    for [_, label] in reader:
        test_y.append(int(label))

print('Training data:')
print('Number of 0s: ', train_y.count(0))
print('Number of 1s: ', train_y.count(1))
print('Validation data:')
print('Number of 0s: ', validation_y.count(0))
print('Number of 1s: ', validation_y.count(1))
print('Testing data:')
print('Number of 0s: ', test_y.count(0))
print('Number of 1s: ', test_y.count(1))