# Split Dataset #

### Splitting the dataset so that (for example) two-thirds of it is used to train the model, after which we measure the model's performance on the remaining third

In [11]:
import random
from typing import *

X = TypeVar('X') # Generic type to represent a data point #
Y = TypeVar('Y') # Generic type to represent output variables #

In [12]:
def split_data(data: List[X], prob:float) -> Tuple[List[X], List[X]]:
    # Split data into fractions [prob, 1 = prob] #
    data = data[:]
    random.shuffle(data)
    cut = int(len(data) * prob)
    return data[:cut], data[cut:]

In [13]:
data = [n for n in range(1000)]
train, test = split_data(data, 0.75)

# The proportions should be correct #
assert len(train) == 750
assert len(test) == 250

# And the originial data should be preserved (in some order) #
assert sorted(train + test) == data

### Often, we'll have paired input variables and output variables. In that case, we need to make sure to put corresponding values together in either the training data or the test data:

In [16]:
def train_test_split(xs: List[X],
                     ys: List[Y],
                     test_pct: float) -> Tuple[List[X], List[X], List[Y],
                                                                  List[Y]]:
    # Generate the indices and split them #
    idxs = [i for i in range(len(xs))]
    train_idxs, test_idxs = split_data(idxs, 1 - test_pct)
    
    return ([xs[i] for i in train_idxs], # x_train #
            [xs[i] for i in test_idxs],  # x_test #
            [ys[i] for i in train_idxs], # y_train #
            [ys[i] for i in test_idxs])  # y_test #

In [17]:
xs = [x for x in range(1000)] # xs are 1...1000
ys = [2 * x for x in xs]     # each y_i is twice x_i
x_train, x_test, y_train, y_test = train_test_split(xs, ys, 0.25)

# Check that the proportions are correct
assert len(x_train) == len(y_train) == 750
assert len(x_test) == len(y_test) == 250

# Check that the corresponding data points are paired correctly
assert all(y == 2 * x for x, y in zip(x_train, y_train))
assert all(y == 2 * x for x, y in zip(x_test, y_test))

### After which you can do something like:

```
model = SomeKindOfModel()
x_train, x_test, y_train, y_test = train_test_split(xs, ys, 0.33)
model.train(x_train, y_train)
performance = model.test(x_test, y_test)
```