# Train Test Split
## Introduction
Train Test Split is a utility function that splits the Features (X) and Target (y) datasets to a reproducible split. This is useful for comparing different models and hyperparameters. The function takes in a dataset and a test size and returns the train and test datasets. The function also takes in a random state to ensure reproducibility.

## Use Case
Train Test Split is useful to split the dataset into train and test datasets which can be used to train and test the model. Different models can be trained on the train dataset and the test dataset can be used to evaluate models. This is useful to compare different models and hyperparameters.

In [112]:
# importing required libraries
import numpy as np
import pandas as pd

In [113]:
# shuffle datasets
def _shuffle_train_test(X, y, random_state):
    if random_state is not None:
        np.random.seed(random_state)

    X_len = len(X)
    random_indices = np.arange(X_len)
    np.random.shuffle(random_indices)

    X_copy = X.copy()
    y_copy = y.copy()

    for i in range(X_len):
        rand_index = random_indices[i]

        # a simple hack to make X_copy and y_copy indexable
        if type(X_copy) == pd.DataFrame:
            X_copy.iloc[i] = X_copy.iloc[rand_index]
            y_copy.iloc[i] = y_copy.iloc[rand_index]

        else:  # supposing X_copy is a numpy array or python list or pd.Series
            X_copy[i] = X[rand_index]
            y_copy[i] = y[rand_index]

    return X_copy, y_copy


# get train and test data lengths
def _get_train_test_len(X_len, train_size=None, test_size=None):
    # if train_size and test_size are not given, then split the dataset into 75% train and 25% test
    if train_size is None and test_size is None:
        train_size = 0.75
        test_size = 0.25

    else:
        # if train size is given, then calculate test size
        if train_size is not None:
            test_size = 1 - train_size

        # if test size is given, then calculate train size
        if test_size is not None:
            train_size = 1 - test_size

        # check if train_size is given and if it is valid
        assert (
            train_size is not None and train_size > 0 and train_size < 1
        ), "train_size must be between 0 and 1"

        # check if train_size is given and if it is valid
        assert (
            test_size is not None and test_size > 0 and test_size < 1
        ), "test_size must be between 0 and 1"

        # check if train_size and test_size are given and they sum upto 1
        assert (
            train_size is not None
            and test_size is not None
            and train_size + test_size == 1
        ), "train_size and test_size must sum upto 1"

    # gettting train and test lengths
    train_len = int(X_len * train_size)
    test_len = X_len - train_len

    return train_len, test_len


# train_test_split function
def train_test_split(
    X, y, train_size=None, test_size=None, random_state=None, shuffle=True
):
    X_len = len(X)
    assert X_len == len(y), "X and y must have same length"

    # shuffle datasets if shuffle is True (default)
    if shuffle:
        X, y = _shuffle_train_test(X, y, random_state)

    # get train and test data lengths
    train_len, test_len = _get_train_test_len(X_len, train_size, test_size)

    # slicing X and y into train and test set
    X_train = X[:train_len]
    X_test = X[train_len:]
    y_train = y[:train_len]
    y_test = y[train_len:]

    return X_train, X_test, y_train, y_test

# Testing

Using Python lists

In [118]:
X = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
y = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

# if shuffle is False, the order of X and y will be preserved
X_train, X_test, y_train, y_test = train_test_split(X, X, test_size=0.3, random_state=None, shuffle=False)
print('Unshuffled\t', X_train, X_test, y_train, y_test)

X_train, X_test, y_train, y_test = train_test_split(X, X, test_size=0.3, random_state=None, shuffle=False)
print('Unshuffled\t', X_train, X_test, y_train, y_test)

X_train, X_test, y_train, y_test = train_test_split(X, X, test_size=0.3, random_state=None, shuffle=True)
print('Shuffled\t', X_train, X_test, y_train, y_test)

X_train, X_test, y_train, y_test = train_test_split(X, X, test_size=0.3, random_state=None, shuffle=True)
print('No Random Seed\t', X_train, X_test, y_train, y_test)

X_train, X_test, y_train, y_test = train_test_split(X, X, test_size=0.3, random_state=42, shuffle=True)
print('Random Seed 42\t', X_train, X_test, y_train, y_test)

X_train, X_test, y_train, y_test = train_test_split(X, X, test_size=0.3, random_state=42, shuffle=True)
print('Random Seed 42\t', X_train, X_test, y_train, y_test)
print(type(X_train), type(X_test), type(y_train), type(y_test))

Unshuffled	 [1, 2, 3, 4, 5, 6, 7] [8, 9, 10] [1, 2, 3, 4, 5, 6, 7] [8, 9, 10]
Unshuffled	 [1, 2, 3, 4, 5, 6, 7] [8, 9, 10] [1, 2, 3, 4, 5, 6, 7] [8, 9, 10]
Shuffled	 [1, 2, 9, 6, 4, 5, 8] [10, 7, 3] [1, 2, 9, 6, 4, 5, 8] [10, 7, 3]
No Random Seed	 [10, 3, 1, 7, 9, 6, 4] [8, 2, 5] [10, 3, 1, 7, 9, 6, 4] [8, 2, 5]
Random Seed 42	 [9, 2, 6, 1, 8, 3, 10] [5, 4, 7] [9, 2, 6, 1, 8, 3, 10] [5, 4, 7]
Random Seed 42	 [9, 2, 6, 1, 8, 3, 10] [5, 4, 7] [9, 2, 6, 1, 8, 3, 10] [5, 4, 7]
<class 'list'> <class 'list'> <class 'list'> <class 'list'>


Using NumPy arrays

In [119]:
X = np.random.randint(1, 100, 10)
y = X * 2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
print(X_train, X_test, y_train, y_test)
print(type(X_train), type(X_test), type(y_train), type(y_test))

[53 87 24 83 22 75  2 88] [75  3] [106 174  48 166  44 150   4 176] [150   6]
<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>


Using Pandas DataFrames and Series

In [120]:
# importing dataset
dataset = pd.read_csv('./../../datasets/fish_market/Fish.csv')
dataset.head()

X = dataset.drop(['Species','Weight'], axis=1)
y = dataset['Weight']

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)
print(len(X_train), len(X_test), len(y_train), len(y_test))
print(type(X_train), type(X_test), type(y_train), type(y_test))

127 32 127 32
<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>
