<a href="https://colab.research.google.com/github/Raunakaga2004/HOML-ch2/blob/main/TrainTestSplit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# same as train_test_split by scikit learn

# Shuffles the data randomly and split according to test_ratio and return train and test data split.

import numpy as np;

def shuffle_and_split_data(data, test_ratio, seed = 42) :

  np.random.seed(seed); # set the seed, default is 42

  # Shuffle the indices of rows in data
  shuffled_indices = np.random.permutation(np.array(data.index)) # np.random.permutations shuffles the array elements

  # Number of rows as test case on the basis of test_ratio
  test_rows = int(len(data) * test_ratio) #len(data) means number of rows in data

  # Get the test_rows amount of top shuffled indices rows
  test_indices = shuffled_indices[:test_rows];

  # Get the train data rows
  train_indices = shuffled_indices[test_rows:];

  # return train and test data
  return data.iloc[train_indices], data.iloc[test_indices];


# Limitations :
  # Every time it will generate different test set and train set which will allow machine learning model to see whole dataset which is not good because we want to separate some amount of data to validate the model.

  # => for that we used seed above

# Solutions :
  # Save the test set in first run then load it in subsequent run
  # Set random number generator's seed

# Limitations of Solutions :
  # they will generate different sets if the dataset is updated

# Solution :
  # to have stable split among train and test set we can use hash of row to identify whether it will go in train or test

In [7]:
# Hash based splitting of train and test set of dataset

import numpy as np;
from zlib import crc32; # zlib is the library used for data compression. crc32 is a type of checksum algorithm, crc32 calculates a 32-bit checksum value based on the input data.

def is_id_in_test_set( identifier, test_ratio) : # identifier is the unique element on which whole row will be decided that whether it will go in test or train set on basis of test_ratio
  return crc32(np.int64(identifier)) < test_ratio * 2 ** 32; # returns true or false

def split_data_with_hash (data, test_ratio, id_column) : # id_column is the column which is or which can be a primary key i.e. which can define uniqueness of row
  ids = data[id_column]
  in_test_set = ids.apply(lambda id_ : is_id_in_test_set(id_, test_ratio))

  return data.loc[~in_test_set], data.loc[in_test_set] # not true means in train set and true means in test set

# Benefits :
  # this will give the same rows in test set everytime whether new data is added or not

In [None]:
# Other type is StratfiedShuffleSplit of sklearn library which divides data into bins and then extract the test_ratio percentage from each bin randomly

# Benefits :
  # It is beneficial because it doesn't change the distribution of data
  # and it represent the population data on the basis of that column of data