# Preparation of Colab runtime environment
Mount GDrive, change directory and check contents of folder.

In [None]:
import os
from google.colab import drive
from google.colab import files

drive.mount('/content/gdrive/')
print("-"*80)


# Change to this source code folder
os.chdir("/content/gdrive/My Drive/Colab Notebooks/CS345_SP22/2. Data")
print("Current dir: ", os.getcwd())
print("-"*40, "Contents", "-"*40)
!ls "/content/gdrive/My Drive/Colab Notebooks/CS345_SP22/2. Data"

# Splitting Data
* Python: Importing a single method (class) from a package
* Python: numpy array shapes


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

#=================================================================
class CDataSet(object):
  # --------------------------------------------------------
  def __init__(self):
    self.IDs = None
    self.Targets = None
    self.Features = None
    self.CategoryIDs = dict()
    self.NextID = 0

    self.Training = None
    self.Validation = None
  # ------------------------------------------------------
  def LoadFromFile(self, p_sFileName, p_sDelimiter=";"):
    oDataFrame = pd.read_csv(p_sFileName, delimiter=p_sDelimiter)
    oData = oDataFrame.to_numpy();

    # Slicing
    self.IDs        = oData[:, 0]
    self.Targets    = oData[:, 1]
    self.Features   = oData[:, 2:]

    for nIndex,sTextValue in enumerate(self.Features[:,2]):
      if not sTextValue in self.CategoryIDs:
        self.CategoryIDs[sTextValue] = self.NextID
        self.NextID += 1
      nID = self.CategoryIDs[sTextValue]
      self.Features[nIndex, 2] = self.NextID
    
    self.Features = self.Features.astype(np.float32)
  # ------------------------------------------------------
  def Preprocess(self):
    nMinimums = np.min(self.Features, axis=(0))  # Vector of minimum values for each feature
    nMaximums = np.max(self.Features, axis=(0))  # vector of maximum values for each feature
    print("Maximum values of features:", nMinimums)
    print("Minimum values of features:", nMaximums)

    for nSampleIndex, nSample in enumerate(self.Features):
      nUnscaledFeatureValues = self.Features[nSampleIndex, :]
      self.Features[nSampleIndex, :] = (nUnscaledFeatureValues - nMinimums) / (nMaximums - nMinimums)

  # ------------------------------------------------------
  def Split(self, p_nValidationSamplesPercentage, p_nRandomSeed=2022):
    nFeaturesTS, nFeaturesVS, nTargetTS, nTargetVS = train_test_split(self.Features, self.Targets
                                                      , test_size=p_nValidationSamplesPercentage
                                                      , random_state=p_nRandomSeed)
    self.Training  = CDataSet()
    self.Training.Features = nFeaturesTS
    self.Training.Targets = nTargetTS

    self.Validation = CDataSet()
    self.Validation.Features = nFeaturesVS
    self.Validation.Targets = nTargetVS

  # ------------------------------------------------------
#=================================================================


oDataset = CDataSet()
oDataset.LoadFromFile("SomeRawData.csv")
oDataset.Preprocess()
oDataset.Split(1.0/6.0)


print("-"*30, "Training Set Shapes", "-"*30)
print(oDataset.Training.Features.shape)
print(oDataset.Training.Targets.shape)

print("-"*30, "Validation Set Shapes", "-"*30)
print(oDataset.Validation.Features.shape)
print(oDataset.Validation.Targets.shape)

