# Preparation of Colab runtime environment
Mount GDrive, change directory and check contents of folder.

In [3]:
import os
from google.colab import drive
from google.colab import files

drive.mount('/content/gdrive/')
print("-"*80)


# Change to this source code folder
os.chdir("/content/gdrive/My Drive/Colab Notebooks/CS345_SP22/2. Data")
print("Current dir: ", os.getcwd())
print("-"*40, "Contents", "-"*40)
!ls "/content/gdrive/My Drive/Colab Notebooks/CS345_SP22/2. Data"

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).
--------------------------------------------------------------------------------
Current dir:  /content/gdrive/My Drive/Colab Notebooks/CS345_SP22/2. Data
---------------------------------------- Contents ----------------------------------------
'2.1 BasicData.ipynb'	   '2.5 SerializingDataWithJSON.ipynb'
'2.2 LoadingData.ipynb'    'First Notebook.ipynb'
'2.3 Normalization.ipynb'   SomeRawData.csv
'2.4 DataSplitting.ipynb'


# Loading and Preprocessing Data: Min-Max Normalization
* Python object methods
* Numpy calculations with vectors

In [4]:
import numpy as np
import pandas as pd

#=================================================================
class CDataSet(object):
  # --------------------------------------------------------
  def __init__(self):
    self.IDs = None
    self.Targets = None
    self.Features = None
    self.CategoryIDs = dict()
    self.NextID = 0
  # ------------------------------------------------------
  def LoadFromFile(self, p_sFileName, p_sDelimiter=";"):
    oDataFrame = pd.read_csv(p_sFileName, delimiter=p_sDelimiter)
    oData = oDataFrame.to_numpy();

    # Slicing
    self.IDs        = oData[:, 0]
    self.Targets    = oData[:, 1]
    self.Features   = oData[:, 2:]

    for nIndex,sTextValue in enumerate(self.Features[:,2]):
      if not sTextValue in self.CategoryIDs:
        self.CategoryIDs[sTextValue] = self.NextID
        self.NextID += 1
      nID = self.CategoryIDs[sTextValue]
      self.Features[nIndex, 2] = self.NextID
    
    self.Features = self.Features.astype(np.float32)
  # ------------------------------------------------------
  def Preprocess(self):
    nMinimums = np.min(self.Features, axis=(0))  # Vector of minimum values for each feature
    nMaximums = np.max(self.Features, axis=(0))  # vector of maximum values for each feature
    print("Maximum values of features:", nMinimums)
    print("Minimum values of features:", nMaximums)

    for nSampleIndex, nSample in enumerate(self.Features):
      nUnscaledFeatureValues = self.Features[nSampleIndex, :]
      self.Features[nSampleIndex, :] = (nUnscaledFeatureValues - nMinimums) / (nMaximums - nMinimums)

  # ------------------------------------------------------
#=================================================================


oDataset = CDataSet()
oDataset.LoadFromFile("SomeRawData.csv")

print("-"*30, "Targets", "-"*30)
print(oDataset.Targets)
print("-"*30, "Features", "-"*30)
print(oDataset.Features)
print(oDataset.Features.dtype)   

print("-"*30, "Features (Normalized)", "-"*30)
oDataset.Preprocess()
print(oDataset.Features)



------------------------------ Targets ------------------------------
[1.0 0.0 1.0 0.0 0.0 1.0]
------------------------------ Features ------------------------------
[[ 4.2 -1.2  1. ]
 [ 2.3  0.5  1. ]
 [10.2 -0.8  2. ]
 [ 4.5  1.   3. ]
 [ 2.3 -4.   3. ]
 [ 1.   2.   3. ]]
float32
------------------------------ Features (Normalized) ------------------------------
Maximum values of features: [ 1. -4.  1.]
Minimum values of features: [10.2  2.   3. ]
[[0.34782606 0.46666667 0.        ]
 [0.14130434 0.75       0.        ]
 [1.         0.53333336 0.5       ]
 [0.38043478 0.8333333  1.        ]
 [0.14130434 0.         1.        ]
 [0.         1.         1.        ]]
