# 3.1 Creating a Random Dataset for Machine Learning 
This examples illustrates how to create a random dataset with a given number of samples that have a given number of features. The dataset is split into training and validation sets which are visualized.


In [None]:
# Mount GDrive, change directory and check contents of folder.

import os
from google.colab import drive
from google.colab import files

PROJECT_FOLDER = "/content/gdrive/My Drive/Colab Notebooks/CS345_SP22/3. Neurons"

drive.mount('/content/gdrive/')
os.chdir("/content/gdrive/My Drive/Colab Notebooks/CS345_SP22/3. Neurons")
print("Current dir: ", os.getcwd())

# Declare the dataset class
#### [Data]
* Seeding the random number generators for reproducibility of experiments.
* Splitting a dataset.
* Normalizing the features.

#### [Python]
* Import from a package that existing inside the project folder.
* Class declaration that implements a random dataset.
* Default constructor parameters.

In [None]:
import numpy as np                      # use the package (a.k.a. namespace) with the alias "np"
from sklearn import datasets            # import as single object/subpackage from the package
from sklearn.model_selection import train_test_split    # import a standalone procedure function from the pacckage
from mllib.utils import RandomSeed                      # custom python package, file inside the package.
 

# ====================================================================================================
class CRandomDataset(object):
  # --------------------------------------------------------------------------------------
  # Constructor
  def __init__(self, p_nSampleCount=200):
    # ................................................................
    # // Fields \\
    self.Samples   = None
    self.Labels    = None
    self.SampleCount = p_nSampleCount

    self.TSSamples = None
    self.TSLabels  = None
    self.TSSampleCount = 0

    self.VSSamples = None
    self.VSLabels  = None
    self.VSSampleCount = 0
    # ................................................................

    RandomSeed(2022)
    self.Samples, self.Labels = datasets.make_classification(
        n_features=2,
        n_classes=2,
        n_samples=self.SampleCount,
        n_redundant=0,
        n_clusters_per_class=1
    )
  # --------------------------------------------------------------------------------------
  # Method 
  def DebugPrint(self):
    print("Shape of sample matrix", self.Samples.shape)
    print('.'*80)

    print("Datatype of sample matrix before convertion: %s" % str(self.Samples.dtype))
    # Convert the data to 32bit floating point numbers (default for faster computations)
    self.Samples = np.asarray(self.Samples, dtype=np.float32)
    print("Datatype of sample matrix after convertion: %s" % str(self.Samples.dtype))
    print('.'*80)

    # Classification into 2 classes == Binary classification
    print("Class labels")
    print(self.Labels)
    print('.'*80)
  # --------------------------------------------------------------------------------------
  def Split(self):
    self.TSSamples, self.VSSamples, self.TSLabels, self.VSLabels = train_test_split(
                                                              self.Samples, self.Labels
                                                            , test_size=0.10, random_state=2021)
        
    self.TSSampleCount = self.TSSamples.shape[0]
    self.VSSampleCount = self.VSSamples.shape[0]
    print("%d ssamples in the Training Set" % self.TSSampleCount)
    print("%d ssamples in the Validation Set"%  self.VSSampleCount)
    print('.'*80)
  # --------------------------------------------------------------------------------------
# ====================================================================================================        



### Use the dataset class from a file


In [None]:
from Dataset import CRandomDataset

# Create random dataset 
It creates the dataset, splits into TS and V, and conditionally normalized the feature values.

In [None]:
from sklearn import preprocessing   

IS_MINMAX_NORMALIZED = True;

oDataset = CRandomDataset(200)
oDataset.DebugPrint()

# Scale the features to 0 .. 1
if IS_MINMAX_NORMALIZED:
    print("Unscaled sample #1:", oDataset.Samples[0])
    oScaler = preprocessing.MinMaxScaler().fit(oDataset.Samples)
    oDataset.Samples = oScaler.transform(oDataset.Samples)
    print("Minmax normalized sample #1:", oDataset.Samples[0])

oDataset.Split()    

# Visualizing the feature space
We can visualize the samples that have only 2 features, because the feature space is 2D.

#### [Python]
* Using class that encapsulates plots done with the matplotlib library.

In [None]:
from mllib.visualization import CPlot    

oPlot = CPlot("Dataset", oDataset.Samples, oDataset.Labels)
oPlot.Show(IS_MINMAX_NORMALIZED)

oPlot = CPlot("Training Set", oDataset.TSSamples, oDataset.TSLabels)
oPlot.Show(IS_MINMAX_NORMALIZED)

oPlot = CPlot("Validation Set", oDataset.VSSamples, oDataset.VSLabels)
oPlot.Show(IS_MINMAX_NORMALIZED)