# Generating a Synthetic Dataset with additional columns containing random values

<img align="left" width="130" src="https://raw.githubusercontent.com/PacktPublishing/Amazon-SageMaker-Cookbook/master/Extra/cover-small-padded.png"/>

This notebook contains the code to help readers work through one of the recipes of the book [Machine Learning with Amazon SageMaker Cookbook: 80 proven recipes for data scientists and developers to perform ML experiments and deployments](https://www.amazon.com/Machine-Learning-Amazon-SageMaker-Cookbook/dp/1800567030)

### How to do it...

In [None]:
from sklearn.datasets import make_blobs

X, y = make_blobs(n_samples=5000, centers=2, 
                  cluster_std=[6, 4], n_features=2, 
                  random_state=42)

In [None]:
n_samples = len(X)
n_samples

In [None]:
import numpy as np

r1 = np.random.randint(low=-100, high=100, 
                       size=(n_samples,)).astype(int)
r2 = np.random.randint(low=-100, high=100, 
                       size=(n_samples,)).astype(int)

In [None]:
import pandas as pd

all_dataset = pd.DataFrame(
    dict(label=y, a=X[:,0], b=X[:,1], c=r1, d=r2))

print(all_dataset)

In [None]:
from matplotlib import pyplot

colors = {0:'red', 1:'blue'}
fig, ax = pyplot.subplots()
grouped = all_dataset.groupby('label')

for key, group in grouped:
    group.plot(ax=ax, kind='scatter', x='a', y='b', label=key, color=colors[key])
    
pyplot.show()

In [None]:
from sklearn.model_selection import train_test_split

train_val, test = train_test_split(all_dataset, 
                                   test_size=0.2, 
                                   random_state=0)

training, validation = train_test_split(train_val, 
                                        test_size=0.25, 
                                        random_state=0)

In [None]:
!mkdir -p tmp

In [None]:
s3_bucket_name = "sagemaker-cookbook-bucket"
prefix = "chapter06/input"

In [None]:
training.to_csv('tmp/training_data.csv', header=True, index=False)
validation.to_csv('tmp/validation_data.csv', header=True, index=False)
test.to_csv('tmp/test_data.csv', header=True, index=False)

In [None]:
!aws s3 cp tmp/training_data.csv s3://{s3_bucket_name}/{prefix}/training_data.csv
!aws s3 cp tmp/validation_data.csv s3://{s3_bucket_name}/{prefix}/validation_data.csv
!aws s3 cp tmp/test_data.csv s3://{s3_bucket_name}/{prefix}/test_data.csv

In [None]:
!aws s3 cp tmp/training_data_no_header.csv s3://{s3_bucket_name}/{prefix}/training_data_no_header.csv
!aws s3 cp tmp/validation_data_no_header.csv s3://{s3_bucket_name}/{prefix}/validation_data_no_header.csv
!aws s3 cp tmp/test_data_no_header.csv s3://{s3_bucket_name}/{prefix}/test_data_no_header.csv

In [None]:
%store s3_bucket_name
%store prefix

In [None]:
f"s3://{s3_bucket_name}/{prefix}/training_data.csv"