# Generating a Synthetic Dataset for Deep Learning Experiments

<img align="left" width="130" src="https://raw.githubusercontent.com/PacktPublishing/Amazon-SageMaker-Cookbook/master/Extra/cover-small-padded.png"/>

This notebook contains the code to help readers work through one of the recipes of the book [Machine Learning with Amazon SageMaker Cookbook: 80 proven recipes for data scientists and developers to perform ML experiments and deployments](https://www.amazon.com/Machine-Learning-Amazon-SageMaker-Cookbook/dp/1800567030)

### How to do it...

In [None]:
import numpy as np

In [None]:
def formula(x):
    if x >= -2000:
        return x
    else:
        return -x - 4000

In [None]:
formula(100)

In [None]:
def generate_synthetic_data(n_samples=1000, 
                            start=-5000, 
                            end=5000):
    np.random.seed(42)
    x = np.random.randint(low=start, 
                          high=end, 
                          size=(n_samples,)).astype(int)
    
    y = np.vectorize(formula)(x) + \
        np.random.normal(150, 150, n_samples) 
    
    return (x, y)

In [None]:
X, y = generate_synthetic_data()

In [None]:
X[:10]

In [None]:
y[:10]

In [None]:
from matplotlib import pyplot
pyplot.rcParams["figure.figsize"] = (10,8)
pyplot.scatter(X,y,s=1)
pyplot.show()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

In [None]:
print(X_train.shape)
print(X_validation.shape)
print(X_test.shape)

In [None]:
!mkdir -p tmp

In [None]:
import pandas as pd

df_all_data = pd.DataFrame({ 'y': y, 'x': X})
df_all_data.to_csv('tmp/all_data.csv', header=False, index=False)

df_training_data = pd.DataFrame({ 'y': y_train, 'x': X_train})
df_training_data.to_csv('tmp/training_data.csv', header=False, index=False)

df_validation_data = pd.DataFrame({ 'y': y_validation, 'x': X_validation})
df_validation_data.to_csv('tmp/validation_data.csv', header=False, index=False)

df_test_data = pd.DataFrame({ 'y': y_test, 'x': X_test})
df_test_data.to_csv('tmp/test_data.csv', header=False, index=False)

In [None]:
s3_bucket = '<insert s3 bucket name here>'
prefix = "chapter03"

In [None]:
!aws s3 cp tmp/training_data.csv \
s3://{s3_bucket}/{prefix}/synthetic/all_data.csv

In [None]:
!aws s3 cp tmp/training_data.csv \
s3://{s3_bucket}/{prefix}/synthetic/training_data.csv

In [None]:
!aws s3 cp tmp/validation_data.csv \
s3://{s3_bucket}/{prefix}/synthetic/validation_data.csv

In [None]:
!aws s3 cp tmp/test_data.csv \
s3://{s3_bucket}/{prefix}/synthetic/test_data.csv