# Generating a Synthetic Dataset for Analysis and Transformation

<img align="left" width="130" src="https://raw.githubusercontent.com/PacktPublishing/Amazon-SageMaker-Cookbook/master/Extra/cover-small-padded.png"/>

This notebook contains the code to help readers work through one of the recipes of the book [Machine Learning with Amazon SageMaker Cookbook: 80 proven recipes for data scientists and developers to perform ML experiments and deployments](https://www.amazon.com/Machine-Learning-Amazon-SageMaker-Cookbook/dp/1800567030)

### How to do it...

In [None]:
import random
from numpy.random import seed as np_seed
from random import randint

In [None]:
np_seed(42)
random.seed(42)

In [None]:
def generate_x_value():
    return randint(500,2000)

def generate_y_value():
    return randint(20,50)

In [None]:
x_values = []

for _ in range(0, 100):
    x_values.append(generate_x_value())
    
x_values[0:5]

In [None]:
y_values = []

for _ in range(0, 100):
    y_values.append(generate_y_value())
    
y_values[0:5]

In [None]:
x2_values = list(map(lambda x: x * 2 + 7000, x_values))
x2_values[0:5]

In [None]:
x3_values = list(map(lambda x: x * 3 - 20, x_values))
x3_values[0:5]

In [None]:
y2_values = list(map(lambda y: y * 2 + 1000, y_values))
y2_values[0:5]

In [None]:
import pandas as pd

df = pd.DataFrame({
    "x": x_values,
    "x2": x2_values,
    "x3": x3_values,
    "y": y_values,
    "y2": y2_values
})

df

In [None]:
import numpy as np

In [None]:
df["label"] = (df.x > 1000) & (df.y > 35)
df['label'] = df['label'].apply(lambda x: 1 if x else 0)
df

In [None]:
df["keep"] = ((df.x > 1000) & (df.y > 35)) | ((df.x < 800) & (df.y < 30))
df

In [None]:
df = df[df.keep]
df.head()

In [None]:
del df["keep"]
df.head()

In [None]:
import matplotlib.pyplot as plt

groups = df.groupby("label")

for name, group in groups:
    plt.plot(group["x"], group["y"], marker="o", linestyle="", label=name)
    
plt.legend()

In [None]:
!mkdir -p tmp

In [None]:
df.to_csv("tmp/synthetic.all.labeled.csv")

In [None]:
labeled_df = df
%store labeled_df

In [None]:
del df["label"]
df.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_array = scaler.fit_transform(df.astype(float))
normalized_df = pd.DataFrame(scaled_array)
normalized_df.columns = df.columns
normalized_df.index = df.index

display(normalized_df.head())

In [None]:
normalized_df.to_csv("tmp/synthetic.all_normalized.unlabeled.csv")

In [None]:
unlabeled_normalized_df = normalized_df
%store unlabeled_normalized_df