# Performing the Train-Test Split on a Time Series Dataset

<img align="left" width="130" src="https://raw.githubusercontent.com/PacktPublishing/Amazon-SageMaker-Cookbook/master/Extra/cover-small-padded.png"/>

This notebook contains the code to help readers work through one of the recipes of the book [Machine Learning with Amazon SageMaker Cookbook: 80 proven recipes for data scientists and developers to perform ML experiments and deployments](https://www.amazon.com/Machine-Learning-Amazon-SageMaker-Cookbook/dp/1800567030)

### How to do it...

In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

In [None]:
def load_data_from_json(filename):
    tmp = {}
    
    with open(filename) as file:
        tmp = json.load(file)
        
    index = pd.date_range(
        start=tmp["t0"], 
        freq=tmp["freq"], 
        periods=tmp["length"])
    tmp["data"] = pd.Series(
        data=tmp["data"], 
        index=index)
        
    return tmp

In [None]:
time_series_data = load_data_from_json(
    "tmp/all.json"
)
time_series_data

In [None]:
def train_test_split(data, ratio=0.9):
    train_length = int(len(data) * ratio)
    pl = int(len(data)) - train_length
    prediction_length = pl
    training_dataset = data[:-prediction_length]
    target_dataset = data[train_length-1:]
    test_dataset = data
    
    return {
        "prediction_length": prediction_length,
        "training_dataset": training_dataset,
        "target_dataset": target_dataset,
        "test_dataset": test_dataset
    }

In [None]:
results = train_test_split(
    time_series_data["data"]
)
print(results["prediction_length"])

In [None]:
training_dataset = results["training_dataset"]
target_dataset = results["target_dataset"]

plt.figure(figsize=(14,6))
plt.plot(training_dataset.index, 
         training_dataset, label="training")
plt.plot(target_dataset.index, 
         target_dataset, 
         label="target")
plt.grid(True)
plt.xlabel("DATE")
plt.ylabel("VALUE")    
plt.legend()
plt.show()

In [None]:
def series_to_object(data):
    return {"start": str(data.index[0]), 
            "target": list(data)}

In [None]:
def series_to_jsonline(data):
    return json.dumps(series_to_object(data))

In [None]:
def save_data_to_jsonlines(data, filename):
    tmp = {}
    
    with open(filename, 'wb') as file:
        t = series_to_jsonline(data)
        t = t.encode("utf-8")
        file.write(t)
        file.write("\n".encode("utf-8"))

In [None]:
save_data_to_jsonlines(
    results["training_dataset"], 
    "tmp/training.jsonl"
)
save_data_to_jsonlines(
    results["test_dataset"], 
    "tmp/test.jsonl"
)

In [None]:
s3_bucket = 'sagemaker-cookbook-bucket'
prefix = 'chapter08'

In [None]:
!aws s3 cp tmp/training.jsonl s3://{s3_bucket}/{prefix}/input/training.jsonl
!aws s3 cp tmp/test.jsonl s3://{s3_bucket}/{prefix}/input/test.jsonl


In [None]:
prediction_length = results["prediction_length"]
%store prediction_length

In [None]:
freq = time_series_data["freq"]
%store freq

In [None]:
training_dataset = results["training_dataset"]
%store training_dataset
training_dataset

In [None]:
target_dataset = results["target_dataset"]
%store target_dataset
target_dataset

In [None]:
%store s3_bucket
%store prefix