In [1]:
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime

import pandas as pd
import numpy as np
import boto3
import sagemaker

import os

import matplotlib
import matplotlib.pyplot as plt

## 0. Data Processing

In [2]:
# Load and Merge data
station = 'tch'

dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d')
df_traffic = pd.read_csv('data/traffic_{}_all.csv'.format(station),
                 parse_dates={'Date': ['date']}, 
                 date_parser=dateparse)

df_traffic['datetime'] = df_traffic['Date'] + df_traffic['time'].astype('timedelta64[h]')
df_traffic = df_traffic[['datetime','time','people']]

df_weather = pd.read_csv('data/weather_{}_all.csv'.format(station))
df_weather['datetime'] = pd.to_datetime(df_weather['datetime'])

df = df_weather.merge(df_traffic,on='datetime')
df.set_index('datetime',inplace=True)#.asfreq('1H') #generate missing hours
#df.fillna(0,inplace=True) #fill missing hours with 0
display(df.head())
display(df.corr())

Unnamed: 0_level_0,air_presr,temp_℃,humidity,wind_spd_m_s,precipitation＿mm_t,time,people
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-01 01:00:00,1020.4,13.8,86,1.6,0.0,1,8
2020-01-01 06:00:00,1020.1,13.9,92,2.3,0.0,6,1357
2020-01-01 07:00:00,1020.7,14.1,93,2.4,0.0,7,1421
2020-01-01 08:00:00,1021.3,14.4,92,1.6,0.0,8,1785
2020-01-01 09:00:00,1021.7,15.0,89,1.6,0.0,9,2841


Unnamed: 0,air_presr,temp_℃,humidity,wind_spd_m_s,precipitation＿mm_t,time,people
air_presr,1.0,0.545295,0.664944,0.20862,-0.01563,0.016148,-0.0024
temp_℃,0.545295,1.0,0.10683,0.016241,-0.024059,0.059052,0.065682
humidity,0.664944,0.10683,1.0,0.049035,0.092039,-0.052849,-0.089997
wind_spd_m_s,0.20862,0.016241,0.049035,1.0,-0.014389,0.113611,0.128371
precipitation＿mm_t,-0.01563,-0.024059,0.092039,-0.014389,1.0,0.006337,0.020481
time,0.016148,0.059052,-0.052849,0.113611,0.006337,1.0,0.492923
people,-0.0024,0.065682,-0.089997,0.128371,0.020481,0.492923,1.0


In [4]:
df_train = df.loc['2020-01-01':'2020-11-30']
df_test = df.loc['2020-12-01':'2020-12-31']


df_train.to_csv('{}_data/train_{}.csv'.format(station, station), index=False)
df_test.to_csv('{}_data/test_{}.csv'.format(station, station), index=False)

In [4]:
train_data = pd.read_csv("tms_data/train_tms.csv", skiprows=1, header=None, names=None)
train_data.iloc[:,:-1]

Unnamed: 0,0,1,2,3,4,5
0,1024.9,14.8,82,3.6,0.0,1
1,1024.4,14.9,87,4.1,0.0,6
2,1025.2,14.9,90,3.1,0.0,7
3,1025.6,15.4,89,3.5,0.0,8
4,1026.0,16.4,82,4.5,0.0,9
...,...,...,...,...,...,...
7028,1020.9,18.8,92,3.7,0.0,19
7029,1021.3,19.0,89,3.7,0.0,20
7030,1021.4,19.0,89,3.1,0.0,21
7031,1021.5,18.3,95,4.2,0.5,22


In [5]:
scaler = MinMaxScaler()

# store them in this dataframe
train_x=pd.DataFrame(scaler.fit_transform(df_train.astype(float)))
train_x

Unnamed: 0,0,1,2,3,4,5,6
0,0.997664,0.224359,0.763158,0.413793,0.0000,0.043478,0.000387
1,0.997177,0.227564,0.828947,0.471264,0.0000,0.260870,0.109371
2,0.997956,0.227564,0.868421,0.356322,0.0000,0.304348,0.158553
3,0.998345,0.243590,0.855263,0.402299,0.0000,0.347826,0.232941
4,0.998735,0.275641,0.763158,0.517241,0.0000,0.391304,0.246980
...,...,...,...,...,...,...,...
7028,0.993770,0.352564,0.894737,0.425287,0.0000,0.826087,0.403368
7029,0.994159,0.358974,0.855263,0.425287,0.0000,0.869565,0.278636
7030,0.994257,0.358974,0.855263,0.356322,0.0000,0.913043,0.268973
7031,0.994354,0.336538,0.934211,0.482759,0.0125,0.956522,0.262068


In [6]:
#train df
df_train_X = df.loc['2020-01-01':'2020-11-30', :'time']
df_train_y = df.loc['2020-01-01':'2020-11-30', 'people']
# Normalize and split data
scaler = MinMaxScaler()

# store them in this dataframe
df_scaled=pd.DataFrame(scaler.fit_transform(df_train_X.astype(float)))

# get same features and State-County indices
df_scaled.columns=df_train_X.columns
df_scaled.index=df_train_X.index




df_scaled.to_csv('{}_data/train_{}_X.csv'.format(station, station), index=False)
df_train_y.to_csv('{}_data/train_{}_y.csv'.format(station, station),index=False)


#test df
df_test_X = df.loc['2020-12-01':'2020-12-31', :'time']
df_test_y = df.loc['2020-12-01':'2020-12-31', 'people']
# Normalize and split data
scaler = MinMaxScaler()

# store them in this dataframe
df_scaled=pd.DataFrame(scaler.fit_transform(df_test_X.astype(float)))

# get same features and State-County indices
df_scaled.columns=df_test_X.columns
df_scaled.index=df_test_X.index

df_scaled.to_csv('{}_data/test_{}_X.csv'.format(station, station),index=False)
df_test_y.to_csv('{}_data/test_{}_y.csv'.format(station, station),index=False)

#df_train_y = df.iloc[:, -1].set_index('datetime')
# convert df to np array
#train_X_np = df_scaled.values.astype('float32')
#train_y_np  = df_train_y.values.reshape(df_train_y.shape[0],1)


## 1. Upload Data to S3

In [7]:
# session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# create an S3 bucket
bucket = sagemaker_session.default_bucket()

In [8]:
# should be the name of directory you created to save your features data
data_dir = '{}_data'.format(station)

# directories to save train/test data
#train_key = os.path.join(data_dir, '{}_train.csv')
#test_key = os.path.join(data_dir, '{}_test.csv')



# set prefix, a descriptive name for a directory  
prefix = 'capStoneProject_{}_data'.format(station)

# upload all data to S3
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)
print(input_data)

s3://sagemaker-us-east-1-716934411671/capStoneProject_tms_data


In [9]:
!pygmentize source_sklearn/train.py

[34mfrom[39;49;00m [04m[36m__future__[39;49;00m [34mimport[39;49;00m print_function

[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m
[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36mpreprocessing[39;49;00m [34mimport[39;49;00m MinMaxScaler

[37m# sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. [39;49;00m
[37m#from sklearn.externals import joblib[39;49;00m
[37m# Import joblib package directly[39;49;00m
[34mimport[39;49;00m [04m[36mjoblib[39;49;00m

[37m## TODO: Import any additional libraries you need to define a model[39;49;00m
[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36msvm[39;49;00m [34mimport[39;49;00m SVR
[37m#from sklearn import linear_model[39;49;00m

[37m# Provided model load function[39;49;00m
[34mdef[39;49;

## 2. Train a Estimator

In [10]:
#pd.read_csv(os.path.join("tms_data", "train_tms_y.csv"), skiprows = 1,header = None).iloc[:,0]
#pd.read_csv(os.path.join("tms_data", "train_tms_X.csv"), skiprows = 1,header = None)

In [11]:
from sagemaker.sklearn.estimator import SKLearn

# output path
output_path = 's3://{}/{}'.format(bucket, prefix)

# your import and estimator code, here
sklearn_estimator = SKLearn(entry_point='train.py',
                            source_dir='source_sklearn',
                            role=role,
                            framework_version='0.23-1',
                            instance_count=1,
                            instance_type='ml.m4.xlarge',
                            output_path=output_path,
                            sagemaker_session=sagemaker_session)

In [12]:
%%time

# Train your estimator on S3 training data
sklearn_estimator.fit({'train': os.path.join(input_data, 'train_tms.csv'),
                       'test': os.path.join(input_data, 'test_tms.csv')})

2021-03-20 06:54:30 Starting - Starting the training job...
2021-03-20 06:54:33 Starting - Launching requested ML instancesProfilerReport-1616223270: InProgress
......
2021-03-20 06:56:00 Starting - Preparing the instances for training.........
2021-03-20 06:57:26 Downloading - Downloading input data......
2021-03-20 06:58:29 Training - Training image download completed. Training in progress..[34m2021-03-20 06:58:30,180 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-03-20 06:58:30,183 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-03-20 06:58:30,193 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-03-20 06:59:31,021 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-03-20 06:59:31,034 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-03-20 06:59:31,04

## 3. Deploy the trained model 

In [13]:
# deploy your model to create a predictor
predictor = sklearn_estimator.deploy(instance_type='ml.m4.xlarge',
                                     initial_instance_count=1)

-----------------!

## Evaluating the Model

In [14]:
# read in test data, assuming it is stored locally
test_data = pd.read_csv(os.path.join('{}_data'.format(station), "test_tms.csv"), skiprows=1, header=None, names=None)



# labels are in the first column
test_y = test_data.iloc[:,-1]
test_x = test_data.iloc[:,:-1]
print(test_x.shape, test_y.shape)
scaler = MinMaxScaler()
# store them in this dataframe
test_x=pd.DataFrame(scaler.fit_transform(test_x.astype(float)))

(651, 6) (651,)


In [15]:
# First: generate predicted, class labels
test_y_preds = predictor.predict(test_x)

In [16]:
df_pred

NameError: name 'df_pred' is not defined

In [None]:
output_file_name = 'SVR_tms_rbg_c100_g10e-1_e10e-1'

# df_pred = pd.DataFrame(test_y_preds)
df_y = pd.DataFrame(test_y)
df_pred = pd.DataFrame(test_y_preds)

df_output = pd.merge(df_pred, df_y, left_index=True, right_index=True)
df_output = df_output.rename(columns={0: "pred", 6: "actual"})
df_output.to_csv("output/{}.csv".format(output_file_name),index=False)


fig, (ax1) = plt.subplots(figsize=(20, 6))


ax1.plot(df_pred[0])
ax1.plot(df_y[6])
ax1.set_title('{}: DEC 2020'.format('Taipei Main Station'), fontsize=20)


plt.savefig("output/{}.png".format(output_file_name))
plt.show()
#fill_holidays(df_plot1, ax1_weekends, ax1)

In [81]:
predictor.delete_endpoint()