# SageMaker Notebook Instance to Forecast Price

In [None]:
import boto3
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri

import numpy as np
import io
import pandas as pd
from sklearn.model_selection import train_test_split

# Data Extraction and Preprocessing

- The code splits the 'time' column into 'date' and 'hour' columns and further splits the 'hour' column to exclude timezone information. 


- The 'time' column is then dropped since it is no longer needed. 


- Any rows with missing values in the variable `'total load actual'` are then dropped to ensure that the same amount of information is used to train this model.


- Any rows with missing values in the target variable <b>`'price actual'`</b> are then dropped. 


- Finally, the code creates two objects, X and y, which contain the independent and dependent variables, respectively, for use in a predictive mod

In [None]:
# Loading Datasets from S3 Bucket
df_energy = pd.read_csv("s3://bucket-aws-group-project-load-datasets-from-local/energy_dataset.csv")
df_weather = pd.read_csv("s3://bucket-aws-group-project-load-datasets-from-local/weather_features.csv")

# Merging datasets
df = pd.merge(df_energy, df_weather, left_on='time', right_on='dt_iso', how='right')

# Dropping non-relevant columns
# The columns dropped have been removed because they either:
# - contained all 0s instances
# - contained all NAs instances
# - caused target leackage because they contained the already forecasted value by the Transmission Service Operator (TSO)
df.drop(columns=['dt_iso', 'generation fossil coal-derived gas',
                 'generation fossil oil shale', 'generation fossil peat', 
                 'generation geothermal', 'generation marine', 'generation wind offshore', 
                 'generation hydro pumped storage aggregated',
                 'forecast wind offshore eday ahead', 'weather_description', 
                 'weather_id', 'weather_icon', 'forecast solar day ahead', 
                 'forecast wind onshore day ahead', 'total load forecast'], axis=1, inplace=True)


# Split the 'time' column into 'date' and 'hour' columns
df[['date', 'hour']] = df['time'].str.split(' ', expand=True)

# Split the 'hour' column further to exclude the timezone information
df['hour'] = df['hour'].str.split('+', expand=True)[0]

# Now we can drop the 'time' column, since we won't need it anymore to merge the tables
df.drop('time', axis=1, inplace=True)

# Drop rows with NAs in 'total load actual' to preserve same amount of data for this model.
df = df.dropna(subset=['total load actual'])

# Drop NAs in the target variable
df = df.dropna(subset=['price actual'])

# Create X matrix and y array
y = df['price actual']
X = df.drop('price actual', axis = 1)

In [None]:
# Create one-hot encoded columns for city_name
X = pd.get_dummies(X, columns=['city_name'])

In [None]:
# Create one-hot encoded columns for weather_main
X = pd.get_dummies(X, columns=['weather_main'])

In [None]:
### Adjust the date column
X['date'] = pd.to_datetime(X['date'])
# Double-validate the correct format of the date
X['date'] = X['date'].dt.strftime('%Y-%m-%d')

# Extract the year, month, and day components from the 'date' column
X['year'] = X['date'].apply(lambda x: int(x[:4]))
X['month'] = X['date'].apply(lambda x: int(x[5:7]))
X['day'] = X['date'].apply(lambda x: int(x[8:10]))
# Drop original column
X.drop('date', axis=1, inplace=True)

### Adjust the time column
from datetime import datetime
X['hour'] = X['hour'].apply(lambda x: datetime.strptime(x, '%H:%M:%S').time())
# We need to use apply since it's a Series
X['hour'] = X['hour'].apply(lambda x: x.hour)

# Add target variable as first column
X = pd.concat([y, X], axis=1)

# Split the data into training and testing
from sklearn.model_selection import train_test_split
train, test = train_test_split(X, test_size=0.2, random_state=2023)

In [None]:
#X.columns

# Uploading Channels with S3

In [None]:
s3 = boto3.resource('s3')

def upload_to_s3(df, bucket, filename):
    
    placeho# Connecting so S3
s3 = boto3.resource('s3')

#The function first creates an empty string buffer object using io.StringIO(), which will be used to hold the contents of the DataFrame in CSV format. 
#The to_csv() method is called on the DataFrame, with the header and index arguments set to False to exclude these from the CSV output. 
#The resulting CSV data is then written to the string buffer.
#Next, an S3 Object instance is created using the boto3 library, with the bucket and filename arguments passed to the constructor. 
#The put() method is called on the Object instance with the contents of the string buffer passed as the Body argument. 

def upload_to_s3(df, bucket, filename):
    
    placeholder = io.StringIO()
    df.to_csv(placeholder, header=False, index=False)
    object = s3.Object(bucket, filename)
    object.put(Body=placeholder.getvalue())
    
###This uploads the file to the S3 bucket with the specified filename.lder = io.StringIO()
    df.to_csv(placeholder, header=False, index=False)
    object = s3.Object(bucket, filename)
    object.put(Body=placeholder.getvalue())

In [8]:
upload_to_s3(train, 'bucket-for-sagemaker-from-notebook', 'sagemaker-data-train.csv')
upload_to_s3(test, 'bucket-for-sagemaker-from-notebook', 'sagemaker-data-test.csv')

# Instanciating the Model

In [9]:
# This line retrieves the execution role associated with the current SageMaker notebook instance. 
# This role is used to grant permissions to the SageMaker training job to access other AWS services like S3.
role = sagemaker.get_execution_role()
# This line gets the AWS region name where the notebook instance is running. 
# This information is used to set the region for the SageMaker training job.
region_name = boto3.Session().region_name
#container = get_image_uri(region_name, 'xgboost', '0.90-1')  # Old version

# This line retrieves the Amazon ECR container location for the XGBoost algorithm image. 
# It uses the sagemaker.image_uris.retrieve() method to get the image URI, passing in the region, the name of the algorithm, and the version number.
container = sagemaker.image_uris.retrieve('xgboost', region_name, version='0.90-1')
# This line sets the S3 output location for the trained model artifacts. 
output_location = 's3://bucket-for-sagemaker-from-notebook/'

# This line defines a dictionary of hyperparameters to be passed to the XGBoost algorithm for training. 
# In this case, it sets the number of rounds to 200 and the objective to 'reg:squarederror', which is a regression objective that minimizes the mean squared error.
hyperparams = {
    'num_round': '200',
    'objective': 'reg:squarederror'
}


# This line creates a SageMaker estimator object, which encapsulates information about the training job to be launched. 
# It specifies the Docker container for the XGBoost algorithm image, the execution role, the number of instances to use, the instance type, the S3 output path for the trained model, the hyperparameters to use for training, and the SageMaker session object. 
# The session object is used to create the training job in the specified AWS region.
estimator = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    output_path=output_location,
    hyperparameters=hyperparams,
    sagemaker_session=sagemaker.Session()
)

# Setting Up Input Channels

In [10]:
# This line creates an S3 input channel for the training data. 
# It uses the sagemaker.session.s3_input() method to create an input object, passing in the S3 location of the training data and the content type of the data (in this case, CSV). 
# The resulting train_channel object represents the input data for the training job.
train_channel = sagemaker.session.s3_input(
    's3://bucket-for-sagemaker-from-notebook/sagemaker-data-train.csv',
    content_type='text/csv'
)

# This line creates a separate S3 input channel for the validation data. 
# It follows the same pattern as the previous line, but specifies the location of the validation data and a separate val_channel object.
val_channel = sagemaker.session.s3_input(
    's3://bucket-for-sagemaker-from-notebook/sagemaker-data-test.csv',
    content_type='text/csv'
)

# This line creates a dictionary object called channels_for_training that maps channel names to input channels. 
# In this case, the dictionary has two entries: 'train', which maps to the train_channel object representing the training data, and 'validation', which maps to the val_channel object representing the validation data.
channels_for_training = {
    'train': train_channel,
    'validation': val_channel
}

The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


# Fitting the model

In [11]:
estimator.fit(inputs=channels_for_training, logs=False)

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-03-13-14-14-45-478



2023-03-13 14:14:45 Starting - Starting the training job.......
2023-03-13 14:15:25 Starting - Preparing the instances for training................
2023-03-13 14:16:49 Downloading - Downloading input data.....
2023-03-13 14:17:19 Training - Downloading the training image.....
2023-03-13 14:17:50 Training - Training image download completed. Training in progress.....
2023-03-13 14:18:16 Uploading - Uploading generated training model..
2023-03-13 14:18:32 Completed - Training job completed


In [12]:
#estimator._current_job_name

# Getting the Metrics of the Model

In [13]:
metrics = sagemaker.analytics.TrainingJobAnalytics(
    estimator._current_job_name,
    metric_names=['train:rmse', 'validation:rmse']
)

In [None]:
#metrics

# Deploying the model

In [14]:
# Deploy model
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge', serializer=sagemaker.serializers.CSVSerializer())


INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-03-13-14-18-33-729
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-03-13-14-18-33-729
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-03-13-14-18-33-729


--------!

# Running and Storing Predictions

<div class="alert alert-block alert-info">The purpose of this function is to <b>generate new data points</b> for testing and evaluating a predictive model trained on the original dataset. By randomly generating values for the different features, the function simulates real-world variability in the data and allows the model to be tested on a range of different input values.</div>


This function generates a dictionary object with randomly generated values for different features related to energy and weather data.

The function creates a dictionary object called new_data with 58 keys, each of which corresponds to a different feature. The values of these features are randomly generated using numpy's `np.random.randint()` function, which generates integers within specified ranges.

The function also sets the values of two features, `'city_name'` and `'weather_main'`, based on randomly generated indices. The corresponding element in the new_data dictionary is then set to 1, depending on the values of these indices.

In [None]:
import random
import pandas as pd
import numpy as np

import time

def get_new_data():
    
    
    
    new_data = {
    'generation biomass': np.random.randint(0, 600),
    'generation fossil brown coal/lignite': np.random.randint(0, 1000),
    'generation fossil gas': np.random.randint(0, 13000),
    'generation fossil hard coal': np.random.randint(0, 8000),
    'generation fossil oil': np.random.randint(0, 500),
    'generation hydro pumped storage consumption': np.random.randint(0, 4000),
    'generation hydro run-of-river and poundage': np.random.randint(0, 2000),
    'generation hydro water reservoir': np.random.randint(0, 9000),
    'generation nuclear': np.random.randint(2000, 7000),
    'generation other': np.random.randint(0, 100),
    'generation other renewable': np.random.randint(0, 100),
    'generation solar': np.random.randint(0, 6000),
    'generation waste': np.random.randint(0, 350),
    'generation wind onshore': np.random.randint(0, 18000),
    'price day ahead': np.random.randint(2, 100),
    'temp': np.random.randint(250, 275),
    'temp_min': np.random.randint(150, 250),
    'temp_max': np.random.randint(275, 350),
    'pressure': np.random.randint(0, 5000),
    'humidity': np.random.randint(10, 100),
    'wind_speed': np.random.randint(0, 30),
    'wind_deg': np.random.randint(0, 360),
    'rain_1h': np.random.randint(0, 8),
    'rain_3h': 0,
    'snow_3h': 0,
    'clouds_all': np.random.randint(0, 90),
    'year': np.random.randint(2016, 2018),
    'month': np.random.randint(1, 12), 
    'day': np.random.randint(1, 28),
    'hour': np.random.randint(0, 23),
    'city_name_ Barcelona': 0,
    'city_name_Bilbao': 0,
    'city_name_Madrid': 0,
    'city_name_Seville': 0,
    'city_name_Valencia': 0,
    'weather_main_clear': 0,
    'weather_main_clouds': 0,
    'weather_main_drizzle': 0,
    'weather_main_dust': 0,
    'weather_main_fog': 0,
    'weather_main_haze': 0,
    'weather_main_mist': 0,
    'weather_main_rain': 0,
    'weather_main_smoke': 0,
    'weather_main_snow': 0,
    'weather_main_squall': 0,
    'weather_main_thunderstorm': 0, 
    'total load actual': np.random.randint(18000, 40000)}
    
    # Generate random indices for city_name and weather_main
    city_index = np.random.randint(0, 5)
    weather_index = np.random.randint(0, 11)
    # Set the corresponding element in the new_data dictionary to 1
    if city_index == 0:
        new_data['city_name_ Barcelona'] = 1
    elif city_index == 1:
        new_data['city_name_Bilbao'] = 1
    elif city_index == 2:
        new_data['city_name_Madrid'] = 1
    elif city_index == 3:
        new_data['city_name_Seville'] = 1
    else:
        new_data['city_name_Valencia'] = 1

    if weather_index == 0:
        new_data['weather_main_clear'] = 1
    elif weather_index == 1:
        new_data['weather_main_clouds'] = 1
    elif weather_index == 2:
        new_data['weather_main_drizzle'] = 1
    elif weather_index == 3:
        new_data['weather_main_dust'] = 1
    elif weather_index == 4:
        new_data['weather_main_fog'] = 1
    elif weather_index == 5:
        new_data['weather_main_haze'] = 1
    elif weather_index == 6:
        new_data['weather_main_mist'] = 1
    elif weather_index == 7:
        new_data['weather_main_rain'] = 1
    elif weather_index == 8:
        new_data['weather_main_smoke'] = 1
    elif weather_index == 9:
        new_data['weather_main_snow'] = 1
    else:
        new_data['weather_main_thunderstorm'] = 1

    return new_data

In [None]:
# create a new row of data as a pandas dataframe
X_new = pd.DataFrame([get_new_data()])
predictor.predict(X_new.iloc[0])
import json
# Convert the predictions to JSON format
predictions = predictor.predict(X_new.iloc[0]).decode('utf-8')
predictions_json = json.dumps(predictions)
predictions_final = float(predictions_json.strip('" '))

In [None]:
predictions_final

In [None]:
X_new['predicted_value'] = predictions_final

In [None]:
#X_new

In [None]:
# Initialize Dataframe to store predictions
results_df = pd.DataFrame()

# Run Loop
for i in range(10000):
    # Generate new row of Independent Variables for this iteration
    X_new = pd.DataFrame([get_new_data()])
    
    ### Get the prediction for this data
    # Convert the predictions to JSON format. Decode from binary.
    predictions = predictor.predict(X_new.iloc[0]).decode('utf-8')
    # Store JSON file
    predictions_json = json.dumps(predictions)
    # Store final prediction as floating, after removing string characters in excess.
    predictions_final = float(predictions_json.strip('" '))
    
    # Add the prediction to the new data DataFrame
    X_new['predicted_value'] = predictions_final
    
    # Append the new data to the results DataFrame
    results_df = results_df.append(X_new, ignore_index=True)


In [3]:
#results_df

# Uploading Predictions to S3

In [None]:
# define the S3 bucket and key to upload to
bucket_name = 'buket-containing-predictions-for-lambda'
key = 'predictions_10000_price.csv'

# write the DataFrame to a local CSV file
csv_buffer = results_df.to_csv(index=False)

# upload the CSV file to S3
s3 = boto3.resource('s3')
s3.Bucket(bucket_name).put_object(Key=key, Body=csv_buffer)