In [None]:
### AUTHOR: RAJ MEHTA
### Visit my profile : https://rajmehta.info/
### Github: https://github.com/Raj-Mehta2012
### LinkedIn: https://www.linkedin.com/in/raj-kamlesh-mehta/

In [21]:
import io
import boto3
import os
import pandas as pd
import sagemaker
from sagemaker.inputs import TrainingInput
from sagemaker.image_uris import retrieve
from sagemaker import get_execution_role

# Setup S3 client
s3_bucket = "processed-meta-prices"
s3_client = boto3.client("s3")

# List all objects in the bucket (assuming only one .csv file)
try:
    bucket_objects = s3_client.list_objects_v2(Bucket=s3_bucket)["Contents"]
except Exception as e:
    print(f"Error listing objects in S3 bucket: {e}")
    exit()

if not bucket_objects:
    print("No objects found in the bucket.")
    exit()  # Exit the script if no objects exist

# Assuming the first object is the CSV file
file_key = bucket_objects[0]["Key"]

# Download data from S3
try:
    response = s3_client.get_object(Bucket=s3_bucket, Key=file_key)
    data = response["Body"].read()
except Exception as e:
    print(f"Error downloading data from S3 for {file_key}: {e}")
    exit()

# Load data into DataFrame
try:
    df = pd.read_csv(io.BytesIO(data))
    print(f"Loaded data from: {file_key}")
except Exception as e:
    print(f"Error loading data into DataFrame for {file_key}: {e}")
    exit()

Loaded data from: META_processed_data/part-00000-ea8b0691-99f2-47f5-8810-6951e1fa4342-c000.csv


In [22]:
# Prepare the dataset (the target is the next day's closing price)
df['Target'] = df['Adj Close'].shift(-1)
df.dropna(inplace=True)

# Split data into train and test sets (75-25 split)
train_size = int(len(df) * 0.75)
train_data = df.iloc[:train_size]
test_data = df.iloc[train_size:]

# Save split data locally
train_data.to_csv('train.csv', header=False, index=False)
test_data.to_csv('test.csv', header=False, index=False)

In [23]:
# Upload split data to S3
session = sagemaker.Session()
bucket = session.default_bucket()
prefix = 'stock-data'
train_path = os.path.join(prefix, 'train/train.csv')
test_path = os.path.join(prefix, 'test/test.csv')
boto3.resource('s3').Bucket(bucket).Object(train_path).upload_file('train.csv')
boto3.resource('s3').Bucket(bucket).Object(test_path).upload_file('test.csv')

# Set up SageMaker estimator for XGBoost
role = get_execution_role()
container = retrieve('xgboost', boto3.Session().region_name, '1.3-1')

xgb = sagemaker.estimator.Estimator(container,
                                    role,
                                    instance_count=1,
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=session)

INFO:sagemaker:Created S3 bucket: sagemaker-us-east-2-149023223962
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [24]:
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        verbosity=1,  # Updated from silent to verbosity
                        objective='reg:squarederror',
                        num_round=100)

In [25]:
# Specify input data
s3_input_train = TrainingInput('s3://{}/{}'.format(bucket, train_path), content_type='csv')
s3_input_test = TrainingInput('s3://{}/{}'.format(bucket, test_path), content_type='csv')

In [26]:
# Fit the model
xgb.fit({'train': s3_input_train, 'validation': s3_input_test})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-05-03-14-38-09-835


2024-05-03 14:38:10 Starting - Starting the training job...
2024-05-03 14:38:24 Starting - Preparing the instances for training...
2024-05-03 14:38:58 Downloading - Downloading input data...
2024-05-03 14:39:33 Downloading - Downloading the training image......
2024-05-03 14:40:34 Training - Training image download completed. Training in progress.
2024-05-03 14:40:34 Uploading - Uploading generated training model[34m[2024-05-03 14:40:29.238 ip-10-0-207-200.us-east-2.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-05-03 14:40:29.268 ip-10-0-207-200.us-east-2.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-05-03:14:40:29:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-05-03:14:40:29:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2024-05-03:14:40:29:INFO] No GPUs detected (normal if no gpus ins

In [28]:
import sagemaker
from sagemaker.serializers import CSVSerializer
xgb_predictor=xgb.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium',
    serializer=CSVSerializer()
)

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-05-03-14-42-35-929
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-05-03-14-42-35-929
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-05-03-14-42-35-929


------!

In [29]:
xgb_predictor.endpoint_name

'sagemaker-xgboost-2024-05-03-14-42-35-929'

In [None]:
xgb_predictor.model_name

In [None]:
# Clean up: shutting down any created resources to prevent extra costs
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint_name)
sagemaker.Session().delete_model(xgb_predictor.model_data)
sagemaker.Session().delete_endpoint_config(xgb_predictor.endpoint_name)