In [None]:
from sagemaker import get_execution_role
import boto3

role = get_execution_role()
conn = boto3.client('s3')

In [None]:
bucket = "assignment-bucket-of-woes"
file = ""
url = f"s3://{bucket}/{file}"

In [None]:
contents = conn.list_objects(Bucket=bucket)['Contents']

In [None]:
csv_files = [x['Key'] for x in contents if x['Key'].split(".")[-1] == "csv"]
csv_files = [url + file_name for file_name in csv_files]
csv_files

In [None]:
import pandas as pd
df = pd.concat(list(map(pd.read_csv, csv_files)), ignore_index=True)

In [None]:
df.to_csv("combined_data.csv")

In [None]:
conn.upload_file('combined_data.csv', bucket, 'combined_data.csv')

In [None]:
region_name = boto3.Session().region_name
sagemaker_client = boto3.Session().client("sagemaker")
sub_folder_name = 'milkyway'

In [None]:
model_data = pd.get_dummies(df)

In [None]:
model_data.head()

In [None]:
import numpy as np
train_data, validation_data, test_data = np.split(model_data.sample(), [int(0.6 * len(model_data)), int(0.8 * len(model_data))])

In [None]:
train_df = pd.concat([train_data['Count'], train_data.drop('Count', axis=1)])
validation_df = pd.concat([validation_data['Count'], validation_data.drop('Count', axis=1)])
test_df = pd.concat([test_data['Count'], test_data.drop('Count', axis=1)])

In [None]:
train_df.to_csv("train.csv", index=False, header=False)
validation_df.to_csv("validation.csv", index=False, header=False)
test_df.to_csv("test.csv", index=False, header=False)

In [None]:
import os
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(sub_folder_name, "train", "train.csv")).upload_file("train.csv")
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(sub_folder_name, "validation", "validation.csv")).upload_file("validation.csv")
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(sub_folder_name, "test", "test.csv")).upload_file("test.csv")

In [None]:
from sagemaker.inputs import TrainingInput

In [None]:
training_input = TrainingInput(
    s3_data=f"{url}{sub_folder_name}/train",
    content_type="csv"
)

In [None]:
validation_input = TrainingInput(
    s3_data=f"{url}{sub_folder_name}/validation",
    content_type="csv"
)

In [None]:
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner
)

In [None]:
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.image_uris import retrieve

sagemaker_session = sagemaker.Session()

container = retrieve("xgboost", region_name, "latest")

In [None]:
xgb = sagemaker.estimator.Estimator(
    container,
    role,
    base_job_name="xgboost-random-search",
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path=f"{url}{sub_folder_name}/output",
    sagemaker_session=sess,
)

In [None]:
xgb.set_hyperparameters(
    eval_metric="auc",
    objective="binary:logistic",
    num_round=10,
    rate_drop=0.3,
    tweedie_variance_power=1.4,
)
objective_metric_name = "validation:auc"

In [None]:
hyperparameter_ranges = {
    "alpha": ContinuousParameter(0.01, 10, scaling_type="Logarithmic"),
    "lambda": ContinuousParameter(0.01, 10, scaling_type="Logarithmic"),
}

In [None]:
tuner_log = HyperparameterTuner(
    xgb,
    objective_metric_name,
    hyperparameter_ranges,
    max_jobs=5,
    max_parallel_jobs=5,
    strategy="Random",
)

In [None]:
from time import strftime, gmtime
tuner_log.fit(
    {"train": training_input, "validation": validation_input},
    include_cls_metadata=False,
    job_name="xgb-randsearch-" + strftime("%Y%m%d-%H-%M-%S", gmtime()),
)

In [None]:
boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner_log.latest_tuning_job.job_name
)["HyperParameterTuningJobStatus"]

In [None]:
hyperparameter_ranges_linear = {
    "alpha": ContinuousParameter(0.01, 10, scaling_type="Linear"),
    "lambda": ContinuousParameter(0.01, 10, scaling_type="Linear"),
}

In [None]:
tuner_linear = HyperparameterTuner(
    xgb,
    objective_metric_name,
    hyperparameter_ranges_linear,
    max_jobs=5,
    max_parallel_jobs=5,
    strategy="Random",
)

In [None]:
tuner_linear.fit(
    {"train": s3_input_train, "validation": s3_input_validation},
    include_cls_metadata=False,
    job_name="xgb-linsearch-" + strftime("%Y%m%d-%H-%M-%S", gmtime()),
)

In [None]:
boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner_linear.latest_tuning_job.job_name
)["HyperParameterTuningJobStatus"]

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
status_log = boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner_log.latest_tuning_job.job_name
)["HyperParameterTuningJobStatus"]
status_linear = boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner_linear.latest_tuning_job.job_name
)["HyperParameterTuningJobStatus"]
assert status_log == "Completed", "First must be completed, was {}".format(status_log)
assert status_linear == "Completed", "Second must be completed, was {}".format(status_linear)
df_log = sagemaker.HyperparameterTuningJobAnalytics(
    tuner_log.latest_tuning_job.job_name
).dataframe()
df_linear = sagemaker.HyperparameterTuningJobAnalytics(
    tuner_linear.latest_tuning_job.job_name
).dataframe()
df_log["scaling"] = "log"
df_linear["scaling"] = "linear"
df = pd.concat([df_log, df_linear], ignore_index=True)

In [None]:
predictor = tuner_linear.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

In [None]:
sess.delete_endpoint(endpoint_name=predictor.endpoint_name)