In [1]:
import pandas as pd
import numpy as np
import boto3
import sagemaker

# Download the dataset
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
df = pd.read_csv(data_url, header=None)

# Assign column names
columns = [f'feature_{i}' for i in range(1, 55)] + ['target']
df.columns = columns

# Adjust target to be 0-indexed
df['target'] = df['target'] - 1

# Ensure target is the first column and reorder columns
df = df[['target'] + [col for col in df.columns if col != 'target']]

# Verify data after adjusting target values and reordering columns
print("Data after adjusting target values and reordering columns:")
print(df.head())

# Save the entire dataset with the target column as the first column for verification
df.to_csv("data_with_target_first.csv", index=False, header=False)

# Load and verify the saved data
saved_data = pd.read_csv("data_with_target_first.csv", header=None)
print("Saved data (first few rows):")
print(saved_data.head())


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Data after adjusting target values and reordering columns:
   target  feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0       4       2596         51          3        258          0        510   
1       4       2590         56          2        212         -6        390   
2       1       2804        139          9        268         65       3180   
3       1       2785        155         18        242        118       3090   
4       4       2595         45          2        153         -1        391   

   feature_7  feature_8  feature_9  ...  feature_45  feature_46  feature_47  \
0        221        232        148  ...           0           0           0   
1        220        235        151  ...           0           0           0   
2        234     

In [2]:
# Split the dataset into training, validation, and batch inference sets
np.random.seed(42)
rand_split = np.random.rand(len(df))
train_list = rand_split < 0.8
val_list = (rand_split >= 0.8) & (rand_split < 0.9)
batch_list = rand_split >= 0.9

data_train = df[train_list]
data_val = df[val_list]
data_batch = df[batch_list].drop(["target"], axis=1)

# Verify the training, validation, and batch data
print("Training data (first few rows):")
print(data_train.head())

print("Validation data (first few rows):")
print(data_val.head())

print("Batch data (first few rows):")
print(data_batch.head())

# Save the datasets locally with the target column as the first column
data_train.to_csv("train.csv", index=False, header=False)
data_val.to_csv("validation.csv", index=False, header=False)
data_batch.to_csv("batch.csv", index=False, header=False)

# Load and verify saved training data
saved_train_data = pd.read_csv("train.csv", header=None)
print("Saved training data (first few rows):")
print(saved_train_data.head())

# Load and verify saved validation data
saved_val_data = pd.read_csv("validation.csv", header=None)
print("Saved validation data (first few rows):")
print(saved_val_data.head())

# Load and verify saved batch data
saved_batch_data = pd.read_csv("batch.csv", header=None)
print("Saved batch data (first few rows):")
print(saved_batch_data.head())


Training data (first few rows):
   target  feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0       4       2596         51          3        258          0        510   
2       1       2804        139          9        268         65       3180   
3       1       2785        155         18        242        118       3090   
4       4       2595         45          2        153         -1        391   
5       1       2579        132          6        300        -15         67   

   feature_7  feature_8  feature_9  ...  feature_45  feature_46  feature_47  \
0        221        232        148  ...           0           0           0   
2        234        238        135  ...           0           0           0   
3        238        238        122  ...           0           0           0   
4        220        234        150  ...           0           0           0   
5        230        237        140  ...           0           0           0   

   feature_48  fea

In [4]:
import os
import boto3
import sagemaker

role = sagemaker.get_execution_role()
session = sagemaker.Session()
bucket = session.default_bucket()
prefix = "sagemaker/covertype"

s3_resource = boto3.Session().resource("s3")

# Upload training data
train_file = "train.csv"
with open(train_file, "rb") as data:
    s3_resource.Bucket(bucket).upload_fileobj(data, os.path.join(prefix, "train", train_file))

# Upload validation data
validation_file = "validation.csv"
with open(validation_file, "rb") as data:
    s3_resource.Bucket(bucket).upload_fileobj(data, os.path.join(prefix, "validation", validation_file))

# Upload batch data
batch_file = "batch.csv"
with open(batch_file, "rb") as data:
    s3_resource.Bucket(bucket).upload_fileobj(data, os.path.join(prefix, "batch", batch_file))


In [8]:
from time import gmtime, strftime
from sagemaker.image_uris import retrieve

job_name = "xgb-covertype-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Training job", job_name)

image = retrieve(framework="xgboost", region=boto3.Session().region_name, version="latest")

output_location = f"s3://{bucket}/{prefix}/output/{job_name}"
print("Training artifacts will be uploaded to:", output_location)

create_training_params = {
    "AlgorithmSpecification": {"TrainingImage": image, "TrainingInputMode": "File"},
    "RoleArn": role,
    "OutputDataConfig": {"S3OutputPath": output_location},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.m5.4xlarge", "VolumeSizeInGB": 50},
    "TrainingJobName": job_name,
    "HyperParameters": {
        "objective": "multi:softmax",
        "num_class": "7",
        "max_depth": "5",
        "eta": "0.2",
        "gamma": "4",
        "min_child_weight": "6",
        "subsample": "0.8",
        "silent": "0",
        "num_round": "100",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 60 * 60},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": f"s3://{bucket}/{prefix}/train",
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
            "ContentType": "text/csv",
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": f"s3://{bucket}/{prefix}/validation",
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
            "ContentType": "text/csv",
        },
    ],
}

sagemaker_client = boto3.client("sagemaker")
sagemaker_client.create_training_job(**create_training_params)
status = sagemaker_client.describe_training_job(TrainingJobName=job_name)["TrainingJobStatus"]
print(status)

try:
    sagemaker_client.get_waiter("training_job_completed_or_stopped").wait(TrainingJobName=job_name)
finally:
    status = sagemaker_client.describe_training_job(TrainingJobName=job_name)["TrainingJobStatus"]
    print("Training job ended with status:", status)
    if status == "Failed":
        message = sagemaker_client.describe_training_job(TrainingJobName=job_name)["FailureReason"]
        print("Training failed with the following error:", message)
        raise Exception("Training job failed")


Training job xgb-covertype-2024-05-31-12-44-18
Training artifacts will be uploaded to: s3://sagemaker-us-west-2-339713066436/sagemaker/covertype/output/xgb-covertype-2024-05-31-12-44-18
InProgress
Training job ended with status: Completed


In [9]:
model_name = job_name
print(model_name)

info = sagemaker_client.describe_training_job(TrainingJobName=job_name)
model_data = info["ModelArtifacts"]["S3ModelArtifacts"]

primary_container = {"Image": image, "ModelDataUrl": model_data}

create_model_response = sagemaker_client.create_model(
    ModelName=model_name, ExecutionRoleArn=role, PrimaryContainer=primary_container
)

print(create_model_response["ModelArn"])


xgb-covertype-2024-05-31-12-44-18
arn:aws:sagemaker:us-west-2:339713066436:model/xgb-covertype-2024-05-31-12-44-18


In [10]:
batch_job_name = "Batch-Transform-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
input_location = f"s3://{bucket}/{prefix}/batch/{batch_file}"
output_location = f"s3://{bucket}/{prefix}/output/{batch_job_name}"

request = {
    "TransformJobName": batch_job_name,
    "ModelName": model_name,
    "TransformOutput": {
        "S3OutputPath": output_location,
        "Accept": "text/csv",
        "AssembleWith": "Line",
    },
    "TransformInput": {
        "DataSource": {"S3DataSource": {"S3DataType": "S3Prefix", "S3Uri": input_location}},
        "ContentType": "text/csv",
        "SplitType": "Line",
        "CompressionType": "None",
    },
    "TransformResources": {"InstanceType": "ml.m4.xlarge", "InstanceCount": 1},
}

sagemaker_client.create_transform_job(**request)
print("Created Transform job with name:", batch_job_name)

# Wait until the job finishes
try:
    sagemaker_client.get_waiter("transform_job_completed_or_stopped").wait(TransformJobName=batch_job_name)
finally:
    response = sagemaker_client.describe_transform_job(TransformJobName=batch_job_name)
    status = response["TransformJobStatus"]
    print("Transform job ended with status:", status)
    if status == "Failed":
        message = response["FailureReason"]
        print("Transform failed with the following error:", message)
        raise Exception("Transform job failed")


Created Transform job with name: Batch-Transform-2024-05-31-12-52-11
Transform job ended with status: Completed


In [11]:
import re

def get_csv_output_from_s3(s3uri, batch_file):
    file_name = f"{batch_file}.out"
    match = re.match("s3://([^/]+)/(.*)", f"{s3uri}/{file_name}")
    output_bucket, output_prefix = match.group(1), match.group(2)
    s3 = boto3.client('s3')
    s3.download_file(output_bucket, output_prefix, file_name)
    return pd.read_csv(file_name, sep=",", header=None)

output_df = get_csv_output_from_s3(output_location, batch_file)
output_df.head()


Unnamed: 0,0
0,4.0
1,1.0
2,4.0
3,4.0
4,1.0
5,4.0
6,1.0
7,0.0


In [12]:
# Load the validation data
validation_data = pd.read_csv("validation.csv", header=None)
# Extract the actual labels
actual_labels = validation_data.iloc[:, 0].values


In [14]:
output_df

Unnamed: 0,0
0,4.0
1,1.0
2,4.0
3,4.0
4,1.0
...,...
58019,2.0
58020,2.0
58021,2.0
58022,2.0


In [17]:
validation_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,54
0,4,2605,49,4,234,7,573,222,230,144,...,0,0,0,0,0,0,0,0,0,0
1,1,2742,134,22,150,69,3215,248,224,92,...,0,0,0,0,0,0,0,0,0,0
2,1,2900,45,19,242,20,5199,221,195,100,...,0,0,0,0,0,0,0,0,0,0
3,4,2510,79,14,192,19,891,237,215,106,...,0,0,0,0,0,0,0,0,0,0
4,1,2705,90,8,134,22,2023,232,228,129,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58150,2,2395,167,23,85,20,85,232,241,127,...,0,0,0,0,0,0,0,0,0,0
58151,2,2455,181,34,277,58,240,210,238,133,...,0,0,0,0,0,0,0,0,0,0
58152,2,2440,173,26,216,44,234,226,242,132,...,0,0,0,0,0,0,0,0,0,0
58153,2,2405,159,22,90,19,120,237,238,119,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# Flatten the predictions
predictions = output_df[0].values


In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(actual_labels[:len(predictions)], predictions)

# Calculate precision, recall, and F1-score for each class
precision = precision_score(actual_labels[:len(predictions)], predictions, average='weighted')
recall = recall_score(actual_labels[:len(predictions)], predictions, average='weighted')
f1 = f1_score(actual_labels[:len(predictions)], predictions, average='weighted')

# Generate the confusion matrix
conf_matrix = confusion_matrix(actual_labels[:len(predictions)], predictions)

# Print the results
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print(f"Model Precision: {precision * 100:.2f}%")
print(f"Model Recall: {recall * 100:.2f}%")
print(f"Model F1-Score: {f1 * 100:.2f}%")
print("Confusion Matrix:")
print(conf_matrix)


Model Accuracy: 48.49%
Model Precision: 47.80%
Model Recall: 48.49%
Model F1-Score: 48.11%
Confusion Matrix:
[[ 9439  9483   708    27   148   322   840]
 [ 8938 17079  1316    27   180   482   599]
 [  613  1252  1042    58    19   325    90]
 [   16    52    86    81     4    38     8]
 [  281   445   129    19    33    50    28]
 [  353   678   404    27    15   191    63]
 [  934   655   115    12     9    38   273]]
