In [1]:
import boto3
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'boto3'

In [None]:
#load the data
bucket_name = "ukhouseholding"
file_key = "subset_80.csv"
s3 = boto3.client("s3")
url = s3.generate_presigned_url(
    ClientMethod="get_object",
    Params={"Bucket": bucket_name, "Key": file_key},
    ExpiresIn=3600  # URL expires in 1 hour
)
data = pd.read_csv(url)
data.columns = data.columns.str.lower()
data

In [None]:
data = data.drop(columns=['transaction unique identifier', 'property type', 'old/new', 'duration', 'town/city', 'district', 'county', 'ppdcategory type', 'record status - monthly file only'])
data = data.dropna()  # Handle missing values, if any
data

NameError: name 'data' is not defined

In [None]:
data['date of transfer'] = pd.to_datetime(data['date of transfer'])

NameError: name 'pd' is not defined

In [None]:
data["price"] = data["price"].astype(float)

# Split the data into features (X) and target (y)
X = data.drop(columns=["price"])  # Features
y = data["price"]                 # Target

# Combine the features and target into a single DataFrame for training
train_data = pd.concat([y, X], axis=1)

# Save the training data to a local CSV file
train_data.to_csv("train_data.csv", index=False)
s3_client = boto3.client("s3")
s3_prefix = "train/train_data.csv"
s3_client.upload_file("train_data.csv", bucket_name, s3_prefix)

We create a new Sagemaker session using the `Session` function. We then retrieve the Identity and Access Management (IAM) role that Sagemaker uses to create tasks on my behalf using the `get_execution_role()`.

In [None]:
session = sagemaker.Session()
role = get_execution_role()

We then define the name of the bucket in which our training dataset is stored.

In [None]:
bucket_name = "ukhouseholding"  # Replace with your bucket name

We then retrieve the Amazon Sagemaker container URI for the XGBoost algorithm. This ensures we create an xgboost model. The pro's of this model are that there is a high performance, parallel processing and cross-validation.

In [None]:
container = sagemaker.image_uris.retrieve("xgboost", session.boto_region_name, version="1.5-1")

We configure the machine learning training job on Amazon Sagemaker. 

Using the `image_uri` variable, we configure the docker image URI as that of the xgboost. 

Using the `role` variable, we give the model the role it can assume to perform takss on. 

Using the `instance_count` variable, we create 1 instance of Machine Learning compute instances to use for training. 

Using the `instance_type` variable, we define the ML compte instance to use for training.

Using the `output_path` variable, we define the output path as `s3://ukhouseholding/ukhouseholding-xgboost`. 

Using the `sagemaker_session` variable, we define the session that manages interactions with the Sagemaker service. 

Lastly we define the name for the training job using the `base_job_name` variable.

In [None]:
xgboost = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type="ml.p3.2xlarge",  # GPU instance type
    output_path=f"s3://{bucket_name}/ukhouseholding-xgboost",  # Output path in S3
    sagemaker_session=session,
    base_job_name="ukhouseholding-xgboost-job"
)

We set the parameters for the XGBoost model. 

Using the `objective` variable, we sepcify the learning task and the corresponding objective. 

Using the `num_round` variable, we specify the number of boosting rounds

Using the `max_depth` variable, we specify the maximum depth of a tree, controlling the complexity of the model.

Using the `eta` variable, we specify the learning rate, which scales the contribution of each tree.

Using the `gamma` variable, we specify the minimum loss reduction.

Using the `subsample` variable, we specify the amount of training data to be randomly sampled for each tree.

Lastly we specify the fraction of features to be randomly sampled for each tree using the `colsample_bytree` variable.

In [None]:
xgboost.set_hyperparameters(
    objective="reg:squarederror",  # Binary classification
    num_round=100,            # Number of training rounds
    max_depth=5,              # Example hyperparameter
    eta=0.2,                  # Learning rate
    gamma=4,                  # Minimum loss reduction
    subsample=0.8,            # Subsample ratio of training instances
    colsample_bytree=0.8      # Subsample ratio of columns
    
)

In [None]:
train_input = TrainingInput(
    s3_data=f"s3://{bucket_name}/{s3_prefix}",  # S3 path to training data
    content_type="text/csv"     # Data format
)
try:
    xgboost.fit({"train": train_input})
except Exception as e:
    print(f"Error: {e}")
try:
    xgboost.fit({"train": train_input})
except Exception as e:
    print(f"Error: {e}")

In [None]:
sm_client = boto3.client("sagemaker")

# Check if the job exists
job_name = xgboost.latest_training_job.name
response = sm_client.describe_training_job(TrainingJobName=job_name)
print(response)

In [None]:
model_artifact_s3_uri = f"s3://ukhouseholding/ukhouseholding-xgboost/ukhouseholding-xgboost-job-2024-11-27-08-27-55-012/output/model.tar.gz"

# Parse bucket and key from the URI
parsed_uri = model_artifact_s3_uri.replace("s3://", "").split("/")
bucket_name = parsed_uri[0]
key = "/".join(parsed_uri[1:])

# Initialize S3 client
s3 = boto3.client("s3")

# Download the model artifact locally
local_file_path = "model.tar.gz"  # Specify the desired local file name
s3.download_file(bucket_name, key, local_file_path)

print(f"Model saved locally as {local_file_path}")