In [None]:
import sagemaker
from sagemaker.sklearn import SKLearn
from sagemaker.model import Model
from sagemaker.experiments.run import Run
from sagemaker.experiments.experiment import Experiment
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
import pandas as pd
from sagemaker.model_monitor import DataCaptureConfig
import boto3

role = 'arn:aws:iam::905418312993:user/mlops-exp-models-acc'
sagemaker_session = sagemaker.Session()
bucket = 'test-default-region-1'
region = sagemaker_session.boto_region_name

In [None]:
training_image_uri = sagemaker.image_uris.retrieve(framework='sklearn', region=region, version='0.23-1', py_version='py3', instance_type='ml.m5.large')
inference_image_uri = training_image_uri

script_path = 'train.py'

# Define the SKLearn estimator
sklearn_estimator = SKLearn(
    entry_point=script_path,
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    framework_version='0.23-1',
    py_version='py3',
    output_path=f's3://{bucket}/output/'
)

# Fit the Estimator
sklearn_estimator.fit({'train': f's3://{bucket}/data/processed/'})

# Retrieve the metrics from the training job
job_description = sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=sklearn_estimator.latest_training_job.name)
training_metrics = job_description['FinalMetricDataList']

# Extract specific metrics
training_mse = next(metric['Value'] for metric in training_metrics if metric['MetricName'] == 'training_mse')
training_r2 = next(metric['Value'] for metric in training_metrics if metric['MetricName'] == 'training_r2')

# Logging Metrics
experiment = Experiment.create(
    experiment_name='linear-regression-experiment',
    description='Training a simple linear regression model',
    sagemaker_boto_client=sagemaker.Session().boto_session.client('sagemaker')
)

with Run(experiment_name=experiment.experiment_name) as run:
    run.log_metric('training_mse', training_mse)
    run.log_metric('training_r2', training_r2)

# Model Evaluation
evaluation_script_path = 'evaluate.py'
script_processor = ScriptProcessor(
    image_uri=training_image_uri,
    command=['python3'],
    instance_type='ml.m5.large',
    instance_count=1,
    role=role
)

script_processor.run(
    code=evaluation_script_path,
    inputs=[
        ProcessingInput(source=f's3://{bucket}/data/processed/test.csv', destination='/opt/ml/processing/input'),
        ProcessingInput(source=sklearn_estimator.model_data, destination='/opt/ml/processing/model')
    ],
    outputs=[
        ProcessingOutput(source='/opt/ml/processing/output', destination=f's3://{bucket}/evaluation')
    ]
)

evaluation_result = pd.read_csv(f's3://{bucket}/evaluation/evaluation.csv')
new_mse = evaluation_result['mse'].values[0]
new_r2 = evaluation_result['r2'].values[0]
s3_client = boto3.client('s3')
metrics_file = 'previous_model_metrics.csv'
metrics_path = f'/tmp/{metrics_file}'

try:
    s3_client.download_file(bucket, f'metrics/{metrics_file}', metrics_path)
    previous_metrics = pd.read_csv(metrics_path)
    previous_mse = previous_metrics['mse'].values[0]
    previous_r2 = previous_metrics['r2'].values[0]
except Exception as e:
    print(f"Error retrieving previous metrics: {e}")
    previous_mse = float('inf')  # Use a high value to ensure new model is used
    previous_r2 = -float('inf')  # Use a low value to ensure new model is used

# Compare metrics and decide whether to deploy the new model
if new_mse < previous_mse and new_r2 > previous_r2:
    print("New model is better!")
    # Save new metrics to S3
    new_metrics = pd.DataFrame({'mse': [new_mse], 'r2': [new_r2]})
    new_metrics.to_csv(metrics_path, index=False)
    s3_client.upload_file(metrics_path, bucket, f'metrics/{metrics_file}')
else:
    print("Previous model is better")

# Serving the Best Model
best_model = Model(model_data=sklearn_estimator.model_data, role=role, image_uri=inference_image_uri)

# Enable data capture for monitoring
data_capture_config = DataCaptureConfig(
    enable_capture=True,
    sampling_percentage=100,
    destination_s3_uri=f's3://{bucket}/data-capture'
)

predictor = best_model.deploy(initial_instance_count=1, instance_type='ml.t2.medium', data_capture_config=data_capture_config)

print("Model deployed and ready for predictions.")

# Save predictions to S3 or other persistent storage
predictions = predictor.predict('some test data')  # Define your test data
output_predictions = pd.DataFrame(predictions, columns=['predictions'])
output_predictions.to_csv(f's3://{bucket}/predictions/output.csv', index=False)

print(predictions)
