## This notebook demonstrate how to use Amazon Personalize to train a recommendation model

In [None]:
import boto3
import json
import numpy as np
import pandas as pd
import time

### Initiate the Personalize APIs

In [None]:
# Configure the SDK to Personalize:
personalize_client = boto3.client('personalize')
personalize_runtime = boto3.client('personalize-runtime')

In [None]:
bucket = "transcribe-bucket-1635732683230702135"       # replace with the name of your S3 bucket
filename = "movie-lens-100k.csv"

### Download movie lens dataset

In [None]:
!wget -N http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -o ml-100k.zip
data = pd.read_csv('./ml-100k/u.data', sep='\t', names=['USER_ID', 'ITEM_ID', 'RATING', 'TIMESTAMP'])

### Select the columns from the movie lends dataset and upload the file to S3

In [None]:
data = data[['USER_ID', 'ITEM_ID', 'TIMESTAMP']] # select columns that match the columns in the schema below
data.to_csv(filename, index=False)
boto3.Session().resource('s3').Bucket(bucket).Object(filename).upload_file(filename)

### Create an empty dataset group to hold the dataset. We will wait for it to complete before we continue. 

In [None]:
create_dataset_group_response = personalize_client.create_dataset_group(
    name = "personalize-lab"
)

dataset_group_arn = create_dataset_group_response['datasetGroupArn']
print(json.dumps(create_dataset_group_response, indent=2))

max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_group_response = personalize_client.describe_dataset_group(
        datasetGroupArn = dataset_group_arn
    )
    status = describe_dataset_group_response["datasetGroup"]["status"]
    print("DatasetGroup: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

### Personalize requires a schema definition to match the structure of the training data so it knows what to expect from the dataset. 

In [None]:
schema = {
    "type": "record",
    "name": "Interactions",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {
            "name": "USER_ID",
            "type": "string"
        },
        {
            "name": "ITEM_ID",
            "type": "string"
        },
        {
            "name": "TIMESTAMP",
            "type": "long"
        }
    ],
    "version": "1.0"
}

create_schema_response = personalize_client.create_schema(
    name = "personalize-demo-schema",
    schema = json.dumps(schema)
)

schema_arn = create_schema_response['schemaArn']
print(json.dumps(create_schema_response, indent=2))

### We only have an item-user interaction dataset for the model training

In [None]:
dataset_type = "INTERACTIONS"
create_dataset_response = personalize_client.create_dataset(
    name = "item-user-interactions",
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = schema_arn
)

dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

### Setting up some permission to allow Personalize to access the S3 bucket where the training data is stored

In [None]:
s3 = boto3.client("s3")

policy = {
    "Version": "2012-10-17",
    "Id": "PersonalizeS3BucketAccessPolicy",
    "Statement": [
        {
            "Sid": "PersonalizeS3BucketAccessPolicy",
            "Effect": "Allow",
            "Principal": {
                "Service": "personalize.amazonaws.com"
            },
            "Action": [
                "s3:GetObject",
                "s3:ListBucket"
            ],
            "Resource": [
                "arn:aws:s3:::{}".format(bucket),
                "arn:aws:s3:::{}/*".format(bucket)
            ]
        }
    ]
}

s3.put_bucket_policy(Bucket=bucket, Policy=json.dumps(policy))

### We also need to create a IAM role in your account for the Personalize service to assume. This role provides Personalize access to S3 buckets in your account 

In [None]:
iam = boto3.client("iam")

role_name = "PersonalizeRoleDemo"
assume_role_policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
          "Effect": "Allow",
          "Principal": {
            "Service": "personalize.amazonaws.com"
          },
          "Action": "sts:AssumeRole"
        }
    ]
}

create_role_response = iam.create_role(
    RoleName = role_name,
    AssumeRolePolicyDocument = json.dumps(assume_role_policy_document)
)



### Add S3 access permission to the role we just created.

In [None]:
policy_arn_s3 = 'arn:aws:iam::aws:policy/AmazonS3FullAccess'
iam.attach_role_policy(
    PolicyArn= policy_arn_s3,
    RoleName=role_name
)
time.sleep(60) # wait for a minute to allow IAM role policy attachment to propagate

role_arn = create_role_response["Role"]["Arn"]
print(role_arn)

### Now, we are ready to import the data into Personalize

In [None]:
create_dataset_import_job_response = personalize_client.create_dataset_import_job(
    jobName = "personalize-demo-import1",
    datasetArn = dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket, filename)
    },
    roleArn = role_arn
)

dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

max_time = time.time() + 2*60*60 
while time.time() < max_time:
    describe_dataset_import_job_response = personalize_client.describe_dataset_import_job(
        datasetImportJobArn = dataset_import_job_arn
    )
    status = describe_dataset_import_job_response["datasetImportJob"]['status']
    print("DatasetImportJob: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)


### Train a Personalize model (AKA Create Solution)

In [None]:
recipe_arn = "arn:aws:personalize:::recipe/aws-user-personalization" 

create_solution_response = personalize_client.create_solution(
    name = "personalize-soln",
    datasetGroupArn = dataset_group_arn,
    recipeArn = recipe_arn
)

solution_arn = create_solution_response['solutionArn']
print(json.dumps(create_solution_response, indent=2))

In [None]:
create_solution_version_response = personalize_client.create_solution_version(
    solutionArn = solution_arn
)

solution_version_arn = create_solution_version_response['solutionVersionArn']
print(json.dumps(create_solution_version_response, indent=2))

max_time = time.time() + 2*60*60 
while time.time() < max_time:
    describe_solution_version_response = personalize_client.describe_solution_version(
        solutionVersionArn = solution_version_arn
    )
    status = describe_solution_version_response["solutionVersion"]["status"]
    print("SolutionVersion: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

In [None]:
### Evaluate the model performance

In [None]:
get_solution_metrics_response = personalize_client.get_solution_metrics(
    solutionVersionArn = solution_version_arn
)

print(json.dumps(get_solution_metrics_response, indent=2))

### Create Campaign (an API endpoint) to serve prediction.

In [None]:
create_campaign_response = personalize_client.create_campaign(
    name = "personalize-camp",
    solutionVersionArn = solution_version_arn,
    minProvisionedTPS = 1,
    campaignConfig = {
        "itemExplorationConfig": {
            "explorationWeight": "0.5"
        }
    }
)

campaign_arn = create_campaign_response['campaignArn']
print(json.dumps(create_campaign_response, indent=2))

max_time = time.time() + 2*60*60 
while time.time() < max_time:
    describe_campaign_response = personalize_client.describe_campaign(
        campaignArn = campaign_arn
    )
    status = describe_campaign_response["campaign"]["status"]
    print("Campaign: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

### Invoke endpoint to serve prediction

In [None]:
# Getting a random user:
user_id, item_id, _ = data.sample().values[0]
print("USER: {}".format(user_id))

In [None]:
items = pd.read_csv('./ml-100k/u.item', sep='|', usecols=[0,1], encoding='latin-1', names=['ITEM_ID', 'TITLE'], index_col='ITEM_ID')

def get_movie_title(movie_id):
    movie_id = int(movie_id)-1
    return items.iloc[movie_id]['TITLE']

In [None]:
get_recommendations_response = personalize_runtime.get_recommendations(
    campaignArn = campaign_arn,
    userId = str(user_id),
)
# Update DF rendering
pd.set_option('display.max_rows', 30)

print("Recommendations for user: ", user_id)

item_list = get_recommendations_response['itemList']

recommendation_list = []

for item in item_list:
    title = get_movie_title(item['itemId'])
    recommendation_list.append(title)
    
recommendations_df = pd.DataFrame(recommendation_list, columns = ['OriginalRecs'])
recommendations_df

### Clean up Resources 

Note that, each action below needs to wait for the preceeding action to complete before it can run.  If you get an error, just wait for the preceeding action to complete and run it again.  You

In [None]:
response = personalize_client.delete_campaign(
    campaignArn=campaign_arn
)

In [None]:
response = personalize_client.delete_solution(
    solutionArn=solution_arn
)

In [None]:
response = personalize_client.delete_dataset(
    datasetArn=dataset_arn
)

In [None]:
response = personalize_client.delete_dataset_group(
    datasetGroupArn=dataset_group_arn
)

In [None]:
response = personalize_client.delete_schema(
    schemaArn=schema_arn
)

In [None]:
response = iam.detach_role_policy(
    RoleName=role_name,
    PolicyArn=policy_arn_s3
)

In [None]:
response = iam.delete_role(
    RoleName=role_name
)