## prepare product data for blazing text alogrithm

In [3]:
import pandas as pd
import boto3
import csv

# load s3 datsset
s3_bucket = 'sagemaker-ml-product-recommendation-02'
product_file_key = 'product_data.csv'
s3_client = boto3.client('s3')

local_file = 'product_data.csv'
s3_client.download_file(s3_bucket, product_file_key, local_file)
product_data = pd.read_csv(local_file)


# normalize category column for better label matching
product_data["Category"] = product_data["Category"].str.lower().str.strip().str.replace(" ", "_")


# prepare text data for BlazingText model training.
def prepare_data(row):
    label = f"__label__{row['Category']}_{row['Product Name'].strip().replace(' ', '_')}"
    text_parts = [str(row[col]).strip() for col in ['Product Name', 'Description', 'Color', 'Size'] if pd.notna(row[col])]
    text = " ".join(text_parts)
    return f"{label} {text}"


product_data['blazingtext'] = product_data.apply(prepare_data, axis=1)

training_file = 'blazingtext_supervised_training_v2.txt'
product_data['blazingtext'].to_csv(
    training_file,
    index=False,
    header=False,
    sep=' ',
    quoting=csv.QUOTE_NONE,
    escapechar=" "
)

# upload to s3
s3_key = "training/blazingtext_supervised_training_v2.txt"
s3_client.upload_file(training_file, s3_bucket, s3_key)

print(f"file uploaded to s3://{s3_bucket}/{s3_key}")

file uploaded to s3://sagemaker-ml-product-recommendation-02/training/blazingtext_supervised_training_v2.txt


## train the blazing model

In [4]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.image_uris import retrieve
from sagemaker.inputs import TrainingInput

region_name = 'us-west-2'
s3_train_data = f's3://{s3_bucket}/training/blazingtext_supervised_training_v2.txt'
s3_output_location = f's3://{s3_bucket}/output'

role = get_execution_role()

# blazing text containee URI
container_uri = retrieve(framework='blazingtext',
                         region=region_name,
                         version='latest')
print('Using Sagemaker BlazingText containe: {} ({})'.format(container_uri, region_name))

# define the estimator object
bt_model = sagemaker.estimator.Estimator(
    image_uri=container_uri,
    role=role,
    instance_count=1,
    instance_type='ml.c4.2xlarge',
    volume_size=5,
    max_run=3600,
    input_mode='File',
    output_path=s3_output_location
)


# set hyperparameters
bt_model.set_hyperparameters(
    mode='supervised',
    epochs=15,
    min_count=2,
    learning_rate=0.05,
    word_ngrams=2,
    vector_dim=100
)


# define training data
train_data = TrainingInput(s3_train_data, content_type='text/plain')
data_channels = {'train': train_data}


# train the model
bt_model.fit(inputs=data_channels, logs=True)

Using Sagemaker BlazingText containe: 433757028032.dkr.ecr.us-west-2.amazonaws.com/blazingtext:1 (us-west-2)


2025-02-26 09:29:06 Starting - Starting the training job...
..25-02-26 09:29:39 Downloading - Downloading input data.
..25-02-26 09:30:09 Training - Training image download completed. Training in progress..
2025-02-26 09:30:35 Uploading - Uploading generated training model[34mArguments: train[0m
  self.stdout = io.open(c2pread, 'rb', bufsize)[0m
[34m[02/26/2025 09:30:22 INFO 139786765338432] nvidia-smi took: 0.025221824645996094 secs to identify 0 gpus[0m
[34m[02/26/2025 09:30:22 INFO 139786765338432] Running single machine CPU BlazingText training using supervised mode.[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[02/26/2025 09:30:22 INFO 139786765338432] Processing /opt/ml/input/data/train/blazingtext_supervised_training_v2.txt . File size: 0.019308090209960938 MB[0m
[34mRead 0M words[0m
[34mNumber of words:  87[0m
[34m##### Alpha: -0.0001  Progress: 100.15%  Million Words/sec: 0.37 #####[0m
[34m##### Alpha: 0.0000  Progress: 100.00%  Million Words/s

## deploy the model

In [6]:
predictor = bt_model.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge'
)

-----!

In [15]:

import json

def get_recommendations(user_input):
    input_text = {"instances": [user_input]}
    response = predictor.predict(json.dumps(input_text), initial_args={"ContentType": "application/json"})
    predictions = json.loads(response)

    # extract labels which is predicted product categories
    recommended_labels = [pred["label"] for pred in predictions]
    
    # match labels with actual product names
    recommended_products = product_data[product_data["Category"].isin(recommended_labels)]["Product Name"].tolist()
    return recommended_products[:5]


In [16]:
query = "I need kid shoes for running in red color"
print(f"Query:{query}\nRecommended Products: ", get_recommendations(query))

Query:I need kid shoes for running in red color
Recommended Products:  []


In [18]:
test_query = {"instances": ["kids shoes"]}
response = predictor.predict(json.dumps(test_query), initial_args={"ContentType": "application/json"})
print("Test Query Output:", json.loads(response))


Test Query Output: [{'label': ['__label__Women'], 'prob': [0.2501992881298065]}]


In [5]:
# use
import json

def get_recommendations(user_input, predictor, threshold=0.02):
    input_text = {"instances": [user_input]}
    response = predictor.predict(json.dumps(input_text), initial_args={"ContentType": "application/json"})
    predictions = json.loads(response)

    print("Raw Model Predictions:", predictions)  # Debugging Output

    recommended_labels = []
    for pred in predictions:
        for label, prob in zip(pred["label"], pred["prob"]):
            if prob >= threshold:
                clean_label = label.replace("__label__", "").lower().strip().replace("_", " ")
                recommended_labels.append(clean_label)

    # Normalize dataset category names for matching
    product_data["Category"] = product_data["Category"].str.lower().str.strip().str.replace("_", " ")

    # Filter products based on predicted categories
    recommended_products = product_data[product_data["Category"].isin(recommended_labels)]["Product Name"].tolist()

    return recommended_products[:5]  # Return top 5 recommendations


In [1]:
# use

query = "I need a stylish leather jacket for winter"
recommended_products = get_recommendations(query, predictor, threshold=0.02)

print(f"\nQuery: {query}")
print("Recommended Products:", recommended_products)


NameError: name 'get_recommendations' is not defined

In [69]:
import json

def get_recommendations(user_input, predictor, threshold=0.02):
    input_text = {"instances": [user_input]}
    response = predictor.predict(json.dumps(input_text), initial_args={"ContentType": "application/json"})
    predictions = json.loads(response)

    print("Raw Model Predictions: ", predictions)
    print()

    recommended_labels = []
    for pred in predictions:
        for label, prob in zip(pred["label"], pred["prob"]):
            if prob >= threshold:  
                clean_label = label.replace("__label__", "").replace("_", " ").strip().lower()  
                recommended_labels.append(clean_label)

    print("Extracted Labels:", recommended_labels)  

    # Prepare dataset for comparison
    product_data["Category_cleaned"] = product_data["Category"].str.strip().str.lower()
    product_data["Product_cleaned"] = product_data["Product Name"].str.strip().str.lower()

    # print("\nAvailable Categories in Dataset:", product_data["Category_cleaned"].unique())
    # print("\nAvailable Product Names in Dataset:", product_data["Product_cleaned"].unique())

    # Try to extract category and product type from predicted label
    matched_products = []
    
    for label in recommended_labels:
        label_parts = label.split()  # Split "men leather jacket" → ["men", "leather", "jacket"]
        
        for category in product_data["Category_cleaned"].unique():
            if category in label_parts:  # Check if "men" exists in ["men", "leather", "jacket"]
                for product in product_data["Product_cleaned"].unique():
                    if any(part in product for part in label_parts):  # Match "leather" or "jacket"
                        matched_products.append(product)
    
    matched_products = list(set(matched_products))  # Remove duplicates

    if not matched_products:
        print("No matching products found in dataset!")

    return matched_products[:5]


query = "I need a stylish leather jacket for winter"
recommended_products = get_recommendations(query, predictor, threshold=0.02)

print(f"\nQuery: {query}")
print("Recommended Products:", recommended_products)


Raw Model Predictions:  [{'label': ['__label__men_Leather_Jacket'], 'prob': [0.02501424215734005]}]

Extracted Labels: ['men leather jacket']

Query: I need a stylish leather jacket for winter
Recommended Products: ['leather jacket']


In [70]:
query = "I need a basic cotton t-shirt"
recommended_products = get_recommendations(query, predictor, threshold=0.02)

print(f"\nQuery: {query}")
print("Recommended Products:", recommended_products)

Raw Model Predictions:  [{'label': ['__label__women_T-Shirt'], 'prob': [0.025014212355017662]}]

Extracted Labels: ['women t-shirt']

Query: I need a basic cotton t-shirt
Recommended Products: ['t-shirt']


In [72]:
query = "I need a sneakers for my kid"
recommended_products = get_recommendations(query, predictor, threshold=0.02)

print(f"\nQuery: {query}")
print("Recommended Products:", recommended_products)

Raw Model Predictions:  [{'label': ['__label__kids_Sneakers'], 'prob': [0.025015996769070625]}]

Extracted Labels: ['kids sneakers']

Query: I need a sneakers for my kid
Recommended Products: ['sneakers']
