In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 1. Setup and Environment Checks
- This section covers the necessary installations and checks the GPU environment to ensure hardware acceleration is available, which is crucial for deep learning models like BERT.

In [1]:
# --- Environment Setup ---

# Install the sentence-transformers library from Hugging Face
# This library provides an easy way to use state-of-the-art sentence embedding models.
!pip install sentence-transformers

# --- GPU Verification ---

import torch

# Check if a CUDA-enabled GPU is available for computation
# Returns True if a GPU is detected and configured correctly.
is_gpu_available = torch.cuda.is_available()
print(f"GPU available: {is_gpu_available}")

if is_gpu_available:
    # Print the name of the detected GPU device
    print(f"GPU Device Name: {torch.cuda.get_device_name(0)}")

    # Display detailed GPU status, including memory usage and driver version
    !nvidia-smi

GPU available: True
GPU Device Name: Tesla T4
Wed Aug  6 03:16:16 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   44C    P8             10W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
  

# 2. Data Loading and Preprocessing
- Here, we load the pre-processed dataset containing review text and perform a key preprocessing step: grouping all reviews for the same product into a single document. This creates a consolidated text block for each item, which is ideal for generating a representative embedding.

In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

# --- Load Data ---

# Load the dataset which includes sentiment scores calculated previously.
# Ensure the absolute path to your file is correct.
file_path = '/content/drive/MyDrive/Amazon_Recommender/data/processed/03_df_with_sentiment_2.csv'
try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
    display(df.head())
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")

# --- Preprocessing ---

# Drop rows where 'reviewText' is missing to avoid errors during embedding.
df.dropna(subset=['reviewText'], inplace=True)
df.reset_index(drop=True, inplace=True)
print("\nShape after dropping nulls:", df.shape)


# Group reviews by product ID ('asin') and concatenate all review texts into one document per product.
# This creates a comprehensive description for each item based on user feedback.
print("\nGrouping reviews by product (asin)...")
df_grouped = df.groupby('asin')['reviewText'].apply(lambda reviews: ' '.join(str(r) for r in reviews)).reset_index()
print("Grouping complete.")
display(df_grouped.head())

# Extract the consolidated review texts into a list for the embedding model.
texts = df_grouped['reviewText'].tolist()

Dataset loaded successfully.


Unnamed: 0,reviewerID,asin,reviewText,overall,sentiment_score
0,AAP7PPBU72QFM,151004714,This is the best novel I have read in 2 or 3 y...,5.0,0.9601
1,A2E168DTVGE6SV,151004714,"Pages and pages of introspection, in the style...",3.0,0.8382
2,A1ER5AYS3FQ9O3,151004714,This is the kind of novel to read when you hav...,5.0,0.9642
3,A1T17LMQABMBN5,151004714,What gorgeous language! What an incredible wri...,5.0,0.9737
4,A3QHJ0FXK33OBE,151004714,I was taken in by reviews that compared this b...,3.0,0.985



Shape after dropping nulls: (6737950, 5)

Grouping reviews by product (asin)...
Grouping complete.


Unnamed: 0,asin,reviewText
0,101635370,I figured out how to use it. It's okay for li...
1,151004714,This is the best novel I have read in 2 or 3 y...
2,380709473,I read this probably 50 years ago in my youth ...
3,446697192,"Fresh from Connecticut, Taylor Henning lands a..."
4,511189877,"This remote, for whatever reason, was chosen b..."


# 3. Model Loading and BERT Embedding Generation
- This is the core part of the notebook. We load a pre-trained all-MiniLM-L6-v2 model from Hugging Face. This model is highly efficient and effective at creating meaningful sentence and paragraph embeddings. The text is then processed in batches to generate a 384-dimensional vector for each product.

In [3]:
from huggingface_hub import login
from tqdm.notebook import tqdm

# --- Load Pre-trained BERT Model ---

# Log in to Hugging Face Hub if required (e.g., for private models).
# hf_token = "YOUR_HUGGING_FACE_TOKEN"
# login(hf_token)

# Load the 'all-MiniLM-L6-v2' model.
# This is a popular and efficient model for generating high-quality sentence embeddings.
print("\nLoading SentenceTransformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded.")

# --- Generate Embeddings in Batches ---

# Define a batch size to manage memory usage, especially on GPUs.
BATCH_SIZE = 10000
embeddings_list = []

print(f"\nStarting embedding generation with batch size {BATCH_SIZE}...")
for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Generating Embeddings"):
    # Get a batch of texts
    batch_texts = texts[i : (i + BATCH_SIZE)]
    print(f"Processing batch: {i} to {i + len(batch_texts)}")

    # Encode the batch of texts. The model will handle tokenization and generate embeddings.
    # 'device='cuda'' ensures the computation runs on the GPU for speed.
    batch_embeddings = model.encode(
        batch_texts,
        device='cuda',
        batch_size=128,  # Inner batch size for the model's forward pass
        show_progress_bar=True
    )
    embeddings_list.append(batch_embeddings)

# Vertically stack the batch embeddings into a single NumPy array.
all_embeddings = np.vstack(embeddings_list)
print(f"\nEmbeddings generation completed. Final shape: {all_embeddings.shape}")


Loading SentenceTransformer model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded.

Starting embedding generation with batch size 10000...


Generating Embeddings:   0%|          | 0/17 [00:00<?, ?it/s]

Processing batch: 0 to 10000


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Processing batch: 10000 to 20000


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Processing batch: 20000 to 30000


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Processing batch: 30000 to 40000


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Processing batch: 40000 to 50000


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Processing batch: 50000 to 60000


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Processing batch: 60000 to 70000


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Processing batch: 70000 to 80000


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Processing batch: 80000 to 90000


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Processing batch: 90000 to 100000


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Processing batch: 100000 to 110000


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Processing batch: 110000 to 120000


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Processing batch: 120000 to 130000


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Processing batch: 130000 to 140000


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Processing batch: 140000 to 150000


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Processing batch: 150000 to 160000


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Processing batch: 160000 to 160052


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Embeddings generation completed. Final shape: (160052, 384)


# 4. Saving Embeddings and Product IDs
- After the computationally intensive embedding process is complete, we save the resulting vectors and the corresponding product ID (asin) list. This allows us to load them directly in the future without re-running the model.

In [4]:
# --- Save Artifacts for Later Use ---

# Define absolute paths for saving the model outputs.
embeddings_save_path = '/content/drive/MyDrive/Amazon_Recommender/models/07_bert_embeddings.npy'
asin_save_path = '/content/drive/MyDrive/Amazon_Recommender/data/processed/07_bert_asin.csv'

# Save the embeddings array to a .npy file for efficient loading.
np.save(embeddings_save_path, all_embeddings)
print(f"Embeddings saved to: {embeddings_save_path}")

# Save the DataFrame containing the product ASINs.
# This ensures the order of ASINs matches the rows in the embeddings array.
df_grouped[['asin']].to_csv(asin_save_path, index=False)
print(f"ASIN list saved to: {asin_save_path}")

Embeddings saved to: /content/drive/MyDrive/Amazon_Recommender/models/07_bert_embeddings.npy
ASIN list saved to: /content/drive/MyDrive/Amazon_Recommender/data/processed/07_bert_asin.csv


In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# --- Load Saved Artifacts ---

# Load the previously generated embeddings and the corresponding product ASINs.
print("\nLoading embeddings and ASIN list for recommendation...")
bert_embeddings = np.load('/content/drive/MyDrive/Amazon_Recommender/models/07_bert_embeddings.npy')
asin_df = pd.read_csv('/content/drive/MyDrive/Amazon_Recommender/data/processed/07_bert_asin.csv')
asin_list = asin_df['asin'].tolist()
print("Artifacts loaded.")

# --- Recommendation Function ---

def recommend_similar_products_by_bert(asin, top_n=5):
    """
    Recommends products similar to the given ASIN based on BERT embedding cosine similarity.

    Args:
        asin (str): The product ID (ASIN) to find recommendations for.
        top_n (int): The number of similar products to return.

    Returns:
        list: A list of tuples, each containing (similar_asin, similarity_score).
              Returns an error string if the ASIN is not found.
    """
    # Find the index of the input ASIN in our list.
    try:
        idx = asin_list.index(asin)
    except ValueError:
        return f"ASIN {asin} not found in the dataset."

    # Get the embedding vector for the input ASIN and reshape it for similarity calculation.
    query_vector = bert_embeddings[idx].reshape(1, -1)

    # Calculate the cosine similarity between the query vector and all other embeddings.
    similarity_scores = cosine_similarity(query_vector, bert_embeddings)[0]

    # Get the indices of the most similar items by sorting the scores in descending order.
    # We exclude the first item [::-1][1:top_n+1] because it will be the query item itself.
    similar_indices = similarity_scores.argsort()[::-1][1:top_n + 1]

    # Create a list of recommended ASINs and their similarity scores.
    recommendations = [(asin_list[i], similarity_scores[i]) for i in similar_indices]

    return recommendations

# --- Example Usage ---

# Get top 5 recommendations for a sample product ASIN.
example_asin = 'B00004T8R2'
recommendations = recommend_similar_products_by_bert(example_asin)
print(f"\nTop 5 recommendations for '{example_asin}':")
print(recommendations)


Loading embeddings and ASIN list for recommendation...
Artifacts loaded.

Top 5 recommendations for 'B00004T8R2':
[('B0039286A2', np.float32(0.82333374)), ('B0007N55NM', np.float32(0.818453)), ('B005FDOG66', np.float32(0.810917)), ('B00AJHCJQC', np.float32(0.8088211)), ('B004MMEI8C', np.float32(0.8063112))]


# 5. Semantic Recommender Function
- This section implements the recommendation logic. Using the pre-computed BERT embeddings, we can find products with similar review semantics by calculating the cosine similarity between their vectors.

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# --- Load Saved Artifacts ---

# Load the previously generated embeddings and the corresponding product ASINs.
print("\nLoading embeddings and ASIN list for recommendation...")
bert_embeddings = np.load('/content/drive/MyDrive/Amazon_Recommender/models/07_bert_embeddings.npy')
asin_df = pd.read_csv('/content/drive/MyDrive/Amazon_Recommender/data/processed/07_bert_asin.csv')
asin_list = asin_df['asin'].tolist()
print("Artifacts loaded.")

# --- Recommendation Function ---

def recommend_similar_products_by_bert(asin, top_n=5):
    """
    Recommends products similar to the given ASIN based on BERT embedding cosine similarity.

    Args:
        asin (str): The product ID (ASIN) to find recommendations for.
        top_n (int): The number of similar products to return.

    Returns:
        list: A list of tuples, each containing (similar_asin, similarity_score).
              Returns an error string if the ASIN is not found.
    """
    # Find the index of the input ASIN in our list.
    try:
        idx = asin_list.index(asin)
    except ValueError:
        return f"ASIN {asin} not found in the dataset."

    # Get the embedding vector for the input ASIN and reshape it for similarity calculation.
    query_vector = bert_embeddings[idx].reshape(1, -1)

    # Calculate the cosine similarity between the query vector and all other embeddings.
    similarity_scores = cosine_similarity(query_vector, bert_embeddings)[0]

    # Get the indices of the most similar items by sorting the scores in descending order.
    # We exclude the first item [::-1][1:top_n+1] because it will be the query item itself.
    similar_indices = similarity_scores.argsort()[::-1][1:top_n + 1]

    # Create a list of recommended ASINs and their similarity scores.
    recommendations = [(asin_list[i], similarity_scores[i]) for i in similar_indices]

    return recommendations

# --- Example Usage ---

# Get top 5 recommendations for a sample product ASIN.
example_asin = 'B00004T8R2'
recommendations = recommend_similar_products_by_bert(example_asin)
print(f"\nTop 5 recommendations for '{example_asin}':")
print(recommendations)


Loading embeddings and ASIN list for recommendation...
Artifacts loaded.

Top 5 recommendations for 'B00004T8R2':
[('B0039286A2', np.float32(0.82333374)), ('B0007N55NM', np.float32(0.818453)), ('B005FDOG66', np.float32(0.810917)), ('B00AJHCJQC', np.float32(0.8088211)), ('B004MMEI8C', np.float32(0.8063112))]
