In [None]:
############################################# final code #########################

import csv
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import random
import matplotlib.patches as mpatches
from heapq import heappush, heappop

# Create a OneHotEncoder instance
encoder = OneHotEncoder(sparse=False)
csv_file = "amazon.csv"

# Read the CSV data and create a dictionary of products
products_dict = {}
with open(csv_file, mode='r', encoding='utf-8') as file:
    csv_reader = csv.DictReader(file)
    for row in csv_reader:
        # Extract and clean the rating_count
        try:
            rating_count = int(row['rating_count'].replace(',', '')) if row['rating_count'] else 0
        except ValueError:
            rating_count = 0

        # Extract and clean the rating
        try:
            rating = float(row['rating'])
        except ValueError:
            rating = 0.0

        # Create a dictionary for each product
        product_details = {
            'product_id': row['product_id'],
            'product_name': row['product_name'],
            'category': row['category'],
            'rating': rating,
            'rating_count': rating_count,
            'about_product': row['about_product']
        }
        # Use product_id as the key for the products dictionary
        products_dict[row['product_id']] = product_details

# Print available product IDs
print("Available product IDs:", list(products_dict.keys())[:10])  # Print first 10 product IDs for checking

# Construct the grid
num_products = len(products_dict)
grid_size = int(np.ceil(np.sqrt(num_products)))
grid = np.empty((grid_size, grid_size), dtype=object)

# Calculate row indices using hybrid filtering
categories = np.array([p['category'] for p in products_dict.values()]).reshape(-1, 1)
category_encoded = encoder.fit_transform(categories)

# Use np.log1p for rating_count to handle skewness
features = np.array([[p['rating'], np.log1p(p['rating_count'])] for p in products_dict.values()])
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(features)
normalized_row_indices = normalized_features[:, 0]

# Combine the category encoding with the normalized features
combined_features = np.concatenate((category_encoded, normalized_features), axis=1)

# Debug print to check the feature values before and after normalization
print("Combined Features (before scaling):\n", features[:10])  # Print first 10 for checking
print("Normalized Features (after scaling):\n", normalized_features[:10])  # Print first 10 for checking

# Calculate column indices using Word2Vec embeddings for better semantic similarity
corpus = [p['about_product'].split() for p in products_dict.values()]
word2vec_model = Word2Vec(sentences=corpus, vector_size=50, window=5, min_count=1, workers=4)
product_vectors = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
                                    or [np.zeros(50)], axis=0) for words in corpus])

column_indices = scaler.fit_transform(product_vectors)

# Normalize indices to fit within grid size
normalized_col_indices = column_indices[:, 0]
row_indices = (normalized_row_indices * (grid_size - 1)).astype(int)
column_indices = (normalized_col_indices * (grid_size - 1)).astype(int)

# Shuffle the product IDs to ensure a random distribution
product_ids = list(products_dict.keys())
random.shuffle(product_ids)

# Place products in the grid based on normalized indices
for idx, product_id in enumerate(product_ids):
    row = row_indices[idx]
    col = column_indices[idx]
    while grid[row, col] is not None:
        col = (col + 1) % grid_size
        if col == 0:
            row = (row + 1) % grid_size
    grid[row, col] = product_id
    # print(f"Product ID: {product_id}, Normalized Row: {normalized_row_indices[idx]:.2f}, Normalized Column: {normalized_col_indices[idx]:.2f}")

def a_star_search_recommendations(start, destination, grid, products_dict, max_recommendations=10):
    def heuristic(product1, product2):
        # Calculate the grid distance (Manhattan distance)
        grid_distance = abs(product1[0] - product2[0]) + abs(product1[1] - product2[1])
        # Calculate the product similarity using BERT embeddings
        product_similarity = calculate_similarity(products_dict[grid[product1[0], product1[1]]], products_dict[grid[product2[0], product2[1]]])
        # Combine the grid distance and product similarity
        return grid_distance - product_similarity

    def get_neighbors(row, col):
        neighbors = []
        if row > 0: neighbors.append((row - 1, col))
        if row < grid_size - 1: neighbors.append((row + 1, col))
        if col > 0: neighbors.append((row, col - 1))
        if col < grid_size - 1: neighbors.append((row, col + 1))
        return neighbors

    start_position = None
    destination_position = None
    for i in range(grid_size):
        for j in range(grid_size):
            if grid[i, j] == start:
                start_position = (i, j)
            if grid[i, j] == destination:
                destination_position = (i, j)
            if start_position and destination_position:
                break
        if start_position and destination_position:
            break

    open_set = []
    heappush(open_set, (0, start_position))
    came_from = {}
    g_score = {pos: float('inf') for row in range(grid_size) for pos in [(row, col) for col in range(grid_size)]}
    g_score[start_position] = 0
    f_score = {pos: float('inf') for row in range(grid_size) for pos in [(row, col) for col in range(grid_size)]}
    f_score[start_position] = heuristic(start_position, destination_position)

    path = []
    while open_set:
        _, current = heappop(open_set)
        if current == destination_position:
            # Reconstruct the path
            while current in came_from:
                path.append(current)
                current = came_from[current]
            path.append(start_position)
            path.reverse()
            break

        neighbors = get_neighbors(current[0], current[1])
        for neighbor in neighbors:
            neighbor_product = grid[neighbor[0], neighbor[1]]
            if neighbor_product:
                tentative_g_score = g_score[current] + calculate_similarity(products_dict[grid[current[0], current[1]]], products_dict[neighbor_product])
                if tentative_g_score < g_score[neighbor]:
                    came_from[neighbor] = current
                    g_score[neighbor] = tentative_g_score
                    f_score[neighbor] = g_score[neighbor] + heuristic(neighbor, destination_position)
                    heappush(open_set, (f_score[neighbor], neighbor))

    recommendations = [grid[row, col] for row, col in path]
    return recommendations, path

# Choose valid start and destination product IDs from the printed list
start_product_id = "B09CQGV36R"  # Use an available product ID from the list
destination_product_id = "B00E5WECYE"  # Use another available product ID from the list

recommendations, path = a_star_search_recommendations(start_product_id, destination_product_id, grid, products_dict, max_recommendations=10)
print("Recommended products:", recommendations)

# Visualization with A* path
plt.figure(figsize=(20, 20))  # Adjusted figure size

# Define colors for categories
categories_unique = list(set(p['category'] for p in products_dict.values()))
category_colors = {category: plt.cm.tab20(i / len(categories_unique)) for i, category in enumerate(categories_unique)}

# Plot the grid cells with category colors
for (row, col), product_id in np.ndenumerate(grid):
    if product_id is not None:
        product = products_dict[product_id]
        color = category_colors[product['category']]
        plt.scatter(col, row, color=color, marker='s')

# Highlight the recommended products
for product_id in recommendations:
    for i in range(grid_size):
        for j in range(grid_size):
            if grid[i, j] == product_id:
                product = products_dict[product_id]
                color = category_colors[product['category']]
                plt.scatter(j, i, color=color, marker='o', s=100)

# Highlight the A* path
path_x = [col for row, col in path]
path_y = [row for row, col in path]
plt.plot(path_x, path_y, 'ro-', linewidth=2)  # Highlight the path with red lines

# Create a legend with category colors
handles = [mpatches.Patch(color=color, label=category) for category, color in category_colors.items()]
plt.legend(handles=handles, bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

plt.xlabel('Embedding-based Column Index')
plt.ylabel('Hybrid Filtering-based Row Index')
plt.axis('off')  # Turn off the axis
plt.show()

In [None]:
print("Recommended Products:")
for idx, product_id in enumerate(recommendations, 1):
    product = products_dict[product_id]
    print(f"{idx}. Product ID: {product_id}")
    print(f"   Product Name: {product['product_name']}")
    print(f"   Category: {product['category']}")
    print(f"   Rating: {product['rating']}")
    print(f"   Rating Count: {product['rating_count']}")
    print(f"   Description: {product['about_product']}")
    print("")