In [1]:
import pandas as pd
import numpy as np
import glob
from sklearn.metrics.pairwise import cosine_similarity
import ast
import re

In [2]:
def extract_2d_array(tensor_str):
    
    tensor_str = tensor_str.replace('tensor(','').replace('grad_fn=<TanhBackward0>)', '').replace(' ', '').replace('\n', '')
    tensor_str = tensor_str[:len(tensor_str)-1]
    numbers = eval(tensor_str)
    
    array = np.array(numbers).reshape((-1, 768))
    
    return array

def extract_2d_array_list(data_str):

    data_str = data_str.replace('\n', ' ')

    # Replace one or more spaces with a single comma
    data_str = re.sub(r'\s+', ',', data_str)

    # Ensure there are no extra commas at the end
    data_str = '[' + data_str.strip(',') + ']'

    # Evaluate the string safely
    numbers = ast.literal_eval(data_str)

    array = np.array(numbers).reshape((-1, 768))  
    return array


In [3]:
def mean_of_2d_arrays(arrays):
    stacked_arrays = np.vstack(arrays)
    return np.mean(stacked_arrays, axis=0).reshape(-1, 768)

def findMeanEmbeddings(df):

    # Group by the specified column and calculate the mean of 2D arrays
    result = df.groupby('sub_category')['embeddings'].apply(lambda x: mean_of_2d_arrays(x))

    # Convert the result to a DataFrame
    result_df = pd.DataFrame(result)

    # Reset the index if necessary
    result_df.reset_index(inplace=True)
    return result_df

In [8]:
def constructAverageDataframe(path):
    files = glob.glob(path)
    dfs = []
    for file in files:
        df = pd.read_csv(file)
        df = df.drop(columns=["Unnamed: 0", "Unnamed: 0.1", 'Unnamed: 0.2'], errors="ignore")
        dfs.append(df)
    
    df = pd.concat(dfs, ignore_index=True)
    df_unique = df.drop_duplicates(subset='name', keep='first')

    df_unique['embeddings'] = df_unique['embeddings'].apply(extract_2d_array)
    print(df_unique.shape)

    result_df = findMeanEmbeddings(df_unique)
    result_df.to_csv("average_embeddings.csv", index=False)

In [9]:
path = "./dataset_truncated.csv"
constructAverageDataframe(path)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique['embeddings'] = df_unique['embeddings'].apply(extract_2d_array)


(59017, 3)
                                sub_category  \
0                           Air Conditioners   
1                             All Appliances   
2               All Car & Motorbike Products   
3                            All Electronics   
4                     All Exercise & Fitness   
5                All Grocery & Gourmet Foods   
6                         All Home & Kitchen   
7                           All Pet Supplies   
8             All Sports, Fitness & Outdoors   
9                             Amazon Fashion   
10                        Household Supplies   
11                           Indoor Lighting   
12                                 Innerwear   
13                             Kids' Watches   
14                          Lab & Scientific   
15                      Lingerie & Nightwear   
16  Musical Instruments & Professional Audio   
17                                    Shirts   
18                               Snack Foods   
19                           

In [6]:
def rankAvgDfBySimilarity(df, user_embedding):

    df['embeddings'] = df['embeddings'].apply(lambda x: x.flatten())
    embeddings_matrix = np.vstack(df['embeddings'].values)
    print(embeddings_matrix.shape)

    df['similarities'] = cosine_similarity(embeddings_matrix, user_embedding).flatten()
    return df.sort_values(by='similarities', ascending=False).head(5)

def getRandomEmbeddingVector(length):
    vector = np.random.rand(length)
    vector = 2 * vector - 1
    return vector.reshape(1, -1)

df = pd.read_csv("average_embeddings.csv")
df['embeddings'] = df['embeddings'].apply(extract_2d_array_list)

df = rankAvgDfBySimilarity(df,getRandomEmbeddingVector(768))
df

(24, 768)


Unnamed: 0,sub_category,embeddings,similarities
17,Shirts,"[-0.752649218, -0.472511213, -0.807391897, 0.7...",-0.003847
12,Innerwear,"[-0.73928861, -0.47255215, -0.80987534, 0.7168...",-0.006061
19,Sports Shoes,"[-0.794324868, -0.535850741, -0.850778818, 0.7...",-0.009555
15,Lingerie & Nightwear,"[-0.75945913, -0.52629979, -0.87041634, 0.7896...",-0.011632
21,Sunglasses,"[-0.79422574, -0.53368691, -0.81616387, 0.7570...",-0.011945


In [7]:
categories_list = df['sub_category']
print(categories_list)
hidden_df = pd.read_csv("dataset_truncated.csv")

final_df = []

for category in categories_list:
    print(category)
    df_filtered = hidden_df[hidden_df['sub_category'] == category]
    df_sampled = df_filtered.sample(n=2)
    final_df.append(df_sampled)

recommendations = pd.concat(final_df, ignore_index=True)

print(recommendations)

17                  Shirts
12               Innerwear
19            Sports Shoes
15    Lingerie & Nightwear
21              Sunglasses
Name: sub_category, dtype: object
Shirts
Innerwear
Sports Shoes
Lingerie & Nightwear
Sunglasses
                                                name     main_category  \
0  Dennis Lingo Men's Floral Printed Slim Fit Cot...    men's clothing   
1    IndoPrimo Men's Regular Fit Cotton Casual Shirt    men's clothing   
2   KUNDAN Gold Men Vest (Pack of 5 Baniyan for Men)    men's clothing   
3  BASIICS by La Intimo Men’s Cotton Spandex Hot ...    men's clothing   
4      Longwalk Men's Sports Shoes Running Gym Shoes       men's shoes   
5  XTEP Men's Soft Textile Upper IP Outsole Sport...       men's shoes   
6  Clovia Women's Lace Lightly Padded Wire Free F...  women's clothing   
7  Pride Apparel Ladies Panties Cotton Solid Colo...  women's clothing   
8  IndyDukaan Fashion Vintage Style Long Eyeglass...       accessories   
9  2 Pack Heart Shaped Diffra