In [83]:
import pandas as pd
import numpy as np
import glob
from sklearn.metrics.pairwise import cosine_similarity
import ast
import re

In [84]:
def extract_2d_array(tensor_str):
    
    tensor_str = tensor_str.replace('tensor(','').replace('grad_fn=<TanhBackward0>)', '').replace(' ', '').replace('\n', '')
    tensor_str = tensor_str[:len(tensor_str)-1]
    numbers = eval(tensor_str)
    
    array = np.array(numbers).reshape((-1, 768))  
    # print(array)
    
    return array

def extract_2d_array_list(data_str):

    data_str = data_str.replace('\n', ' ')

    # Replace one or more spaces with a single comma
    data_str = re.sub(r'\s+', ',', data_str)

    # Ensure there are no extra commas at the end
    data_str = '[' + data_str.strip(',') + ']'

    # Evaluate the string safely
    data_list = ast.literal_eval(data_str)

    # Evaluate the string safely
    numbers = ast.literal_eval(data_str)

    array = np.array(numbers).reshape((-1, 768))  
    return array


In [85]:
def mean_of_2d_arrays(arrays):
    stacked_arrays = np.vstack(arrays)
    return np.mean(stacked_arrays, axis=0).reshape(-1, 768)

def findMeanEmbeddings(df):

    # Group by the specified column and calculate the mean of 2D arrays
    result = df.groupby('sub_category')['embeddings'].apply(lambda x: mean_of_2d_arrays(x))

    # Convert the result to a DataFrame
    result_df = pd.DataFrame(result)

    # Reset the index if necessary
    result_df.reset_index(inplace=True)
    return result_df

In [86]:
def constructAverageDataframe(path):
    files = glob.glob(path)
    df = pd.DataFrame()
    for file in files:
        df = pd.concat([df, pd.read_csv(file)], ignore_index=True)

    df.drop(columns=["Unnamed: 0","Unnamed: 0.1"], inplace=True)

    df['embeddings'] = df['embeddings'].apply(extract_2d_array)
    result_df = findMeanEmbeddings(df)
    result_df.to_csv("average_embeddings.csv", index=False)

In [87]:
# path = "./final_csvs/*/*.csv"
# constructAverageDataframe(path)
df = pd.read_csv("average_embeddings.csv")
df['embeddings'] = df['embeddings'].apply(extract_2d_array_list)

In [97]:
def rankAvgDfBySimilarity(df, user_embedding):

    df['embeddings'] = df['embeddings'].apply(lambda x: x.flatten())
    embeddings_matrix = np.vstack(df['embeddings'].values)
    print(embeddings_matrix.shape)

    df['similarities'] = cosine_similarity(embeddings_matrix, user_embedding).flatten()
    return df.sort_values(by='similarities', ascending=False).head(10)


In [100]:
def getRandomEmbeddingVector(length):
    vector = np.random.rand(length)
    vector = 2 * vector - 1
    return vector.reshape(1, -1)

df = rankAvgDfBySimilarity(df,getRandomEmbeddingVector(768))
df

(10, 768)


Unnamed: 0,sub_category,embeddings,similarities
17,Shirts,"[-0.765575706, -0.458050662, -0.75404854, 0.72...",0.00945
12,Innerwear,"[-0.741735079, -0.46590045, -0.796648398, 0.71...",0.005191
19,Sports Shoes,"[-0.796010255, -0.533936577, -0.849693317, 0.7...",0.003216
15,Lingerie & Nightwear,"[-0.76052792, -0.52577224, -0.86706468, 0.7902...",0.002561
21,Sunglasses,"[-0.79210288, -0.53491784, -0.81867417, 0.7583...",-0.001317
8,"All Sports, Fitness & Outdoors","[-0.731673475, -0.486389727, -0.878391126, 0.7...",-0.00233
14,Lab & Scientific,"[-0.79466522, -0.43120857, -0.86454234, 0.7453...",-0.004333
23,Watches,"[-0.83866561, -0.5528242, -0.91195698, 0.83299...",-0.004782
13,Kids' Watches,"[-0.80525833, -0.54394737, -0.92720296, 0.7868...",-0.005169
0,Air Conditioners,"[-0.67466764, -0.32023762, -0.88830097, 0.6970...",-0.006376


In [None]:
# from sklearn.metrics.pairwise import cosine_similarity

# # Define two vectors
# vector1 = [1, 2, 3]
# vector2 = [4, 5, 6]

# # Reshape the vectors to be 2D arrays
# vector1 = np.array(vector1).reshape(1, -1)
# vector2 = np.array(vector2).reshape(1, -1)

# # Calculate the cosine similarity
# similarity = cosine_similarity(vector1, vector2)

# print(similarity)