In [None]:
## Run once cell

%load_ext autoreload
%autoreload 2

import os
os.chdir('..')

In [None]:
import numpy as np
import pandas as pd

path_to_data = "data/final/"
df = pd.read_pickle(path_to_data + "master_dataframe.pkl")
edges = pd.read_pickle(path_to_data + "master_edges.pkl")
df.shape, len(edges)

In [None]:
lvl2_vectors = np.array(df[df['level'] == 2]['scalar_rep'].values.tolist())
lvl2_vectors.shape

In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="f676907e-9d77-4e1c-89b5-6b996b971354")

In [None]:
pc.create_index(
    name="proteins-lvl2",
    dimension=46, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [None]:
lvl2_upsert = []
for i, lvl2_vector in enumerate(lvl2_vectors):
    lvl2_upsert.append({"id": f"vec{i}", "values": lvl2_vector})

len(lvl2_upsert)

def split_vectors(vector_list, max_size=1000):
    """Split a list of vectors into multiple lists, each with a maximum size of max_size."""
    return [vector_list[i:i + max_size] for i in range(0, len(vector_list), max_size)]

# Splitting the lvl2_upsert list into parts of at most 1000 vectors each
lvl2_upsert_parts = split_vectors(lvl2_upsert, 1000)


# # Splitting the lvl2_upsert list into 5 roughly even parts
# split_size = len(lvl2_upsert) // 68
# lvl2_upsert_parts = [lvl2_upsert[i:i + split_size] for i in range(0, len(lvl2_upsert), split_size)]
# # Ensure that all elements are included in the parts
# if len(lvl2_upsert_parts) > 68:
#     lvl2_upsert_parts[67].extend(lvl2_upsert_parts.pop())

# len(lvl2_upsert_parts)



In [None]:


index = pc.Index("proteins-lvl2")



In [None]:
%%time

for part in lvl2_upsert_parts:
    print(len(part))
    index.upsert(
        vectors=part,
        namespace="ns1"
    )



In [None]:
lvl2

In [None]:

query_results_all = []
for vector in lvl2_vectors:
    vector = list(vector)
    vector = [float(i) for i in vector]
    query_results1 = index.query(
        namespace="ns1",
        vector=vector,
        top_k=10,
        include_values=True
    )
    break

print(query_results1)

In [None]:
from tqdm import tqdm

def get_all_scores(vectors, top_k=10):
    """Given a list of vectors, return the scores of the top_k matches for each vector."""
    query_results = []
    for vector in tqdm(vectors):
        vector = [float(i) for i in vector]
        query_results.extend([1 - match.score for match in index.query(
            namespace="ns1",
            vector=vector,
            top_k=top_k,
            include_values=True
        ).matches])

    return query_results


In [None]:
%%time

all_scores = get_all_scores(lvl2_vectors[:1000])
len(all_scores)

In [None]:
all_scores_normalized = [(score - min(all_scores)) / (max(all_scores) - min(all_scores)) for score in all_scores]


In [None]:
import matplotlib.pyplot as plt

# Create a histogram of all scores
plt.figure(figsize=(10, 6))
plt.hist(all_scores_normalized, bins=1000, color='blue', alpha=0.7)
plt.xlim(-0.001, 0.001)
plt.title('Histogram of All Scores')
plt.xlabel('Scores')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
