In [1]:
import pandas as pd

df_drug = pd.read_csv('dataset/train.csv',sep=',')

df_drug.drop(['drug_approved_by_UIC','number_of_times_prescribed','base_score','effectiveness_rating','patient_id'],axis=1)

len(df_drug['name_of_drug'].unique())

len(df_drug['use_case_for_drug'].unique())

# Combine the columns into a single text column called 'combined_text'
df_drug['combined_text'] = df_drug.apply(
    lambda x: f"Drug: {x['name_of_drug']} | Use Case: {x['use_case_for_drug']} | Review: {x['review_by_patient']}",
    axis=1
)


In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import faiss

# Load the UAE-Large-V1 model using SentenceTransformer
model_name = "WhereIsAI/UAE-Large-V1"
model = SentenceTransformer(model_name)

In [42]:
df_drug = pd.DataFrame({
    'combined_text': ["Drug A treats disease X", "Drug B prevents condition Y", "Drug C helps with Z"]
})

embedded_data = df_drug['combined_text'].apply(lambda x: model.encode(x))

print('Embedded Data: ', embedded_data)

embedding_matrix = np.array(embedded_data.tolist())

embedding_dim = embedding_matrix.shape[1]
print(f"Embedding matrix shape: {embedding_matrix.shape}")

index = faiss.IndexFlatL2(embedding_dim)

index.add(embedding_matrix)

similarity_matrix = model.similarity(embedding_matrix, embedding_matrix)
print("Pairwise similarity matrix:")
print(similarity_matrix)


Embedded Data:  0    [0.114834286, -0.3201546, 0.6377671, 0.9517594...
1    [-0.4264032, -0.14732005, 0.33011806, 0.543894...
2    [0.14592683, -9.149313e-06, 1.2848724, 0.09866...
Name: combined_text, dtype: object
Embedding matrix shape: (3, 1024)
Pairwise similarity matrix:
tensor([[1.0000, 0.7564, 0.7508],
        [0.7564, 1.0000, 0.7078],
        [0.7508, 0.7078, 1.0000]])


In [43]:
import sqlite3
import numpy as np

# Connect to SQLite database (or create it)
conn = sqlite3.connect("embedded_data.db")
cursor = conn.cursor()

# Create a table for storing embedded data
cursor.execute("""
CREATE TABLE IF NOT EXISTS embeddings (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    combined_text TEXT,
    embedding BLOB
)
""")

# Insert embedded data into the table
for i, (text, embedding) in enumerate(zip(embedded_data, embedding_matrix)):
    # Convert NumPy embedding array to bytes
    embedding_bytes = embedding.tobytes()
    # Insert into the database
    cursor.execute("INSERT INTO embeddings (combined_text, embedding) VALUES (?, ?)", (text, embedding_bytes))

# Commit changes and close connection
conn.commit()
conn.close()

print("Embedded data successfully stored in the database.")


Embedded data successfully stored in the database.


In [44]:
# Connect to the SQLite database
conn = sqlite3.connect("embedded_data.db")
cursor = conn.cursor()

# Retrieve all rows from the embeddings table
cursor.execute("SELECT combined_text, embedding FROM embeddings")
rows = cursor.fetchall()

# Convert the binary embeddings back into NumPy arrays
retrieved_texts = []
retrieved_embeddings = []

for row in rows:
    text = row[0]
    embedding = np.frombuffer(row[1], dtype=np.float32)  # Convert bytes to NumPy array
    retrieved_texts.append(text)
    retrieved_embeddings.append(embedding)

# Convert to NumPy matrix
retrieved_embedding_matrix = np.array(retrieved_embeddings)

print("Retrieved Texts: ", retrieved_texts)
print("Retrieved Embedding Matrix Shape: ", retrieved_embedding_matrix.shape)

conn.close()


Retrieved Texts:  [b'=.\xeb=N\xeb\xa3\xbe\xb4D#?\x82\xa6s?\x8a#2\xbfu^G\xbf(\xbb\xea;\x94\xff\xad\xbd\xf8\xa0,\xbc-\x07J?/\x90b>NK\x15\xbf\x00)\x01\xba\x9es\xaa>\xf4\xf1\x97=\x94\xce\x11?\'`\xf2<X\x96r\xbf&\xb6\xca\xbd\xe2\x1b]?\x1a\x05\x82<\xe8\x1d8?\xe8\xe7=\xbfp\xcb\x0e\xbf2i\xbb>W\x8b\x11\xbeZ\xc1\xf9\xbe`\x14\x96\xbe\x9fB\x91?\xef\xbeq?\xa1!\x1f\xbf\xd3\x1b\x12\xbeN?\x1c?\xc6\r\x91\xbe\xc5\x1a\xb3\xbe\x1c\x8f\xe7\xbe\xeeL\xf3>\xd6\x86\x9c\xbf\xde~6\xbd\xd9\xf1\xc8\xbe\xea\xae\x02?\xf0[\x87\xbe\xba\xe2\xc2\xbdQ\xa2\x06\xbf\x80@\x08\xbfJ\xe7\xcb\xbe\xae\x00\\\xbf\xe2\xfe{\xbe$\xdbh?^\x8a\xb3=\xb0B\x80\xbe\xb9J\xf9=y\xb0\x9a>\xa9\x8d\x80>\xab\xb9\x8c\xbe&Eb=\x1b\xaf\xa9>RK\xa8=}dB\xbe K\n?\'\xc4M\xbeQ:L\xbfpGV;\xcb\x8e\xbb\xbf?2\xac?\xbb*U?\x9f\xe0\x80\xbf\xce\x08>\xbf\xac\xb3\xe7>\x8dH\x0f?f%\xe8<C\xf3\xb0>|jG=d\xa5\xeb\xbd\xc4\xea\xeb\xbd4.\xdb\xbe\x8e\r,?\xae\xe9\xfa\xbd\xa7\x8f\x0c?\xbaSR?#R\x9d>\xf8\x0f\xb5>\x91\x1f\xe7\xbd\x95\xc8\xf3>\x1acR\xbf\xf5\xb9\x91?\xf8\x00\xaa>\xc5\xf