In [2]:
import pandas as pd
import numpy as np
import math
import torch
from torch import Tensor
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import psycopg2
from nltk.tokenize import word_tokenize
from pgvector.psycopg2 import register_vector


In [3]:


df = pd.read_csv('/Users/shirinwadood/Desktop/projects/SkinBot/RAG/preprocessing_data/preproc_articles.csv')


## 1. Chunk the articles

In [4]:

def chunk_document(df, max_tokens, token_overlap):
    """
    Splits a document into chunks each containing at most max_tokens tokens.
    """
    chunk_list = []

    for i in range(len(df.index)):
        text = df['articles'][i]
        tokens = word_tokenize(text)
        num_tokens = len(tokens)
        start = 0
        while start < num_tokens: 
            end = min(start + max_tokens, num_tokens)
            current_chunk = tokens[start:end]
            chunk_list.append((' '.join(current_chunk)))  # Store document index along with the chunk
            start += max_tokens - token_overlap # Increment start by at least max_tokens - token_overlap

    return chunk_list

chunks_list = chunk_document(df, max_tokens=300, token_overlap=40)


## 2. Get the embeddings of the chunks

In [5]:
tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small")
model = AutoModel.from_pretrained("thenlper/gte-small")

def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


def get_embeddings(input_texts: list[str]) -> str:
    """
    Get embeddings for a list of input texts.

    Args:
        input_texts (List[str]): List of input texts.

    Returns:
        str: JSON string representing the embeddings.
    """
    # Tokenize the input texts
    batch_dict = tokenizer(input_texts, max_length=120, padding=True, truncation=True, return_tensors='pt')
    
    with torch.no_grad():
        outputs = model(**batch_dict)

    embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    embeddings = F.normalize(embeddings, p=2, dim=1)

    embeddings_list = embeddings.numpy().tolist()

    return embeddings_list



In [6]:
embedded_df = pd.DataFrame(columns=['Chunk', 'Embedding'])

# Iterate through the chunked articles and get their embeddings
for chunk in chunks_list:
    embeddings = get_embeddings([chunk])
    embedded_df.loc[len(embedded_df)] = {'Chunk': chunk, 'Embedding': embeddings[0]}


print(embedded_df.head())

                                               Chunk  \
0  can i keep my skin looking young and healthy w...   
1  them . outcome are often based on a subjective...   
2  ’ re pregnant or breastfeeding.check the produ...   
3  forget to apply sunscreen to your chest and ha...   
4  not healing well . if you notice any change , ...   

                                           Embedding  
0  [-0.03198978677392006, 0.007969328202307224, 0...  
1  [-0.05294744297862053, 0.011631992645561695, 0...  
2  [-0.03757018595933914, -0.028628941625356674, ...  
3  [-0.029223192483186722, -0.008009245619177818,...  
4  [-0.04224391281604767, -0.030831916257739067, ...  


In [7]:

embedded_df.to_csv('embedded_articles.csv', index=False)

embedded_df.head()

Unnamed: 0,Chunk,Embedding
0,can i keep my skin looking young and healthy w...,"[-0.03198978677392006, 0.007969328202307224, 0..."
1,them . outcome are often based on a subjective...,"[-0.05294744297862053, 0.011631992645561695, 0..."
2,’ re pregnant or breastfeeding.check the produ...,"[-0.03757018595933914, -0.028628941625356674, ..."
3,forget to apply sunscreen to your chest and ha...,"[-0.029223192483186722, -0.008009245619177818,..."
4,"not healing well . if you notice any change , ...","[-0.04224391281604767, -0.030831916257739067, ..."


In [8]:

print('Connecting to PostgreSQL...')
conn = psycopg2.connect("host=localhost dbname=ragdb user=myusername password=mypassword")
    
cur = conn.cursor()

print('Successfully connected to PostgreSQL.')

cur.execute("CREATE EXTENSION IF NOT EXISTS vector");
conn.commit()


Connecting to PostgreSQL...
Successfully connected to PostgreSQL.


## 3. Setup the Database to save the embeddings

In [9]:
register_vector(conn)


In [11]:

table_create_command = """
CREATE TABLE IF NOT EXISTS ragdb (
    id bigserial primary key, 
    Chunk text,
    Embedding vector(384)
);
"""

# Open a cursor
with conn.cursor() as cur:
    # Execute the table creation command
    cur.execute(table_create_command)

conn.commit()

data_list = [(row['Chunk'], np.array(row['Embedding'])) for index, row in embedded_df.iterrows()]
sql_query = 'INSERT INTO ragdb (Chunk, Embedding) VALUES (%s, %s)'


# Open a cursor again
with conn.cursor() as cur:
    try:
        # Executing the SQL query with the data
        cur.executemany(sql_query, data_list)
        
        # Committing the changes to the database
        conn.commit()

    except Exception as e:
        # Handle exceptions (print, log, etc.)
        print(f"An error occurred: {e}")




In [12]:
with conn.cursor() as cur:

    cur.execute("SELECT COUNT(*) as cnt FROM ragdb;")
    num_records = cur.fetchone()[0]
    print("Number of vector records in table: ", num_records,"\n")

Number of vector records in table:  17 



In [13]:
with conn.cursor() as cur:

    # print the first record in the table, for sanity-checking
    cur.execute("SELECT * FROM ragdb LIMIT 1;")
    records = cur.fetchall()
    print("First record in table: ", records)


First record in table:  [(204, 'can i keep my skin looking young and healthy without paying for expensive procedure ? a : after doing a deep dive of the medical literature , my advice is to follow a simple routine : live well every day with tip and guidance on food , fitness and mental health , delivered to your inbox every thursday.arrowrightin the morning : use a face cleanser , then apply a moisturizer and a broad-spectrum sunscreen.at night : cleanse your face again , then apply a retinoid and your moisturizer.the brand doesn ’ t necessarily matter . chose what work for your budget and skin type — such a sensitive , dry or oily.skip to end of carouselstart the year freshkatty huertas/the poststart with practical tip and smart solution for your health , technology , travel , food , money , home and more . easy win , good habit , better living . elevate your daily life with expertise from the washington post.find it all here.end of carouselbe skeptical of social medium and adslook , 

## 4. Create an index on the data for faster retrieval


In [14]:

#calculate the index parameters according to best practices
num_lists = num_records / 1000
if num_lists < 10:
   num_lists = 10
if num_records > 1000000:
   num_lists = math.sqrt(num_records)
with conn.cursor() as cur:
   try:


      #use the cosine distance measure, which is what we'll later use for querying
      cur.execute(f'CREATE INDEX ON ragdb USING ivfflat (Embedding vector_cosine_ops) WITH (lists = {num_lists});')

      conn.commit()
   except Exception as e:
        conn.rollback()
        print(f"An error occurred: {e}")
      


## 5. Retreive the relevant context from PostgreSQL

In [15]:
def get_top3_similar_docs(query):
    batch_dict = tokenizer(query, max_length=512, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**batch_dict)
    embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

    # (Optionally) normalize embeddings
    query_embeddings = F.normalize(embeddings, p=2, dim=1)
    query_embeddings = embeddings.numpy().tolist()
    query_embeddings = np.array(embeddings).flatten()

    # Register pgvector extension
    cur = conn.cursor()
    # cur.execute("SELECT content FROM embeddings ORDER BY embedding <=> %s LIMIT 3", (embeddings,))
    cur.execute("SELECT Chunk, 1- (Embedding <=> %s) AS cosine_similarity FROM ragdb ORDER BY cosine_similarity DESC LIMIT 1", (query_embeddings,))
    top3_docs = cur.fetchall()
    return top3_docs


In [16]:
related_topic = get_top3_similar_docs('when should i use suncreen"')
related_topic

[('. by now , you should know the line well : apply sunscreen every day , even when it ’ s gray or cold , even when you ’ re covered up . when you are exposed , reapply every two hour . make sure your sunscreen protects against both uva and uvb ray . if skin cancer and sun damage aren ’ t enough to convince you , uv exposure is also thenumber onecause of wrinkle , uneven skin tone , loss of firmness and aging signs.exfoliationhere ’ s a product you might not need or want to apply every day . if you have dry skin , including winter-air-induced dry skin , you may exfoliate more than usual , but you should still keep it to once or twice a week – max . exfoliation can be used after cleanser but before moisturizer , a it help to remove flaky skin by increasing skin cell turnover . the benefit are real – removing dead skin and buildup for smoother skin and clearer pore – but most dermatologist will recommend chemical exfoliants over scrub to prevent damage to the protective barrier of your s

## 6. Generate the answer using LLM

In [34]:
tokenizer = AutoTokenizer.from_pretrained("llmware/bling-stable-lm-3b-4e1t-v0", trust_remote_code = True) 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/264 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [43]:
import torch
import transformers
from transformers import AutoModelForCausalLM  
import time


start_time = time.time()

# Load the tokenizer and model
llm_model = "llmware/bling-stable-lm-3b-4e1t-v0"

model = AutoModelForCausalLM.from_pretrained(llm_model, torch_dtype=torch.bfloat16,trust_remote_code = True)

# If there's a GPU available...
if torch.cuda.is_available():    

    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

entries = {}
entries['context'] = related_topic[0][0]
entries['query'] = "when should i use suncreen"

# prepare prompt packaging used in fine-tuning process
new_prompt = "<human>: " + entries["context"] + "\n" + entries["query"] + "\n" + "<bot>:"

inputs = tokenizer(new_prompt, return_tensors="pt")  
start_of_output = len(inputs.input_ids[0])
max_new_tokens = 100
# config = transformers.GenerationConfig() 
# config.eos_token_id = tokenizer.eos_token_id
outputs = model.generate(
        inputs.input_ids.to(device),
        attention_mask=inputs.attention_mask.to(device),
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.1,
        max_new_tokens=max_new_tokens,
        )

output_only = tokenizer.decode(outputs[0][start_of_output:],skip_special_tokens=True)
end_time = time.time()
elapsed_time = end_time - start_time

output_only

No GPU available, using the CPU instead.


' Every day, even when it is cloudy or cold, even when you are covered up.'

In [36]:
output_df = pd.DataFrame(columns=['Model', 'Output', 'Max_Tokens', 'time'])


In [39]:

output_df.loc[len(output_df)] = {'Model': llm_model, 'Output': output_only, 'Max_Tokens': max_new_tokens,'time': elapsed_time }


In [40]:
output_df

Unnamed: 0,Model,Output,Max_Tokens,time
0,llmware/bling-stable-lm-3b-4e1t-v0,"Every day, even when it is cloudy or cold, ev...",100,927.570463
1,llmware/bling-stable-lm-3b-4e1t-v0,"Every day, even when it is cloudy, even when ...",200,1403.333939
