In [13]:
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler('Logs.log')
console_handler = logging.StreamHandler()
file_handler.setLevel(logging.DEBUG)
console_handler.setLevel(logging.DEBUG)

formatter = logging.Formatter('%(asctime)s - %(message)s - Line: %(lineno)d', datefmt='%Y-%m-%d %H:%M:%S')
file_handler.setFormatter(formatter)
console_handler.setFormatter(formatter)

logger.addHandler(file_handler)
logger.addHandler(console_handler)

logger.debug("This is a debug message")
logger.info("This is an info message")
logger.warning("This is a warning message")
logger.error("This is an error message")
logger.critical("This is a critical message")

2024-08-10 11:23:50 - This is a debug message - Line: 16
2024-08-10 11:23:50 - This is an info message - Line: 17
2024-08-10 11:23:50 - This is an error message - Line: 19
2024-08-10 11:23:50 - This is a critical message - Line: 20


# Data Scraping

In [14]:
######### a new method that could work on heroku
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import by
import time
import json
from bs4 import BeautifulSoup
from crewai import Agent, Task
from langchain.tools import tool


class BrowserTools:
    @tool("Scrape website content")
    def scrape_and_summarize_website(website):
        """Useful to scrape and summarize a website content"""

        # Set up Selenium with headless Chrome
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")

        driver = webdriver.Chrome(options=chrome_options)
        driver.get(website)

        # Extract content with BeautifulSoup or any other method
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        content = soup.get_text()

        driver.quit()

        # Further processing and summarization as needed
        content = [content[i:i + 8000] for i in range(0, len(content), 8000)]

        summaries = []
        for chunk in content:
            agent = Agent(
                role='Principal Researcher',
                goal='Do amazing research and summaries based on the content you are working with',
                backstory="You're a Principal Researcher at a big company and you need to do research about a given topic.",
                allow_delegation=False)

            task = Task(
                agent=agent,
                description=f'Analyze and summarize the content below, make sure to include the most relevant information in the summary, return only the summary nothing else.\n\nCONTENT\n----------\n{chunk}',
                expected_output='A summarized report of the content provided.'
            )

            summary = task.execute()
            summaries.append(summary)

        return "\n\n".join(summaries)



ImportError: cannot import name 'by' from 'selenium.webdriver.common.by' (c:\Users\NajibS\.conda\envs\RAG_env\Lib\site-packages\selenium\webdriver\common\by.py)

# Transformer Network

In [15]:
# MULTIHEAD ATTENTION 
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        try:
            assert d_model % num_heads == 0
        except Exception as e:
            logger.error("dimension of the embedding model is not divisable by number of heads")
        
        self.d_models = d_model
        self.num_heads = num_heads
        self.depth = d_model // num_heads

        # The query, key, value learnable matrices
        self.Wq = nn.Linear(d_model, d_model)
        self.Wk = nn.Linear(d_model, d_model)
        self.Wv = nn.Linear(d_model, d_model)

        self.FCLayer = nn.Linear(d_model, d_model)
    def split_embedding_perHead(self,x):
        # x shape is (batch_size, seq_len, d_model)
        (batch_size, seq_len, d_model) = x.shape
        # logger.info(f"multi-head; x-shape: {x.shape}")
        # let's reshape to (batch_size, seq_len, num_heads, depth)
        x = x.view(batch_size, -1, self.num_heads, self.depth)
        logger.info(f"Multi-head; x reshaped: {x.shape} ")
        # changing the dimensions order to:(batch_size, num_heads, seq_len, depth)
        x = x.permute(0,2,1,3)
        return x
    
    def cal_attention(self,q,k,v,mask):
        qk = torch.matmul(q, k.permute(0,1,3,2))
        dk=torch.tensor(k.shape[-1], dtype=torch.float32)
        #dk is a tensor scalar!
        attention = qk/torch.sqrt(dk)
        if mask is not None:
            attention += (mask*-1e9)
        attention_weights = F.softmax(attention, dim=1)
        output = torch.matmul(attention_weights, v)
        return output, attention_weights
    
    def forward(self, v,k,q,mask):
        batch_size = q.shape[0]
        q = self.split_embedding_perHead(self.Wq(q))
        k = self.split_embedding_perHead(self.Wk(k))
        v = self.split_embedding_perHead(self.Wv(v))

        attention,atten_weights = self.cal_attention(q,k,v,mask)
        attention = attention.permute(0,2,1,3).contiguous()
        attention = attention.reshape(batch_size, -1, self.d_models)

        output = self.FCLayer(attention)
        return output


In [78]:
# THE ENCODER LAYER
class EncoderLayer(nn.Module):
    def __init__(self,d_model, num_heads,dff):
        super(EncoderLayer,self).__init__()
        self.MultiHAttention = MultiHeadAttention(d_model, num_heads)
        self.FeedForwardNN = nn.Sequential(
            nn.Linear(d_model,dff),
            nn.ReLU(),
            nn.Linear(dff,d_model)

        )
        self.layerNorm1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layerNorm2 = nn.LayerNorm(d_model, eps=1e-6)
    def forward(self,x,mask):
        # logger.info("multihead encoder initated")
        attn_output = self.MultiHAttention(x,x,x,mask)
        output1 = self.layerNorm1(x+attn_output)
        output2 = self.FeedForwardNN(output1)
        output3 = self.layerNorm2(output1+ output2)
        return output3

In [100]:
# THE DECODER LAYER
class DecoderLayer(nn.Module):
    def __init__(self,d_model, num_heads, dff):
        super(DecoderLayer,self).__init__()
        self.MultiHAttention1 = MultiHeadAttention(d_model, num_heads)
        self.MultiHAttention2 = MultiHeadAttention(d_model, num_heads)
        self.FeedForwardNN = nn.Sequential(
            nn.Linear(d_model,dff),
            nn.ReLU(),
            nn.Linear(dff,d_model)

        )
        self.layerNorm1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layerNorm2 = nn.LayerNorm(d_model, eps=1e-6)
        self.layerNorm3 = nn.LayerNorm(d_model, eps=1e-6)

    def forward(self, x, enc_output, look_ahead_mask, padding_mask):
        attn_output1 = self.MultiHAttention1(x,x,x,look_ahead_mask)
        attn_output1 = self.layerNorm1(x+attn_output1)
        logger.info(f"decoder input into second multihead attention layer:{attn_output1.shape}")
        attn_output2 = self.MultiHAttention2(enc_output, enc_output,attn_output1, padding_mask)
        attn_output2 = self.layerNorm2(attn_output2+attn_output1)

        Feedforward_output = self.FeedForwardNN(attn_output2)
        final_output = self.layerNorm3(attn_output2+Feedforward_output)
        return final_output

In [101]:
# THE ENCODER AND DECODER
class Encoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, dff):
        super(Encoder,self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        # self.embedding = nn.Embedding(input_vocab_size, d_model)
        # self.pos_encoding = self.positional_encoding(maximum_position_encoding, d_model)
        self.enc_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, dff) for _ in range(num_layers)])

    # def positional_encoding(self, position, d_model):
    #     angle_rads = self.get_angles(np.arange(position)[:,np.newaxis], np.arange(d_model)[np.newaxis,:], d_model)
    #     angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    #     angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    #     pos_encoding = angle_rads[np.newaxis, ...]
    #     return torch.tensor(pos_encoding, dtype=torch.float32)
    
    # def get_angles(self, pos, i, d_model):
    #     angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    #     return pos * angle_rates
    
    def forward(self, x):
        # the multihead attention class will require x to be of shape
        # batch, seq, d_model but for properties, there is no sequence so
        # will add an additional dimension of length 1 for seq
        x = x.unsqueeze(1) 
        logger.info(f"embedding input type: {type(x)}")
        logger.info(f"embedding input shape: {x.shape}")
        # x += self.pos_encoding[:, :seq_len, :]
        logger.info("encoder layer initiated")
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, mask=None)

        logger.info("encoder layer done")

        return x

class Decoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = nn.Embedding(target_vocab_size, d_model) # d_model is the size of embedding vector
        self.pos_encoding = self.positional_encoding(maximum_position_encoding, d_model)

        self.dec_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, dff) for _ in range(num_layers)])

    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        pos_encoding = angle_rads[np.newaxis, ...]
        return torch.tensor(pos_encoding, dtype=torch.float32)

    def get_angles(self, pos, i, d_model):
        angle_rates = 1 / np.power(1000, (2 * (i // 2)) / np.float32(d_model))
        return pos * angle_rates

    def forward(self, x, enc_output, look_ahead_mask, padding_mask):
        logger.info(f"decoder input shape to the embedding: {x.shape}")
        seq_len = x.size(1)
        x = self.embedding(x)
        logger.info(f"decoder input shape after embedding: {x.shape}")
        x *= torch.sqrt(torch.tensor(self.d_model, dtype=torch.float32))
        x += self.pos_encoding[:, :seq_len, :]

        for i in range(self.num_layers):
            x = self.dec_layers[i](x, enc_output, look_ahead_mask, padding_mask)
        return x


In [102]:
# TRANSFORMER

class Transformer(nn.Module):
    def __init__(self,num_layers, enc_d_model, dec_d_model,
                enc_num_heads, dec_num_heads, enc_dff, 
                dec_dff, target_vocab_size, pe_target):
        super(Transformer, self).__init__()

        # self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size)
        self.encoder = Encoder(num_layers, enc_d_model, enc_num_heads, enc_dff)
        self.decoder = Decoder(num_layers, dec_d_model, dec_num_heads, dec_dff, target_vocab_size, pe_target)
        self.final_layer = nn.Linear(dec_d_model, target_vocab_size)

    def forward(self, properties, target, look_ahead_mask, dec_padding_mask):
        logger.info("ENCODER STARTED")
        enc_output = self.encoder(properties)
        logger.info("ENCODER COMPLETED")
        logger.info(f"encoder output dimensions:{enc_output.shape}")
        logger.info("DECODER STARTED")
        dec_output = self.decoder(target, enc_output, look_ahead_mask, dec_padding_mask)
        final_output = self.final_layer(dec_output)
        return final_output

# Data preprocessing

In [103]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

class MoleculeDataset(Dataset):
    def __init__(self, properties, smiles):
        self.properties = properties
        self.smiles = smiles

    def __len__(self):
        return len(self.properties)

    def __getitem__(self, idx):
        return torch.tensor(self.properties[idx], dtype=torch.float32), torch.tensor(self.smiles[idx], dtype=torch.long)


def preprocess_data(csv_file):
    data = pd.read_csv(csv_file)
    properties = data[['polararea', 'complexity', 'heavycnt', 'hbonddonor', 'hbondacc']].values
    smiles = data['isosmiles'].values
    print("length of smiles: ", smiles.shape)
    # print(f"smiles: {smiles}")

    # print(f"properties: {properties}")
    
    # Normalize properties
    scaler = StandardScaler()
    properties = scaler.fit_transform(properties)
    
    # Convert SMILES to a list of character indices
    # only unique characters remain
    # this is for creating a vocab to use to enumerate the smiles notation
    char_to_idx = {char: idx + 1 for idx, char in enumerate(sorted(set(''.join(smiles))))}

    print(char_to_idx)
    # reversing the index to character
    idx_to_char = {idx: char for char, idx in char_to_idx.items()}
    
    max_smiles_len = max(len(s) for s in smiles)
    print("max smiles length: ", max_smiles_len)
    smiles_indices = [[char_to_idx[char] for char in smi] + [0] * (max_smiles_len - len(smi)) for smi in smiles]

    # testing the smiles indices code
    print("smiles length:",len(smiles_indices))
    for smile_i in smiles_indices:
        print("smiles example: ", smile_i)
        print("smiles example length: ", len(smile_i))
        break

    return properties, smiles_indices, char_to_idx, idx_to_char, scaler

properties, smiles_indices, char_to_idx, idx_to_char, scaler = preprocess_data('pubchem.csv')

train_props, test_props, train_smiles, test_smiles = train_test_split(properties, smiles_indices, test_size=0.2, random_state=42)

train_dataset = MoleculeDataset(train_props, train_smiles)
test_dataset = MoleculeDataset(test_props, test_smiles)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


length of smiles:  (5837,)
{'#': 1, '%': 2, '(': 3, ')': 4, '+': 5, '-': 6, '.': 7, '/': 8, '0': 9, '1': 10, '2': 11, '3': 12, '4': 13, '5': 14, '6': 15, '7': 16, '8': 17, '9': 18, '=': 19, '@': 20, 'A': 21, 'B': 22, 'C': 23, 'F': 24, 'G': 25, 'H': 26, 'I': 27, 'K': 28, 'L': 29, 'M': 30, 'N': 31, 'O': 32, 'P': 33, 'R': 34, 'S': 35, 'T': 36, 'U': 37, 'V': 38, 'W': 39, 'Y': 40, 'Z': 41, '[': 42, '\\': 43, ']': 44, 'a': 45, 'b': 46, 'c': 47, 'd': 48, 'e': 49, 'f': 50, 'g': 51, 'h': 52, 'i': 53, 'l': 54, 'm': 55, 'n': 56, 'o': 57, 'p': 58, 'r': 59, 's': 60, 't': 61, 'u': 62}
max smiles length:  426
smiles length: 5837
smiles example:  [23, 23, 3, 23, 31, 4, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [104]:
def create_padding_mask(seq):
    seq_masked = torch.tensor(seq) == 0 # True if value is 0 otherwise false
    return seq_masked.unsqueeze(1).unsqueeze(2) 

def create_look_ahead_mask(size):
    # creating an upper triangle of 1s
    mask = torch.triu(torch.ones((size, size)), diagonal=1) 
    return mask.unsqueeze(0).unsqueeze(1)


In [105]:
import torch.optim as optim
import torch.nn as nn

def loss_function(real, pred):
    mask = real != 0
    loss_ = nn.CrossEntropyLoss()(pred.transpose(1, 2), real)
    
    mask = mask.float()
    loss_ *= mask

    return torch.mean(loss_)

def train_model(transformer, train_loader, num_epochs, learning_rate):
    optimizer = optim.Adam(transformer.parameters(), lr=learning_rate)
    
    for epoch in range(num_epochs):
        transformer.train()
        total_loss = 0
        
        for idx_num, (properties, smiles) in enumerate(train_loader):
   
            print("properties: ", properties.shape)
            print("smiles", smiles.shape)
            properties = properties.to(device)
            smiles = smiles.to(device)
            # print("target: ",smiles.shape)
            # print("input:", smiles.shape)
            # print("smiles before masking: ",smiles)
            
            # print("smiles after masking", enc_padding_mask)
            print("look ahead dimension:", smiles.size(1))
            look_ahead_mask = create_look_ahead_mask(smiles.size(1))
            dec_padding_mask = create_padding_mask(smiles)
            
            optimizer.zero_grad()
            predictions = transformer(properties, smiles, look_ahead_mask, dec_padding_mask)
            loss = loss_function(smiles[:, 1:], predictions[:, :-1])
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f'Epoch {epoch+1}, Loss: {total_loss / (idx_num + 1)}')

# Initialize the model
target_vocab_size = len(char_to_idx) + 1  # +1 for padding token
num_layers = 4
enc_d_model = 128 # number of properties
dec_d_model = 128
enc_num_heads = 1
dec_num_heads = 8
enc_dff = 128 # dimension of the feed forward layer
dec_dff = 128 
pe_target = 1000 # positional encoding

transformer = Transformer(num_layers, enc_d_model, dec_d_model,
                          enc_num_heads, dec_num_heads, enc_dff, 
                          dec_dff, target_vocab_size, pe_target)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transformer = transformer.to(device)

# Train the model
train_model(transformer, train_loader, num_epochs=20, learning_rate=0.001)


  seq_masked = torch.tensor(seq) == 0 # True if value is 0 otherwise false
2024-08-12 10:00:05 - ENCODER STARTED - Line: 15
2024-08-12 10:00:05 - embedding input type: <class 'torch.Tensor'> - Line: 27
2024-08-12 10:00:05 - embedding input shape: torch.Size([32, 1, 5]) - Line: 28
2024-08-12 10:00:05 - encoder layer initiated - Line: 30
2024-08-12 10:00:05 - multihead encoder initated - Line: 15
2024-08-12 10:00:05 - multi-head; x-shape: torch.Size([32, 1, 5]) - Line: 28
2024-08-12 10:00:05 - Multi-head; x reshaped: torch.Size([32, 1, 1, 5])  - Line: 31


2024-08-12 10:00:05 - multi-head; x-shape: torch.Size([32, 1, 5]) - Line: 28
2024-08-12 10:00:05 - Multi-head; x reshaped: torch.Size([32, 1, 1, 5])  - Line: 31
2024-08-12 10:00:05 - multi-head; x-shape: torch.Size([32, 1, 5]) - Line: 28
2024-08-12 10:00:05 - Multi-head; x reshaped: torch.Size([32, 1, 1, 5])  - Line: 31
2024-08-12 10:00:05 - multihead encoder initated - Line: 15
2024-08-12 10:00:05 - multi-head; x-shape: torch.Size([32, 1, 5]) - Line: 28
2024-08-12 10:00:05 - Multi-head; x reshaped: torch.Size([32, 1, 1, 5])  - Line: 31
2024-08-12 10:00:05 - multi-head; x-shape: torch.Size([32, 1, 5]) - Line: 28
2024-08-12 10:00:05 - Multi-head; x reshaped: torch.Size([32, 1, 1, 5])  - Line: 31
2024-08-12 10:00:05 - multi-head; x-shape: torch.Size([32, 1, 5]) - Line: 28
2024-08-12 10:00:05 - Multi-head; x reshaped: torch.Size([32, 1, 1, 5])  - Line: 31
2024-08-12 10:00:05 - multihead encoder initated - Line: 15
2024-08-12 10:00:05 - multi-head; x-shape: torch.Size([32, 1, 5]) - Line: 2

properties:  torch.Size([32, 5])
smiles torch.Size([32, 426])
look ahead dimension: 426
multihead done
multihead done
multihead done
multihead done


2024-08-12 10:00:06 - decoder input into second multihead attention layer:torch.Size([32, 426, 128]) - Line: 20
2024-08-12 10:00:06 - multi-head; x-shape: torch.Size([32, 426, 128]) - Line: 28
2024-08-12 10:00:06 - Multi-head; x reshaped: torch.Size([32, 426, 8, 16])  - Line: 31


RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x5 and 128x128)