In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from langchain_community.document_loaders import PyPDFLoader

In [None]:
df_joy = pd.read_csv('D:/Backup/A-collection-27-08-2024/AL_ML_data Science with python/Dataset/CancerEMO/Joy_anon.csv')

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained(
    "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
)
model = AutoModel.from_pretrained(
    "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
)
model.eval()

In [None]:
from langchain.embeddings.base import Embeddings

class PubMedBERTEmbeddings(Embeddings):
    def embed_documents(self, texts):
        return [self._embed(text) for text in texts]

    def embed_query(self, text):
        return self._embed(text)

    def _embed(self, text):
        inputs = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512
        )
        with torch.no_grad():
            outputs = model(**inputs)

   
        embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings.squeeze().numpy().tolist()

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
loader = PyPDFLoader('D:/oken1961.pdf')
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=100, chunk_overlap=20, separators=["\n\n", "\n", " ", ""]
)
chunks = splitter.split_documents(docs)

In [None]:
from langchain_community.vectorstores import FAISS

embeddings = PubMedBERTEmbeddings()

vector_store = FAISS.from_documents(
    documents=chunks,  
    embedding=embeddings
)

In [None]:
retriever = vector_store.as_retriever(
            search_type="similarity", search_kwargs={"k": 4}
        )

In [None]:
def retrieve_context(query):
    result = retriever.invoke(query)
    context = [doc.page_content for doc in result]
    return context

In [None]:
sentences = df_joy['Sentence'].astype(str).tolist()

contexts = []
for sentence in sentences:
    contexts.append(retrieve_context(sentence))

In [None]:
import re
REPLACE_BY_SPACE = re.compile('[/(){}\[\]\|,;&-_]=')
def preprocess_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\@\S+", "", text)
    text = re.sub(r"#\S+", "", text)
    text = re.sub(r"won\'t", "would not", text)
    text = re.sub(r"n\'t", " not", text)
    text = REPLACE_BY_SPACE.sub(' ', text)
    text = [word.strip() for word in text.split()]
    text = [re.sub(r'[^\u0020-\u007F]+', '', sentence) for sentence in text]
    text = [word for word in text if len(word)>2]
    text = [word for word in text if word!='amp']
    #text = [word.split() for word in text]
    text = ' '.join(text)
    return text

In [None]:
df_joy['context'] = contexts
df_joy['context'] = df_joy['context'].apply(lambda x: ' '.join(x))

In [None]:
df_joy.head()

In [None]:
df_joy['context'] = df_joy['context'].apply(preprocess_text)

In [None]:
df_joy['text'] = (
    "[SENT] " + df_joy['Sentence'] +
    " [CTX] " + df_joy['context']
)

In [None]:
train_df = df_joy[df_joy['Split'] == 0]
val_df   = df_joy[df_joy['Split'] == 1]
test_df   = df_joy[df_joy['Split'] == 2]

In [None]:
from transformers import AutoTokenizer

MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def tokenize(texts):
    return tokenizer(
        texts.tolist(),
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors="tf"
    )

train_enc = tokenize(train_df['text'])
val_enc   = tokenize(val_df['text'])
test_enc   = tokenize(test_df['text'])

In [None]:
import tensorflow as tf
import keras_nlp

text_input = tf.keras.Input(shape=(), dtype=tf.string)

preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
    "bert_base_en"
)

encoder = keras_nlp.models.BertBackbone.from_preset(
    "bert_base_en"
)

x = preprocessor(text_input)
x = encoder(x)
cls = x["pooled_output"]

output = tf.keras.layers.Dense(1, activation="sigmoid")(cls)

model = tf.keras.Model(text_input, output)
model.summary()


In [None]:
import torch
from transformers import BertTokenizer, BertModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert = BertModel.from_pretrained("bert-base-uncased").to(device)


In [None]:
import pandas as pd
from torch.utils.data import Dataset

class JoyDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.texts = df["text"].tolist()
        self.labels = df["Joy"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [None]:
from torch.utils.data import DataLoader

train_df = df_joy[df_joy["Split"] == 0]
val_df   = df_joy[df_joy["Split"] == 1]

train_ds = JoyDataset(train_df, tokenizer)
val_ds   = JoyDataset(val_df, tokenizer)

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=16)


In [None]:
import torch.nn as nn

class BertClassifier(nn.Module):
    def __init__(self, n_classes=2):
        super().__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(768, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        cls = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls)
        return self.fc(x)


In [None]:
from torch.optim import AdamW

model = BertClassifier(n_classes=2).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

def train_epoch(model, loader):
    model.train()
    total_loss = 0

    for batch in loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)


In [None]:
def eval_epoch(model, loader):
    model.eval()
    correct, total = 0, 0

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            logits = model(input_ids, attention_mask)
            preds = torch.argmax(logits, dim=1)

            correct += (preds == labels).sum().item()
            total += labels.size(0)

    return correct / total


In [None]:
for epoch in range(5):
    train_loss = train_epoch(model, train_loader)
    val_acc = eval_epoch(model, val_loader)

    print(f"Epoch {epoch+1} | Loss: {train_loss:.4f} | Val Acc: {val_acc:.4f}")
