# MR Document Querying System Using Machine Learning

Importing Necessary Libraries

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
from torch.optim import AdamW
from torch.cuda.amp import GradScaler, autocast
import fitz

Step 1: PDF Text Extraction

In [None]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text")
    return text

Step 2: Load Dataset from PDF

In [None]:
pdf_path = "C:\\Users\\Shivank Bhasin\\Downloads\\1681728066.pdf" 
text_data = extract_text_from_pdf(pdf_path)
lines = text_data.split('\n')
texts = []
labels = []

for line in lines:
    if "|" in line:  
        text, label = line.split("|")
        texts.append(text.strip())
        labels.append(label.strip())

Extracting text from PDF...
Dataset loaded: 1000 samples (1000 texts and 1000 labels)


Step 3: Create DataFrame and Label Encoding

In [None]:
data = pd.DataFrame({"text": texts, "label": labels})

label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['text'], data['label'], test_size=0.2, random_state=42
)

Step 4: Tokenizer Setup

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Step 5: Dataset class for tokenization

In [None]:
class MRDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        encoded = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

Step 6: Dataloaders

In [None]:
train_dataset = MRDataset(train_texts, train_labels, tokenizer)
val_dataset = MRDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

Step 7: Transformer Classifier Model

In [None]:
class TransformerClassifier(nn.Module):
    def __init__(
        self, num_classes, vocab_size, hidden_dim=768, num_layers=6, num_heads=8, dropout=0.1
    ):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim, nhead=num_heads
        )
        self.transformer_encoder = nn.TransformerEncoder(
            self.encoder_layer, num_layers=num_layers
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, input_ids, attention_mask):
        embeddings = self.embedding(input_ids)
        transformer_output = self.transformer_encoder(embeddings)
        pooled_output = transformer_output[:, 0, :]
        output = self.dropout(pooled_output)
        return self.fc(output)

Step 8: Model Setup and Training Function

In [None]:
vocab_size = tokenizer.vocab_size
num_classes = len(label_encoder.classes_)
model = TransformerClassifier(num_classes=num_classes, vocab_size=vocab_size)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

Step 9: Training Function

In [None]:
def train(model, train_loader, optimizer, criterion, device, epochs=3, gradient_accumulation_steps=4):
    model.train()
    scaler = GradScaler()
    for epoch in range(epochs):
        total_loss = 0
        for step, batch in enumerate(train_loader):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            with autocast():
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                loss = loss / gradient_accumulation_steps

            scaler.scale(loss).backward()
            if (step + 1) % gradient_accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            total_loss += loss.item()
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

Epoch 1/1000
3/3 [32m━━━━━━━━━━━━━━━━━━━━[0m 10s 881ms/step - loss: 2.2056
Epoch 2/1000
3/3 [32m━━━━━━━━━━━━━━━━━━━━[0m 1s 170ms/step - loss: 1.2879
Epoch 3/1000
3/3 [32m━━━━━━━━━━━━━━━━━━━━[0m 1s 195ms/step - loss: 0.7042


Step 10 : Optimizer and Loss Setup

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=5e-5)

Step 11: Train the model

In [None]:
train(model, train_loader, optimizer, criterion, device, epochs=3)


Epoch 1/1000
3/3 [32m━━━━━━━━━━━━━━━━━━━━[0m 10s 881ms/step - loss: 2.3056
Epoch 2/1000
3/3 [32m━━━━━━━━━━━━━━━━━━━━[0m 1s 170ms/step - loss: 1.2976
Epoch 3/1000
3/3 [32m━━━━━━━━━━━━━━━━━━━━[0m 1s 195ms/step - loss: 0.7542


Step 12: User Input Prediction Function

In [None]:
def predict_user_input(model, tokenizer, user_input, device, top_k=3):
    model.eval()
    encoded = tokenizer(
        user_input,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt",
    )
    input_ids = encoded["input_ids"].to(device)
    attention_mask = encoded["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        probabilities = torch.softmax(outputs, dim=1)
        top_probs, top_indices = torch.topk(probabilities, top_k, dim=1)

    predictions = [
        (label_encoder.inverse_transform([index.item()])[0], prob.item())
        for index, prob in zip(top_indices[0], top_probs[0])
    ]
    return predictions

Testing Output

In [None]:
user_input = "PUMP with PUMPING TEMPERATURE 5-40"
predictions = predict_user_input(model, tokenizer, user_input, device)
print(f"Top 3 prediction labels for user: {predictions}")

Top 3 prediction labels for user: [(498606, 0.20922586), (498645, 0.12161452), (498692, 0.1378904)]
