In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import re
from collections import Counter
import numpy as np

In [2]:
# Load the dataset from the specified local path
df = pd.read_csv(r"..\data\merged_file.csv")

In [3]:
def basic_tokenizer(text):
    if not isinstance(text, str):
        return []  
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text.lower())
    return text.split()


token_counts = Counter()
for text in df['Title']:
    tokens = basic_tokenizer(text)
    token_counts.update(tokens)

In [4]:
vocab = {word: i+1 for i, (word, _) in enumerate(token_counts.items())} 
vocab['<pad>'] = 0
vocab['<unk>'] = len(vocab)


def encode(text, vocab=vocab):
    return [vocab.get(token, vocab['<unk>']) for token in basic_tokenizer(text)]

In [5]:
class VeterinaryDataset(Dataset):
    def __init__(self, df, vocab, max_len=128):
        self.df = df
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx]['Title']  
        input_ids = encode(text, self.vocab)[:self.max_len]
        input_ids += [0] * (self.max_len - len(input_ids))  
        return torch.tensor(input_ids, dtype=torch.long)

dataset = VeterinaryDataset(df, vocab)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim

In [7]:
import torch.nn as nn
import torch.nn.functional as F

class VetGPT(nn.Module):
    def __init__(self, vocab_size, max_len=128, n_layers=4, n_heads=8, d_model=256, d_ff=512):
        super(VetGPT, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = nn.Parameter(torch.randn(1, max_len, d_model))
        
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model, n_heads, d_ff) for _ in range(n_layers)
        ])
        
        self.output_layer = nn.Linear(d_model, vocab_size)
        
    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        for layer in self.layers:
            x = layer(x)
        return self.output_layer(x)

vocab_size = len(vocab)
model = VetGPT(vocab_size=vocab_size) 


#GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [8]:
import torch.optim as optim

num_epochs = 5
learning_rate = 5e-4

criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding index in loss
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in dataloader:
        inputs = batch  # Keep on CPU
        targets = inputs.clone()  # Clone for target prediction

        optimizer.zero_grad()
        outputs = model(inputs)  # Keep outputs on CPU
        outputs = outputs.view(-1, vocab_size)
        targets = targets.view(-1)
        
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

Epoch 1/5, Loss: 4.8480
Epoch 2/5, Loss: 2.7975
Epoch 3/5, Loss: 1.7868
Epoch 4/5, Loss: 1.1588
Epoch 5/5, Loss: 0.7480


In [9]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [10]:
def search_dataset_fuzzy(df, query, threshold=70):

    query_lower = query.lower()

    
    results = []

    for _, row in df.iterrows():
        title = row.get('Title', '')
        description = row.get('Field', '')
        
        title_score = fuzz.partial_ratio(query_lower, str(title).lower())
        description_score = fuzz.partial_ratio(query_lower, str(description).lower())
        

        if title_score >= threshold or description_score >= threshold:
            url = row.get('Title_URL', 'No URL available')
            results.append((title, url, description, max(title_score, description_score)))


    results = sorted(results, key=lambda x: x[3], reverse=True)

    if results:
        best_match = results[0]
        return f"{best_match[0]}\t{best_match[1]}\t{best_match[2]}"
    else:
        return "No relevant entries found."

query = input("Ask a Veterinary Question: ")
response = search_dataset_fuzzy(df, query)
print("Response:")
print(response)

Response:
Asthma in Cats	https://veterinarypartner.vin.com/default.aspx?pid=19239&catId=254055&id=4951536	If your cat needs to open its mouth to breathe or if its abdomen moves excessively as it breathes (and it is not purring), then it may be suffering from feline asthma. Asthma is a recurring respiratory compromise that occurs when the lung airways constrict either spontaneously or in response to stimuli that normally should not cause a reaction.
