In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load your data
data = pd.read_csv(r"D:\health\extracted_keyphrase.csv")
data.head(1)

Unnamed: 0,Gross description of the specimen_Keywords,microscopic appearance_Keywords,diagnosis_Keywords
0,"['bottle labelled age', 'consists soft tissue'...","['connective tissue stroma', 'cellular connect...",['peripheral ossifying fibroma']


In [5]:
data['diagnosis_Keywords_flat'] = data['diagnosis_Keywords'].apply(lambda x: x[0] if isinstance(x, list) and x else x)
#print(data['diagnosis_Keywords_flat'].unique())


In [6]:
# Preprocess keywords to form strings and encode labels
data['text'] = data['microscopic appearance_Keywords'].apply(lambda x: ' '.join(eval(x)))


label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['diagnosis_Keywords_flat'])
print(data['text'][2]) 
data.head(2)

granulation tissue exhibiting inflammatory cell infiltrate chronic inflammatory cell granulation tissue inflammatory cell presence granulation tissue chronic inflammatory moderate chronic inflammatory inflammatory tissue exhibiting moderate


Unnamed: 0,Gross description of the specimen_Keywords,microscopic appearance_Keywords,diagnosis_Keywords,diagnosis_Keywords_flat,text,label
0,"['bottle labelled age', 'consists soft tissue'...","['connective tissue stroma', 'cellular connect...",['peripheral ossifying fibroma'],['peripheral ossifying fibroma'],connective tissue stroma cellular connective t...,16
1,"['dentigerous cyst containing', 'labelled dent...","['tissue capsule fibrous', 'capsule composed f...",['dentigerous cyst'],['dentigerous cyst'],tissue capsule fibrous capsule composed fibrou...,3


In [7]:
# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['text'].tolist(), data['label'].tolist(), test_size=0.2, random_state=42
)

# Tokenization
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class MedicalDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset = MedicalDataset(train_texts, train_labels, tokenizer)
val_dataset = MedicalDataset(val_texts, val_labels, tokenizer)



In [None]:
# Load the BERT model with a classification head
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_encoder.classes_))

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

# Save the model and tokenizer for later use
model.save_pretrained("medical_diagnosis_bert_model")
tokenizer.save_pretrained("medical_diagnosis_bert_model")


In [8]:
import joblib
joblib.dump(label_encoder, "label_encoder.pkl")


['label_encoder.pkl']

In [1]:
import joblib
import pandas as pd

# Load your dataset
data = pd.read_csv("pakka.csv")

# Extract unique microscopic appearance keywords
microscopic_keywords = set()
for entry in data['microscopic appearance_Keywords']:
    if isinstance(entry, str):
        microscopic_keywords.update(eval(entry))  # Convert string to list and collect keywords

# Save the keywords to a file
joblib.dump(sorted(microscopic_keywords), "microscopic_keywords.pkl")


['microscopic_keywords.pkl']

In [3]:
# import joblib
# import pandas as pd

# # Load your dataset (one-time operation during preprocessing)
# data = pd.read_csv("pakka.csv")

# # Group microscopic appearance keywords by diagnosis
# grouped_keywords = {}
# for _, row in data.iterrows():
#     diagnosis = row['diagnosis_Keywords']
#     if isinstance(row['microscopic appearance_Keywords'], str):
#         keywords = eval(row['microscopic appearance_Keywords'])
#         if diagnosis not in grouped_keywords:
#             grouped_keywords[diagnosis] = set()
#         grouped_keywords[diagnosis].update(keywords)

# # Convert sets to lists and save to a file
# grouped_keywords = {diagnosis: sorted(keywords) for diagnosis, keywords in grouped_keywords.items()}
# joblib.dump(grouped_keywords, "grouped_keywords.pkl")


['grouped_keywords.pkl']