In [None]:

import subprocess

yt_dlp_path = r"C:\Users\Lenovo\pyannote_env\Scripts\yt-dlp.exe"  # Update this path as needed
video_url = "https://www.youtube.com/watch?v=awbVzXKr5co"
output_audio = "D:/whisper_med/audio_file_patient/youtube_audio.wav"

command = [
    yt_dlp_path,  # Use full path
    "-x",
    "--audio-format", "wav",
    "-o", output_audio,
    video_url,
]

subprocess.run(command)

print(f"Audio extracted and saved as {output_audio}")


In [None]:
import subprocess
from pyannote.audio import Pipeline

# Define the path to your audio file
audio_path = "D:/whisper_med/audio_file_patient/youtube_audio.wav"
processed_audio_path = "D:/whisper_med/audio_file_patient/processed_audio.wav"

# Convert audio to mono, 16kHz (PyAnnote requires this format)
ffmpeg_command = [
    "ffmpeg", "-i", audio_path, 
    "-ar", "16000", "-ac", "1", processed_audio_path, "-y"
]

subprocess.run(ffmpeg_command, check=True)

# Load the pre-trained speaker diarization pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization")

# Perform speaker diarization, specifying that we expect 2 speakers
diarization = pipeline({"uri": "youtube_audio", "audio": processed_audio_path}, num_speakers=2)

# Print speaker segments
for turn, _, speaker in diarization.itertracks(yield_label=True):
    print(f"Speaker: {speaker}, Start: {turn.start:.2f}s, End: {turn.end:.2f}s")


In [None]:
import os
import torch
from transformers import pipeline

# Path where segmented audio files are stored
SEGMENTS_DIR = "D:/whisper_med/audio_file_patient/segments"
TRANSCRIPTIONS_DIR = "D:/whisper_med/audio_file_patient/transcriptions"

# Ensure transcription directory exists
os.makedirs(TRANSCRIPTIONS_DIR, exist_ok=True)

# Load Whisper model with GPU support if available
device = "cuda" if torch.cuda.is_available() else "cpu"
whisper_model = pipeline(
    "automatic-speech-recognition",
    model="D:/whisper_medical_model",  # Path to the saved Whisper model
    device=device
)

print(f"Using device: {device}")
print("Whisper model loaded successfully!")

# Process each segment in the directory
for segment_file in sorted(os.listdir(SEGMENTS_DIR)):  
    segment_path = os.path.join(SEGMENTS_DIR, segment_file)

    if segment_file.endswith(".wav"):  # Ensure it's an audio file
        print(f"Transcribing {segment_file}...")

        # Transcribe the audio
        transcription_result = whisper_model(segment_path, return_timestamps=True)
        transcription_text = transcription_result["text"]

        # Define transcription file path (same name as segment but .txt)
        transcript_filename = os.path.splitext(segment_file)[0] + ".txt"
        transcript_path = os.path.join(TRANSCRIPTIONS_DIR, transcript_filename)

        # Save transcription
        with open(transcript_path, "w", encoding="utf-8") as f:
            f.write(transcription_text)

        print(f"Saved transcription: {transcript_path}")

print("\nAll transcriptions saved in:", TRANSCRIPTIONS_DIR)


In [4]:
import pandas as pd

# Load dataset from Excel file
dataset_path = r"C:\Users\Lenovo\Desktop\fixed_doctor_patient_dialogue.xlsx"
df = pd.read_excel(dataset_path)

# Check dataset structure
print(df.head())


                                               Input   Output
0    You need to take these antibiotics twice a day.   Doctor
1  I have been feeling unwell for the past two days.  Patient
2  Let's schedule a follow-up appointment in two ...   Doctor
3  I've had a persistent dry cough for three week...  Patient
4  Avoid caffeine and alcohol until your heart rh...   Doctor


In [5]:
df.rename(columns={"YourTextColumn": "Input", "YourLabelColumn": "Output"}, inplace=True)


In [7]:
# Convert categorical labels to numerical values
label_mapping = {"Doctor": 1, "Patient": 0}
df["label"] = df["Output"].map(label_mapping)


In [8]:
from datasets import Dataset

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Verify dataset structure
print(dataset)


Dataset({
    features: ['Input', 'Output', 'label'],
    num_rows: 392
})


In [10]:
import os
import pandas as pd
import torch
from datasets import Dataset
from transformers import BioGptTokenizer, BioGptForSequenceClassification, Trainer, TrainingArguments

# 1. Load and preprocess the dataset from Excel
excel_file = r"C:\Users\Lenovo\Desktop\doctor_patient_dialogue.xlsx"
df = pd.read_excel(excel_file, header=None)
df.columns = ["Raw"]

# If the first row contains headers like "Input,Output", remove it
if "Input" in df.iloc[0, 0]:
    df = df.iloc[1:].reset_index(drop=True)

# Split the raw column by comma into Input and Output
df[['Input', 'Output']] = df['Raw'].str.split(',', n=1, expand=True)
df['Input'] = df['Input'].str.strip()
df['Output'] = df['Output'].str.strip()

# Convert Output to labels (e.g., 1 for Doctor, 0 for Patient)
label_mapping = {"Doctor": 1, "Patient": 0}
df['label'] = df['Output'].map(label_mapping)

# If any rows didn't match, they will be NaN; drop or handle them as needed.
df = df.dropna(subset=["label"])
df["label"] = df["label"].astype(int)

# Drop the raw column now that we have Input and Output
df = df.drop(columns=["Raw"])

# Save the cleaned dataset (optional)
cleaned_file = r"C:\Users\Lenovo\Desktop\cleaned_doctor_patient_dialogue.xlsx"
df.to_excel(cleaned_file, index=False)
print("Cleaned dataset saved at:", cleaned_file)

# 2. Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# 3. Load BioGPT tokenizer and model for sequence classification
model_name = "microsoft/biogpt"  # or your chosen variant
tokenizer = BioGptTokenizer.from_pretrained(model_name)
model = BioGptForSequenceClassification.from_pretrained(model_name, num_labels=2)

# 4. Tokenization function with proper padding and truncation
def tokenize_function(examples):
    return tokenizer(examples["Input"], padding=True, truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Remove original text columns to avoid nesting issues
tokenized_datasets = tokenized_datasets.remove_columns(["Input", "Output"])
# Set dataset format for PyTorch
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# 5. Split dataset into train and test sets
split_dataset = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

# 6. Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="./logs",
)

# 7. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# 8. Fine-tune the model
trainer.train()

# 9. Save the fine-tuned model and tokenizer
save_dir = r"D:\fine_tuned_biogpt"

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print("Fine-tuning complete. Model saved at:", save_dir)

# 10. Evaluate the model
metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)


Cleaned dataset saved at: C:\Users\Lenovo\Desktop\cleaned_doctor_patient_dialogue.xlsx


Some weights of BioGptForSequenceClassification were not initialized from the model checkpoint at microsoft/biogpt and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/361 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.304167
2,No log,0.143412
3,No log,0.169742


Fine-tuning complete. Model saved at: D:\fine_tuned_biogpt


Evaluation Metrics: {'eval_loss': 0.169741690158844, 'eval_runtime': 35.96, 'eval_samples_per_second': 2.03, 'eval_steps_per_second': 0.278, 'epoch': 3.0}


In [11]:
# Evaluate model
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 0.169741690158844, 'eval_runtime': 33.8246, 'eval_samples_per_second': 2.158, 'eval_steps_per_second': 0.296, 'epoch': 3.0}


In [None]:
# import os
# import torch
# from transformers import BioGptTokenizer, BioGptForSequenceClassification

# # Define model path and transcriptions directory
# model_path = r"D:\fine_tuned_biogpt"
# transcriptions_dir = r"D:\whisper_med\audio_file_patient\transcriptions"

# # Load fine-tuned BioGPT model and tokenizer
# tokenizer = BioGptTokenizer.from_pretrained(model_path)
# model = BioGptForSequenceClassification.from_pretrained(model_path, num_labels=2)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# model.eval()  # Set model to evaluation mode

# def predict_role(text):
#     """
#     Tokenize the input text and predict the role using the fine-tuned model.
#     Returns "Doctor" if predicted label == 1, otherwise "Patient".
#     """
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
#     inputs = {key: value.to(device) for key, value in inputs.items()}
#     with torch.no_grad():
#         outputs = model(**inputs)
#     predicted_label = torch.argmax(outputs.logits, dim=1).item()
#     return "Doctor" if predicted_label == 1 else "Patient"

# # Process each transcribed text file
# for filename in sorted(os.listdir(transcriptions_dir)):
#     if filename.endswith(".txt"):
#         file_path = os.path.join(transcriptions_dir, filename)
#         with open(file_path, "r", encoding="utf-8") as f:
#             text = f.read().strip()
#         # Preprocess the text if needed (here we simply strip extra whitespace)
#         text = text.replace("\n", " ").strip()
#         role = predict_role(text)
#         print(f"File: {filename}")
#         print(f"Text: {text}")
#         print(f"Predicted Role: {role}\n")


File: segment_0.txt
Text: May I come in, Doctor? Yes, come in. Take your seat.
Predicted Role: Patient

File: segment_1.txt
Text: Thank you, doctor. Um, what's your name?
Predicted Role: Doctor

File: segment_10.txt
Text: Hmm, any other symptoms?
Predicted Role: Doctor

File: segment_11.txt
Text: Yeah, I also had bouts of vomiting last night and today morning as well.
Predicted Role: Patient

File: segment_12.txt
Text: Do you have a headache? No.
Predicted Role: Patient

File: segment_13.txt
Text: Did you have this kind of a stomach ache before?
Predicted Role: Doctor

File: segment_14.txt
Text: Yes, doctor. I had it once before.
Predicted Role: Patient

File: segment_15.txt
Text: How many days ago?
Predicted Role: Doctor

File: segment_16.txt
Text: almost three months ago. But at that time the pain stopped after I took an antacid.
Predicted Role: Patient

File: segment_17.txt
Text: Hmm. Um, please lie on that bed. I have to check. Okay, doctor.
Predicted Role: Patient

File: segment_1

In [None]:
# import os
# import torch
# import re
# from transformers import BioGptTokenizer, BioGptForSequenceClassification

# # Define model path and transcriptions directory
# model_path = r"D:\fine_tuned_biogpt"  # Fine-tuned model saved in D: drive
# transcriptions_dir = r"D:\whisper_med\audio_file_patient\transcriptions"

# # Load fine-tuned BioGPT model and tokenizer
# tokenizer = BioGptTokenizer.from_pretrained(model_path)
# model = BioGptForSequenceClassification.from_pretrained(model_path, num_labels=2)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# model.eval()  # Set model to evaluation mode

# def predict_role(text):
#     """
#     Tokenize the input text and predict the role using the fine-tuned model.
#     Returns "Doctor" if predicted label == 1, else "Patient".
#     """
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
#     inputs = {key: value.to(device) for key, value in inputs.items()}
#     with torch.no_grad():
#         outputs = model(**inputs)
#     predicted_label = torch.argmax(outputs.logits, dim=1).item()
#     return "Doctor" if predicted_label == 1 else "Patient"

# def split_sentences(text):
#     """
#     Splits text into sentences based on '.' and '?' delimiters.
#     Keeps the delimiters with each sentence.
#     """
#     # Split using regex to retain punctuation as part of sentences.
#     sentences = re.split(r'(?<=[.?])\s+', text)
#     sentences = [s.strip() for s in sentences if s.strip()]  # Remove empty strings
#     return sentences

# # Process each transcribed text file in the directory
# for filename in sorted(os.listdir(transcriptions_dir)):
#     if filename.endswith(".txt"):
#         file_path = os.path.join(transcriptions_dir, filename)
#         with open(file_path, "r", encoding="utf-8") as f:
#             full_text = f.read().strip()
        
#         # Split the full text into sentences based on '.' and '?'
#         sentences = split_sentences(full_text)
        
#         print(f"\nFile: {filename}")
#         print(f"Full Text: {full_text}\n")
        
#         # Predict and print role for each sentence
#         for sentence in sentences:
#             # Optionally ignore very short sentences if needed
#             if len(sentence.split()) < 3:
#                 continue
#             role = predict_role(sentence)
#             print(f"Sentence: {sentence}")
#             print(f"Predicted Role: {role}\n")



File: segment_0.txt
Full Text: May I come in, Doctor? Yes, come in. Take your seat.

Sentence: May I come in, Doctor?
Predicted Role: Doctor

Sentence: Yes, come in.
Predicted Role: Doctor

Sentence: Take your seat.
Predicted Role: Doctor


File: segment_1.txt
Full Text: Thank you, doctor. Um, what's your name?

Sentence: Thank you, doctor.
Predicted Role: Doctor

Sentence: Um, what's your name?
Predicted Role: Doctor


File: segment_10.txt
Full Text: Hmm, any other symptoms?

Sentence: Hmm, any other symptoms?
Predicted Role: Doctor


File: segment_11.txt
Full Text: Yeah, I also had bouts of vomiting last night and today morning as well.

Sentence: Yeah, I also had bouts of vomiting last night and today morning as well.
Predicted Role: Patient


File: segment_12.txt
Full Text: Do you have a headache? No.

Sentence: Do you have a headache?
Predicted Role: Doctor


File: segment_13.txt
Full Text: Did you have this kind of a stomach ache before?

Sentence: Did you have this kind of a st

In [15]:
# import os
# import re
# import torch
# import pandas as pd
# from transformers import BioGptTokenizer, BioGptForSequenceClassification

# # Define directories and file paths
# model_path = r"D:\fine_tuned_biogpt"  # Fine-tuned model directory
# transcriptions_dir = r"D:\whisper_med\audio_file_patient\transcriptions"
# adaptive_data_path = r"D:\whisper_med\adaptive_training_data.csv"  # Adaptive training data file

# # Load fine-tuned BioGPT model and tokenizer
# tokenizer = BioGptTokenizer.from_pretrained(model_path)
# model = BioGptForSequenceClassification.from_pretrained(model_path, num_labels=2)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# model.eval()

# def predict_role(text):
#     """
#     Tokenize the input text and predict the role using the fine-tuned model.
#     Returns "Doctor" if predicted label == 1, else "Patient".
#     """
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
#     inputs = {key: value.to(device) for key, value in inputs.items()}
#     with torch.no_grad():
#         outputs = model(**inputs)
#     predicted_label = torch.argmax(outputs.logits, dim=1).item()
#     return "Doctor" if predicted_label == 1 else "Patient"

# def split_sentences(text):
#     """
#     Splits text into sentences based on '.' and '?' delimiters.
#     Keeps the delimiters attached.
#     """
#     sentences = re.split(r'(?<=[.?])\s+', text)
#     return [s.strip() for s in sentences if s.strip()]

# # Ensure adaptive training file exists; if not, create it with headers
# if not os.path.exists(adaptive_data_path):
#     pd.DataFrame(columns=["Input", "Output"]).to_csv(adaptive_data_path, index=False)

# # Process each transcription file
# for filename in sorted(os.listdir(transcriptions_dir)):
#     if filename.endswith(".txt"):
#         file_path = os.path.join(transcriptions_dir, filename)
#         with open(file_path, "r", encoding="utf-8") as f:
#             full_text = f.read().strip()
        
#         # Split the transcription into fine-grained sentences
#         sentences = split_sentences(full_text)
        
#         print(f"\nProcessing File: {filename}")
#         print(f"Full Text: {full_text}\n")
        
#         for sentence in sentences:
#             # Skip very short sentences that might be noise
#             if len(sentence.split()) < 3:
#                 continue
            
#             # Predict role using our fine-tuned model
#             predicted_role = predict_role(sentence)
            
#             # If the sentence contains 'doctor' (case-insensitive) and the prediction is "Doctor",
#             # override the prediction to "Patient"
#             if re.search(r'\bdoctor\b', sentence, flags=re.IGNORECASE) and predicted_role == "Doctor":
#                 corrected_role = "Patient"
#                 print(f"Sentence: {sentence}")
#                 print(f"Model Predicted Role: {predicted_role}  --> Overridden to: {corrected_role}")
                
#                 # Log the corrected example into the adaptive training CSV file
#                 new_entry = pd.DataFrame({"Input": [sentence], "Output": [corrected_role]})
#                 new_entry.to_csv(adaptive_data_path, mode='a', header=False, index=False)
#             else:
#                 corrected_role = predicted_role
#                 print(f"Sentence: {sentence}")
#                 print(f"Predicted Role: {corrected_role}")
        
#         print("-" * 50)

# print(f"\nAdaptive training data saved at: {adaptive_data_path}")

import os
import re
import torch
from transformers import BioGptTokenizer, BioGptForSequenceClassification

# Define model path and transcriptions directory
model_path = r"D:\fine_tuned_biogpt"  # Fine-tuned model saved in D:
transcriptions_dir = r"D:\whisper_med\audio_file_patient\transcriptions"
adaptive_data_path = r"D:\whisper_med\adaptive_training_data.csv"  # For adaptive logging (if needed)

# Load fine-tuned BioGPT model and tokenizer
tokenizer = BioGptTokenizer.from_pretrained(model_path)
model = BioGptForSequenceClassification.from_pretrained(model_path, num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set model to evaluation mode

def predict_role(text):
    """
    Tokenize the input text and predict the role using the fine-tuned model.
    Returns "Doctor" if predicted label == 1, else "Patient".
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    predicted_label = torch.argmax(outputs.logits, dim=1).item()
    return "Doctor" if predicted_label == 1 else "Patient"

def split_sentences(text):
    """
    Splits text into sentences based on '.' and '?' delimiters.
    Keeps the delimiters attached.
    """
    # Use regex to split on period or question mark followed by space or end-of-string.
    sentences = re.split(r'(?<=[.?])\s+', text)
    return [s.strip() for s in sentences if s.strip()]

def apply_generalized_rules(sentence, predicted_role):
    """
    Apply domain-specific rules to override the predicted role based on sentence content.
    
    Rule 1: If the sentence contains directive/prescription language, override to "Doctor".
            (e.g., phrases like "writing down", "prescribe", "take these medicines", "prescribed")
    
    Rule 2: If the sentence contains polite address (e.g., "thank you, doctor") without directive language,
            override to "Patient".
    
    If none match, return the original prediction.
    """
    sentence_lower = sentence.lower()

    # Directive keywords (if any of these appear, consider it directive from a doctor)
    directive_keywords = ["writing down", "prescribe", "prescribing", "take these", "take them", "medicine", "medicines", "prescribed"]

    # Polite address keyword (often patients address doctors politely)
    polite_address_pattern = r'\bthank you[,]*\s*doctor\b'

    # Check for directive language
    if any(keyword in sentence_lower for keyword in directive_keywords):
        return "Doctor"
    
    # Check for polite address
    if re.search(polite_address_pattern, sentence_lower):
        return "Patient"
    
    return predicted_role

# Process each transcription file in the directory
for filename in sorted(os.listdir(transcriptions_dir)):
    if filename.endswith(".txt"):
        file_path = os.path.join(transcriptions_dir, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            full_text = f.read().strip()
        
        # Split the full text into sentences based on '.' and '?'
        sentences = split_sentences(full_text)
        
        print(f"\nFile: {filename}")
        print(f"Full Text: {full_text}\n")
        
        for sentence in sentences:
            # Optionally ignore very short sentences
            if len(sentence.split()) < 3:
                continue
            
            model_pred = predict_role(sentence)
            # Apply our generalized rules
            final_pred = apply_generalized_rules(sentence, model_pred)
            print(f"Sentence: {sentence}")
            print(f"Model Predicted Role: {model_pred} --> Final Role: {final_pred}\n")




File: segment_0.txt
Full Text: May I come in, Doctor? Yes, come in. Take your seat.

Sentence: May I come in, Doctor?
Model Predicted Role: Doctor --> Final Role: Doctor

Sentence: Yes, come in.
Model Predicted Role: Doctor --> Final Role: Doctor

Sentence: Take your seat.
Model Predicted Role: Doctor --> Final Role: Doctor


File: segment_1.txt
Full Text: Thank you, doctor. Um, what's your name?

Sentence: Thank you, doctor.
Model Predicted Role: Doctor --> Final Role: Patient

Sentence: Um, what's your name?
Model Predicted Role: Doctor --> Final Role: Doctor


File: segment_10.txt
Full Text: Hmm, any other symptoms?

Sentence: Hmm, any other symptoms?
Model Predicted Role: Doctor --> Final Role: Doctor


File: segment_11.txt
Full Text: Yeah, I also had bouts of vomiting last night and today morning as well.

Sentence: Yeah, I also had bouts of vomiting last night and today morning as well.
Model Predicted Role: Patient --> Final Role: Patient


File: segment_12.txt
Full Text: Do yo

In [5]:
import os
import re
import torch
import pandas as pd
from transformers import BioGptTokenizer, BioGptForSequenceClassification

# === Directory & Model Setup ===
model_path = r"D:\fine_tuned_biogpt"  # Fine-tuned model directory on D:
transcriptions_dir = r"D:\whisper_med\audio_file_patient\transcriptions"
adaptive_data_path = r"D:\whisper_med\adaptive_training_data.csv"  # Adaptive training data file
processed_dir = r"D:\whisper_med\processed_transcriptions"  # Base folder for saving CSVs

# Ensure the adaptive training file exists with proper headers
if not os.path.exists(adaptive_data_path):
    pd.DataFrame(columns=["Input", "Output"]).to_csv(adaptive_data_path, index=False)

# Load the fine-tuned BioGPT model and tokenizer
tokenizer = BioGptTokenizer.from_pretrained(model_path)
model = BioGptForSequenceClassification.from_pretrained(model_path, num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set model to evaluation mode

# === Prediction Function ===
def predict_role(text):
    """
    Tokenize the input text and predict the role using the fine-tuned model.
    Returns "Doctor" if predicted label == 1, else "Patient".
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    predicted_label = torch.argmax(outputs.logits, dim=1).item()
    return "Doctor" if predicted_label == 1 else "Patient"

# === Sentence Splitting Function ===
def split_sentences(text):
    """
    Splits text into sentences based on '.' and '?' delimiters.
    Keeps the punctuation attached.
    """
    sentences = re.split(r'(?<=[.?])\s+', text)
    return [s.strip() for s in sentences if s.strip()]

# # === Updated Rule-Based Correction Function ===
# def apply_rule_based_corrections(text, predicted_role):
#     """
#     Applies the updated rule-based corrections to the model's prediction.
#     Rules (applied in order):
    
#     1. Polite Address Rule (Highest Priority):
#        - If the sentence contains polite address indicators (e.g., "thank you, doctor", "may i come in, doctor", "hi doctor")
#          then override the prediction to Patient.
         
#     2. Directive Language vs. Request Context:
#        - If the sentence contains directive keywords (e.g., "writing down", "prescribe", "take these", "medicine", "medicines", "prescribed"):
#          • If it does NOT contain any request indicators (e.g., "could you", "would you", "can you", "please", "won't you", "should i", "is it okay if i", "will you", "can i"),
#            override the prediction to Doctor.
#          • If it DOES contain request phrases, override to Patient.
         
#     3. Age Information Rule:
#        - If the sentence mentions age (e.g., "I'm 29", "I am 29", "my age is 29"), override the prediction to Patient.
       
#     4. Unlabeled Default:
#        - If the model's prediction is empty or "Unlabeled", default to Patient.
    
#     If none of these rules apply, return the model's prediction.
#     """
#     # If no prediction, default to "Unlabeled"
#     if not predicted_role or predicted_role.strip() == "":
#         predicted_role = "Unlabeled"
    
#     text_lower = text.lower()

#     # Rule 1: Polite Address Rule – override to Patient
#     polite_patterns = [r'\bthank you[,]*\s*doctor\b', r'\bmay i come\b', r'\bhi doctor\b']
#     for pattern in polite_patterns:
#         if re.search(pattern, text_lower):
#             if predicted_role != "Patient":
#                 return "Patient", True
#             return "Patient", False

#     # Rule 2: Directive Language vs. Request Context
#     directive_keywords = ["writing down", "prescribe", "prescribing", "take these", "take them", "medicine", "medicines", "prescribed"]
#     request_phrases = ["could you", "would you", "can you", "please", "won't you", "should i", "is it okay if i", "will you", "can i"]
#     has_directive = any(keyword in text_lower for keyword in directive_keywords)
#     has_request = any(phrase in text_lower for phrase in request_phrases)

#     if has_directive:
#         if has_request:
#             if predicted_role != "Patient":
#                 return "Patient", True
#             return "Patient", False
#         else:
#             if predicted_role != "Doctor":
#                 return "Doctor", True
#             return "Doctor", False

#     # Rule 3: Age Information Rule – override to Patient
#     age_pattern = r"(?:i'?m|i am|my age is)\s*\d{1,3}"
#     if re.search(age_pattern, text_lower):
#         if predicted_role != "Patient":
#             return "Patient", True
#         return "Patient", False

#     # Rule 4: If model prediction is Unlabeled, default to Patient.
#     if predicted_role == "Unlabeled":
#         return "Patient", True

#     # If no rule applies, return the model's prediction.
#     return predicted_role, False

# === Logging Function for Reinforcement Learning ===
def log_correction(sentence, corrected_label):
    """
    Append the corrected example (sentence and corrected label) to the adaptive training CSV file.
    """
    new_entry = pd.DataFrame({"Input": [sentence], "Output": [corrected_label]})
    new_entry.to_csv(adaptive_data_path, mode='a', header=False, index=False)

# === Function to Save Results into CSV Files ===
def save_results(filename, sentence, role):
    """
    Saves processed sentences into their respective CSV files (doctor.csv or patient.csv)
    in a folder named after the original transcription file (without extension) under:
    D:\whisper_med\processed_transcriptions
    """
    base_dir = r"D:\whisper_med\processed_transcriptions"
    folder_name = os.path.splitext(filename)[0]
    folder_path = os.path.join(base_dir, folder_name)
    os.makedirs(folder_path, exist_ok=True)
    file_path = os.path.join(folder_path, f"{role.lower()}.csv")
    new_entry = pd.DataFrame({"Sentence": [sentence]})
    new_entry.to_csv(file_path, mode="a", header=not os.path.exists(file_path), index=False)

# === Context-Aware Rule-Based Correction Function ===
def apply_rule_based_corrections(text, predicted_role, prev_text=None, prev_role=None):
    """
    Applies rule-based corrections with dynamic context awareness.
    
    - `prev_text`: The previous sentence (for context-based decisions).
    - `prev_role`: The role assigned to the previous sentence.
    """
    text_lower = text.lower()

    # Rule 1: If "doctor" is present in the sentence, label as "Patient"
    if "doctor" in text_lower:
        return "Patient", True  # Override applied

    # Rule 2: Directive Language (Doctors giving instructions)
    directive_keywords = ["writing down", "prescribe", "prescribing", "take these", "take them", "medicine", "medicines", "prescribed"]
    request_phrases = ["could you", "would you", "can you", "please", "won't you", "should I", "is it okay if I", "will you", "can I"]
    has_directive = any(keyword in text_lower for keyword in directive_keywords)
    has_request = any(phrase in text_lower for phrase in request_phrases)

    if has_directive:
        return ("Patient", True) if has_request else ("Doctor", True)

    # Rule 3: Age Information (Sentences mentioning age → Patient)
    age_pattern = r"(?:i'?m|i am|my age is)\s*\d{1,3}"
    if re.search(age_pattern, text_lower):
        return "Patient", True

    # Rule 4: Context Window (If previous sentence was from a doctor, continue as Doctor)
    if prev_text and prev_role == "Doctor":
        return "Doctor", True

    # Rule 5: Default to Model’s Prediction (if no rules apply)
    return predicted_role, False


# === Modified Main Processing Loop with Context Awareness ===
for filename in sorted(os.listdir(transcriptions_dir)):
    if filename.endswith(".txt"):
        file_path = os.path.join(transcriptions_dir, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            full_text = f.read().strip()
        
        sentences = split_sentences(full_text)

        print(f"\nProcessing File: {filename}")
        print(f"Full Text: {full_text}\n")

        prev_sentence = None
        prev_role = None

        for sentence in sentences:
            if len(sentence.split()) < 1:
                continue  # Skip empty sentences

            model_pred = predict_role(sentence)
            final_role, overridden = apply_rule_based_corrections(sentence, model_pred, prev_sentence, prev_role)

            print(f"Sentence: {sentence}")
            print(f"Model Predicted Role: {model_pred}  --> Final Role: {final_role}\n")

            if overridden:
                log_correction(sentence, final_role)

            save_results(filename, sentence, final_role)

            # Update context for next iteration
            prev_sentence = sentence
            prev_role = final_role
print(f"\nAdaptive training data saved at: {adaptive_data_path}")



Processing File: name.txt
Full Text: Simran Parveen.

Sentence: Simran Parveen.
Model Predicted Role: Doctor  --> Final Role: Doctor


Processing File: segment_0.txt
Full Text: May I come in, Doctor? Yes, come in. Take your seat.

Sentence: May I come in, Doctor?
Model Predicted Role: Doctor  --> Final Role: Patient

Sentence: Yes, come in.
Model Predicted Role: Doctor  --> Final Role: Doctor

Sentence: Take your seat.
Model Predicted Role: Doctor  --> Final Role: Doctor


Processing File: segment_1.txt
Full Text: Thank you, doctor. Um, what's your name?

Sentence: Thank you, doctor.
Model Predicted Role: Doctor  --> Final Role: Patient

Sentence: Um, what's your name?
Model Predicted Role: Doctor  --> Final Role: Doctor


Processing File: segment_10.txt
Full Text: Hmm, any other symptoms?

Sentence: Hmm, any other symptoms?
Model Predicted Role: Doctor  --> Final Role: Doctor


Processing File: segment_11.txt
Full Text: Yeah, I also had bouts of vomiting last night and today morning 

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Define the model path (the directory where the model is downloaded)
model_path = r"C:\Users\Lenovo\.cache\huggingface\hub\models--Clinical-AI-Apollo--Medical-NER"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Move model to GPU if available
device = 0 if torch.cuda.is_available() else -1

# Create a NER pipeline using the loaded model and tokenizer
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, device=device)

# Example: Process a transcribed text segment
sample_text = (
    "Patient says: I'm 29 and I've been experiencing headaches for the last few days. "
    "Doctor prescribed 500mg of paracetamol to be taken twice a day."
)

# Run the NER pipeline on the sample text
entities = ner_pipeline(sample_text)

# Print out the extracted entities
print("Extracted Medical Entities:")
for entity in entities:
    print(f"Entity: {entity['word']}, Label: {entity['entity']}, Score: {entity['score']:.2f}")


In [10]:
import os
import pandas as pd
from transformers import pipeline

# --- Setup: Define the base processed transcription directory and model path ---

processed_dir = r"D:\whisper_med\processed_transcriptions"  # Folder containing segment sub-folders
med_ner_model_path = "Helios9/BIOMed_NER"  # Model path for Clinical AI Apollo Med NER

# --- Load the Med NER Pipeline ---
ner_pipe = pipeline(
    task="token-classification",
    model=med_ner_model_path,
    tokenizer=med_ner_model_path,
    aggregation_strategy="average"
)

# --- Process all CSV files in the segments folder ---
# This will iterate over each sub-folder (e.g., segment_0, segment_1, etc.)
for segment_folder in sorted(os.listdir(processed_dir)):
    folder_path = os.path.join(processed_dir, segment_folder)
    if os.path.isdir(folder_path):
        print(f"\nProcessing folder: {segment_folder}")
        # Process each CSV file in the sub-folder
        for file in sorted(os.listdir(folder_path)):
            if file.endswith(".csv"):
                csv_path = os.path.join(folder_path, file)
                print(f"\nProcessing file: {file}")
                df = pd.read_csv(csv_path)
                
                # Check if there is a 'Sentence' column
                if "Sentence" not in df.columns:
                    print(f"Column 'Sentence' not found in {file}. Skipping...")
                    continue
                
                # Process each sentence using the NER pipeline
                for idx, row in df.iterrows():
                    sentence = row["Sentence"]
                    ner_output = ner_pipe(sentence)
                    print(f"Sentence: {sentence}")
                    print("NER Output:", ner_output)
                    print("-" * 40)


Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Processing folder: name

Processing file: patient.csv




Sentence: Simran Parveen.
NER Output: [{'entity_group': 'Age', 'score': 0.06272781, 'word': 'Simran', 'start': 0, 'end': 6}]
----------------------------------------

Processing folder: segment_0

Processing file: doctor.csv




Sentence: Yes, come in.
NER Output: []
----------------------------------------
Sentence: Take your seat.
NER Output: []
----------------------------------------

Processing file: patient.csv




Sentence: May I come in, Doctor?
NER Output: []
----------------------------------------

Processing folder: segment_1

Processing file: doctor.csv




Sentence: Um, what's your name?
NER Output: []
----------------------------------------

Processing file: patient.csv




Sentence: Thank you, doctor.
NER Output: []
----------------------------------------

Processing folder: segment_10

Processing file: doctor.csv




Sentence: Hmm, any other symptoms?
NER Output: []
----------------------------------------

Processing folder: segment_11

Processing file: patient.csv




Sentence: Yeah, I also had bouts of vomiting last night and today morning as well.
NER Output: [{'entity_group': 'Sign_symptom', 'score': 0.20863366, 'word': 'vomiting', 'start': 25, 'end': 34}, {'entity_group': 'Date', 'score': 0.20515616, 'word': 'night', 'start': 39, 'end': 45}, {'entity_group': 'Date', 'score': 0.14541103, 'word': 'morning', 'start': 55, 'end': 63}]
----------------------------------------

Processing folder: segment_12

Processing file: doctor.csv




Sentence: Do you have a headache?
NER Output: []
----------------------------------------
Sentence: No.
NER Output: []
----------------------------------------

Processing folder: segment_13

Processing file: doctor.csv




Sentence: Did you have this kind of a stomach ache before?
NER Output: [{'entity_group': 'Disease_disorder', 'score': 0.14776263, 'word': 'stomach', 'start': 27, 'end': 35}]
----------------------------------------

Processing folder: segment_14

Processing file: patient.csv




Sentence: Yes, doctor.
NER Output: []
----------------------------------------
Sentence: I had it once before.
NER Output: []
----------------------------------------

Processing folder: segment_15

Processing file: doctor.csv




Sentence: How many days ago?
NER Output: []
----------------------------------------

Processing folder: segment_16

Processing file: patient.csv




Sentence: almost three months ago.
NER Output: [{'entity_group': 'Duration', 'score': 0.15342881, 'word': 'three', 'start': 6, 'end': 12}, {'entity_group': 'Duration', 'score': 0.10232926, 'word': 'months', 'start': 12, 'end': 19}]
----------------------------------------
Sentence: But at that time the pain stopped after I took an antacid.
NER Output: []
----------------------------------------

Processing folder: segment_17

Processing file: doctor.csv




Sentence: Hmm.
NER Output: []
----------------------------------------
Sentence: Um, please lie on that bed.
NER Output: []
----------------------------------------
Sentence: I have to check.
NER Output: []
----------------------------------------

Processing file: patient.csv




Sentence: Okay, doctor.
NER Output: []
----------------------------------------

Processing folder: segment_18

Processing file: doctor.csv




Sentence: Does it hurt here?
NER Output: []
----------------------------------------
Sentence: Okay, you can get it now.
NER Output: []
----------------------------------------

Processing file: patient.csv




Sentence: Yes, doctor, it hurts a lot.
NER Output: []
----------------------------------------

Processing folder: segment_19

Processing file: patient.csv




Sentence: Is it something serious, doctor?
NER Output: []
----------------------------------------

Processing folder: segment_20

Processing file: doctor.csv




Sentence: I can't say now, I'm writing down some tests.
NER Output: []
----------------------------------------
Sentence: Try to do this by today.
NER Output: []
----------------------------------------

Processing folder: segment_21

Processing file: doctor.csv




Sentence: But what about now?
NER Output: []
----------------------------------------
Sentence: I can't even work properly because of the pain.
NER Output: []
----------------------------------------
Sentence: Hmm, I understand.
NER Output: []
----------------------------------------
Sentence: I'm giving you an injection for temporary relief.
NER Output: []
----------------------------------------

Processing folder: segment_22

Processing file: doctor.csv




Sentence: Injection?
NER Output: []
----------------------------------------
Sentence: Don't you have any medicine?
NER Output: [{'entity_group': 'History', 'score': 0.27738333, 'word': 'medicine?', 'start': 18, 'end': 28}]
----------------------------------------

Processing folder: segment_23

Processing file: doctor.csv




Sentence: Why?
NER Output: []
----------------------------------------
Sentence: Are you scared of injections?
NER Output: []
----------------------------------------
Sentence: It's not like that.
NER Output: []
----------------------------------------
Sentence: I mean, it would be better if you could give me some medicine.
NER Output: []
----------------------------------------

Processing folder: segment_24

Processing file: doctor.csv




Sentence: Nothing will happen.
NER Output: []
----------------------------------------
Sentence: You won't even feel it.
NER Output: []
----------------------------------------
Sentence: Look at that side.
NER Output: []
----------------------------------------

Processing folder: segment_25

Processing file: patient.csv




Sentence: Please, doctor, be careful.
NER Output: []
----------------------------------------

Processing folder: segment_26

Processing file: doctor.csv




Sentence: You can open your eyes now.
NER Output: []
----------------------------------------
Sentence: It's already done.
NER Output: []
----------------------------------------

Processing folder: segment_27

Processing file: patient.csv




Sentence: Oh, it's done.
NER Output: []
----------------------------------------
Sentence: Thank you so much.
NER Output: []
----------------------------------------
Sentence: I did not feel anything at all.
NER Output: []
----------------------------------------

Processing folder: segment_28

Processing file: doctor.csv




Sentence: After receiving the test reports, bring them to me as soon as possible.
NER Output: []
----------------------------------------

Processing folder: segment_29

Processing file: doctor.csv




Sentence: There's nothing to fear, right?
NER Output: []
----------------------------------------
Sentence: Don't be so scared beforehand.
NER Output: []
----------------------------------------
Sentence: Let's see the repose first.
NER Output: []
----------------------------------------

Processing folder: segment_3

Processing file: doctor.csv




Sentence: Thank you.
NER Output: []
----------------------------------------

Processing folder: segment_30

Processing file: doctor.csv




Sentence: Hmm, I'm prescribing this medicine.
NER Output: []
----------------------------------------
Sentence: It's just for today.
NER Output: [{'entity_group': 'Detailed_description', 'score': 0.11374801, 'word': 'just', 'start': 4, 'end': 9}]
----------------------------------------
Sentence: Take it after your dinner.
NER Output: [{'entity_group': 'Detailed_description', 'score': 0.09271472, 'word': 'after', 'start': 7, 'end': 13}]
----------------------------------------

Processing file: patient.csv




Sentence: Won't you give me any medicines doctor?
NER Output: []
----------------------------------------

Processing folder: segment_31

Processing file: doctor.csv




Sentence: Um, where shall I submit the fees?
NER Output: []
----------------------------------------
Sentence: Please submit that in the cash counter.
NER Output: []
----------------------------------------

Processing file: patient.csv




Sentence: Okay, doctor.
NER Output: []
----------------------------------------

Processing folder: segment_32

Processing file: patient.csv




Sentence: Thank you, Doctor.
NER Output: []
----------------------------------------

Processing folder: segment_33

Processing file: doctor.csv




Sentence: Welcome.
NER Output: []
----------------------------------------

Processing folder: segment_34

Processing file: doctor.csv




Sentence: Oh yes, come in please.
NER Output: []
----------------------------------------
Sentence: Here are the reports of the test that you gave.
NER Output: []
----------------------------------------
Sentence: Oh yeah, let me check them.
NER Output: []
----------------------------------------

Processing file: patient.csv




Sentence: May I come in, doctor?
NER Output: []
----------------------------------------

Processing folder: segment_35

Processing file: patient.csv




Sentence: I'm sorry.
NER Output: []
----------------------------------------

Processing folder: segment_36

Processing file: doctor.csv




Sentence: Hmm, it's not that serious, nothing to worry about.
NER Output: []
----------------------------------------
Sentence: It was just food poisoning.
NER Output: [{'entity_group': 'Disease_disorder', 'score': 0.3556828, 'word': 'food', 'start': 11, 'end': 16}]
----------------------------------------
Sentence: And if you face this problem again, come back immediately.
NER Output: []
----------------------------------------
Sentence: You're welcome.
NER Output: []
----------------------------------------

Processing file: patient.csv




Sentence: I'm writing down a medicines, please take them for one week after dinner.
NER Output: [{'entity_group': 'Duration', 'score': 0.31108415, 'word': 'one', 'start': 50, 'end': 54}, {'entity_group': 'Duration', 'score': 0.46910623, 'word': 'week', 'start': 54, 'end': 59}, {'entity_group': 'Duration', 'score': 0.13263787, 'word': 'after', 'start': 59, 'end': 65}]
----------------------------------------
Sentence: Oh, okay doctor.
NER Output: []
----------------------------------------
Sentence: Sure, doctor, thank you.
NER Output: []
----------------------------------------

Processing folder: segment_4

Processing file: doctor.csv




Sentence: Hmm.
NER Output: []
----------------------------------------
Sentence: And how old are you?
NER Output: []
----------------------------------------

Processing folder: segment_5

Processing file: patient.csv




Sentence: I'm 29.
NER Output: []
----------------------------------------

Processing folder: segment_6

Processing file: doctor.csv




Sentence: Oops!
NER Output: []
----------------------------------------

Processing folder: segment_7

Processing file: doctor.csv




Sentence: Okay.
NER Output: []
----------------------------------------

Processing folder: segment_8

Processing file: doctor.csv




Sentence: Now tell me, what are the problems that you're facing?
NER Output: []
----------------------------------------

Processing folder: segment_9

Processing file: patient.csv
Sentence: Since yesterday night I've been having severe stomach ache.
NER Output: [{'entity_group': 'Date', 'score': 0.31920394, 'word': 'yesterdaynight', 'start': 5, 'end': 21}, {'entity_group': 'Severity', 'score': 0.49358055, 'word': 'severe', 'start': 38, 'end': 45}, {'entity_group': 'Sign_symptom', 'score': 0.23591548, 'word': 'stomach', 'start': 45, 'end': 53}]
----------------------------------------




Sentence: I took an antithecid last night but the pain was still the same.
NER Output: [{'entity_group': 'Medication', 'score': 0.6232004, 'word': 'antithecid', 'start': 9, 'end': 20}, {'entity_group': 'Date', 'score': 0.19697075, 'word': 'night', 'start': 25, 'end': 31}]
----------------------------------------


In [11]:
# from transformers import pipeline

# # Load the model
# model_path = "Helios9/BIOMed_NER"
# pipe = pipeline(
#     task="token-classification",
#     model=model_path,
#     tokenizer=model_path,
#     aggregation_strategy="max"
# )

# # Test the pipeline
# text = ("A 48-year-old female presented with vaginal bleeding and abnormal Pap smears. "
#         "Upon diagnosis of invasive non-keratinizing SCC of the cervix, she underwent a radical "
#         "hysterectomy with salpingo-oophorectomy which demonstrated positive spread to the pelvic "
#         "lymph nodes and the parametrium.")
# result = pipe(text)
# print(result)

from Bio_Epidemiology_NER.bio_recognizer import ner_prediction

# returns the predicted class along with the probability of the actual EnvBert model
doc = """
	CASE: A woman with age of 65 years facing a headache, fever and shivering in legs
      for last two months and this occurs nearly 2 to 3 times in a week
      """

# returns a dataframe output
print(ner_prediction(corpus=doc, compute='gpu')) #pass compute='gpu' if using gpu

from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")
model = AutoModel.from_pretrained("medicalai/ClinicalBERT")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/5.00k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/266M [00:00<?, ?B/s]

Device set to use cpu
  final_df = final_df.append(disease_df) # adding the disease_df to existing
  master_df = master_df.append(final_df)


           entity_group                                              value  \
0                   Sex                                              woman   
1                   Age                                    age of 65 years   
2          Sign_symptom                                              fever   
3          Sign_symptom                                          shivering   
4  Biological_structure                                               legs   
5              Duration                                         two months   
6             Frequency  nearly 2 to 3 times facing shivering in in wom...   

      score  
0  0.999641  
1  0.986176  
2  0.999828  
3  0.999826  
4  0.999890  
5  0.996540  
6  0.924235  


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/542M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

In [12]:
import os
import pandas as pd
from transformers import pipeline

# Define directory where segmented CSVs are stored
base_dir = r"D:\whisper_med\processed_transcriptions"

def merge_csvs(role):
    merged_data = []
    
    # Iterate over all segment folders
    for folder in os.listdir(base_dir):
        folder_path = os.path.join(base_dir, folder)
        if os.path.isdir(folder_path):  # Ensure it's a directory
            for file in os.listdir(folder_path):
                if file.endswith(f"{role}.csv"):  # Match only the role-specific files
                    file_path = os.path.join(folder_path, file)
                    df = pd.read_csv(file_path, header=None)  # Read without assuming headers
                    merged_data.append(df)
    
    if merged_data:
        merged_df = pd.concat(merged_data, ignore_index=True)
        merged_file = os.path.join(base_dir, f"{role}_merged.csv")
        merged_df.to_csv(merged_file, index=False, header=False)  # Save without headers
        return merged_file
    else:
        print(f"No files found for role: {role}")
        return None

# Merge CSVs for doctors and patients
doctor_csv = merge_csvs("doctor")
patient_csv = merge_csvs("patient")

# Load Helios9/BIOMed_NER model
model_path = "Helios9/BIOMed_NER"
ner_pipeline = pipeline(task="token-classification", model=model_path, tokenizer=model_path, aggregation_strategy="max")

def run_ner_on_csv(csv_file):
    if csv_file is None:
        return
    
    df = pd.read_csv(csv_file, header=None)
    full_text = " ".join(df[0].astype(str))  # Merge all sentences into one text
    result = ner_pipeline(full_text)
    
    print(f"NER results for {csv_file}:")
    for entity in result:
        print(entity)

# Run NER on merged doctor and patient CSVs
run_ner_on_csv(doctor_csv)
run_ner_on_csv(patient_csv)


Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


NER results for D:\whisper_med\processed_transcriptions\doctor_merged.csv:
{'entity_group': 'Sign_symptom', 'score': 0.50863206, 'word': 'stomachache', 'start': 177, 'end': 190}
{'entity_group': 'Duration', 'score': 0.34032366, 'word': 'days', 'start': 216, 'end': 221}
{'entity_group': 'Therapeutic_procedure', 'score': 0.4825295, 'word': 'injection', 'start': 533, 'end': 543}
{'entity_group': 'Disease_disorder', 'score': 0.6709282, 'word': 'food', 'start': 1448, 'end': 1453}
NER results for D:\whisper_med\processed_transcriptions\patient_merged.csv:
{'entity_group': 'Sign_symptom', 'score': 0.88265896, 'word': 'vomiting', 'start': 119, 'end': 128}
{'entity_group': 'Duration', 'score': 0.14955811, 'word': 'night', 'start': 133, 'end': 139}
{'entity_group': 'Date', 'score': 0.19101742, 'word': 'todaymorning', 'start': 143, 'end': 157}
{'entity_group': 'Date', 'score': 0.48064843, 'word': 'threemonths', 'start': 226, 'end': 239}
{'entity_group': 'Sign_symptom', 'score': 0.66078514, 'word'



In [15]:
import os
import pandas as pd
from transformers import pipeline

# Define directory where segmented CSVs are stored
base_dir = r"D:\whisper_med\processed_transcriptions"

# Function to merge CSVs for a given role
def merge_csvs(role):
    merged_data = []
    
    for folder in os.listdir(base_dir):
        folder_path = os.path.join(base_dir, folder)
        if os.path.isdir(folder_path):
            for file in os.listdir(folder_path):
                if file.endswith(f"{role}.csv"):
                    file_path = os.path.join(folder_path, file)
                    df = pd.read_csv(file_path, header=None)
                    merged_data.append(df)
    
    if merged_data:
        merged_df = pd.concat(merged_data, ignore_index=True)
        merged_file = os.path.join(base_dir, f"{role}_merged.csv")
        merged_df.to_csv(merged_file, index=False, header=False)
        return merged_file
    return None

# Merge CSVs for doctors and patients
doctor_csv = merge_csvs("doctor")
patient_csv = merge_csvs("patient")

# Load Helios9/BIOMed_NER model
model_path = "Helios9/BIOMed_NER"
ner_pipeline = pipeline(task="token-classification", model=model_path, tokenizer=model_path, aggregation_strategy="average")

# Function to extract and group NER entities
def extract_and_group_entities(csv_file, output_file):
    if csv_file is None:
        return
    
    df = pd.read_csv(csv_file, header=None)
    full_text = " ".join(df[0].astype(str))
    ner_results = ner_pipeline(full_text)
    
    # Group entities
    grouped_entities = {"Sign/Symptoms": [], "Predicted Diseases": [], "Duration": [], "Medication": []}
    
    for entity in ner_results:
        entity_text = entity['word']
        entity_group = entity['entity_group'].lower()
        
        if "symptom" in entity_group:
            grouped_entities["Sign/Symptoms"].append(entity_text)
        elif "disease" in entity_group:
            grouped_entities["Predicted Diseases"].append(entity_text)
        elif "duration" in entity_group:
            grouped_entities["Duration"].append(entity_text)
        elif "medication" in entity_group:
            grouped_entities["Medication"].append(entity_text)
    
    # Save results to CSV
    grouped_df = pd.DataFrame([grouped_entities])
    grouped_df.to_csv(output_file, index=False)
    print(f"Saved NER results to {output_file}")

# Run NER and save structured results
extract_and_group_entities(doctor_csv, os.path.join(base_dir, "doctor_ner_results.csv"))
extract_and_group_entities(patient_csv, os.path.join(base_dir, "patient_ner_results.csv"))


Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Saved NER results to D:\whisper_med\processed_transcriptions\doctor_ner_results.csv
Saved NER results to D:\whisper_med\processed_transcriptions\patient_ner_results.csv


