In [1]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
import pickle

class CustomModel(torch.nn.Module):
    def __init__(self, base_model, num_labels):
        super(CustomModel, self).__init__()
        self.base_model = base_model 
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(base_model.config.hidden_size, num_labels)
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        return {"loss": loss, "logits": logits}

# Function to load models
def load_complete_model(model_path):
    base_model = AutoModel.from_pretrained(model_path)
    classifier_state = torch.load(f"{model_path}/classifier_state.pt", map_location=torch.device('cpu'))
    model = CustomModel(base_model, classifier_state['num_labels'])
    model.classifier.load_state_dict(classifier_state['classifier_state'])
    model.eval()
    return model

# Load models and tokenizers
model_paths = [
    "/kaggle/input/medicall/other/default/1/BioBert",
    "/kaggle/input/medicall/other/default/1/arabic_text_classifier_final",
    "/kaggle/input/medicall/other/default/1/mBert"
]

models = []
tokenizers = []
category_mappings = []

for path in model_paths:
    # Load tokenizer
    tokenizers.append(AutoTokenizer.from_pretrained(path))
    
    # Load model
    model = load_complete_model(path)
    models.append(model)
    
    # Load category mapping
    with open(f"{path}/category_mapping.pkl", "rb") as f:
        category_mapping = pickle.load(f)
    category_mappings.append(category_mapping)

# Verify all mappings are the same (they should be)
is_same_mapping = all(category_mappings[0] == mapping for mapping in category_mappings)
if not is_same_mapping:
    print("Warning: Category mappings are different across models. Using the first model's mapping.")

# Use the first model's mapping
category_mapping = category_mappings[0]
category_mapping_reverse = {v: k for k, v in category_mapping.items()}

# Function to get predictions from all models
def get_ensemble_prediction(text, voting='soft'):
    all_logits = []
    
    # Get predictions from each model
    for i, (model, tokenizer) in enumerate(zip(models, tokenizers)):
        # Tokenize the input text
        inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        
        # Make prediction
        with torch.no_grad():
            outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
        
        # Get logits
        logits = outputs["logits"].numpy()
        all_logits.append(logits[0])  # Get the first (and only) example
    
    if voting == 'hard':
        # Hard voting: each model votes for a class
        predictions = [np.argmax(logits) for logits in all_logits]
        # Count votes for each class
        vote_counts = np.bincount(predictions, minlength=len(category_mapping))
        # Class with most votes wins
        ensemble_prediction = np.argmax(vote_counts)
    else:  # soft voting
        # Soft voting: average the probabilities
        # Convert logits to probabilities using softmax
        probs = [np.exp(logits) / np.sum(np.exp(logits)) for logits in all_logits]
        # Average the probabilities
        avg_probs = np.mean(probs, axis=0)
        # Class with highest average probability wins
        ensemble_prediction = np.argmax(avg_probs)
    
    # Convert to category name
    predicted_category = category_mapping_reverse[ensemble_prediction]
    
    return predicted_category, all_logits

# Function to get individual model predictions (for comparison)
def get_individual_predictions(text):
    results = []
    for i, (model, tokenizer) in enumerate(zip(models, tokenizers)):
        # Tokenize the input text
        inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        
        # Make prediction
        with torch.no_grad():
            outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
        
        # Get logits and predicted class
        logits = outputs["logits"]
        predicted_label = torch.argmax(logits, dim=-1).item()
        predicted_category = category_mapping_reverse[predicted_label]
        
        model_name = model_paths[i].split('/')[-1]
        results.append((model_name, predicted_category))
    
    return results

# Example usage
def predict_with_ensemble(text, verbose=True, voting='soft'):
    """
    Make predictions using the ensemble model
    
    Parameters:
    text (str): Arabic text to classify
    verbose (bool): Whether to print individual model predictions
    voting (str): 'soft' for weighted voting, 'hard' for majority voting
    
    Returns:
    str: Predicted category
    """
    if verbose:
        print(f"Input text: {text}")
        print("\nIndividual model predictions:")
        
        individual_predictions = get_individual_predictions(text)
        for model_name, prediction in individual_predictions:
            print(f"- {model_name}: {prediction}")
    
    ensemble_prediction, _ = get_ensemble_prediction(text, voting)
    
    if verbose:
        print(f"\nEnsemble prediction ({voting} voting): {ensemble_prediction}")
    
    return ensemble_prediction

# Example tests
test_texts = [
    "انيميا حاده",
    "التهاب في المعده",
    "جفاف العين",
    "نزله برد",
    "كسر في المفصل",
    "ضيق في الصمام",
    "عقم",
    "تاخر الدوره الشهريه"
]

print("Testing ensemble predictions...\n")
for text in test_texts:
    print("-" * 80)
    predict_with_ensemble(text, voting='soft')
    print()

# Save the ensemble model
def save_ensemble_model(models, tokenizers, category_mapping, save_path):
    import os
    import shutil
    
    # Create the directory if it doesn't exist
    os.makedirs(save_path, exist_ok=True)
    
    # Save each model in a subdirectory
    for i, (model, tokenizer) in enumerate(zip(models, tokenizers)):
        model_dir = f"{save_path}/model_{i}"
        os.makedirs(model_dir, exist_ok=True)
        
        # Save model and tokenizer
        model.base_model.save_pretrained(model_dir)
        tokenizer.save_pretrained(model_dir)
        
        # Save classifier weights
        classifier_state = {
            'classifier_state': model.classifier.state_dict(),
            'num_labels': model.classifier.out_features
        }
        torch.save(classifier_state, f"{model_dir}/classifier_state.pt")
    
    # Save category mapping
    with open(f"{save_path}/category_mapping.pkl", "wb") as f:
        pickle.dump(category_mapping, f)
    
    # Save ensemble configuration
    config = {
        'model_count': len(models),
        'voting': 'soft'  # Default voting method
    }
    with open(f"{save_path}/ensemble_config.pkl", "wb") as f:
        pickle.dump(config, f)
    
    # Create a zip file
    shutil.make_archive(save_path, 'zip', save_path)
    
    print(f"Ensemble model saved to {save_path}.zip")

# Save the ensemble model
save_ensemble_model(models, tokenizers, category_mapping, "arabic_medical_ensemble")

  classifier_state = torch.load(f"{model_path}/classifier_state.pt", map_location=torch.device('cpu'))


Testing ensemble predictions...

--------------------------------------------------------------------------------
Input text: انيميا حاده

Individual model predictions:
- BioBert: امراض الدم
- arabic_text_classifier_final: امراض الدم
- mBert: امراض الدم

Ensemble prediction (soft voting): امراض الدم

--------------------------------------------------------------------------------
Input text: التهاب في المعده

Individual model predictions:
- BioBert: امراض الجهاز الهضمي
- arabic_text_classifier_final: امراض الجهاز الهضمي
- mBert: امراض الجهاز الهضمي

Ensemble prediction (soft voting): امراض الجهاز الهضمي

--------------------------------------------------------------------------------
Input text: جفاف العين

Individual model predictions:
- BioBert: امراض العيون
- arabic_text_classifier_final: امراض العيون
- mBert: امراض العيون

Ensemble prediction (soft voting): امراض العيون

--------------------------------------------------------------------------------
Input text: نزله برد

Individua

In [2]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
import pickle
import random
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

class CustomModel(torch.nn.Module):
    def __init__(self, base_model, num_labels):
        super(CustomModel, self).__init__()
        self.base_model = base_model 
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(base_model.config.hidden_size, num_labels)
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        return {"loss": loss, "logits": logits}

# Function to load models
def load_complete_model(model_path):
    base_model = AutoModel.from_pretrained(model_path)
    classifier_state = torch.load(f"{model_path}/classifier_state.pt", map_location=torch.device('cpu'))
    model = CustomModel(base_model, classifier_state['num_labels'])
    model.classifier.load_state_dict(classifier_state['classifier_state'])
    model.eval()
    return model

# Load models and tokenizers
model_paths = [
    "/kaggle/input/medicall/other/default/1/BioBert",
    "/kaggle/input/medicall/other/default/1/arabic_text_classifier_final",
    "/kaggle/input/medicall/other/default/1/mBert"
]

models = []
tokenizers = []
category_mappings = []

for path in model_paths:
    # Load tokenizer
    tokenizers.append(AutoTokenizer.from_pretrained(path))
    
    # Load model
    model = load_complete_model(path)
    models.append(model)
    
    # Load category mapping
    with open(f"{path}/category_mapping.pkl", "rb") as f:
        category_mapping = pickle.load(f)
    category_mappings.append(category_mapping)

# Verify all mappings are the same (they should be)
is_same_mapping = all(category_mappings[0] == mapping for mapping in category_mappings)
if not is_same_mapping:
    print("Warning: Category mappings are different across models. Using the first model's mapping.")

# Use the first model's mapping
category_mapping = category_mappings[0]
category_mapping_reverse = {v: k for k, v in category_mapping.items()}

# Function to get predictions from all models
def get_ensemble_prediction(text, voting='soft'):
    all_logits = []
    
    # Get predictions from each model
    for i, (model, tokenizer) in enumerate(zip(models, tokenizers)):
        # Tokenize the input text
        inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        
        # Make prediction
        with torch.no_grad():
            outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
        
        # Get logits
        logits = outputs["logits"].numpy()
        all_logits.append(logits[0])  # Get the first (and only) example
    
    if voting == 'hard':
        # Hard voting: each model votes for a class
        predictions = [np.argmax(logits) for logits in all_logits]
        # Count votes for each class
        vote_counts = np.bincount(predictions, minlength=len(category_mapping))
        # Class with most votes wins
        ensemble_prediction = np.argmax(vote_counts)
    else:  # soft voting
        # Soft voting: average the probabilities
        # Convert logits to probabilities using softmax
        probs = [np.exp(logits) / np.sum(np.exp(logits)) for logits in all_logits]
        # Average the probabilities
        avg_probs = np.mean(probs, axis=0)
        # Class with highest average probability wins
        ensemble_prediction = np.argmax(avg_probs)
    
    # Convert to category name
    predicted_category = category_mapping_reverse[ensemble_prediction]
    
    return predicted_category, all_logits

# Function to get individual model predictions (for comparison)
def get_individual_predictions(text):
    results = []
    for i, (model, tokenizer) in enumerate(zip(models, tokenizers)):
        # Tokenize the input text
        inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        
        # Make prediction
        with torch.no_grad():a
            outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
        
        # Get logits and predicted class
        logits = outputs["logits"]
        predicted_label = torch.argmax(logits, dim=-1).item()
        predicted_category = category_mapping_reverse[predicted_label]
        
        model_name = model_paths[i].split('/')[-1]
        results.append((model_name, predicted_category))
    
    return results

# Make predictions with ensemble model
def predict_with_ensemble(text, verbose=True, voting='soft'):
    """
    Make predictions using the ensemble model
    
    Parameters:
    text (str): Arabic text to classify
    verbose (bool): Whether to print individual model predictions
    voting (str): 'soft' for weighted voting, 'hard' for majority voting
    
    Returns:
    str: Predicted category
    """
    if verbose:
        print(f"Input text: {text}")
        print("\nIndividual model predictions:")
        
        individual_predictions = get_individual_predictions(text)
        for model_name, prediction in individual_predictions:
            print(f"- {model_name}: {prediction}")
    
    ensemble_prediction, _ = get_ensemble_prediction(text, voting)
    
    if verbose:
        print(f"\nEnsemble prediction ({voting} voting): {ensemble_prediction}")
    
    return ensemble_prediction

# Function to load and prepare test data from Kaggle dataset
def load_test_data(test_data_path):
    """
    Load test data from Kaggle dataset
    
    Parameters:
    test_data_path (str): Path to the test data file
    
    Returns:
    pandas.DataFrame: Test data
    """
    # Load test data (adjust file format as needed - CSV, Excel, etc.)
    if test_data_path.endswith('.xlsx'):
        test_df = pd.read_excel(test_data_path)
    elif test_data_path.endswith('.csv'):
        test_df = pd.read_csv(test_data_path)
    else:
        raise ValueError("Unsupported file format. Please use .xlsx or .csv")
    
    print(f"Loaded test data with {len(test_df)} rows")
    
    # Display data columns for verification
    print(f"Columns in test data: {test_df.columns.tolist()}")
    
    # If needed, perform any data cleaning or preprocessing
    # For example, handling missing values, renaming columns, etc.
    
    return test_df

# Evaluate ensemble model on test data from dataset
def evaluate_ensemble_on_test_data(test_data_path, text_column='q_body', 
                                  category_column='category', voting='soft',
                                  shuffle=True, sample_size=None):
    """
    Evaluate the ensemble model on a test dataset
    
    Parameters:
    test_data_path (str): Path to the test data file
    text_column (str): Column name containing the text to classify
    category_column (str): Column name containing the category labels
    voting (str): 'soft' for weighted voting, 'hard' for majority voting
    shuffle (bool): Whether to shuffle the test data
    sample_size (int, optional): Number of samples to use (None for all)
    
    Returns:
    dict: Evaluation metrics
    """
    # Load test data
    test_df = load_test_data(test_data_path)
    
    # Filter to valid categories if needed
    valid_categories = list(category_mapping.keys())
    original_count = len(test_df)
    test_df = test_df[test_df[category_column].isin(valid_categories)]
    filtered_count = len(test_df)
    
    if filtered_count < original_count:
        print(f"Filtered out {original_count - filtered_count} rows with invalid categories")
        print(f"Valid categories: {valid_categories}")
        print(f"Remaining test data size: {filtered_count}")
    
    # Shuffle if requested
    if shuffle:
        test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)
        print("Test data has been shuffled")
    
    # Take a sample if specified
    if sample_size is not None and sample_size < len(test_df):
        test_df = test_df.iloc[:sample_size]
        print(f"Using a sample of {sample_size} rows from test data")
    
    # Store predictions
    true_labels = []
    predicted_labels = []
    all_results = []
    
    # Process each text in the test data
    print(f"\nProcessing {len(test_df)} test samples...")
    correct = 0
    
    for idx, row in test_df.iterrows():
        if idx % 50 == 0:
            print(f"Processing sample {idx}/{len(test_df)}")
        
        text = row[text_column]
        true_category = row[category_column]
        
        # Skip if text is missing or NaN
        if pd.isna(text) or text == "":
            print(f"Skipping empty text at index {idx}")
            continue
        
        # Get ensemble prediction
        pred_category = predict_with_ensemble(text, verbose=False, voting=voting)
        
        # Check if prediction is correct
        is_correct = pred_category == true_category
        if is_correct:
            correct += 1
            
        # Convert categories to indices for metrics calculation
        true_idx = category_mapping[true_category]
        pred_idx = category_mapping[pred_category]
        
        true_labels.append(true_idx)
        predicted_labels.append(pred_idx)
        
        # Store result
        all_results.append({
            "idx": idx,
            "text": text,
            "true_category": true_category,
            "predicted_category": pred_category,
            "correct": is_correct
        })
        
        # Print details for some samples (e.g., first 5 and any incorrect predictions)
        if idx < 5 or not is_correct:
            status = "✓" if is_correct else "✗"
            print(f"\nSample {idx}: {status}")
            print(f"Text: {text}")
            print(f"True: {true_category}")
            print(f"Pred: {pred_category}")
    
    # Calculate accuracy
    accuracy = correct / len(true_labels) * 100
    print(f"\nOverall accuracy: {accuracy:.2f}% ({correct}/{len(true_labels)})")
    
    # Calculate metrics
    print("\nClassification Report:")
    report = classification_report(true_labels, predicted_labels, 
                                  target_names=list(category_mapping.keys()))
    print(report)
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(true_labels, predicted_labels)
    print(cm)
    
    # Save results to CSV
    results_df = pd.DataFrame(all_results)
    results_file = "ensemble_evaluation_results.csv"
    results_df.to_csv(results_file, index=False)
    print(f"\nDetailed evaluation results saved to {results_file}")
    
    # Return metrics as dictionary
    return {
        "accuracy": accuracy,
        "true_labels": true_labels,
        "predicted_labels": predicted_labels,
        "classification_report": report,
        "confusion_matrix": cm,
        "results_df": results_df
    }

# Save the ensemble model
def save_ensemble_model(models, tokenizers, category_mapping, save_path):
    import os
    import shutil
    
    # Create the directory if it doesn't exist
    os.makedirs(save_path, exist_ok=True)
    
    # Save each model in a subdirectory
    for i, (model, tokenizer) in enumerate(zip(models, tokenizers)):
        model_dir = f"{save_path}/model_{i}"
        os.makedirs(model_dir, exist_ok=True)
        
        # Save model and tokenizer
        model.base_model.save_pretrained(model_dir)
        tokenizer.save_pretrained(model_dir)
        
        # Save classifier weights
        classifier_state = {
            'classifier_state': model.classifier.state_dict(),
            'num_labels': model.classifier.out_features
        }
        torch.save(classifier_state, f"{model_dir}/classifier_state.pt")
    
    # Save category mapping
    with open(f"{save_path}/category_mapping.pkl", "wb") as f:
        pickle.dump(category_mapping, f)
    
    # Save ensemble configuration
    config = {
        'model_count': len(models),
        'voting': 'soft'  # Default voting method
    }
    with open(f"{save_path}/ensemble_config.pkl", "wb") as f:
        pickle.dump(config, f)
    
    # Create a zip file
    shutil.make_archive(save_path, 'zip', save_path)
    
    print(f"Ensemble model saved to {save_path}.zip")

# Run the evaluation with existing test data
if __name__ == "__main__":
    # Set random seed for reproducibility
    random.seed(42)
    
    # Choose voting method ('soft' or 'hard')
    voting_method = 'soft'
    
    # Quick test with a few examples
    examples = [
        "صداع شديد مع دوخة",
        "الم في المفاصل",
        "التهاب اللثة"
    ]
    
    print("Testing with individual examples:")
    for text in examples:
        prediction = predict_with_ensemble(text, verbose=True)
        print(f"Final prediction: {prediction}\n")
    
    # Evaluate on your test dataset
    # Specify the path to your test data file
    test_data_path = "/kaggle/input/maqa-dataset/Unbalanced/Unbalanced/MAQA_Test.xlsx"  # Update this path to your actual test data
    
    print("\n" + "="*80)
    print("EVALUATING MODEL ON EXISTING TEST DATASET")
    print("="*80)
    
    # You can adjust these parameters as needed
    metrics = evaluate_ensemble_on_test_data(
        test_data_path=test_data_path,
        text_column='q_body',         # Update with your actual text column name
        category_column='category',   # Update with your actual category column name
        voting=voting_method,
        shuffle=True,
        sample_size=None              # Set to a number like 100 to test with a smaller sample
    )
    
    # Save the ensemble model
    save_ensemble_model(models, tokenizers, category_mapping, "arabic_medical_ensemble")

  classifier_state = torch.load(f"{model_path}/classifier_state.pt", map_location=torch.device('cpu'))


Testing with individual examples:
Input text: صداع شديد مع دوخة

Individual model predictions:
- BioBert: انف اذن وحنجرة
- arabic_text_classifier_final: انف اذن وحنجرة
- mBert: امراض الدم

Ensemble prediction (soft voting): انف اذن وحنجرة
Final prediction: انف اذن وحنجرة

Input text: الم في المفاصل

Individual model predictions:
- BioBert: امراض العضلات والعظام و المفاصل
- arabic_text_classifier_final: امراض العضلات والعظام و المفاصل
- mBert: امراض العضلات والعظام و المفاصل

Ensemble prediction (soft voting): امراض العضلات والعظام و المفاصل
Final prediction: امراض العضلات والعظام و المفاصل

Input text: التهاب اللثة

Individual model predictions:
- BioBert: انف اذن وحنجرة
- arabic_text_classifier_final: طب الاسنان
- mBert: امراض الدم

Ensemble prediction (soft voting): امراض الدم
Final prediction: امراض الدم


EVALUATING MODEL ON EXISTING TEST DATASET
Loaded test data with 68302 rows
Columns in test data: ['q_body', 'a_body', 'category', 'q_body_count', 'a_body_count']
Filtered out 2851

KeyboardInterrupt: 

In [3]:
import torch
import numpy as np
import pandas as pd
import random
import pickle
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# 🔹 Define valid medical categories
valid_categories = [
    "امراض نسائية",
    "امراض العضلات و العظام و المفاصل",
    "امراض الجهاز الهضمي",
    "الامراض الجنسية",
    "طب الاسنان",
    "امراض القلب و الشرايين",
    "امراض العيون",
    "أنف, أذن وحنجرة",
    "جراحة تجميل",
    "امراض الدم"
]

# 🔹 Load and filter test dataset
def load_and_filter_test_data(test_data_path, text_column, category_column):
    """
    Load and filter test data to include only the valid categories.

    Parameters:
    - test_data_path (str): Path to the test dataset.
    - text_column (str): Column containing the text input.
    - category_column (str): Column containing the actual category labels.

    Returns:
    - pd.DataFrame: Filtered test dataset.
    """
    # Load the dataset
    if test_data_path.endswith('.xlsx'):
        test_df = pd.read_excel(test_data_path)
    elif test_data_path.endswith('.csv'):
        test_df = pd.read_csv(test_data_path)
    else:
        raise ValueError("Unsupported file format. Please use .xlsx or .csv")

    # Drop rows with missing values
    test_df = test_df.dropna(subset=[text_column, category_column])

    # Filter dataset to keep only valid categories
    filtered_df = test_df[test_df[category_column].isin(valid_categories)]

    print(f"Loaded {len(test_df)} samples → Filtered to {len(filtered_df)} valid samples.")
    return filtered_df

# 🔹 Define the Custom Model class
class CustomModel(torch.nn.Module):
    def __init__(self, base_model, num_labels):
        super(CustomModel, self).__init__()
        self.base_model = base_model 
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(base_model.config.hidden_size, num_labels)
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        return {"loss": loss, "logits": logits}

# 🔹 Function to load models
def load_complete_model(model_path):
    base_model = AutoModel.from_pretrained(model_path)
    classifier_state = torch.load(f"{model_path}/classifier_state.pt", map_location=torch.device('cpu'))
    model = CustomModel(base_model, classifier_state['num_labels'])
    model.classifier.load_state_dict(classifier_state['classifier_state'])
    model.eval()
    return model

# 🔹 Load models and tokenizers
model_paths = [
    "/kaggle/input/medicall/other/default/1/BioBert",
    "/kaggle/input/medicall/other/default/1/arabic_text_classifier_final",
    "/kaggle/input/medicall/other/default/1/mBert"
]

models, tokenizers, category_mappings = [], [], []

for path in model_paths:
    tokenizers.append(AutoTokenizer.from_pretrained(path))
    model = load_complete_model(path)
    models.append(model)

    with open(f"{path}/category_mapping.pkl", "rb") as f:
        category_mapping = pickle.load(f)
    category_mappings.append(category_mapping)

# Ensure category mappings are the same
if not all(category_mappings[0] == mapping for mapping in category_mappings):
    print("Warning: Category mappings are different across models. Using the first model's mapping.")

category_mapping = category_mappings[0]
category_mapping_reverse = {v: k for k, v in category_mapping.items()}

# 🔹 Function to get ensemble predictions
def get_ensemble_prediction(text, voting='soft'):
    all_logits = []

    for model, tokenizer in zip(models, tokenizers):
        inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

        with torch.no_grad():
            outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])

        logits = outputs["logits"].cpu().numpy()  
        all_logits.append(logits[0])

    if voting == 'hard':
        predictions = [np.argmax(logits) for logits in all_logits]
        vote_counts = np.bincount(predictions, minlength=len(category_mapping))
        ensemble_prediction = np.argmax(vote_counts)
    else:  
        probs = [np.exp(logits) / np.sum(np.exp(logits)) for logits in all_logits]
        avg_probs = np.mean(probs, axis=0)
        ensemble_prediction = np.argmax(avg_probs)

    return category_mapping_reverse[ensemble_prediction]

# 🔹 Evaluate the ensemble model on the filtered dataset
def evaluate_ensemble_on_test_data(test_data_path, text_column, category_column, voting='soft'):
    test_df = load_and_filter_test_data(test_data_path, text_column, category_column)
    true_labels, predicted_labels, all_results = [], [], []

    for _, row in test_df.iterrows():
        text, true_category = row[text_column], row[category_column]
        predicted_category = get_ensemble_prediction(text, voting)

        true_labels.append(category_mapping[true_category])
        predicted_labels.append(category_mapping[predicted_category])

        all_results.append({
            "Text": text,
            "True Category": true_category,
            "Predicted Category": predicted_category,
            "Correct": true_category == predicted_category
        })

    accuracy = np.mean([t == p for t, p in zip(true_labels, predicted_labels)]) * 100

    print("\nClassification Report:")
    report = classification_report(true_labels, predicted_labels, target_names=list(category_mapping.keys()))
    print(report)

    cm = confusion_matrix(true_labels, predicted_labels)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=category_mapping.keys(), yticklabels=category_mapping.keys())
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()

    results_df = pd.DataFrame(all_results)
    results_df.to_csv("filtered_ensemble_test_results.csv", index=False)
    print(f"\nFiltered evaluation results saved to 'filtered_ensemble_test_results.csv'")

    return {
        "accuracy": accuracy,
        "classification_report": report,
        "confusion_matrix": cm,
        "results_df": results_df
    }

# 🔹 Run evaluation with the filtered dataset
if __name__ == "__main__":
    test_data_path = "/kaggle/input/maqa-dataset/Unbalanced/Unbalanced/MAQA_Test.xlsx"
    evaluate_ensemble_on_test_data(
        test_data_path=test_data_path,
        text_column="q_body",  
        category_column="category",
        voting="soft"
    )


  classifier_state = torch.load(f"{model_path}/classifier_state.pt", map_location=torch.device('cpu'))


Loaded 68302 samples → Filtered to 32165 valid samples.


KeyboardInterrupt: 

In [5]:
import torch
import numpy as np
import pandas as pd
import random
import pickle
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# 🔹 Define valid medical categories
valid_categories = [
    "امراض نسائية",
    "امراض العضلات والعظام و المفاصل",
    "امراض الجهاز الهضمي",
    "الامراض الجنسية",
    "طب الاسنان",
    "امراض القلب و الشرايين",
    "امراض العيون",
    "انف اذن وحنجرة",
    "جراحة تجميل",
    "امراض الدم"
]

# 🔹 Load dataset, filter valid categories, shuffle, and split
def load_and_process_data(train_path, test_path):
    # Load train & test datasets
    train_df = pd.read_excel(train_path)
    test_df = pd.read_excel(test_path)

    # Combine both datasets
    all_data = pd.concat([train_df, test_df], ignore_index=True)

    # Select only q_body and category columns
    all_data = all_data[['q_body', 'category']]

    # Filter dataset to keep only the valid categories
    all_data = all_data[all_data["category"].isin(valid_categories)]

    # Shuffle the data
    all_data = all_data.sample(frac=1, random_state=42).reset_index(drop=True)

    # Split into train (80%) and test (20%)
    train_df, test_df = train_test_split(all_data, test_size=0.2, random_state=42, stratify=all_data['category'])

    # Save test data to CSV for later use
    test_df.to_csv("shuffled_test_data.csv", index=False)

    # Print dataset distribution
    print("\n🔹 Training Dataset Class Distribution:")
    print(train_df["category"].value_counts())
    print("\n🔹 Test Dataset Class Distribution:")
    print(test_df["category"].value_counts())

    return train_df, test_df

# 🔹 Load shuffled test data for ensemble model
def load_test_data_for_ensemble():
    """
    Load the shuffled test dataset to apply ensemble prediction.
    
    Returns:
    - pd.DataFrame: Test dataset
    """
    test_df = pd.read_csv("shuffled_test_data.csv")  # Load the saved shuffled test dataset
    print(f"\n✅ Loaded shuffled test dataset with {len(test_df)} samples.")
    return test_df

# 🔹 Define the Custom Model class
class CustomModel(torch.nn.Module):
    def __init__(self, base_model, num_labels):
        super(CustomModel, self).__init__()
        self.base_model = base_model 
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(base_model.config.hidden_size, num_labels)
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        return {"loss": loss, "logits": logits}

# 🔹 Function to load models
def load_complete_model(model_path):
    base_model = AutoModel.from_pretrained(model_path)
    classifier_state = torch.load(f"{model_path}/classifier_state.pt", map_location=torch.device('cpu'))
    model = CustomModel(base_model, classifier_state['num_labels'])
    model.classifier.load_state_dict(classifier_state['classifier_state'])
    model.eval()
    return model

# 🔹 Load models and tokenizers
model_paths = [
    "/kaggle/input/medicall/other/default/1/BioBert",
    "/kaggle/input/medicall/other/default/1/arabic_text_classifier_final",
    "/kaggle/input/medicall/other/default/1/mBert"
]

models, tokenizers, category_mappings = [], [], []

for path in model_paths:
    tokenizers.append(AutoTokenizer.from_pretrained(path))
    model = load_complete_model(path)
    models.append(model)

    with open(f"{path}/category_mapping.pkl", "rb") as f:
        category_mapping = pickle.load(f)
    category_mappings.append(category_mapping)

# Ensure category mappings are the same
if not all(category_mappings[0] == mapping for mapping in category_mappings):
    print("Warning: Category mappings are different across models. Using the first model's mapping.")

category_mapping = category_mappings[0]
category_mapping_reverse = {v: k for k, v in category_mapping.items()}

# 🔹 Function to get ensemble predictions
def get_ensemble_prediction(text, voting='soft'):
    all_logits = []

    for model, tokenizer in zip(models, tokenizers):
        inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

        with torch.no_grad():
            outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])

        logits = outputs["logits"].cpu().numpy()  
        all_logits.append(logits[0])

    if voting == 'hard':
        predictions = [np.argmax(logits) for logits in all_logits]
        vote_counts = np.bincount(predictions, minlength=len(category_mapping))
        ensemble_prediction = np.argmax(vote_counts)
    else:  
        probs = [np.exp(logits) / np.sum(np.exp(logits)) for logits in all_logits]
        avg_probs = np.mean(probs, axis=0)
        ensemble_prediction = np.argmax(avg_probs)

    return category_mapping_reverse[ensemble_prediction]

# 🔹 Apply ensemble model on shuffled test data
# 🔹 Apply ensemble model on shuffled test data with progress updates
# 🔹 Apply ensemble model on shuffled test data with structured printing
def apply_ensemble_on_test_data(print_interval=10):
    """
    Apply the ensemble model on shuffled test data and print structured progress.

    Parameters:
    - print_interval (int): Number of samples after which progress is printed.
    """
    # Load shuffled test data
    test_df = load_test_data_for_ensemble()

    # Store predictions
    results = []
    total_samples = len(test_df)

    print(f"\n🔹 Running ensemble predictions on shuffled test dataset ({total_samples} samples)...")

    for idx, row in test_df.iterrows():
        text, true_category = row["q_body"], row["category"]
        predicted_category = get_ensemble_prediction(text)

        results.append({
            "Text": text,
            "True Category": true_category,
            "Predicted Category": predicted_category,
            "Correct": true_category == predicted_category
        })

        # Print structured output every `print_interval` samples
        if (idx + 1) % print_interval == 0 or idx == total_samples - 1:
            print("\n" + "="*80)
            print(f"Processing sample {idx + 1}/{total_samples}")
            print("="*80)
            print(f"Input text: {text}\n")
            
            print("Individual model predictions:")
            for model_name, tokenizer in zip(model_paths, tokenizers):
                model_prediction = get_ensemble_prediction(text)
                print(f"- {model_name.split('/')[-1]}: {model_prediction}")

            print("\nEnsemble prediction (soft voting):", predicted_category)
            print("Final prediction:", predicted_category)
            print("="*80)

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    # Save predictions
    results_df.to_csv("ensemble_test_predictions.csv", index=False)
    print("\n✅ Ensemble test predictions saved to 'ensemble_test_predictions.csv'.")

    # Print sample results
    print("\n🔹 Sample Predictions:")
    print(results_df.head())




# 🔹 Run full process
if __name__ == "__main__":
    train_path = "/kaggle/input/maqa-dataset/Unbalanced/Unbalanced/MAQA_Train.xlsx"
    test_path = "/kaggle/input/maqa-dataset/Unbalanced/Unbalanced/MAQA_Test.xlsx"

    # 🔹 Step 1: Load & Process Data
    train_df, test_df = load_and_process_data(train_path, test_path)

    # 🔹 Step 2: Apply Ensemble on Shuffled Test Data
    apply_ensemble_on_test_data()


  classifier_state = torch.load(f"{model_path}/classifier_state.pt", map_location=torch.device('cpu'))



🔹 Training Dataset Class Distribution:
category
امراض نسائية                       56128
انف اذن وحنجرة                     15646
امراض العضلات والعظام و المفاصل    14848
امراض العيون                       14639
امراض القلب و الشرايين             12757
امراض الجهاز الهضمي                12708
الامراض الجنسية                     8874
طب الاسنان                          8806
جراحة تجميل                         7875
امراض الدم                          6862
Name: count, dtype: int64

🔹 Test Dataset Class Distribution:
category
امراض نسائية                       14032
انف اذن وحنجرة                      3912
امراض العضلات والعظام و المفاصل     3712
امراض العيون                        3660
امراض القلب و الشرايين              3189
امراض الجهاز الهضمي                 3177
الامراض الجنسية                     2219
طب الاسنان                          2201
جراحة تجميل                         1969
امراض الدم                          1715
Name: count, dtype: int64

✅ Loaded shuffled test dataset wi

KeyboardInterrupt: 

In [6]:
import torch
import numpy as np
import pandas as pd
import random
import pickle
from transformers import AutoTokenizer, AutoModel

# 🔹 Define valid medical categories
valid_categories = [
    "امراض نسائية",
    "امراض العضلات والعظام و المفاصل",
    "امراض الجهاز الهضمي",
    "الامراض الجنسية",
    "طب الاسنان",
    "امراض القلب و الشرايين",
    "امراض العيون",
    "انف اذن وحنجرة",
    "جراحة تجميل",
    "امراض الدم"
]

# 🔹 Load only the test dataset and shuffle it
def load_test_data(test_path):
    """
    Loads only the test dataset, filters valid categories, and shuffles it.

    Returns:
    - pd.DataFrame: Shuffled test dataset.
    """
    # Load test dataset (WITHOUT merging with training data)
    test_df = pd.read_excel(test_path)

    # Select only q_body and category columns
    test_df = test_df[['q_body', 'category']]

    # Filter dataset to keep only the valid categories
    test_df = test_df[test_df["category"].isin(valid_categories)]

    # Shuffle the test data
    test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Save shuffled test data for later use
    test_df.to_csv("shuffled_test_data.csv", index=False)

    # Print dataset distribution
    print("\n🔹 Test Dataset Class Distribution (Shuffled):")
    print(test_df["category"].value_counts())

    return test_df

# 🔹 Load shuffled test data for ensemble model
def load_test_data_for_ensemble():
    """
    Load the shuffled test dataset for evaluation.

    Returns:
    - pd.DataFrame: Shuffled test dataset
    """
    test_df = pd.read_csv("shuffled_test_data.csv")
    print(f"\n✅ Loaded shuffled test dataset with {len(test_df)} samples.")
    return test_df

# 🔹 Function to apply ensemble model on test data
def apply_ensemble_on_test_data(print_interval=10):
    """
    Apply the ensemble model on shuffled test data and print structured progress.

    Parameters:
    - print_interval (int): Number of samples after which progress is printed.
    """
    # Load shuffled test data
    test_df = load_test_data_for_ensemble()

    # Store predictions
    results = []
    total_samples = len(test_df)

    print(f"\n🔹 Running ensemble predictions on shuffled test dataset ({total_samples} samples)...")

    for idx, row in test_df.iterrows():
        text, true_category = row["q_body"], row["category"]
        predicted_category = "dummy_prediction"  # Replace with actual model prediction function

        results.append({
            "Text": text,
            "True Category": true_category,
            "Predicted Category": predicted_category,
            "Correct": true_category == predicted_category
        })

        # Print structured output every `print_interval` samples
        if (idx + 1) % print_interval == 0 or idx == total_samples - 1:
            print("\n" + "="*80)
            print(f"Processing sample {idx + 1}/{total_samples}")
            print("="*80)
            print(f"Input text: {text}\n")
            print("\nEnsemble prediction:", predicted_category)
            print("Final prediction:", predicted_category)
            print("="*80)

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    # Save predictions
    results_df.to_csv("ensemble_test_predictions.csv", index=False)
    print("\n✅ Ensemble test predictions saved to 'ensemble_test_predictions.csv'.")

# 🔹 Run full process
if __name__ == "__main__":
    test_path = "/kaggle/input/maqa-dataset/Unbalanced/Unbalanced/MAQA_Test.xlsx"

    # ✅ Step 1: Load & Process Test Data (ONLY TEST DATA)
    test_df = load_test_data(test_path)

    # ✅ Step 2: Apply Ensemble on Shuffled Test Data
    apply_ensemble_on_test_data()



🔹 Test Dataset Class Distribution (Shuffled):
category
امراض نسائية                       14032
انف اذن وحنجرة                      3912
امراض العضلات والعظام و المفاصل     3712
امراض العيون                        3660
امراض القلب و الشرايين              3190
امراض الجهاز الهضمي                 3177
الامراض الجنسية                     2219
طب الاسنان                          2202
جراحة تجميل                         1969
امراض الدم                          1716
Name: count, dtype: int64

✅ Loaded shuffled test dataset with 39789 samples.

🔹 Running ensemble predictions on shuffled test dataset (39789 samples)...

Processing sample 10/39789
Input text: مرحبتين بالنسبه لطفل صغير عمره 4 سنين حدث له كسر في اليد اليمني كم ستستغرق مده اجبيس حتي تعود العظام كما كانت وشكرا جزيلا


Ensemble prediction: dummy_prediction
Final prediction: dummy_prediction

Processing sample 20/39789
Input text: 
ما هو سبب تزايد الخطوط الحمراء في بياض العين مع العلم أنّني أرتدي نظارات طبية امام جهاز الكمبيوتر ؟ 



In [None]:
import torch
import numpy as np
import pandas as pd
import random
import pickle
from transformers import AutoTokenizer, AutoModel

# 🔹 Define valid medical categories
valid_categories = [
    "امراض نسائية",
    "امراض العضلات والعظام و المفاصل",
    "امراض الجهاز الهضمي",
    "الامراض الجنسية",
    "طب الاسنان",
    "امراض القلب و الشرايين",
    "امراض العيون",
    "انف اذن وحنجرة",
    "جراحة تجميل",
    "امراض الدم"
]

# 🔹 Load and shuffle test dataset
def load_and_process_test_data(test_path):
    """
    Load only the test dataset, filter valid categories, and shuffle it.

    Returns:
    - pd.DataFrame: Shuffled test dataset.
    """
    # Load test dataset
    test_df = pd.read_excel(test_path)

    # Select only q_body and category columns
    test_df = test_df[['q_body', 'category']]

    # Filter dataset to keep only the valid categories
    test_df = test_df[test_df["category"].isin(valid_categories)]

    # Shuffle the test data
    test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Save shuffled test data for later use
    test_df.to_csv("shuffled_test_data.csv", index=False)

    # Print dataset distribution
    print("\n🔹 Test Dataset Class Distribution (Shuffled):")
    print(test_df["category"].value_counts())

    return test_df

# 🔹 Load shuffled test data for ensemble model
def load_test_data_for_ensemble():
    """
    Load the shuffled test dataset for evaluation.

    Returns:
    - pd.DataFrame: Shuffled test dataset
    """
    test_df = pd.read_csv("shuffled_test_data.csv")
    print(f"\n✅ Loaded shuffled test dataset with {len(test_df)} samples.")
    return test_df

# 🔹 Define the Custom Model class
class CustomModel(torch.nn.Module):
    def __init__(self, base_model, num_labels):
        super(CustomModel, self).__init__()
        self.base_model = base_model 
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(base_model.config.hidden_size, num_labels)
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        return {"loss": loss, "logits": logits}

# 🔹 Function to load models
def load_complete_model(model_path):
    base_model = AutoModel.from_pretrained(model_path)
    classifier_state = torch.load(f"{model_path}/classifier_state.pt", map_location=torch.device('cpu'))
    model = CustomModel(base_model, classifier_state['num_labels'])
    model.classifier.load_state_dict(classifier_state['classifier_state'])
    model.eval()
    return model

# 🔹 Load models and tokenizers
model_paths = [
    "/kaggle/input/medicall/other/default/1/BioBert",
    "/kaggle/input/medicall/other/default/1/arabic_text_classifier_final",
    "/kaggle/input/medicall/other/default/1/mBert"
]

models, tokenizers, category_mappings = [], [], []

for path in model_paths:
    tokenizers.append(AutoTokenizer.from_pretrained(path))
    model = load_complete_model(path)
    models.append(model)

    with open(f"{path}/category_mapping.pkl", "rb") as f:
        category_mapping = pickle.load(f)
    category_mappings.append(category_mapping)

# Ensure category mappings are the same
if not all(category_mappings[0] == mapping for mapping in category_mappings):
    print("Warning: Category mappings are different across models. Using the first model's mapping.")

category_mapping = category_mappings[0]
category_mapping_reverse = {v: k for k, v in category_mapping.items()}

# 🔹 Function to get ensemble predictions
def get_ensemble_prediction(text, voting='soft'):
    all_logits = []

    for model, tokenizer in zip(models, tokenizers):
        inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

        with torch.no_grad():
            outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])

        logits = outputs["logits"].cpu().numpy()  
        all_logits.append(logits[0])

    if voting == 'hard':
        predictions = [np.argmax(logits) for logits in all_logits]
        vote_counts = np.bincount(predictions, minlength=len(category_mapping))
        ensemble_prediction = np.argmax(vote_counts)
    else:  
        probs = [np.exp(logits) / np.sum(np.exp(logits)) for logits in all_logits]
        avg_probs = np.mean(probs, axis=0)
        ensemble_prediction = np.argmax(avg_probs)

    return category_mapping_reverse[ensemble_prediction]

# 🔹 Apply ensemble model on shuffled test data
def apply_ensemble_on_test_data(print_interval=500):
    """
    Apply the ensemble model on shuffled test data and print structured progress.

    Parameters:
    - print_interval (int): Number of samples after which progress is printed.
    """
    # Load shuffled test data
    test_df = load_test_data_for_ensemble()

    # Store predictions
    results = []
    total_samples = len(test_df)

    print(f"\n🔹 Running ensemble predictions on shuffled test dataset ({total_samples} samples)...")

    for idx, row in test_df.iterrows():
        text, true_category = row["q_body"], row["category"]
        predicted_category = get_ensemble_prediction(text)

        results.append({
            "Text": text,
            "True Category": true_category,
            "Predicted Category": predicted_category,
            "Correct": true_category == predicted_category
        })

        # Print structured output every `print_interval` samples
        if (idx + 1) % print_interval == 0 or idx == total_samples - 1:
            print("\n" + "="*80)
            print(f"Processing sample {idx + 1}/{total_samples}")
            print("="*80)
            print(f"Input text: {text}\n")
            
            print("Individual model predictions:")
            for model_name, tokenizer in zip(model_paths, tokenizers):
                model_prediction = get_ensemble_prediction(text)
                print(f"- {model_name.split('/')[-1]}: {model_prediction}")

            print("\nEnsemble prediction (soft voting):", predicted_category)
            print("Final prediction:", predicted_category)
            print("="*80)

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    # Save predictions
    results_df.to_csv("ensemble_test_predictions.csv", index=False)
    print("\n✅ Ensemble test predictions saved to 'ensemble_test_predictions.csv'.")

    # Print sample results
    print("\n🔹 Sample Predictions:")
    print(results_df.head())

# 🔹 Run full process
if __name__ == "__main__":
    test_path = "/kaggle/input/maqa-dataset/Unbalanced/Unbalanced/MAQA_Test.xlsx"

    # ✅ Step 1: Load & Process Test Data (ONLY TEST DATA)
    test_df = load_and_process_test_data(test_path)

    # ✅ Step 2: Apply Ensemble on Shuffled Test Data
    apply_ensemble_on_test_data()


  classifier_state = torch.load(f"{model_path}/classifier_state.pt", map_location=torch.device('cpu'))



🔹 Test Dataset Class Distribution (Shuffled):
category
امراض نسائية                       14032
انف اذن وحنجرة                      3912
امراض العضلات والعظام و المفاصل     3712
امراض العيون                        3660
امراض القلب و الشرايين              3190
امراض الجهاز الهضمي                 3177
الامراض الجنسية                     2219
طب الاسنان                          2202
جراحة تجميل                         1969
امراض الدم                          1716
Name: count, dtype: int64

✅ Loaded shuffled test dataset with 39789 samples.

🔹 Running ensemble predictions on shuffled test dataset (39789 samples)...

Processing sample 10/39789
Input text: مرحبتين بالنسبه لطفل صغير عمره 4 سنين حدث له كسر في اليد اليمني كم ستستغرق مده اجبيس حتي تعود العظام كما كانت وشكرا جزيلا

Individual model predictions:
- BioBert: امراض العضلات والعظام و المفاصل
- arabic_text_classifier_final: امراض العضلات والعظام و المفاصل
- mBert: امراض العضلات والعظام و المفاصل

Ensemble prediction (soft voting): امر