<a href="https://colab.research.google.com/github/Rasha-Abd-El-Khalik/Tashkhees/blob/main/medicaltrain_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time

# List of URL paths (بعد التعديل)
categories = [
    "امراض-الغدد-الصماء",
    "امراض-الجهاز-التنفسي",
    "جراحة-العظام",
    "جراحة-عامة",
    "مرض-السكري"
]

# Map URL paths to readable Arabic names
category_names = {
    "امراض-الغدد-الصماء": "الغدد الصماء",
    "امراض-الجهاز-التنفسي": "أمراض الجهاز التنفسي",
    "جراحة-العظام": "جراحة العظام",
    "جراحة-عامة": "جراحة عامة",
    "مرض-السكري": "مرض السكري"
}

base_url = "https://altibbi.com/اسئلة-طبية/"
max_questions = 5000
all_data = []

for path in categories:
    print(f"Starting category: {category_names[path]}")
    page = 1
    data = []

    while len(data) < max_questions:
        url = f"{base_url}{path}?page={page}"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")

        articles = soup.find_all("article", class_="new-question-item")
        if not articles:
            print(f"No more questions found on page {page}")
            break

        for article in articles:
            question_tag = article.find("h2", class_="question-text")
            question = question_tag.get_text(strip=True) if question_tag else "N/A"

            answer_tag = article.find("div", itemprop="text")
            answer = answer_tag.get_text(strip=True) if answer_tag else "N/A"

            data.append([category_names[path], question, answer])

            if len(data) >= max_questions:
                break

        page += 1

    all_data.extend(data)

# Save CSV with separate columns
with open("medicaldata.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    # Column names
    writer.writerow(["Category", "Question", "Answer"])
    writer.writerows(all_data)

print("Scraping finished! CSV saved as 'medicaldata.csv'")


Starting category: الغدد الصماء
Starting category: أمراض الجهاز التنفسي
Starting category: جراحة العظام
No more questions found on page 602
Starting category: جراحة عامة
Starting category: مرض السكري
Scraping finished! CSV saved as 'medicaldata.csv'


In [None]:
import pandas as pd
df=pd.read_csv("medicaldata.csv")
df.head()

Unnamed: 0,Category,Question,Answer
0,الغدد الصماء,هل بإمكاني تناول مكملات غذائية عبارة عن كبسولا...,سلامتك،من المهم جداً أن نكون حذرين بشأن تناول ...
1,الغدد الصماء,تحليل tsh طالع 0.1 هى كدا الغده نشطه ولا خمول ...,"نشاط بسيط يفضل إجراء فحص لل FT4, FT3"
2,الغدد الصماء,عملت فحص للغدة الدرقيه وهذه النتائج \r\nFT4 0....,وظيفة الغدة سليمة مع ارتفاع المضادات ما يسمى ه...
3,الغدد الصماء,هل يناسب استخدام كبسولات جليسينات المغنيسيوم م...,أهلاً بك،سأقدم لك معلومات مفصلة لمساعدتك بشأن ...
4,الغدد الصماء,حامل في الشهر السادس اعاني من قصور درقي اتناول...,أتمنى لكِ السلامة، وأتمنى أن تكوني بخير،يعتبر ...


In [None]:
category_counts = df['Category'].value_counts()

print(category_counts)

Category
الغدد الصماء            5000
أمراض الجهاز التنفسي    5000
جراحة عامة              5000
مرض السكري              5000
جراحة العظام            4207
Name: count, dtype: int64


In [None]:
df.shape

(24207, 3)

In [None]:
df= df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df.head()

Unnamed: 0,Category,Question,Answer
0,مرض السكري,عندما اتناول الطعام واشبع اشعر انني احتاج الى ...,لانه في الاصل لا يجب ان تشبع
1,جراحة عامة,ظهر قبل سنة ونصف \r\nلم يكن هناك اي الآم لكن م...,انا في مصر لاادرى التكلفه بلاردن لاحرج من التو...
2,مرض السكري,اعانى في زيادة الوزن بشكل ملحوظ ذهبت للطبيب اع...,لا مانع لذلك
3,جراحة عامة,أنا روحت لدكتور أمراض شرجية وكشف عليا قال كان ...,لا ادري ان كان تفجير الخراج من فتحة كافي لتنظي...
4,مرض السكري,السلام عليكم ٠٠٠في حال ارتفاع السكر في الدم ع...,هذا يحتاج الى نظام متكامل غذائي وعلاجي تحت إشر...


In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

test_df.to_csv("medical_test.csv", index=False, encoding="utf-8")

print("✅ Data split done: 'train.csv' (80%) and 'test.csv' (20%) saved.")

✅ Data split done: 'train.csv' (80%) and 'test.csv' (20%) saved.


In [None]:
df=train_df

In [None]:
df.shape

(19365, 3)

In [None]:
category_counts = df['Category'].value_counts()

print(category_counts)

Category
مرض السكري              4036
أمراض الجهاز التنفسي    3997
جراحة عامة              3983
الغدد الصماء            3952
جراحة العظام            3397
Name: count, dtype: int64


In [None]:
# Check nulls and datatypes
nulls_info = pd.DataFrame({
    "Null_Count": df.isnull().sum(),
    "DataType": df.dtypes
})

print(nulls_info)


          Null_Count DataType
Category           0   object
Question           0   object
Answer             0   object


In [None]:
duplicates_count = df.duplicated().sum()
print("Total duplicate rows:", duplicates_count)

Total duplicate rows: 0


In [None]:
# df = df.drop_duplicates()

In [None]:
# duplicates_count = df.duplicated().sum()
# print("Total duplicate rows:", duplicates_count)

In [None]:
df = df.drop(columns=["Answer"])

In [None]:
df.columns

Index(['Category', 'Question'], dtype='object')

In [None]:
df.shape

(19365, 2)

In [None]:

import re

def extract_non_arabic(text):
    """
    Extract all characters that are NOT Arabic letters.
    """
    if not isinstance(text, str):
        return ""
    return ''.join(re.findall(r'[^\u0600-\u06FF]', text))

all_text = ''.join(df['Question'].astype(str).tolist())
non_arabic_chars = set(extract_non_arabic(all_text))

print(non_arabic_chars)
print(f"\nTotal unique non-Arabic characters found: {len(non_arabic_chars)}")


{' ', '²', '«', 'ﻤ', 'ﻝ', '≥', 'ﻋ', 'E', 'ﺧ', '"', 'd', 'a', 'ﻱ', 'ﺩ', 'o', '>', 'N', 'ﺝ', 'R', '<', 'ﺻ', 'ﻓ', '–', '…', 'D', 'ﻘ', 'ﻔ', 'ﻖ', 'ﺐ', 'ﻭ', '-', 'g', 'ﻀ', 'ﻌ', 'ﻂ', 'h', 'ﺖ', '#', '°', 'ﺎ', 'Z', 't', 'f', 'ﺃ', 'ﻨ', '•', 'C', '{', 'c', 'ﻡ', 'ﻷ', 'ﻏ', 'ﻵ', 'ﯙ', 'ﻛ', '✨', 'ﻻ', 'O', 'ﻴ', '|', 'x', 'ﻇ', 'ﻬ', '÷', 'I', '[', '2', 'ﺫ', '&', 'w', '*', 'ﺓ', 'b', '\u200f', 'ﺍ', '●', '.', 'T', 'ﺘ', 'ﺑ', '+', '3', 'v', 'A', '♡', 'q', '}', 'µ', 'r', '\n', '\t', 'ﻧ', '8', 'ﻙ', '@', 'ﺪ', 'ﺔ', 'G', 'ﻜ', '⬇', '̐', 'ﺟ', 'ﻢ', 'μ', '\u200b', 'ﺳ', 'ﺮ', 'S', ':', 'è', ')', 'ﺽ', 'ﻳ', 'V', '\r', '7', 'ﺬ', 'ﺭ', 'ﻫ', 'i', 'm', 'ﻥ', 'ﺒ', 'j', 'ﻲ', '~', 'K', '\\', 'ﺼ', 'L', 'à', 'J', 'ﻗ', 'ﻹ', 'ﻣ', 'ﺸ', 'ﺿ', '⬆', '❤', 'ﻟ', 'Q', '1', '￼', 'ﺜ', '0', '^', 'ﺺ', 'ų', '%', '/', 'ﺤ', '☺', 'e', 'ﺣ', 'p', 'M', 'ﻯ', 'ﺠ', '4', 'ﻦ', 'ﻠ', 'ﺄ', '️', 'U', ',', 'P', 'Y', '_', '9', 'ﻞ', 'ﺷ', '\u200d', 'l', '\u200e', 'ﻼ', 'u', 'z', 'ï', '\xa0', '》', '(', 'B', 'ﺕ', 's', '5', 'ﺋ', 'ﻊ', 'W', 'ﻃ', 'y', '，', 'ﻎ', 'H', 'ﻪ', ';

In [None]:
import unicodedata

df['Question'] = df['Question'].apply(lambda text: unicodedata.normalize('NFKC', text))


In [None]:
all_text = ''.join(df['Question'].astype(str).tolist())
non_arabic_chars = set(extract_non_arabic(all_text))

print(non_arabic_chars)
print(f"\nTotal unique non-Arabic characters found: {len(non_arabic_chars)}")


{' ', '«', '≥', 'E', '"', 'd', 'a', 'o', '>', 'N', 'R', '<', '–', 'D', '-', 'g', 'h', '#', '°', 'Z', 't', 'f', '•', 'C', '{', 'c', '✨', 'O', '|', 'x', '÷', 'I', '[', '2', '&', 'w', '*', 'b', '\u200f', '●', '.', 'T', '+', '3', 'v', 'A', '♡', 'q', '}', 'r', '\n', '\t', '8', '@', 'G', '⬇', '̐', 'μ', '\u200b', 'S', ':', 'è', ')', 'V', '\r', '7', 'i', 'm', 'j', '~', 'K', '\\', 'L', 'à', 'J', '⬆', '❤', 'Q', '1', '￼', '0', '^', 'ų', '%', '/', '☺', 'e', 'p', 'M', '4', '️', 'U', ',', 'P', 'Y', '_', '9', '\u200d', 'l', '\u200e', 'u', 'z', 'ï', '》', '(', 'B', 's', '5', 'W', 'y', 'H', ';', 'é', '̷', '❗', '=', "'", 'k', 'n', '!', '6', '×', 'X', ']', '?', 'F', '̨'}

Total unique non-Arabic characters found: 127


In [None]:
# pip install camel-tools


In [None]:
from camel_tools.utils.normalize import normalize_alef_maksura_ar, normalize_alef_ar, normalize_teh_marbuta_ar
from camel_tools.utils.dediac import dediac_ar
import re

def camel_clean(text):
    text = dediac_ar(str(text))
    text = normalize_alef_ar(text)
    text = normalize_alef_maksura_ar(text)
    text = normalize_teh_marbuta_ar(text)
    en2ar = str.maketrans('0123456789', '٠١٢٣٤٥٦٧٨٩')
    text = text.translate(en2ar)
    text = re.sub(r'[^\u0600-\u06FF\s.,;:!?()\'"-]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip().lower()

df['Question'] = df['Question'].apply(camel_clean)


In [None]:
all_text = ''.join(df['Question'].astype(str).tolist())
non_arabic_chars = set(extract_non_arabic(all_text))

print(non_arabic_chars)
print(f"\nTotal unique non-Arabic characters found: {len(non_arabic_chars)}")

{'(', ';', ' ', '!', ':', ')', '"', '?', "'", '.', '-', ','}

Total unique non-Arabic characters found: 12


In [None]:
from transformers import AutoTokenizer
import pandas as pd
import numpy as np

# موديل MARBERT
model_name = "UBC-NLP/MARBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def add_token_lengths(df, text_column='Question'):
    sentence_lens = []
    word_lens = []

    for text in df[text_column]:
        if not isinstance(text, str):
            sentence_lens.append(0)
            continue

        tokens = tokenizer.tokenize(text)
        sentence_lens.append(len(tokens))

        words = text.split()
        word_lens.extend([len(tokenizer.tokenize(w)) for w in words if w.strip() != ''])

    df[text_column + '_token_len'] = sentence_lens

    analysis = {
        "sentence_max_len": max(sentence_lens),
        "sentence_min_len": min(sentence_lens),
        "sentence_50_percentile": np.percentile(sentence_lens, 50),
        "sentence_75_percentile": np.percentile(sentence_lens, 75),
        "word_max_len": max(word_lens),
        "word_min_len": min(word_lens),
        "word_50_percentile": np.percentile(word_lens, 50),
        "word_75_percentile": np.percentile(word_lens, 75),
    }

    return df, analysis

df, stats = add_token_lengths(df, 'Question')
print(stats)


{'sentence_max_len': 78, 'sentence_min_len': 2, 'sentence_50_percentile': 31.0, 'sentence_75_percentile': 37.0, 'word_max_len': 76, 'word_min_len': 1, 'word_50_percentile': 1.0, 'word_75_percentile': 1.0}


In [None]:
max_len = 45

num_sentences_under_max = sum(
    len(tokenizer.tokenize(q)) <= max_len for q in df['Question'] if isinstance(q, str)
)
percent_under_max = num_sentences_under_max / len(df) * 100

print(f"Percentage of sentences <= {max_len} tokens: {percent_under_max:.2f}%")

Percentage of sentences <= 45 tokens: 97.89%


In [None]:
df.shape

(19365, 3)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Category'])


NUM_LABELS = df['Label'].nunique()
print("Labels:", label_encoder.classes_)
print("Number of classes:", NUM_LABELS)


Labels: ['أمراض الجهاز التنفسي' 'الغدد الصماء' 'جراحة العظام' 'جراحة عامة'
 'مرض السكري']
Number of classes: 5


In [None]:
df["Label"]

Unnamed: 0,Label
18177,0
9213,3
18727,4
10500,0
6379,0
...,...
21575,4
5390,3
860,2
15795,1


In [None]:
# pred_labels = label_encoder.inverse_transform(predicted_ids)

In [None]:
df.columns

Index(['Category', 'Question', 'Question_token_len', 'Label'], dtype='object')

In [None]:
df = df.rename(columns={'Label': 'labels'})

In [None]:
df['labels'] = df['labels'].astype('int64')


In [None]:
# !pip install transformers datasets peft accelerate bitsandbytes


In [None]:
# !pip install --upgrade transformers
# !pip install --upgrade peft


In [None]:
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from peft import LoraConfig, get_peft_model
import pandas as pd


In [None]:

dataset = Dataset.from_pandas(df)

split_datasets = dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = split_datasets['train']
val_dataset = split_datasets['test']


In [None]:
from transformers import AutoTokenizer

model_name = "UBC-NLP/MARBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
MAX_LEN = 45

def tokenize_function(examples):
    return tokenizer(
        examples['Question'],
        padding='max_length',
        truncation=True,
        max_length=MAX_LEN
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/16460 [00:00<?, ? examples/s]

Map:   0%|          | 0/2905 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=NUM_LABELS)

# إعداد LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query", "value"],  # حسب بنية MARBERT
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)


model = get_peft_model(model, lora_config)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score

training_args = TrainingArguments(
    output_dir="./marbert_finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=10,
    report_to="none",
    load_best_model_at_end=True
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [None]:
# # إعادة تسمية العمود
# train_dataset = train_dataset.rename_column("Label", "labels")
# val_dataset = val_dataset.rename_column("Label", "labels")

# # ضبط format للـ PyTorch
# train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


In [None]:
print(train_dataset.column_names)
print(val_dataset.column_names)


['Category', 'Question', 'Question_token_len', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask']
['Category', 'Question', 'Question_token_len', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask']


In [None]:
columns_to_keep = ['input_ids', 'attention_mask', 'labels', 'token_type_ids']
train_dataset = train_dataset.remove_columns([c for c in train_dataset.column_names if c not in columns_to_keep])
val_dataset = val_dataset.remove_columns([c for c in val_dataset.column_names if c not in columns_to_keep])


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6899,0.612959,0.823064
2,0.5676,0.481051,0.854733
3,0.6148,0.444976,0.860241
4,0.36,0.432191,0.861962
5,0.401,0.42935,0.86506


TrainOutput(global_step=10290, training_loss=0.6046035401327617, metrics={'train_runtime': 591.5444, 'train_samples_per_second': 139.127, 'train_steps_per_second': 17.395, 'total_flos': 1909877039874000.0, 'train_loss': 0.6046035401327617, 'epoch': 5.0})

In [None]:
from transformers import AutoTokenizer


model.save_pretrained("./marbert_finetuned_lora")

tokenizer.save_pretrained("./marbert_finetuned_lora")


('./marbert_finetuned_lora/tokenizer_config.json',
 './marbert_finetuned_lora/special_tokens_map.json',
 './marbert_finetuned_lora/vocab.txt',
 './marbert_finetuned_lora/added_tokens.json',
 './marbert_finetuned_lora/tokenizer.json')

In [None]:
# from google.colab import files
# import shutil

# # Zip the folder
# shutil.make_archive("marbert_finetuned_lora", 'zip', "marbert_finetuned_lora")

# # Download
# files.download("marbert_finetuned_lora.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# import shutil

# # Delete folder and its contents
# shutil.rmtree("marbert_finetuned_lora")
