# 1- Run the whisper medium model to do the transcribing, from English audio to text. Using well-defined data, to show the computing sources requirements

In [3]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset
import torch
torch.cuda.reset_peak_memory_stats()


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

#model_id = "openai/whisper-medium"
#model_id = "/home/lm2445/Arabic_models/models--openai--whisper-medium/snapshots/abdf7c39ab9d0397620ccaea8974cc764cd0953e"  
model_id = "lm2445/for_transribing"


model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
    return_timestamps=True   # <-- Add this line, all the input longer than 30 sec
)

dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

result = pipe(sample)
print(result["text"])
peak_memory = torch.cuda.max_memory_allocated()
print(f"Peak GPU memory usage: {peak_memory / 1024**2:.2f} MB")

Device set to use cuda:0
Generating validation split: 100%|██████████| 1/1 [00:00<00:00, 112.68 examples/s]
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


 Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of up guards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Birkett Foster's landscapes smile at one much in the same way that Mr. Carcar used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap in the back before he says, like a champ pooler in a Turkish bath. Next man.
Peak GPU memory usage: 1625.11 MB


# 2- using local Arabic audio file as input - show how the model work for Arabic audio -> Arabic text

In [9]:
import torch
import torchaudio
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

torch.cuda.reset_peak_memory_stats()

file_path = "/home/lm2445/project_pi_sjf37/lm2445/Arabic/V8.wav"
waveform, sr = torchaudio.load(file_path)

if waveform.shape[0] > 1:
    waveform = waveform.mean(dim=0)

sample = {"array": waveform.squeeze().numpy(), "sampling_rate": sr}

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "lm2445/for_transribing"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
).to(device)

processor = AutoProcessor.from_pretrained(model_id)

# ------------------------------------------------------
# FIXED: Only remove forced tokens from CONFIG, not tokenizer
# ------------------------------------------------------
model.config.forced_decoder_ids = None
model.config.suppress_tokens = None

# ------------------------------------------------------
# Pipeline
# ------------------------------------------------------
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    device=device,
    torch_dtype=torch_dtype,
    return_timestamps=False,
    generate_kwargs={
        "language": "ar",
        "task": "transcribe",
        "forced_decoder_ids": None,
        "suppress_tokens": None,
    },
)

# Only keep 30 seconds for testing
sample["array"] = sample["array"][: int(30 * sample["sampling_rate"])]

result = pipe(sample)
print(result["text"])

peak_memory = torch.cuda.max_memory_allocated()
print(f"Peak GPU memory usage: {peak_memory/1024**2:.2f} MB")


Device set to use cuda:0


 مونا ازايك يا مونا؟ مونا زاكي يا حبيبتي زاكي يا مونا اشكرك جدا يا مونا شكري يا حبيبتي انا فخور بيك جدا يا مونا
Peak GPU memory usage: 3082.37 MB


# 3-Try deidentify via Camelbert
## the Camelbert can detect the location of the important words and show their locations as ouput
## PER: person; ORG: organization; LOC: location; MISC: Miscellaneous

## 3.1 two examples

In [3]:
# try deidentify 
# https://huggingface.co/CAMeL-Lab/bert-base-arabic-camelbert-mix-ner
from transformers import pipeline
import torch
torch.cuda.reset_peak_memory_stats()
#local_model = "/home/lm2445/Arabic_models/models--CAMeL-Lab--bert-base-arabic-camelbert-mix-ner/snapshots/40b8059a6f3bfbb49d64038e131f49b93cc37417"
ner = pipeline('ner', model='lm2445/for_deidentify')
# ner = pipeline(
#     "ner",
#     model=local_model,
#     tokenizer=local_model,
#     device=0 if torch.cuda.is_available() else -1,
# )
results = ner("إمارة أبوظبي هي إحدى إمارات دولة الإمارات العربية المتحدة السبع")
print (results)
peak_memory = torch.cuda.max_memory_allocated()
print(f"Peak GPU memory usage: {peak_memory / 1024**2:.2f} MB")

Some weights of the model checkpoint at lm2445/for_deidentify were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity': 'B-LOC', 'score': np.float32(0.9928443), 'index': 2, 'word': 'أبوظبي', 'start': 6, 'end': 12}, {'entity': 'B-LOC', 'score': np.float32(0.95888805), 'index': 8, 'word': 'الإمارات', 'start': 33, 'end': 41}, {'entity': 'I-LOC', 'score': np.float32(0.9397786), 'index': 9, 'word': 'العربية', 'start': 42, 'end': 49}, {'entity': 'I-LOC', 'score': np.float32(0.96159947), 'index': 10, 'word': 'المتحدة', 'start': 50, 'end': 57}]
Peak GPU memory usage: 1883.08 MB


In [4]:
# try deidentify 
# https://huggingface.co/CAMeL-Lab/bert-base-arabic-camelbert-mix-ner
from transformers import pipeline
import torch
torch.cuda.reset_peak_memory_stats()
#local_model = "/home/lm2445/Arabic_models/models--CAMeL-Lab--bert-base-arabic-camelbert-mix-ner/snapshots/40b8059a6f3bfbb49d64038e131f49b93cc37417"
ner = pipeline('ner', model='lm2445/for_deidentify')
# ner = pipeline(
#     "ner",
#     model=local_model,
#     tokenizer=local_model,
#     device=0 if torch.cuda.is_available() else -1,
# )
results = ner(" اه قل الو انت اعترفين الو الو مونا ازايك يا مونا مونا زاكي كل سنة انتو طيبين يا حبيبتي زاكي يا مونا احبك جدا وكل سنة حبيبك طيب اشكرك جدا يا مونا و الف الف مبروك ع الموثلسل الف الف الف مبروك شكري حبيبتي انا انا فخور بيكي جدا يا مونا فخور حبيبتك بالتحديد لما تبقى تبقى فقربية بحاجة شرف")
print (results)
peak_memory = torch.cuda.max_memory_allocated()
print(f"Peak GPU memory usage: {peak_memory / 1024**2:.2f} MB")

Some weights of the model checkpoint at lm2445/for_deidentify were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity': 'B-PERS', 'score': np.float32(0.54886025), 'index': 9, 'word': 'مونا', 'start': 31, 'end': 35}, {'entity': 'B-PERS', 'score': np.float32(0.71680826), 'index': 13, 'word': 'مونا', 'start': 45, 'end': 49}, {'entity': 'B-PERS', 'score': np.float32(0.8773864), 'index': 14, 'word': 'مونا', 'start': 50, 'end': 54}, {'entity': 'I-PERS', 'score': np.float32(0.85757035), 'index': 15, 'word': 'زا', 'start': 55, 'end': 57}, {'entity': 'I-PERS', 'score': np.float32(0.8754845), 'index': 16, 'word': '##كي', 'start': 57, 'end': 59}, {'entity': 'B-PERS', 'score': np.float32(0.91140044), 'index': 24, 'word': 'زا', 'start': 88, 'end': 90}, {'entity': 'B-PERS', 'score': np.float32(0.56098473), 'index': 25, 'word': '##كي', 'start': 90, 'end': 92}, {'entity': 'B-PERS', 'score': np.float32(0.64975023), 'index': 27, 'word': 'مونا', 'start': 96, 'end': 100}, {'entity': 'B-PERS', 'score': np.float32(0.64129), 'index': 38, 'word': 'مونا', 'start': 141, 'end': 145}, {'entity': 'B-PERS', 'score': np.f

## 3.2 Whole Deidentify Process: text-> text

In [6]:
from transformers import pipeline
import torch
import re

# -----------------------
# Load NER model
# -----------------------
torch.cuda.reset_peak_memory_stats()
#ner = pipeline("ner", model="CAMeL-Lab/bert-base-arabic-camelbert-mix-ner", grouped_entities=True)
#local_model = "/home/lm2445/Arabic_models/models--CAMeL-Lab--bert-base-arabic-camelbert-mix-ner/snapshots/40b8059a6f3bfbb49d64038e131f49b93cc37417"
#ner = pipeline('ner', model='lm2445/for_deidentify')
ner = pipeline("ner",
               model="lm2445/for_deidentify",
               grouped_entities=True)

# ner = pipeline(
#     "ner",
#     model=local_model,
#     tokenizer=local_model,
#     device=0 if torch.cuda.is_available() else -1,
#     grouped_entities=True,
# )

# -----------------------
# Input sentence
# -----------------------
#text = "اه قل الو انت اعترفين الو الو مونا ازايك يا مونا مونا زاكي كل سنة انتو طيبين يا حبيبتي زاكي يا مونا احبك جدا وكل سنة حبيبك طيب اشكرك جدا يا مونا و الف الف مبروك ع الموثلسل الف الف الف مبروك شكري حبيبتي انا انا فخور بيكي جدا يا مونا فخور حبيبتك بالتحديد لما تبقى تبقى فقربية بحاجة شرف"
text = "سافرت الدكتورة ليلى حمدان مع زميلها الدكتور عمر الكيلاني إلى مدينة جنيف لزيارة مقر منظمة الصحة العالمية، حيث عُقِد اجتماع رسمي مع ممثلين من منظمة الأمم المتحدة لإطلاق مشروع بحثي جديد حول الذكاء الاصطناعي في الرعاية الصحية. وبعد الاجتماع، توجّه الوفد إلى جامعة أكسفورد في لندن لمناقشة تعاون علمي مع معهد البيانات الطبية. وأكدت ليلى حمدان أن نتائج الدراسة ستُنشر بالتعاون مع جامعة القاهرة ومركز الابتكار الرقمي، بينما أشار عمر الكيلاني إلى أهمية تطوير أدوات تعتمد على تحليل اللغة الطبيعية والتعلم العميق لتطبيقها في أنظمة المستشفيات في الشرق الأوسط."
# -----------------------
# Run NER
# -----------------------
results = ner(text)

# -----------------------
# Build replacements
# -----------------------
deidentified_text = text

for entity in results:
    ent_text = entity['word']
    ent_label = entity['entity_group']

    # CAMeL labels: PER, ORG, LOC, MISC
    placeholder = f"<{ent_label}>"

    # Replace entity text in the sentence (use regex to avoid partial overlaps)
    pattern = re.escape(ent_text)
    deidentified_text = re.sub(pattern, placeholder, deidentified_text)

# -----------------------
# Print results
# -----------------------
print("=== Original Text ===")
print(text)
print("\n=== NER Output ===")
print(results)
print("\n=== De-identified Text ===")
print(deidentified_text)

# -----------------------
# GPU Memory
# -----------------------
peak = torch.cuda.max_memory_allocated()
print(f"\nPeak GPU memory usage: {peak / 1024**2:.2f} MB")


Some weights of the model checkpoint at lm2445/for_deidentify were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


=== Original Text ===
سافرت الدكتورة ليلى حمدان مع زميلها الدكتور عمر الكيلاني إلى مدينة جنيف لزيارة مقر منظمة الصحة العالمية، حيث عُقِد اجتماع رسمي مع ممثلين من منظمة الأمم المتحدة لإطلاق مشروع بحثي جديد حول الذكاء الاصطناعي في الرعاية الصحية. وبعد الاجتماع، توجّه الوفد إلى جامعة أكسفورد في لندن لمناقشة تعاون علمي مع معهد البيانات الطبية. وأكدت ليلى حمدان أن نتائج الدراسة ستُنشر بالتعاون مع جامعة القاهرة ومركز الابتكار الرقمي، بينما أشار عمر الكيلاني إلى أهمية تطوير أدوات تعتمد على تحليل اللغة الطبيعية والتعلم العميق لتطبيقها في أنظمة المستشفيات في الشرق الأوسط.

=== NER Output ===
[{'entity_group': 'PERS', 'score': np.float32(0.96569043), 'word': 'ليلى حمدان', 'start': 15, 'end': 25}, {'entity_group': 'PERS', 'score': np.float32(0.9855659), 'word': 'عمر الكيلاني', 'start': 44, 'end': 56}, {'entity_group': 'LOC', 'score': np.float32(0.99765825), 'word': 'جنيف', 'start': 67, 'end': 71}, {'entity_group': 'ORG', 'score': np.float32(0.99653834), 'word': 'منظمة الصحة العالمية', 'start': 83