In [2]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path, output_txt_path="pq_corpus.txt"):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text() for page in doc])
    with open(output_txt_path, "w", encoding="utf-8") as f:
        f.write(text)

# Example usage
extract_text_from_pdf("pq_textbook.pdf")


In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset
from transformers import pipeline


# Load the tokenizer and base model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased")

# Load the plain text dataset
datasets = load_dataset('text', data_files={'train': 'pq_corpus.txt'})

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = datasets.map(tokenize_function, batched=True, remove_columns=["text"])

# Set up data collator for MLM
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./distilbert-pq-mlm",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    save_steps=10000,
    save_total_limit=2,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator,
)

# Run training
trainer.train()

model_save_path = "./distilbert-pq-mlm"

# Save the domain-adapted model
trainer.save_model("./distilbert-pq-mlm")
tokenizer.save_pretrained(model_save_path)

# Verify saved files
print(f"Model and tokenizer saved to: {model_save_path}")

  from .autonotebook import tqdm as notebook_tqdm





Generating train split: 26838 examples [00:00, 797084.97 examples/s]
Map: 100%|██████████| 26838/26838 [00:02<00:00, 11370.21 examples/s]
 10%|▉         | 500/5034 [30:14<5:32:29,  4.40s/it]

{'loss': 3.1743, 'grad_norm': 22.645418167114258, 'learning_rate': 4.503377036154152e-05, 'epoch': 0.3}


 20%|█▉        | 1000/5034 [59:01<3:56:10,  3.51s/it]

{'loss': 2.8826, 'grad_norm': 18.324779510498047, 'learning_rate': 4.0067540723083036e-05, 'epoch': 0.6}


 30%|██▉       | 1500/5034 [1:27:46<3:25:50,  3.49s/it]

{'loss': 2.7576, 'grad_norm': 14.610793113708496, 'learning_rate': 3.5101311084624553e-05, 'epoch': 0.89}


 40%|███▉      | 2000/5034 [1:56:04<2:50:12,  3.37s/it]

{'loss': 2.6183, 'grad_norm': 30.54942512512207, 'learning_rate': 3.0135081446166074e-05, 'epoch': 1.19}


 50%|████▉     | 2500/5034 [2:23:42<2:09:51,  3.07s/it]

{'loss': 2.578, 'grad_norm': 17.067272186279297, 'learning_rate': 2.5168851807707588e-05, 'epoch': 1.49}


 60%|█████▉    | 3000/5034 [2:49:40<1:52:01,  3.30s/it]

{'loss': 2.5107, 'grad_norm': 13.001250267028809, 'learning_rate': 2.0202622169249108e-05, 'epoch': 1.79}


 70%|██████▉   | 3500/5034 [3:16:11<1:23:24,  3.26s/it]

{'loss': 2.4417, 'grad_norm': 19.580732345581055, 'learning_rate': 1.5236392530790625e-05, 'epoch': 2.09}


 79%|███████▉  | 4000/5034 [3:46:53<1:18:16,  4.54s/it]

{'loss': 2.2824, 'grad_norm': 18.23748207092285, 'learning_rate': 1.0270162892332142e-05, 'epoch': 2.38}


 89%|████████▉ | 4500/5034 [4:13:29<26:22,  2.96s/it]  

{'loss': 2.2569, 'grad_norm': 16.32189178466797, 'learning_rate': 5.30393325387366e-06, 'epoch': 2.68}


 99%|█████████▉| 5000/5034 [4:38:24<01:44,  3.06s/it]

{'loss': 2.2809, 'grad_norm': 18.75294303894043, 'learning_rate': 3.3770361541517685e-07, 'epoch': 2.98}


100%|██████████| 5034/5034 [4:40:05<00:00,  3.34s/it]


{'train_runtime': 16805.8714, 'train_samples_per_second': 4.791, 'train_steps_per_second': 0.3, 'train_loss': 2.577739136627282, 'epoch': 3.0}


281 mins 40.6 sec

In [9]:

# Test loading
test_tokenizer = AutoTokenizer.from_pretrained(model_save_path)
test_model = AutoModelForMaskedLM.from_pretrained(model_save_path)

# Create fill-mask pipeline
fill_mask = pipeline("fill-mask", model=test_model, tokenizer=test_tokenizer)

# Example test sentences relevant to power quality
test_sentences = [
    "A sudden [MASK] in voltage can cause sensitive equipment to fail.",
    "The power [MASK] was caused by lightning strike.",
    "The distribution [MASK] failed due to overload.",
    "Multiple [MASK] faults occurred during the thunderstorm.",
    "The [MASK] waveform showed significant distortion.", 
    "[MASK] fell on power lines during a storm, affecting multiple feeders; estimated restoration time is 4 hours."
]

# Run predictions
print("\nTesting domain-adapted model:")
print("-" * 50)
for sentence in test_sentences:
    print(f"\nInput: {sentence}")
    results = fill_mask(sentence)
    print("Top predictions:")
    for r in results[:5]:  # Show top 5 predictions
        print(f"- {r['token_str']}: {r['score']:.4f}")

Device set to use cpu



Testing domain-adapted model:
--------------------------------------------------

Input: A sudden [MASK] in voltage can cause sensitive equipment to fail.
Top predictions:
- increase: 0.4798
- drop: 0.1674
- rise: 0.1308
- change: 0.0904
- surge: 0.0441

Input: The power [MASK] was caused by lightning strike.
Top predictions:
- problem: 0.1721
- loss: 0.1222
- quality: 0.0741
- interruption: 0.0732
- system: 0.0654

Input: The distribution [MASK] failed due to overload.
Top predictions:
- system: 0.5795
- feeder: 0.1961
- line: 0.0223
- systems: 0.0182
- lines: 0.0142

Input: Multiple [MASK] faults occurred during the thunderstorm.
Top predictions:
- ground: 0.1447
- surge: 0.0926
- lightning: 0.0635
- fault: 0.0274
- transient: 0.0264

Input: The [MASK] waveform showed significant distortion.
Top predictions:
- first: 0.1249
- resulting: 0.0728
- initial: 0.0675
- input: 0.0616
- original: 0.0550

Input: [MASK] fell on power lines during a storm, affecting multiple feeders; estimated

# more textbooks

In [4]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset
from transformers import pipeline
import fitz  # PyMuPDF


# Load your previously fine-tuned model and its tokenizer
tokenizer = AutoTokenizer.from_pretrained("./distilbert-pq-mlm")
model = AutoModelForMaskedLM.from_pretrained("./distilbert-pq-mlm")

def extract_text_from_pdf(pdf_path, output_txt_path):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text() for page in doc])
    with open(output_txt_path, "w", encoding="utf-8") as f:
        f.write(text)

# Extract text from a new textbook
extract_text_from_pdf("t2.pdf", output_txt_path="new_corpus.txt")

# Load the text from the new textbook
datasets = load_dataset('text', data_files={'train': 'new_corpus.txt'})

# The rest of your tokenization and training code remains the same
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = datasets.map(tokenize_function, batched=True, remove_columns=["text"])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# Define training arguments for continued training
training_args = TrainingArguments(
    output_dir="./distilbert-pq-enhanced",  # New path for the enhanced model
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    save_steps=10000,
    save_total_limit=2,
)

# Initialize Trainer with your previously fine-tuned model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator,
)

# Continue training
trainer.train()

# Save the further enhanced model
model_save_path = "./distilbert-pq-enhanced"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

Generating train split: 16293 examples [00:00, 766522.66 examples/s]
Map: 100%|██████████| 16293/16293 [00:01<00:00, 11451.57 examples/s]
 16%|█▋        | 500/3057 [26:45<2:40:21,  3.76s/it]

{'loss': 3.2649, 'grad_norm': 23.426637649536133, 'learning_rate': 4.182204775924109e-05, 'epoch': 0.49}


 33%|███▎      | 1000/3057 [53:36<1:41:39,  2.97s/it]

{'loss': 3.1102, 'grad_norm': 19.356590270996094, 'learning_rate': 3.3644095518482174e-05, 'epoch': 0.98}


 49%|████▉     | 1500/3057 [1:19:02<1:16:48,  2.96s/it]

{'loss': 2.856, 'grad_norm': 21.036420822143555, 'learning_rate': 2.546614327772326e-05, 'epoch': 1.47}


 65%|██████▌   | 2000/3057 [1:44:19<53:04,  3.01s/it]  

{'loss': 2.7942, 'grad_norm': 18.880910873413086, 'learning_rate': 1.7288191036964345e-05, 'epoch': 1.96}


 82%|████████▏ | 2500/3057 [2:09:48<27:50,  3.00s/it]

{'loss': 2.6668, 'grad_norm': 16.75374984741211, 'learning_rate': 9.110238796205431e-06, 'epoch': 2.45}


 98%|█████████▊| 3000/3057 [2:34:38<02:47,  2.95s/it]

{'loss': 2.5409, 'grad_norm': 17.763526916503906, 'learning_rate': 9.322865554465163e-07, 'epoch': 2.94}


100%|██████████| 3057/3057 [2:37:25<00:00,  3.09s/it]


{'train_runtime': 9445.6973, 'train_samples_per_second': 5.175, 'train_steps_per_second': 0.324, 'train_loss': 2.866854726624793, 'epoch': 3.0}


('./distilbert-pq-enhanced\\tokenizer_config.json',
 './distilbert-pq-enhanced\\special_tokens_map.json',
 './distilbert-pq-enhanced\\vocab.txt',
 './distilbert-pq-enhanced\\added_tokens.json',
 './distilbert-pq-enhanced\\tokenizer.json')

In [6]:

# Test loading
test_tokenizer = AutoTokenizer.from_pretrained(model_save_path)
test_model = AutoModelForMaskedLM.from_pretrained(model_save_path)

# Create fill-mask pipeline
fill_mask = pipeline("fill-mask", model=test_model, tokenizer=test_tokenizer)

# Example test sentences relevant to power quality
test_sentences = [
    "A sudden [MASK] in voltage can cause sensitive equipment to fail.",
    "The power [MASK] was caused by lightning strike.",
    "The distribution [MASK] failed due to overload.",
    "Multiple [MASK] faults occurred during the thunderstorm.",
    "The [MASK] waveform showed significant distortion.", 
    "[MASK] fell on power lines during a storm, affecting multiple feeders; estimated restoration time is 4 hours."
]

test_sentences2 = [
    "Customer reported flickering lights and a [MASK] smell near the panel.",
    "A [MASK] was discovered inside the transformer cabinet, likely causing the outage.",
    "SCADA alarm triggered for [MASK] voltage detected at substation 4C.",
    "Feeder F12 tripped due to an [MASK] on phase B during high winds.",
    "Maintenance crew removed a [MASK] nest from the capacitor bank enclosure.",
    "An [MASK] caused a line-to-ground fault near pole 36.",
    "The service transformer experienced [MASK] oil leakage.",
    "Dispatch noted a [MASK] arc near the insulator on structure 112.",
    "Post-storm inspection found a [MASK] hanging on the primary conductor.",
    "Customer reported [MASK] noise from the overhead lines during peak load.",
    "The [MASK] tripped multiple times overnight, suspect loose connector.",
    "Protective relay flagged a [MASK] imbalance in feeder 27A.",
    "[MASK] identified on DGA report—suggest potential internal fault.",
    "Crews isolated section B for suspected [MASK] in underground cable.",
    "Outage caused by [MASK] falling across all three phases near intersection."
]


# Run predictions
print("\nTesting domain-adapted model:")
print("-" * 50)
for sentence in test_sentences2:
    print(f"\nInput: {sentence}")
    results = fill_mask(sentence)
    print("Top predictions:")
    for r in results[:5]:  # Show top 5 predictions
        print(f"- {r['token_str']}: {r['score']:.4f}")

Device set to use cpu



Testing domain-adapted model:
--------------------------------------------------

Input: Customer reported flickering lights and a [MASK] smell near the panel.
Top predictions:
- foul: 0.2724
- metallic: 0.0471
- toxic: 0.0444
- rank: 0.0369
- burning: 0.0340

Input: A [MASK] was discovered inside the transformer cabinet, likely causing the outage.
Top predictions:
- leak: 0.3965
- fault: 0.1289
- fuse: 0.1183
- spark: 0.0247
- conductor: 0.0180

Input: SCADA alarm triggered for [MASK] voltage detected at substation 4C.
Top predictions:
- high: 0.2192
- the: 0.1270
- excessive: 0.0916
- any: 0.0582
- fault: 0.0311

Input: Feeder F12 tripped due to an [MASK] on phase B during high winds.
Top predictions:
- arc: 0.7042
- interruption: 0.0848
- fault: 0.0404
- impact: 0.0337
- accident: 0.0249

Input: Maintenance crew removed a [MASK] nest from the capacitor bank enclosure.
Top predictions:
- squirrel: 0.1528
- tree: 0.0730
- slot: 0.0652
- seal: 0.0528
- tape: 0.0252

Input: An [MASK] c