In [None]:
import gzip
import logging
import random
import sys
import traceback
from datetime import datetime
import tqdm

from transformers import AutoModelForMaskedLM, AutoTokenizer
from datasets import  load_from_disk, Dataset, load_dataset

from sentence_transformers import SentenceTransformer, models
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.losses import DenoisingAutoEncoderLoss
from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/uazam/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
model_checkpoint = "bert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
ds = load_dataset('UmarAzam/wikipedia_subsets',split='train')
ds = ds.sort('hits',reverse=True)

In [None]:
max_token_length = tokenizer.model_max_length
# split_overlap parameter defines how many segments the model's context window should split into. The training data will move one segment forwarch with each datapoint so that there's overlap in
# the text to enable the model to learn contextual association in a moving window.
split_overlap = 2
token_stride = int(max_token_length / split_overlap)

def generate_splits(item):
  output = tokenizer(item['text'], return_offsets_mapping=True)
  # Add mapping that provides indices for tokens to split text data evenly
  item['offset_list'] =  [[offset_list[ind][1] for ind in range(0,len(offset_list),token_stride)] for offset_list in output['offset_mapping']]
  return item

def generate_sentences(item):
  # offset_list = item['offset_list']
  # text = item['text']
  # sentences = [text[offset_list[ind]:offset_list[ind+2]] for ind in range(0,len(offset_list)-2)]
  # return {'sentences' : sentences}
  #
  # For Batched code is as follows
    offset_lists = item['offset_list']
    texts = item['text']
    sentences = []
    for ind, offset_list in enumerate(offset_lists):
      sentences += [texts[ind][offset_list[i]:offset_list[i+split_overlap]] for i in range(0,len(offset_list)-2)]
    return {'text' : sentences}

In [None]:
# ds_s = ds.select(range(100))

# test = ds_s.map(generate_splits, batched=True, remove_columns = ['text','hits'])

In [None]:
ds_m = ds.select(range(2000))
ds_m = ds_m.map(generate_splits, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (14716 > 512). Running this sequence through the model will result in indexing errors


In [None]:
ds_m

Dataset({
    features: ['text', 'hits', 'offset_list'],
    num_rows: 2000
})

In [None]:
ds_sentences = ds_m.map(generate_sentences, batched=True, remove_columns = ds_m.column_names)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
ds_sentences

Dataset({
    features: ['text'],
    num_rows: 91516
})

In [None]:
# Set the log level to INFO to get more information
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, stream=sys.stdout)

# Training parameters
model_name = model_checkpoint
train_batch_size = 4
num_epochs = 1
max_seq_length = tokenizer.model_max_length

output_dir = f"./output/training_tsdae-{model_name.replace('/', '-')}-{train_batch_size}-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

# 1. Defining our sentence transformer model
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# or to load a pre-trained SentenceTransformer model OR use mean pooling
# model = SentenceTransformer(model_name)
# model.max_seq_length = max_seq_length


2025-07-18 04:47:48 - Use pytorch device_name: cuda:0


In [None]:
# Create a dataset from the sentences
dataset = ds_sentences


def noise_transform(batch, del_ratio=0.6):
    """
    Applies noise by randomly deleting words.

    WARNING: nltk's tokenization/detokenization is designed primarily for English.
    For other languages, especially those without clear word boundaries (e.g., Chinese),
    custom tokenization and detokenization are strongly recommended.

    Args:
        batch (Dict[str, List[str]]): A dictionary with the structure
            {column_name: [string1, string2, ...]}, where each list contains
            the batch data for the respective column.
        del_ratio (float): The ratio of words to delete. Defaults to 0.6.
    """
    from nltk import word_tokenize
    from nltk.tokenize.treebank import TreebankWordDetokenizer

    assert 0.0 <= del_ratio < 1.0, "del_ratio must be in the range [0, 1)"
    assert isinstance(batch, dict) and "text" in batch, "batch must be a dictionary with a 'text' key."

    noisy_texts = []
    for text in batch["text"]:
        words = word_tokenize(text)
        n = len(words)
        if n == 0:
            noisy_texts.append(text)
            continue

        kept_words = [word for word in words if random.random() < del_ratio]
        # Guarantee that at least one word remains
        if len(kept_words) == 0:
            noisy_texts.append(random.choice(words))
            continue

        noisy_texts.append(TreebankWordDetokenizer().detokenize(kept_words))
    return {"noisy": noisy_texts, "text": batch["text"]}


# TSDAE requires a dataset with 2 columns: a noisified text column and a text column
# We use a function to delete some words, but you can customize `noise_transform` to noisify your text some other way.
# We use `set_transform` instead of `map` so the noisified text differs each epoch.
dataset.set_transform(transform=lambda batch: noise_transform(batch), columns=["text"], output_all_columns=True)
dataset = dataset.train_test_split(test_size=10000)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
print(train_dataset)
print(train_dataset[0])
# """
# Dataset({
#     features: ['text'],
#     num_rows: 990000
# })
# {
#     'noisy': 'to be the primary antiviral drug used combat influenza commonly as the bird flu.',
#     'text': 'Oseltamivir is considered to be the primary antiviral drug used to combat avian influenza, commonly known as the bird flu.',
# }
# """



Dataset({
    features: ['text'],
    num_rows: 81516
})
{'noisy': "Highway motorists enter the outskirts legal speed limit drops a short from 55 mph to mph leading some drivers who are alert to . fine the posted speed even mph is 146 . Initially used enforcement for construction zones only on books to throughout . red light program and planning put school . Some suburbs . Alsip) have cameras intersections . Some red-light speed limit enforcement cameras (radar') have now approved or are implementation of speed enforcement cameras . The Maryland legislature such program in January 2006 . In 2005, 2006, 2008 2009 the California legislature considered did pass, to implement limit enforcement cameras . legislators considering expanding their speed limit enforcement cameras successes such as 158,811 in revenue three months . 2007 study of speed on the State 101 in Scottsdale found 50 reduction in total crash frequency injuries falling% rear-end increased by As of late, cameras placed along

In [None]:
# As you can see, the noisy text is applied on the fly when the sample is accessed.

# 3. Define our training loss: https://sbert.net/docs/package_reference/sentence_transformer/losses.html#denoisingautoencoderLoss
# Note that this will likely result in warnings as we're loading 'model_name' as a decoder, but it likely won't
# have weights for that yet. This is fine, as we'll be training it from scratch.
train_loss = DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)

# 4. Define an evaluator for use during training. This is useful to keep track of alongside the evaluation loss.
stsb_eval_dataset = load_dataset("sentence-transformers/stsb", split="validation")
dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=list(stsb_eval_dataset["sentence1"]),
    sentences2=list(stsb_eval_dataset["sentence2"]),
    scores=list(stsb_eval_dataset["score"]),
    main_similarity=SimilarityFunction.COSINE,
    name="sts-dev",
)
logging.info("Evaluation before training:")
eval_result = dev_evaluator(model)
print(f"Evaluation results: {eval_result}")


# 5. Define the training arguments
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=output_dir,
    # Optional training parameters:
    learning_rate=3e-5,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=train_batch_size,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=2,
    logging_steps=100,
    run_name=None,  # Will be used in W&B if `wandb` is installed
)


2025-07-18 04:48:12 - When tie_encoder_decoder=True, the decoder_name_or_path will be invalid.


Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

2025-07-18 04:48:14 - Evaluation before training:
2025-07-18 04:48:14 - EmbeddingSimilarityEvaluator: Evaluating the model on the sts-dev dataset:
2025-07-18 04:48:52 - Cosine-Similarity:	Pearson: 0.2916	Spearman: 0.3173
Evaluation results: {'sts-dev_pearson_cosine': 0.2915711158741553, 'sts-dev_spearman_cosine': 0.3173135052190934}


In [None]:

# 6. Create the trainer & start training
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=train_loss,
    evaluator=dev_evaluator,
)
trainer.train()
#trainer.train(resume_from_checkpoint = True)

# 7. Evaluate the model performance on the STS Benchmark test dataset
test_dataset = load_dataset("sentence-transformers/stsb", split="test")
test_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=list(test_dataset["sentence1"]),
    sentences2=list(test_dataset["sentence2"]),
    scores=list(test_dataset["score"]),
    main_similarity=SimilarityFunction.COSINE,
    name="sts-test",
)


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Sts-dev Pearson Cosine,Sts-dev Spearman Cosine
1000,4.5054,4.501888,0.288322,0.322025
2000,4.118,4.148431,0.28734,0.343915
3000,4.002,3.980737,0.300894,0.365024
4000,3.9431,3.879455,0.323986,0.403542
5000,3.7712,3.802709,0.390993,0.465541
6000,3.7923,3.745499,0.444961,0.504401
7000,3.6957,3.697093,0.475574,0.537404
8000,3.6329,3.660428,0.534822,0.578038
9000,3.6317,3.626774,0.523794,0.557595
10000,3.6344,3.600181,0.526074,0.553342


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


2025-07-18 05:09:57 - EmbeddingSimilarityEvaluator: Evaluating the model on the sts-dev dataset in epoch 0.049070121203199375 after 1000 steps:
2025-07-18 05:09:58 - Cosine-Similarity:	Pearson: 0.2883	Spearman: 0.3220
2025-07-18 05:09:58 - Saving model checkpoint to ./output/training_tsdae-bert-base-uncased-4-2025-07-18_04-47-46/checkpoint-1000
2025-07-18 05:09:58 - Save model to ./output/training_tsdae-bert-base-uncased-4-2025-07-18_04-47-46/checkpoint-1000
2025-07-18 05:33:03 - EmbeddingSimilarityEvaluator: Evaluating the model on the sts-dev dataset in epoch 0.09814024240639875 after 2000 steps:
2025-07-18 05:33:05 - Cosine-Similarity:	Pearson: 0.2873	Spearman: 0.3439
2025-07-18 05:33:05 - Saving model checkpoint to ./output/training_tsdae-bert-base-uncased-4-2025-07-18_04-47-46/checkpoint-2000
2025-07-18 05:33:05 - Save model to ./output/training_tsdae-bert-base-uncased-4-2025-07-18_04-47-46/checkpoint-2000
2025-07-18 05:52:32 - EmbeddingSimilarityEvaluator: Evaluating the model on

In [None]:
eval_results = test_evaluator(model)
print(f"Evaluation results: {eval_result}")

# 8. Save the trained & evaluated model locally
final_output_dir = f"{output_dir}/final"
model.save(final_output_dir)

2025-07-18 11:44:39 - EmbeddingSimilarityEvaluator: Evaluating the model on the sts-test dataset:
2025-07-18 11:44:41 - Cosine-Similarity:	Pearson: 0.4154	Spearman: 0.4684
Evaluation results: {'sts-dev_pearson_cosine': 0.2915711158741553, 'sts-dev_spearman_cosine': 0.3173135052190934}
2025-07-18 11:44:41 - Save model to ./output/training_tsdae-bert-base-uncased-4-2025-07-18_04-47-46/final


In [None]:
model_name = model_checkpoint
model.push_to_hub(f"{model_name}-industrialtech")

2025-07-18 11:51:59 - Save model to /tmp/tmp_oizrb20


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

'https://huggingface.co/UmarAzam/bert-base-uncased-industrialtech/commit/11a319395c1d71c892c65f1ad24b631c581c0718'