In [5]:
from transformers import MPNetForSequenceClassification
from sentence_transformers import SentenceTransformer, util

model = MPNetForSequenceClassification.from_pretrained("sentence-transformers/all-mpnet-base-v2", num_labels=3)  # Adjust num_labels based on your task
#model = SentenceTransformer('local_models/all-mpnet-base-v2')

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from transformers import MPNetTokenizer # Specifyes the tokenizer speficic for model choice
from datasets import load_dataset, Dataset # We need these to be able to convert our dataset from pandas dataframe format to a trainable format mainly huggingface format
import pandas as pd # will be used to load the fine tuning and training fdataset from the environment

# Load tokenizer to split the paragraph word by word and assign value or meassure for each word
tokenizer = MPNetTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")

# Load your dataset which has a 'premise', 'hypothesis, and 'label' columns
# Here the premise is a paragrahp from project reports over time which are annual reports and other reports.
#
data = pd.read_csv("sample_data.csv")  # Ensure your CSV has 'text' and 'label' columns

# Convert to Hugging Face dataset format
dataset = Dataset.from_pandas(data)

In [8]:
#print(data.head(5))

In [9]:
#To ensure thata we are aware of the trancation in the input paragraph
#we check the lenght of the 'premise' and 'hypothesis' and identify potential candidates
#for truncation we then Inspect the Length of Input Sequences which we set as premise and hypothesis:
#Before tokenization, you can check the length of your premise and hypothesis pairs to identify potential candidates for truncation

def check_length(example):
  premise_len = [len(tokenizer.tokenize(premise)) for premise in example['premise']]
  hypothesis_len = [len(tokenizer.tokenize(hypothesis)) for hypothesis in example['hypothesis']]
  return {'premise_len': premise_len, 'hypothesis_len': hypothesis_len}

lengths = dataset.map(check_length, batched=True)
print(lengths['premise_len'])
print(lengths['hypothesis_len'])


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

[490, 512, 590, 572, 478, 556, 201, 573, 454, 611, 351, 136, 79, 735, 709, 155, 312, 724, 650, 640, 580, 461, 613, 690, 540, 546, 626, 760, 541, 593, 619, 58, 53, 391, 717, 188, 610, 879, 729, 731, 732, 712, 883, 694, 827, 821, 894, 906, 905, 839, 841, 29, 116, 839, 577, 121, 742, 762, 744, 671, 616, 304, 612, 585, 371, 608, 621, 329, 469, 695, 387, 603, 515, 499, 558, 651, 659, 608, 640, 253, 605, 579, 584, 654, 550, 531, 502, 645, 669, 309, 384, 379, 384, 780, 621, 780, 119, 225, 235, 960]
[48, 34, 46, 17, 31, 17, 14, 32, 24, 34, 34, 45, 27, 25, 18, 31, 28, 15, 17, 34, 34, 40, 70, 28, 11, 32, 22, 51, 43, 61, 70, 26, 37, 46, 37, 61, 67, 28, 69, 64, 33, 36, 34, 48, 35, 67, 34, 33, 63, 8, 54, 15, 34, 31, 77, 32, 28, 43, 13, 49, 29, 88, 48, 26, 35, 16, 32, 42, 41, 15, 28, 115, 35, 20, 41, 31, 21, 70, 28, 62, 42, 41, 40, 42, 72, 38, 29, 42, 33, 15, 30, 34, 37, 18, 33, 28, 30, 12, 70, 29]


In [10]:
#You can Reduce Sequence Length (Optional): If you find that too many tokens are being truncated,
#you can reduce the max_length argument in the tokenizer to prevent excessive truncation.
###########################################################################################
#Return Overflowing Tokens If you want to see the tokens that are truncated and removed,
#you can set the return_overflowing_tokens=True argument in the tokenizer call.
#This way, even the tokens that are discarded due to truncation will be returned and can be inspected:
#############################################################################################
#Change Truncation Strategy Instead of using 'longest_first' for truncation
# (which discards tokens without providing them), you can switch to a truncation
# strategy that better suits your needs, such as truncating one side only or customizing the behavior:
#    'only_first': This truncates tokens from the first sequence (the premise) only.
#    'only_second': This truncates tokens from the second sequence (the hypothesis) only.


def preprocess_function(examples):
    return tokenizer(
        examples['premise'],
        examples['hypothesis'],
        truncation='only_first',        #'only_first',  # This will only truncate the premise since we saw that the length of the hypothesis is short
        padding="max_length",
        max_length=512,  # Reduce max_length if needed
        return_overflowing_tokens=False,
        return_special_tokens_mask=True  # Optional: helps in understanding which tokens were removed
    )

In [11]:
# This part is to tokenize both the 'premise' and the 'hypothesis'
#def preprocess_function(examples):
#    return tokenizer(examples['premise'], examples['hypothesis'], truncation=True, padding="max_length")

# Tokenize the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [12]:
# Inspect overflowing tokens
# You can then check whether any tokens were truncated:
# Inspect a sample for truncation
for i in range(3):
    print(f"Sample {i}:")
    print("Tokens:", tokenizer.convert_ids_to_tokens(tokenized_datasets[i]['input_ids']))
    if 'overflowing_tokens' in tokenized_datasets[i]:
        print("Overflowing tokens:", tokenizer.convert_ids_to_tokens(tokenized_datasets[i]['overflowing_tokens']))


Sample 0:
Tokens: ['<s>', 'the', 'international', 'potato', 'industry', 'information', 'in', 'order', 'to', 'educate', 'us', 'and', 'canadian', 'consumers', '.', 'both', 'sides', 'of', 'the', 'gm', '##o', 'issue', 'will', 'raise', 'and', 'spend', 'money', 'trying', 'to', 'influence', 'societal', 'acceptance', 'of', 'gm', '##o', 'food', '.', 'the', 'leading', 'anti', '-', 'gm', '##o', 'organization', 'is', 'green', '##pe', '##ace', '(', 'website', 'http', ':', '/', '/', 'www', '.', 'green', '##pe', '##ace', '.', 'org', ')', '.', 'if', 'the', 'societal', 'acceptance', 'pattern', 'for', 'gm', '##os', 'is', 'similar', 'to', 'that', 'of', 'microwave', 'oven', '##s', ',', 'gm', '##o', 'potatoes', 'will', 'eventually', 'be', 'accepted', '.', 'gm', '##o', 'acceptance', 'is', 'slowed', 'by', 'concerns', 'about', 'health', 'and', 'the', 'environment', ',', 'which', 'are', 'similar', 'forces', 'to', 'the', 'health', 'and', 'cost', 'concerns', 'that', 'slowed', 'microwave', 'oven', 'acceptance', '

In [13]:
#tokenized_datasets[0]['premise']

In [14]:
# Split the dataset into training and validation sets without stratification
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Check the train and validation datasets to ensure they contain the labels
print(train_dataset.features)
print(eval_dataset.features)

{'premise': Value(dtype='string', id=None), 'hypothesis': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'special_tokens_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}
{'premise': Value(dtype='string', id=None), 'hypothesis': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'special_tokens_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


In [15]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir="./logs",
    #no_cuda=True     This parameter has been deprecated we use the below parameter instead
    use_cpu=True
)


In [16]:
#from transformers import DataCollatorForSequenceClassification, Trainer

#data_collator = DataCollatorForSequenceClassification(tokenizer=tokenizer, model=model)

from transformers import DataCollatorWithPadding, Trainer

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    #train_dataset=tokenized_datasets,
    #eval_dataset=tokenized_datasets,  # Typically you would split into train and validation datasets
    train_dataset=train_dataset,  # Use the training dataset
    eval_dataset=eval_dataset,    # Use the validation dataset
)

In [17]:
trainer.train()

AttributeError: 'AdamW' object has no attribute 'train'

In [19]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [None]:
from transformers import Trainer

class CustomTrainer(Trainer):
    def training_step(self, model, inputs):
        # Remove the self.optimizer.train() call
        model.train()  # Ensure model is in training mode
        inputs = self._prepare_inputs(inputs)

        # Forward pass
        loss = self.compute_loss(model, inputs)
        
        # Backward pass
        loss.backward()
        
        return loss.detach()

# Now use CustomTrainer instead of Trainer
custom_trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

custom_trainer.train()


Epoch,Training Loss,Validation Loss


In [26]:
import torch
from torch.utils.data import DataLoader
from transformers import MPNetForSequenceClassification, MPNetTokenizer, AdamW
from datasets import load_dataset

# Load dataset
#dataset = load_dataset('your_dataset_name')  # Replace with your actual dataset

data = pd.read_csv("sample_data.csv")  # Ensure your CSV has 'text' and 'label' columns

# Convert to Hugging Face dataset format
dataset = Dataset.from_pandas(data)

# Load tokenizer
tokenizer = MPNetTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')

# Preprocessing function
def preprocess_function(examples):
    return tokenizer(
        examples['premise'],  # Replace with your actual input field
        examples['hypothesis'],  # Replace with your actual input field
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'  # Ensure it returns tensors
    )

# Tokenize the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Convert the dataset to PyTorch tensors
def convert_to_tensors(examples):
    return {
        'input_ids': torch.tensor(examples['input_ids']),
        'attention_mask': torch.tensor(examples['attention_mask']),
        'labels': torch.tensor(examples['label'])  # Adjust this based on your dataset
    }

# Apply conversion to tensors
tokenized_dataset = tokenized_dataset.map(convert_to_tensors, batched=True)

# Prepare DataLoader
train_loader = DataLoader(tokenized_dataset, batch_size=16, shuffle=True)

# Load model
#model = MPNetForSequenceClassification.from_pretrained('local_models/all-mpnet-base-v2', num_labels=3)  # Adjust num_labels based on your classification task

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Move model to appropriate device (GPU or CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Custom training loop
model.train()  # Set the model to training mode

for epoch in range(3):  # Number of epochs
    for batch in train_loader:
        optimizer.zero_grad()  # Clear previous gradients
        
        # Debugging: Print batch contents to check their types
        print("Batch Contents:", batch)
        
        # Move tensors to the appropriate device (GPU or CPU)
        inputs = {
            'input_ids': torch.stack(batch['input_ids']).to(device),  # Stack the input ids
            'attention_mask': torch.stack(batch['attention_mask']).to(device),  # Stack the attention masks
            'labels': torch.tensor(batch['labels']).to(device),  # Adjust this based on your dataset
        }
        
        # Forward pass
        outputs = model(**inputs)  # Forward pass
        loss = outputs.loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights
        
        print(f"Epoch {epoch}, Loss: {loss.item()}")  # Print loss for monitoring

# Save the model
model.save_pretrained('new_local_model')  # Replace with your save directory
tokenizer.save_pretrained('new_local_model_tokenizer')  # Save tokenizer too


OSError: local_model/all-mpnet-base-v2 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`