In [None]:
import wandb

# Initialize wandb project
wandb.init(project="idiomatic-literal-recognizer", entity = "juliosalim")

In [None]:
!pip install transformers datasets torch scikit.learn huggingface-hub tensorflow optuna



In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, pipeline
from huggingface_hub import HfApi
from huggingface_hub import login
import pandas as pd


In [None]:
df=pd.read_csv('gpt-desc.csv')
df.head() # check the data, if it have been loaded

Unnamed: 0,compound,sentence_type,Meaning,sentence,pic_sentence
0,elbow grease,idiomatic,Hard physical work or effort.,It took a lot of elbow grease to get the old e...,A hardworking mechanic intensely cleaning an o...
1,elbow grease,literal,Actual grease or lubricant applied to an elbow...,The mechanic applied elbow grease to the joint...,A mechanic in a workshop applying a lubricant ...
2,night owl,idiomatic,A person who stays up late at night and is mor...,"It's a constant battle for us, as he is a morn...","A cozy living room at night, with a person sit..."
3,night owl,literal,A species of owl that is active during the night.,The researchers observed a night owl perched o...,"A serene night scene in a dense forest, with a..."
4,heart of gold,idiomatic,A very kind and generous nature.,Even the somewhat seedy failed private eye has...,A modest private investigator's office with a ...


In [None]:
# mapping sentecne_type into numeric labels
label_mapping = {'idiomatic':1 ,'literal':0}
df['label'] = df['sentence_type'].map(label_mapping)
df.head()

Unnamed: 0,compound,sentence_type,Meaning,sentence,pic_sentence,label
0,elbow grease,idiomatic,Hard physical work or effort.,It took a lot of elbow grease to get the old e...,A hardworking mechanic intensely cleaning an o...,1
1,elbow grease,literal,Actual grease or lubricant applied to an elbow...,The mechanic applied elbow grease to the joint...,A mechanic in a workshop applying a lubricant ...,0
2,night owl,idiomatic,A person who stays up late at night and is mor...,"It's a constant battle for us, as he is a morn...","A cozy living room at night, with a person sit...",1
3,night owl,literal,A species of owl that is active during the night.,The researchers observed a night owl perched o...,"A serene night scene in a dense forest, with a...",0
4,heart of gold,idiomatic,A very kind and generous nature.,Even the somewhat seedy failed private eye has...,A modest private investigator's office with a ...,1


In [None]:
# Combine sentence type and compound using'[SEP]'
df['text'] = df.apply(lambda row: f"{row['sentence']} [SEP] {row['compound']}", axis=1)
df['text'] = df['text'].astype(str)

In [None]:
# split into train 80 and test 20

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'],
    df['label'],
    test_size=0.2,
    random_state=42
)

In [None]:
# Convert to panda DataFranme for HugFace Dataset Compatibility
train_df = pd.DataFrame({'text': train_texts, 'label': train_labels})
test_df = pd.DataFrame({'text': test_texts, 'label': test_labels})

#Convert to HugFace Dataset objects
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# Just to check the labels
print(train_dataset.column_names)
print("-------------------------------------")
print(train_dataset)
print("-------------------------------------")
print(test_dataset)

['text', 'label', '__index_level_0__']
-------------------------------------
Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 112
})
-------------------------------------
Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 28
})


In [None]:
# Load pre-trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

#TOkenize the dataset
def tokenize_function(examples):
  return tokenizer(examples['text'], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove columns no need for taining
train_dataset = train_dataset.remove_columns(['text'])
test_dataset = test_dataset.remove_columns(['text'])

# Convert datasets to Pytorch format
train_dataset.set_format("torch")
test_dataset.set_format("torch")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/112 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

In [None]:
import transformers
import accelerate
import torch

print("Transformers version:", transformers.__version__)
print("Accelerate version:", accelerate.__version__)
print("Torch version:", torch.__version__)

Transformers version: 4.45.2
Accelerate version: 1.3.0
Torch version: 2.5.1


Only for mac silicon facing MPS issues

In [None]:
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./result-8',          # Output directory
    evaluation_strategy="epoch",    # Evaluate at the end of each epoch
    learning_rate=4.0355175011641236e-05,             # Learning rate                       #before:2e-5
    per_device_train_batch_size=16,  # Batch size for training             #before:8
    per_device_eval_batch_size=16,   # Batch size for evaluation           #before:8
    num_train_epochs=7,             # Number of epochs                    #before:5
    weight_decay=0.01,              # Weight decay
    logging_dir='./logs',           # Directory for storing logs
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True
)



In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    tokenizer = tokenizer
)

  trainer = Trainer(


Save model

In [None]:
# "model.h5" is saved in wandb.run.dir & will be uploaded at the end of training
model.save(os.path.join(wandb.run.dir, "model.r8"))

# Save a model file manually from the current directory:
wandb.save('model.r8')

# Save all files that currently exist containing the substring "ckpt":
wandb.save('../logs/*ckpt*')

# Save any files starting with "checkpoint" as they're written to:
wandb.save(os.path.join(wandb.run.dir, "checkpoint*"))

AttributeError: 'BertForSequenceClassification' object has no attribute 'save'

In [None]:
# Train the model
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 Â·Â·Â·Â·Â·Â·Â·Â·Â·Â·


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,No log,0.626445
2,0.670200,0.538798
3,0.414000,0.423649
4,0.414000,0.438608
5,0.196400,0.475707
6,0.117000,0.493484
7,0.117000,0.503847


TrainOutput(global_step=49, training_loss=0.3012101917850728, metrics={'train_runtime': 171.4246, 'train_samples_per_second': 4.573, 'train_steps_per_second': 0.286, 'total_flos': 206279067402240.0, 'train_loss': 0.3012101917850728, 'epoch': 7.0})

In [None]:
#Evaluate the model on the test set
test_result = trainer.evaluate(eval_dataset = test_dataset)
print("Test Results:", test_result)

Test Results: {'eval_loss': 0.42364856600761414, 'eval_runtime': 0.8865, 'eval_samples_per_second': 31.584, 'eval_steps_per_second': 2.256, 'epoch': 7.0}


In [None]:
# metrics

from sklearn.metrics import classification_report
true_labels = test_labels
predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions.argmax(axis=1)

In [None]:
print(classification_report(true_labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.92      0.73      0.81        15
           1       0.75      0.92      0.83        13

    accuracy                           0.82        28
   macro avg       0.83      0.83      0.82        28
weighted avg       0.84      0.82      0.82        28



Model Testing

In [None]:
# prepare sentence and compound input

test_inputs = [
     {"sentence": "The artisan carefully crafted a replica of Pandora's box from ancient texts", "compound": "Pandora's box"},
     {"sentence": "The project has become a Pandora's box, revealing unexpected challenges", "compound": "Pandora's box"}
]

# combine sentence and compund using '[SEP]'
test_texts = [f"{item['sentence']} [SEP] {item['compound']}" for item in test_inputs]


In [None]:
# Load the fine-tuned model and tokenizer

model_path = 'result-8/checkpoint-49'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [None]:
# tokenize the test inputs
inputs = tokenizer(test_texts, return_tensors="pt", padding=True, truncation = True)


# perform inference
outputs = model(**inputs)
predictions =outputs.logits.argmax(dim=1)

# map predictions to labels
label_mapping = {0:'literal', 1:'idiomatic'}
predicted_labels = [label_mapping[label] for label in predictions.tolist()]

In [None]:
# print predictions
for input_data, label in zip(test_inputs, predicted_labels):
   print(f"Sentence: {input_data['sentence']}\nCompound: {input_data['compound']}\nPredicted Label: {label}\n")

Sentence: The artisan carefully crafted a replica of Pandora's box from ancient texts
Compound: Pandora's box
Predicted Label: literal

Sentence: The project has become a Pandora's box, revealing unexpected challenges
Compound: Pandora's box
Predicted Label: idiomatic



In [None]:
# Reload the model and tokenizer from your directory
model = AutoModelForSequenceClassification.from_pretrained("result-8/checkpoint-49")
tokenizer = AutoTokenizer.from_pretrained("result-8/checkpoint-49")
print("Model and tokenizer loaded successfully.")

Model and tokenizer loaded successfully.


Pushing hte model to huggingFace


In [None]:
login()

# Pushing model to HuggingFace
model.push_to_hub("jlsalim/bert-uncased-idiomatic-literal-recognizer")
tokenizer.push_to_hub("jlsalim/bert-uncased-idiomatic-literal-recognizer")


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jlsalim/bert-uncased-idiomatic-literal-recognizer/commit/8f71ff89fa09df24542b415b3378ebfe7ca93e05', commit_message='Upload tokenizer', commit_description='', oid='8f71ff89fa09df24542b415b3378ebfe7ca93e05', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jlsalim/bert-uncased-idiomatic-literal-recognizer', endpoint='https://huggingface.co', repo_type='model', repo_id='jlsalim/bert-uncased-idiomatic-literal-recognizer'), pr_revision=None, pr_num=None)

In [None]:
# User the pipeline with Hugging Face model
classifier = pipeline("text-classification", model ="jlsalim/bert-uncased-idiomatic-literal-recognizer", tokenizer="jlsalim/bert-uncased-idiomatic-literal-recognizer")

#Test predictions
print(classifier(["This is example sentence [SEP] literal"]))

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Device set to use cuda:0


[{'label': 'LABEL_1', 'score': 0.7600845694541931}]
