In [1]:
import pandas as pd
from datasets import Dataset

# Load your dataset
df = pd.read_csv("dataset.csv")

# Convert the DataFrame into a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split the dataset into training and testing sets
dataset = dataset.train_test_split(test_size=0.1)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df.head()

Unnamed: 0,Refextracted_Author_String,Harvard_Author_Format
0,"Smith J, 2020","Smith, J. (2020)"
1,"Johnson R & White K, 2019","Johnson, R. & White, K. (2019)"
2,Lee C et al 2018,"Lee, C. et al. (2018)"
3,"Green L, Brown S, Black P, 2021","Green, L., Brown, S. & Black, P. (2021)"
4,"Miller T. et al, 2022","Miller, T. et al. (2022)"


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import Trainer, TrainingArguments

# Initialize the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Preprocess the data for the model
def preprocess_function(examples):
    input_encodings = tokenizer(examples['Refextracted_Author_String'], padding="max_length", truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['Harvard_Author_Format'], padding="max_length", truncation=True)
    return {"input_ids": input_encodings.input_ids, "labels": labels.input_ids}

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Define training arguments
# Configure training arguments to log more frequently
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=3,              # Total number of training epochs
    per_device_train_batch_size=4,  # Batch size per device during training
    per_device_eval_batch_size=4,   # Batch size for evaluation
#     warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=50,               # Log every X updates steps.
    evaluation_strategy="steps",     # Evaluation is done (and logged) every X steps.
    eval_steps=50,                  # Evaluation and logging are done every X steps.
)


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

# Train the model
trainer.train()


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|██████████████████████████████████████████████████████████████████| 145/145 [00:00<00:00, 2301.64 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 2125.60 examples/s]
[34m[1mwandb[0m: Currently logged in as: [33mrahulmudhiraj9059[0m ([33mrahul1322[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss


In [4]:
trainer.evaluate()

{'eval_loss': 0.16566085815429688,
 'eval_runtime': 13.4118,
 'eval_samples_per_second': 1.268,
 'eval_steps_per_second': 0.373,
 'epoch': 3.0}

In [5]:
def convert_author_string(author_string):
    input_ids = tokenizer.encode(author_string, return_tensors="pt")
    outputs = model.generate(input_ids)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
convert_author_string("Lee, C. et al.,2018")



'Lee, C. et al.,2018'