# Training / Fine-tuning a Text-to-Phon model

We are going to look at model fine-tuning by taking a general purpose language model and fine-tuning it to translate text to IPA

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install accelerate -U
!pip install transformers==4.29.2 -U
!pip install datasets -U
!pip install fsspec==2023.9.2

Collecting accelerate
  Downloading accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0

In [3]:
import transformers
import datasets
from datasets import load_dataset
import accelerate


In [14]:
!ls /content/drive/MyDrive

'6 Distributional Verb Semantics.gdoc'
 AI_by_hand
'Colab Notebooks'
'Corpus Linguistics Assignment Discussion.gdoc'
 my_models
'通过 Chrome 保存'


Upload data to be processed

In [10]:
import pandas as pd
from google.colab import files

# upload files
uploaded = files.upload()

# obtain filenames
for filename in uploaded.keys():
  print('The file "{name}" length is  {length} '.format(name=filename, length=len(uploaded[filename])))
  uploaded_filename = filename

# read xlsx files
try:
  df = pd.read_excel(uploaded_filename)
  print("\nsuccessfully read DataFrame：")
  print(df.head()) # 打印前几行数据以
except Exception as e:
  print(f"error while reading file: {e}")

Saving processed_data_age_1_2_v2.xlsx to processed_data_age_1_2_v2.xlsx
The file "processed_data_age_1_2_v2.xlsx" length is  2080853 

successfully read DataFrame：
   Number      text          child_utterance
0     272      toys              t ʰ ɔ j i j
1     273      toys              t ʰ ɔ j i j
2     274      mama                m ɑ ̃ m ɑ
3     275      mama                  m ʊ m ʌ
4     276  all gone  ʌ ɫ  WORD_BOUNDARY  ɡ ɑ


Divide data

In [11]:
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np # 导入 numpy 用于识别 NaN

# 1. 提取列并创建新的 DataFrame
try:
    # 选择需要的列
    new_df = df[['text', 'child_utterance']].copy()
    print("\n成功提取并重命名列：")
    print(new_df.head())

    # **添加：删除包含空值的行**
    # 检查 'text' 和 'child_utterance' 列中的空值 (包括 NaN, None)
    new_df.dropna(subset=['text', 'child_utterance'], inplace=True)
    # 也可以额外检查是否有一些被表示为 None 的值，虽然 dropna 通常处理 NaN 和 None
    # new_df = new_df[new_df['text'].notna() & new_df['child_utterance'].notna()]
    # 确保所有值都是字符串类型 (虽然 dropna 应该已经移除了 None/NaN)
    new_df['text'] = new_df['text'].astype(str)
    new_df['child_utterance'] = new_df['child_utterance'].astype(str)


    print(f"\n删除空值后剩余的行数: {len(new_df)}")
    print(new_df.head())


    # 2. 将 DataFrame 转换为 Hugging Face Dataset
    # 检查 DataFrame 是否为空，避免将空 DataFrame 转换为 Dataset
    if new_df.empty:
        print("\n警告：处理空值后 DataFrame 为空，无法创建 Dataset。")
        # 您可以在这里添加逻辑来跳过后续步骤或退出
    else:
        ds = Dataset.from_pandas(new_df)
        print("\n成功将 DataFrame 转换为 Hugging Face Dataset：")
        print(ds)

        # 3. 划分数据集 (例如，80% 训练，10% 验证，10% 测试)
        # 第一次划分：训练集 vs 验证集+测试集
        train_testvalid = ds.train_test_split(test_size=0.2)

        # 第二次划分：验证集 vs 测试集
        test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

        # 创建 DatasetDict 对象来存储划分后的数据集
        your_ds_split = DatasetDict({
            'train': train_testvalid['train'],
            'valid': test_valid['train'], # 将 test_valid 中的训练集作为验证集
            'test': test_valid['test']      # 将 test_valid 中的测试集作为测试集
        })

        print("\n成功划分数据集：")
        print(your_ds_split)


except KeyError as e:
    print(f"错误：DataFrame 中不存在列 {e}。请检查 XLSX 文件中的列名。")
except Exception as e:
    print(f"处理数据时发生错误: {e}")


成功提取并重命名列：
       text          child_utterance
0      toys              t ʰ ɔ j i j
1      toys              t ʰ ɔ j i j
2      mama                m ɑ ̃ m ɑ
3      mama                  m ʊ m ʌ
4  all gone  ʌ ɫ  WORD_BOUNDARY  ɡ ɑ

删除空值后剩余的行数: 67796
       text          child_utterance
0      toys              t ʰ ɔ j i j
1      toys              t ʰ ɔ j i j
2      mama                m ɑ ̃ m ɑ
3      mama                  m ʊ m ʌ
4  all gone  ʌ ɫ  WORD_BOUNDARY  ɡ ɑ

成功将 DataFrame 转换为 Hugging Face Dataset：
Dataset({
    features: ['text', 'child_utterance', '__index_level_0__'],
    num_rows: 67796
})

成功划分数据集：
DatasetDict({
    train: Dataset({
        features: ['text', 'child_utterance', '__index_level_0__'],
        num_rows: 54236
    })
    valid: Dataset({
        features: ['text', 'child_utterance', '__index_level_0__'],
        num_rows: 6780
    })
    test: Dataset({
        features: ['text', 'child_utterance', '__index_level_0__'],
        num_rows: 6780
    })
})


In [12]:
print(your_ds_split["test"][:10])

{'text': ['yummy . ', 'tangle foot . ', 'no .', 'cold .', 'apple', 'away', 'in', 'Rolly the roller . ', 'Denise .', 'dog dog'], 'child_utterance': ['ˈ ʌ m i', 'ˈ t æ ̃ n ɡ ə  WORD_BOUNDARY  ˈ f ʊ ˈ f ʊ', 'n o', 'k u ː', 'æ p ʊ', 'ə w a', 'ɪ n', 'ˈ ɔ l e ɪ  WORD_BOUNDARY  ˈ r o l ə', 'd i s', 'd ɔ t ʰ  WORD_BOUNDARY  d ɔ t ʰ'], '__index_level_0__': [47278, 61153, 41203, 6859, 26569, 12876, 34545, 66297, 3895, 21253]}


In [13]:
print(list(your_ds_split))
print(len(your_ds_split["train"]))
print(len(your_ds_split["valid"]))
print(len(your_ds_split["test"]))

['train', 'valid', 'test']
54236
6780
6780


### Fine-Tuning

To fine tune BART from scratch uncomment the next five blocks of code and run. Note though that it will take a good few hours to run for each epoch.

In [14]:
from transformers import AutoTokenizer, BartForConditionalGeneration
device="cuda"
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base").to(device)



In [15]:
# 获取训练数据集中所有音标标记，去重后形成一个列表。
child_utterances = your_ds_split["train"][:]["child_utterance"]
valid_utterances = [utterance for utterance in child_utterances if isinstance(utterance, str)]
tokenset = list(set(str.split(' '.join(valid_utterances))))

print(len(tokenizer))
print(tokenset)

50265
['ɨ', 'ˌ', 'V', 'I', 'x', 'S', 'ɫ', 'o', '~', 'ð', '̟', 'ɒ', 'ɜ', '*', 'ʍ', 'q', 'θ', 'd', 'u', 'b', '-', 'e', 'ʋ', 'ʂ', 'ʣ', 'ʰ', 'r', 'ʒ', 'ɾ', 'ɵ', 'C', '̜', '(', '͡', 'ʎ', 'ɖ', 'p', '̪', 'h', '˺', 'ˑ', 'ɱ', 'a', 'f', 'ː', '̆', '̰', '̤', 'ɲ', 'ɘ', 'l', '̣', 'ɪ', 'ʉ', '̬', 'v', 'ʌ', 'i', 'w', 'ʊ', '˞', 'ʧ', 'ʤ', 'ʙ', 'ɐ', 'ɚ', '^', 'ɡ', 'χ', 'A', 'ɹ', 'ɤ', 'ˠ', 'ɣ', '̃', '̩', 'y', '_', '̯', 'ə', 'ɑ', 'ʝ', 'ç', 'ŋ', 'n', 'æ', 'ʲ', 'WORD_BOUNDARY', '3', 'k', 'ʔ', '̚', 'ˀ', 'ɦ', 'ˈ', 'ɥ', 'ɛ', ')', '̠', 't', 's', 'm', 'E', 'c', 'g', 'β', 'j', 'ɔ', 'ɰ', 'O', 'ʴ', 'ʃ', 'ɕ', '.', '%', 'ʦ', 'ʷ', 'ɸ', ':', 'z', '̥', 'ɝ']


In [16]:
print(tokenizer)

BartTokenizerFast(name_or_path='facebook/bart-base', vocab_size=50265, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True)


In [17]:
# 将刚提取的唯一音标标记添加到tokenizer的词汇表中。
tokenizer.add_tokens(tokenset)
print(len(tokenizer))  # 28997
print(tokenizer)

50340
BartTokenizerFast(name_or_path='facebook/bart-base', vocab_size=50265, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True)


In [48]:
# 调整模型的嵌入层大小，以适应扩展后的词汇表。
model.resize_token_embeddings(len(tokenizer))

Embedding(50351, 768)

In [18]:
# 定义一个函数，用于将原始数据批量转换为模型可用的特征格式。
def convert_examples_to_features(example_batch):
   # 将输入文本编码为token ID，最大长度1024，超长则截断。
   input_texts = [str(text) if not isinstance(text, str) else text for text in example_batch["text"]]
   input_encodings = tokenizer(example_batch["text"], max_length=1024,
                               truncation=True)
   # Ensure "child_utterance" values are strings before tokenizing as target
   child_utterances = [str(utterance) if not isinstance(utterance, str) else utterance for utterance in example_batch["child_utterance"]]
   # 使用特殊上下文管理器将tokenizer设置为目标模式，然后对目标音标序列进行编码。
   with tokenizer.as_target_tokenizer():
       target_encodings = tokenizer(example_batch["child_utterance"], max_length=1024,
                                    truncation=True)

   return {"input_ids": input_encodings["input_ids"],
           "attention_mask": input_encodings["attention_mask"],
           "labels": target_encodings["input_ids"]}

# 将转换函数应用到整个数据集，使用批处理模式以提高效率。
# 对ds数据集中的每个样本应用 convert_examples_to_features 函数
ds_pt = your_ds_split.map(convert_examples_to_features, batched=True)
columns = ["input_ids", "labels", "attention_mask"]
#选择需要的列并将数据集转换为PyTorch张量格式
ds_pt.set_format(type="torch", columns=columns)

Map:   0%|          | 0/54236 [00:00<?, ? examples/s]



Map:   0%|          | 0/6780 [00:00<?, ? examples/s]

Map:   0%|          | 0/6780 [00:00<?, ? examples/s]

In [52]:
# 导入训练所需的Transformers库组件
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer

# 创建序列到序列的数据整理器，处理不同长度序列的批次
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

'''
 设置训练参数，包括：
输出目录、训练轮数(5)、预热步数(500)
每设备批量大小(1)、权重衰减(0.01)
日志记录步数(10)、评估策略和步数
梯度累积步数(128)，用于模拟更大批量
'''
training_args = TrainingArguments(
   output_dir='text-to-phon', num_train_epochs=1, warmup_steps=500,
   per_device_train_batch_size=1, per_device_eval_batch_size=1,
   weight_decay=0.01, logging_steps=10, push_to_hub=False,
   evaluation_strategy='steps', eval_steps=2500, save_steps=1e6,gradient_accumulation_steps=128)

# 初始化训练器，组合模型、参数、数据集等组件，准备训练
trainer = Trainer(model=model, args=training_args,
                 tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                 train_dataset=ds_pt["train"],
                 eval_dataset=ds_pt["valid"])


In [26]:
!pip install wandb



In [27]:
import wandb
from huggingface_hub import notebook_login

notebook_login()
wandb.init(mode="disabled")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [51]:
import os

dir_path = os.path.dirname('/content/drive/MyDrive/my_models/dialogue-summ-model-bart_age1_2')
if not os.path.exists(dir_path):
    os.makedirs(dir_path)



In [53]:
# hide_output
import torch
torch.cuda.empty_cache()
trainer.train()
# To save your fine-tuned model:
trainer.save_model("/content/drive/MyDrive/my_models/dialogue-summ-model-bart_age1_2")

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


To use a pre-tuned (for one-epoch only so performance isn't great yet) model run the following

In [None]:
!gdown 1VAJMdR2kfHQkJRYQToLT8oHXEYfKJbZP

!gunzip bart-text-to-phon.tar.gz
!tar xf bart-text-to-phon.tar

Downloading...
From (original): https://drive.google.com/uc?id=1VAJMdR2kfHQkJRYQToLT8oHXEYfKJbZP
From (redirected): https://drive.google.com/uc?id=1VAJMdR2kfHQkJRYQToLT8oHXEYfKJbZP&confirm=t&uuid=138b334c-382a-4ef0-9ec1-128619ea7046
To: /content/bart-text-to-phon.tar.gz
100% 518M/518M [00:11<00:00, 45.0MB/s]


In [19]:
from transformers import AutoTokenizer, BartForConditionalGeneration
model_ckpt="/content/drive/MyDrive/my_models/dialogue-summ-model-bart_age1_2"
device="cuda"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = BartForConditionalGeneration.from_pretrained(model_ckpt).to(device)

### Translating new input to IPA

In [29]:
input_utterance = "dog dog"
input_ = tokenizer(input_utterance, max_length=1024, truncation=True, return_tensors="pt")
device="cuda"
input_ids = input_['input_ids']
input_mask = input_['attention_mask']
responses = model.generate(input_ids=input_ids.to(device),
                         attention_mask=input_mask.to(device),
                         num_beams=100,
                         no_repeat_ngram_size=2,
                         early_stopping=True,
                         num_return_sequences=1,
                         max_length=1024)
tokenizer.batch_decode(responses, skip_special_tokens=True)

['d a  WORD_BOUNDARY  d a']

Evaluation

In [31]:
!pip install evaluate
!pip install rouge_score

[31mERROR: Could not find a version that satisfies the requirement bleu_score (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for bleu_score[0m[31m
[0m

In [32]:
from evaluate import load
import torch
from tqdm import tqdm  # 用于显示进度

# Load BLEU and ROUGE metrics
bleu = load("bleu")
rouge = load("rouge")

def compute_metrics(pred):
    # Decode the predictions and labels
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # Replace -100 with the pad_token_id since we can't decode -100
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id

    # Decode the predictions and labels
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    # Calculate BLEU score
    bleu_result = bleu.compute(predictions=pred_str, references=[[ref] for ref in labels_str])

    # Calculate ROUGE score
    rouge_result = rouge.compute(predictions=pred_str, references=labels_str, rouge_types=["rouge1", "rouge2", "rougeL"])

    # Return the results as a dictionary
    return {
        "bleu": bleu_result["bleu"],
        "rouge1": rouge_result["rouge1"],
        "rouge2": rouge_result["rouge2"],
        "rougeL": rouge_result["rougeL"]
    }

# Define a function to evaluate in batches
def evaluate_in_batches(dataset, batch_size=1):
    # Initialize lists to store all predictions and labels
    all_preds = []
    all_labels = []

    # Create a DataLoader to load the dataset in batches
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)

    # Prepare a progress bar
    progress_bar = tqdm(total=min(len(dataloader), 1000), desc="Evaluating")

    # Loop through each batch, but only process the first 100 batches
    for i, batch in enumerate(dataloader):
        if i >= 1000:
            break  # Exit after processing 100 batches

        # Move batch to the appropriate device (e.g., GPU)
        batch = {k: v.to('cuda') for k, v in batch.items()}

        # Perform a forward pass without gradient calculation
        with torch.no_grad():
            outputs = model.generate(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                max_length=1024
            )

        # Collect predictions and labels
        all_preds.extend(outputs.cpu().numpy())
        all_labels.extend(batch['labels'].cpu().numpy())

        # Clear CUDA cache after each batch to free up memory
        torch.cuda.empty_cache()

        # Update the progress bar
        progress_bar.update(1)

    progress_bar.close()  # Close the progress bar

    # Convert all predictions and labels to a format suitable for compute_metrics
    dummy_pred = type('Pred', (object,), {'predictions': all_preds, 'label_ids': all_labels})
    return compute_metrics(dummy_pred)

# Set environment variable to reduce fragmentation
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Evaluate the first 100 samples of the test set in batches of 1
test_dataset_subset = ds_pt["test"].select(range(1000))  # Select the first 100 samples
test_results = evaluate_in_batches(test_dataset_subset, batch_size=1)

print("Test set evaluation results:", test_results)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Evaluating: 100%|██████████| 1000/1000 [01:59<00:00,  8.37it/s]


Test set evaluation results: {'bleu': 0.04059764002186778, 'rouge1': np.float64(0.5485707049786068), 'rouge2': np.float64(0.21218112102036268), 'rougeL': np.float64(0.5442118493836998)}
