In [None]:
import pandas as pd
from datasets import Dataset

In [None]:
#dataset value in utf-8 formate
train_set = pd.read_csv('tweet_qa-train.csv',encoding = "utf-8")
test_set = pd.read_csv('tweet_qa-test.csv',encoding = "utf-8")
validation_set = pd.read_csv('tweet_qa-validation.csv',encoding = "utf-8")

In [None]:
#lower case all letters
columns_to_lowercase = ['text', 'context', 'gold_label_str']

def convert_columns_to_lowercase(dataset, columns):
    for column in columns:
        if column in dataset.columns:
            dataset[column] = dataset[column].str.lower()
    return dataset

train_set = convert_columns_to_lowercase(train_set, columns_to_lowercase)
test_set = convert_columns_to_lowercase(test_set, columns_to_lowercase)
validation_set = convert_columns_to_lowercase(validation_set, columns_to_lowercase)

In [None]:
#rename columns according to column_mapping, for easy understanding

# Define a function to rename columns in a dataset
def rename_columns(dataset, column_mapping):
    return dataset.rename(columns=column_mapping)

column_mapping = {
    'text': 'text',
    'context': 'question',
    'gold_label_str': 'answer'
}

train_set = rename_columns(train_set, column_mapping)
test_set = rename_columns(test_set, column_mapping)
validation_set = rename_columns(validation_set, column_mapping)

In [None]:
#Clean up columns: remove duplicates, drop missing rows and ensure answers are found in text
def dataset_cleaning(dataset, cleaning_columns):
    dataset = dataset.drop_duplicates(subset=cleaning_columns)
    dataset = dataset.dropna(subset=cleaning_columns)
    
    # Ensure answers are found in the text
    dataset = dataset[dataset.apply(
        lambda row: row['answer'] in row['text'], axis=1
    )]
    return dataset

#Columns for cleaning
cleaning_columns = ['text', 'question', 'answer']

train_set_cleaned = dataset_cleaning(train_set, cleaning_columns)
test_set_cleaned = dataset_cleaning(test_set, cleaning_columns)
validation_set_cleaned = dataset_cleaning(validation_set, cleaning_columns)

In [None]:
# Function to normalize column lengths to the average length of each column
def normalization_question_text(dataset, columns, padding_char=" "):
    def adjust_length(value, target_length):
        # Truncate if longer than target_length
        if len(value) > target_length:
            return value[:target_length]
        # Pad if shorter than target_length
        else:
            return value + padding_char * (target_length - len(value))
    
    # Process each column
    for column in columns:
        if column in dataset.columns:
            avg_length = int(dataset[column].apply(len).mean())
            dataset[column] = dataset[column].apply(
                lambda x: adjust_length(x, avg_length) if isinstance(x, str) else x
            )
    
    return dataset

train_set_normalized = normalization_question_text(train_set_cleaned, ['context', 'text'])
test_set_normalized = normalization_question_text(test_set_cleaned, ['context', 'text'])
validation_set_normalized = normalization_question_text(validation_set_cleaned, ['context', 'text'])

In [1]:
# 转换为Huggingface Dataset
train_dataset = Dataset.from_pandas(train_data)
validation_dataset = Dataset.from_pandas(validation_data)
test_dataset = Dataset.from_pandas(test_data)

# 检查数据集内容
print(train_dataset[0])
print(validation_dataset[0])
print(test_dataset[0])

{'text': '"So much of The Post is Ben," Mrs. Graham said in 1994, three years after Bradlee retired as editor. "He created it as we know it today."— Ed O\'Keefe (@user) October 21, 2014', 'context': 'what did bradlee retire as?', 'gold_label_str': 'editor'}
{'text': '"@Reid2962: @realDonaldTrump@FoxNews I expected better from @megynkelly, wondering what is her hidden agenda.— Donald J. Trump (@realDonaldTrump) August 7, 2015', 'context': 'who do you expect better from?', 'gold_label_str': '@megynkelly'}
{'text': '5 years in 5 seconds. Darren Booth (@darbooth) January 25, 2013', 'context': 'what site does the link take you to?', 'gold_label_str': 'vine'}


In [2]:
def preprocess_function(examples):
    inputs = [f"question: {text} context: {context}" for text, context in zip(examples['text'], examples['context'])]
    targets = examples['gold_label_str']
    return {'input_text': inputs, 'target_text': targets}

# 应用预处理
train_dataset = train_dataset.map(preprocess_function, batched=True)
validation_dataset = validation_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/9489 [00:00<?, ? examples/s]

Map:   0%|          | 0/1086 [00:00<?, ? examples/s]

Map:   0%|          | 0/1203 [00:00<?, ? examples/s]

In [4]:
from transformers import T5Tokenizer

# 加载T5的tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

def tokenize_function(examples):
    model_inputs = tokenizer(examples['input_text'], padding="max_length", truncation=True, max_length=512)
    labels = tokenizer(examples['target_text'], padding="max_length", truncation=True, max_length=128)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Tokenize数据集
train_dataset = train_dataset.map(tokenize_function, batched=True)
validation_dataset = validation_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/9489 [00:00<?, ? examples/s]

Map:   0%|          | 0/1086 [00:00<?, ? examples/s]

Map:   0%|          | 0/1203 [00:00<?, ? examples/s]

In [5]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

# 加载T5模型
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# 设置训练参数
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",     # 每个 epoch 进行验证
    learning_rate=2e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",           # 每个 epoch 保存模型
    load_best_model_at_end=True,
#     fp16=True  
)





In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,         # 使用训练集
    eval_dataset=validation_dataset,     # 使用验证集
)

# 开始训练
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.1342,0.064232
2,0.0665,0.060553


In [None]:
# 评估模型在测试集上的性能
results = trainer.evaluate(eval_dataset=test_dataset)
print(results)

In [None]:
from tqdm import tqdm
# 定义生成答案的函数
def generate_answer(batch):
    # 对输入进行tokenization
    inputs = tokenizer(batch['input_text'], return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # 使用模型生成输出
    outputs = model.generate(inputs.input_ids, max_length=128)
    
    # 解码token ids为文本，并存入生成答案字段
    generated_answers = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    # 将解码后的答案替代之前的数字存入batch
    batch['generated_answer'] = generated_answers
    return batch

# 对测试集生成答案并替代
tqdm.pandas()  # 添加进度条显示
test_results = test_dataset.map(generate_answer, batched=True, batch_size=8)

# 保存结果
test_results.to_csv("test_results_with_generated_answers.csv", index=False)


In [None]:
# ! pip install evaluate
# ! pip install rouge_score

In [None]:
import evaluate

# 加载 BLEU 和 ROUGE 评估指标
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

# 从 test_results 中提取参考答案和生成答案
references = [[item['gold_label_str']] for item in test_results]  # 每个参考答案需要嵌套在一个列表中
predictions = [item['generated_answer'] for item in test_results]  # 生成答案直接使用未分词的句子

# 确保预测和参考数据非空
if not references or not predictions:
    raise ValueError("References or predictions are empty. Please check the test_results data.")

# 计算 BLEU
bleu_score = bleu_metric.compute(
    predictions=predictions,  # 直接传入预测文本
    references=references     # 每个参考答案是一个嵌套列表
)
print(f"BLEU Score: {bleu_score['bleu']:.2f}")

# 计算 ROUGE
rouge_score = rouge_metric.compute(
    predictions=predictions,  # 直接使用预测文本
    references=[r[0] for r in references]  # 从嵌套列表中取出第一个参考答案
)
print(f"ROUGE-1: {rouge_score['rouge1']:.2f}")
print(f"ROUGE-L: {rouge_score['rougeL']:.2f}")

In [None]:
# 将生成结果转换为 pandas DataFrame
test_results_df = test_results.to_pandas()

# 保存为 CSV 文件
test_results_df.to_csv("generated_answers.csv", index=False)

In [None]:
import torch

# 检查是否有可用的 GPU
if torch.cuda.is_available():
    print("GPU 可用！")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"显存: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.2f} GB")
else:
    print("GPU 不可用，使用 CPU。")

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

try:
    print("尝试加载 T5-base 模型...")
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    print("模型加载成功！")
except Exception as e:
    print("模型加载失败，错误信息：", e)