In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tensorflow as tf
import numpy as np
import os
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.metrics import precision_score, recall_score, f1_score

# 配置路径
DRIVE_DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/data'  # Google Drive存储路径

# 文件路径配置
file_paths = {
    'train': ('imdb_train.npz', ['x', 'y']),
    'test': ('imdb_test.npz', ['x', 'y']),
    'val': ('imdb_val.npz', ['x', 'y'])
}

# 自动检测数据存在路径
def check_data_source():
    # 检查Google Drive是否有数据
    drive_files_exist = all([os.path.exists(os.path.join(DRIVE_DATA_PATH, f[0])) for f in file_paths.values()])

    if drive_files_exist:
        print("Uso de datos persistentes en Google Drive")
        return DRIVE_DATA_PATH

    # 如果没有找到数据，提示用户上传
    print("""
    ¡Archivo de datos no encontrado!
    Por favor seleccione una fuente de datos:
    1. Subir desde local a Google Drive (recomendado)
    """)
    from google.colab import files
    for file_info in file_paths.values():
        print(f"Subiendo {file_info[0]}...")
        uploaded = files.upload()
        for fn in uploaded.keys():
            os.rename(fn, os.path.join(DRIVE_DATA_PATH, fn))
    return DRIVE_DATA_PATH

# 确定数据源
DATA_SOURCE = check_data_source()

# 模型参数配置
max_length = 200
batch_size = 16
learning_rate = 2e-5
number_of_epochs = 10
bert_weight_name = 'bert-base-uncased'

# 初始化分词器
tokenizer = BertTokenizer.from_pretrained(bert_weight_name)

# 数据加载函数
def get_raw_dataset():
    def load_data(file_name, keys):
        path = os.path.join(DATA_SOURCE, file_name)
        if not os.path.exists(path):
            raise FileNotFoundError(f"Archivos de datos {path} No existe, verifique si la ruta y el nombre del archivo son correctos")
        data = np.load(path)
        return tuple(data[key] for key in keys)

    try:
        return (
            *load_data(file_paths['train'][0], file_paths['train'][1]),
            *load_data(file_paths['test'][0], file_paths['test'][1]),
            *load_data(file_paths['val'][0], file_paths['val'][1])
        )
    except Exception as e:
        print(f"Error en la carga de datos：{e}")
        raise

# 数据预处理函数
def convert_example_to_feature(review):
    return tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        max_length=max_length,
        pad_to_max_length=True,
        return_attention_mask=True,
        truncation=True
    )

def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
    return {
        "input_ids": input_ids,
        "token_type_ids": token_type_ids,
        "attention_mask": attention_masks
    }, label

def encode_examples(x, y, limit=-1):
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []

    for review, label in zip(x[:limit], y[:limit]):
        bert_input = convert_example_to_feature(review)
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label])

    return tf.data.Dataset.from_tensor_slices(
        (input_ids_list, attention_mask_list, token_type_ids_list, label_list)
    ).map(map_example_to_dict)

# 数据集生成
def get_dataset():
    x_train, y_train, x_test, y_test, x_val, y_val = get_raw_dataset()

    ds_train_encoded = encode_examples(x_train, y_train).shuffle(10000).batch(batch_size)
    ds_test_encoded = encode_examples(x_test, y_test).batch(batch_size)
    ds_val_encoded = encode_examples(x_val, y_val).batch(batch_size)

    return ds_train_encoded, ds_test_encoded, ds_val_encoded

# 模型构建
def build_model():
    model = TFBertForSequenceClassification.from_pretrained(bert_weight_name)

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
    return model

Uso de datos persistentes en Google Drive


In [None]:
# 第一部分：初始化模型
print("\nInicializar el modelo...")
model = build_model()
model.summary()


初始化模型...


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_113 (Dropout)       multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 109483778 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# 第二部分：加载数据集
print("\nCarga de conjuntos de datos ...")
train_data, test_data, val_data = get_dataset()


Carga de conjuntos de datos ...


In [None]:
# 第三部分：训练模型
print("\nEmpezar a entrenar...")
history = model.fit(
    train_data,
    validation_data=val_data,
    epochs=number_of_epochs
)


开始训练...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# 第四部分：保存模型（可选）
model_save_path = '/content/drive/MyDrive/Colab Notebooks/model/saved_model'
os.makedirs(os.path.dirname(model_save_path), exist_ok=True)  # 确保目录存在
model.save_pretrained(model_save_path)
print(f"Modelo guardado en {model_save_path}")

模型已保存到 /content/drive/MyDrive/Colab Notebooks/model/saved_model


In [None]:
# Cargar el modelo (si es necesario desde un archivo)
from transformers import TFBertForSequenceClassification
model_load_path = '/content/drive/MyDrive/Colab Notebooks/model/saved_model'
model = TFBertForSequenceClassification.from_pretrained(model_load_path)
print("Modelo cargado desde archivo")

Some layers from the model checkpoint at /content/drive/MyDrive/Colab Notebooks/model/saved_model were not used when initializing TFBertForSequenceClassification: ['dropout_113']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at /content/drive/MyDrive/Colab Notebooks/model/saved_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further traini

Modelo cargado desde archivo


In [None]:
# Recompilar el modelo
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
print("\nEvaluación final：")
loss, acc = model.evaluate(test_data, verbose=0)

print(f"Loss: {loss:.4f}")
print(f"Accuracy: {acc:.4f}")


Evaluación final：
Loss: 0.4820
Accuracy: 0.9064


In [None]:
# 第七部分：评估模型 - 计算精确率、召回率和 F1 分数
y_true = []
y_pred = []

for batch in test_data:
    inputs, labels = batch
    predictions = model(inputs).logits
    y_true.extend(labels.numpy())
    y_pred.extend(tf.argmax(predictions, axis=1).numpy())

precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {acc:.4f}")

Precision: 0.9065
Recall: 0.9064
F1 Score: 0.9063
