In [1]:
!pip install transformers



In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import random
from tensorflow.keras import layers
import tensorflow.keras.backend as K
from transformers import BertTokenizer


In [3]:
train_path = "/content/Train_Dataset.csv"
test_path = "/content/Test_Dataset.csv"


In [4]:
# 加载 Bert Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def dataset_embedding(dataset_path, tokenizer, batch_size=32):
    dataset = pd.read_csv(dataset_path)[["tweet", "sarcastic"]].dropna()

    tokenized_tweets = tokenizer(list(dataset["tweet"]), truncation=True, padding=True, max_length=128, return_tensors="tf")

    processed_dataset = tf.data.Dataset.from_tensor_slices((dict(tokenized_tweets), dataset["sarcastic"].values)).batch(batch_size)
    return processed_dataset

# 处理数据
train_data = dataset_embedding(train_path, tokenizer)
test_data = dataset_embedding(test_path, tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [13]:
import tensorflow.keras.backend as K

def recall_m(y_true, y_pred):
    y_true = K.cast(y_true, dtype='float32')
    y_pred = K.cast(y_pred, dtype='float32')
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    y_true = K.cast(y_true, dtype='float32')
    y_pred = K.cast(y_pred, dtype='float32')
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    f1 = 2 * ((precision * recall) / (precision + recall + K.epsilon()))
    return K.clip(K.mean(f1), 0, 1)  # 归一化 F1-score，确保在 0-1 之间


In [14]:
class LSTM_CNN_MODEL(tf.keras.Model):

    def __init__(self, vocabulary_size, embedding_dimensions=32, cnn_filters=50, dnn_units=512, dropout_rate=0.1, training=False, name="lstm_cnn_model"):
        super(LSTM_CNN_MODEL, self).__init__(name=name)

        self.embedding = layers.Embedding(vocabulary_size, embedding_dimensions)

        self.lstm1 = layers.LSTM(32, return_sequences=True)
        self.lstm2 = layers.LSTM(32, return_sequences=True)
        self.lstm3 = layers.LSTM(32, return_sequences=True)

        self.time1 = layers.TimeDistributed(layers.Dense(32, activation='relu'))
        self.time2 = layers.TimeDistributed(layers.Dense(32, activation='relu'))

        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters, kernel_size=2, padding="valid", activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters, kernel_size=3, padding="valid", activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters, kernel_size=4, padding="valid", activation="relu")
        self.pool = layers.GlobalMaxPool1D()

        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        self.last_dense = layers.Dense(units=1, activation="sigmoid")

    def call(self, inputs, training=False):
        # 确保 inputs 只包含 input_ids
        if isinstance(inputs, dict):
            inputs = inputs["input_ids"]  # 只取 input_ids 作为输入

        inputs = tf.cast(inputs, dtype=tf.int32)  # 确保数据类型正确

        ll = self.lstm1(self.embedding(inputs))
        ll = self.time1(ll)
        ll = self.lstm2(ll)
        ll = self.time2(ll)
        ll = self.lstm3(ll)

        l = ll
        l_1 = self.pool(self.cnn_layer1(l))
        l_2 = self.pool(self.cnn_layer2(l))
        l_3 = self.pool(self.cnn_layer3(l))

        concatenated = tf.concat([l_1, l_2, l_3], axis=-1)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training=training)
        model_output = self.last_dense(concatenated)

        return model_output




In [15]:
import time

device = '/GPU:0' if tf.config.experimental.list_physical_devices('GPU') else '/CPU:0'
print(f"Using device: {device}")

class F1ScoreCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        print(f"Epoch {epoch+1} - Loss: {logs['loss']:.4f}, Accuracy: {logs['accuracy']:.4f}, "
              f"Val Loss: {logs['val_loss']:.4f}, Val Accuracy: {logs['val_accuracy']:.4f}, "
              f"F1-score: {logs.get('f1_m', 0):.4f}")

start_time = time.time()

with tf.device(device):
    model = LSTM_CNN_MODEL(vocabulary_size=tokenizer.vocab_size)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1_m])
    model.fit(train_data, epochs=10, validation_data=test_data, class_weight={1:4, 0:1}, callbacks=[F1ScoreCallback()])

end_time = time.time()
print(f"✅ total time: {end_time - start_time:.2f} 秒")


Using device: /CPU:0
Epoch 1/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 224ms/step - accuracy: 0.7469 - f1_m: 0.4277 - loss: 2.4134Epoch 1 - Loss: 2.0923, Accuracy: 0.7172, Val Loss: 1.8221, Val Accuracy: 0.8571, F1-score: 0.1982
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 257ms/step - accuracy: 0.7467 - f1_m: 0.4266 - loss: 2.4119 - val_accuracy: 0.8571 - val_f1_m: 0.0000e+00 - val_loss: 1.8221
Epoch 2/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 214ms/step - accuracy: 0.5330 - f1_m: 0.2502 - loss: 5.8073Epoch 2 - Loss: 2.6521, Accuracy: 0.5672, Val Loss: 1.3097, Val Accuracy: 0.8571, F1-score: 0.1659
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 226ms/step - accuracy: 0.5332 - f1_m: 0.2498 - loss: 5.7928 - val_accuracy: 0.8571 - val_f1_m: 0.0000e+00 - val_loss: 1.3097
Epoch 3/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 220ms/step - accuracy: 0.3757 - f1_m: 0.2768 - l