In [3]:
!pip install transformers






In [18]:
import tensorflow as tf
import tensorflow_hub as hub
from transformers import BertTokenizer, TFBertModel

import numpy as np
import pandas as pd
import random
from tensorflow.keras import layers
import tensorflow.keras.backend as K
import os

In [32]:
import tensorflow.keras.backend as K

def recall_m(y_true, y_pred):
    y_true = K.cast(y_true, dtype='float32')
    y_pred = K.cast(y_pred, dtype='float32')

    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))

    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    y_true = K.cast(y_true, dtype='float32')
    y_pred = K.cast(y_pred, dtype='float32')

    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))

    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)

    f1 = 2 * ((precision * recall) / (precision + recall + K.epsilon()))
    return K.clip(f1, 0, 1)  # 确保 F1-score 在 [0,1] 之间


In [33]:
def dataset_embedding(dataset_path, tokenizer, batch_size=32):
    dataset = pd.read_csv(dataset_path)[["tweet", "sarcastic"]]
    dataset = dataset.dropna()

    tokenized_tweets = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(tweet)) for tweet in dataset['tweet']]

    tweets_with_len = [[tweet, dataset['sarcastic'].iloc[i], len(tweet)] for i, tweet in enumerate(tokenized_tweets)]
    random.Random(42).shuffle(tweets_with_len)

    tweets_with_len.sort(key=lambda x: x[2])  # 按长度排序
    sorted_tweets_labels = [(tweet_lab[0], tweet_lab[1]) for tweet_lab in tweets_with_len]

    processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_tweets_labels, output_types=(tf.int32, tf.int32))

    return processed_dataset.padded_batch(batch_size, padded_shapes=((None,), ()))



In [34]:
from transformers import BertTokenizer

def prepare_datasets(train_path, test_path):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # 读取数据集
    train_df = pd.read_csv(train_path)[["tweet", "sarcastic"]].dropna()
    test_df = pd.read_csv(test_path)[["tweet", "sarcastic"]].dropna()

    # Tokenize 数据
    train_encodings = tokenizer(list(train_df["tweet"]), truncation=True, padding=True, max_length=128, return_tensors="tf")
    test_encodings = tokenizer(list(test_df["tweet"]), truncation=True, padding=True, max_length=128, return_tensors="tf")

    # 创建 TensorFlow 数据集
    train_data = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_df["sarcastic"].values)).batch(32)
    test_data = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_df["sarcastic"].values)).batch(32)

    return train_data, test_data, tokenizer


In [35]:

train_path = "/content/Train_Dataset.csv"
test_path = "/content/Test_Dataset.csv"

# 运行数据准备
train_data, test_data, tokenizer = prepare_datasets(train_path, test_path)


In [36]:
class CNN_MODEL(tf.keras.Model):

    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 dropout_rate=0.1,
                 training=False,
                 name="cnn_model"):
        super(CNN_MODEL, self).__init__(name=name)

        self.embedding = layers.Embedding(vocabulary_size, embedding_dimensions)

        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters, kernel_size=2, padding="valid", activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters, kernel_size=3, padding="valid", activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters, kernel_size=4, padding="valid", activation="relu")
        self.pool = layers.GlobalMaxPool1D()

        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        self.last_dense = layers.Dense(units=1, activation="sigmoid")

    def call(self, inputs, training=False):
        # 只提取 input_ids，忽略其他无关参数
        input_ids = inputs["input_ids"]

        l = self.embedding(input_ids)
        l_1 = self.pool(self.cnn_layer1(l))
        l_2 = self.pool(self.cnn_layer2(l))
        l_3 = self.pool(self.cnn_layer3(l))

        concatenated = tf.concat([l_1, l_2, l_3], axis=-1)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training=training)  # training 需显式传递
        model_output = self.last_dense(concatenated)

        return model_output



In [37]:
cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1_m])
cnn.fit(train_data, epochs=10, validation_data=test_data, class_weight={1:4, 0:1}, callbacks=[F1ScoreCallback()])


Epoch 1/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step - accuracy: 0.7100 - f1_m: 0.4388 - loss: 1.9583Epoch 1 - Loss: 2.1422, Accuracy: 0.7414, Val Loss: 1.5665, Val Accuracy: 0.8564, F1-score: 0.2166
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 105ms/step - accuracy: 0.7102 - f1_m: 0.4378 - loss: 1.9591 - val_accuracy: 0.8564 - val_f1_m: 0.0227 - val_loss: 1.5665
Epoch 2/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step - accuracy: 0.6025 - f1_m: 0.3229 - loss: 5.2301Epoch 2 - Loss: 2.4540, Accuracy: 0.7381, Val Loss: 1.8831, Val Accuracy: 0.8564, F1-score: 0.2028
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 109ms/step - accuracy: 0.6031 - f1_m: 0.3223 - loss: 5.2174 - val_accuracy: 0.8564 - val_f1_m: 0.0227 - val_loss: 1.8831
Epoch 3/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step - accuracy: 0.6316 - f1_m: 0.3827 - loss: 4.4152Epoch 3 - Loss: 1.

<keras.src.callbacks.history.History at 0x7d74204d9390>