In [2]:
# Print version of tensorflow
import tensorflow as tf
print(tf.__version__)
# Print version of keras
import keras
print(keras.__version__)

2.17.0
3.4.1


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pyspark

In [5]:
physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(physical_devices))

Num GPUs Available:  1


In [16]:
# 添加必要的导入
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, concat_ws
from pyspark.sql.types import IntegerType, FloatType, StringType
from pyspark.ml.feature import Tokenizer, StopWordsRemover

import tensorflow as tf
import glob
import os

# -------------------------------
# 1. 使用PySpark加载和预处理全量数据
# -------------------------------

# 创建SparkSession
spark = SparkSession.builder \
    .appName("SentimentAnalysisWithTensorFlow") \
    .getOrCreate()

# 加载数据集
df = spark.read.csv("/content/drive/MyDrive/ggg_sg.csv", header=True, inferSchema=True, multiLine=True, escape='"')

# 数据清洗
df = df.filter(df.ContextualText.isNotNull())
df = df.filter(df.DocTone.isNotNull())
df = df.withColumn("DocTone", df["DocTone"].cast(FloatType()))

# 创建情感标签：正面（2），中性（1），负面（0）
def sentiment_label(score):
    if score > 1.9910:
        return 2
    elif score < -2.0202:
        return 0
    else:
        return 1

sentiment_udf = udf(sentiment_label, IntegerType())
df = df.withColumn("label", sentiment_udf(col("DocTone")))


AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `text` cannot be resolved. Did you mean one of the following? [`Lat`, `Lon`, `URL`, `Title`, `label`].;
'Project [DateTime#362, URL#363, Title#364, SharingImage#365, LangCode#366, DocTone#395, DomainCountryCode#368, Location#369, Lat#370, Lon#371, CountryCode#372, Adm1Code#373, Adm2Code#374, GeoType#375, ContextualText#376, the_geom#377, label#413, cast('text as string) AS text#431]
+- Project [DateTime#362, URL#363, Title#364, SharingImage#365, LangCode#366, DocTone#395, DomainCountryCode#368, Location#369, Lat#370, Lon#371, CountryCode#372, Adm1Code#373, Adm2Code#374, GeoType#375, ContextualText#376, the_geom#377, sentiment_label(DocTone#395)#412 AS label#413]
   +- Project [DateTime#362, URL#363, Title#364, SharingImage#365, LangCode#366, cast(DocTone#367 as float) AS DocTone#395, DomainCountryCode#368, Location#369, Lat#370, Lon#371, CountryCode#372, Adm1Code#373, Adm2Code#374, GeoType#375, ContextualText#376, the_geom#377]
      +- Filter isnotnull(DocTone#367)
         +- Filter isnotnull(ContextualText#376)
            +- Relation [DateTime#362,URL#363,Title#364,SharingImage#365,LangCode#366,DocTone#367,DomainCountryCode#368,Location#369,Lat#370,Lon#371,CountryCode#372,Adm1Code#373,Adm2Code#374,GeoType#375,ContextualText#376,the_geom#377] csv


In [18]:
df = df.withColumn("label", col("label").cast(IntegerType()))


# 文本预处理
tokenizer = Tokenizer(inputCol="ContextualText", outputCol="words")
df = tokenizer.transform(df)

remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
df = remover.transform(df)

df = df.withColumn("text", concat_ws(" ", col("filtered_words")))
df = df.withColumn("text", col("text").cast(StringType()))
df = df.filter(df.text.isNotNull())
df = df.filter(df.label.isNotNull())
df = df.select("text", "label")

In [20]:
df.select("text", "label").write.csv("processed_data.csv", header=True)

In [22]:
import tensorflow as tf

# 定义解析 CSV 行的函数
def parse_csv_line(line):
    record_defaults = ['', 0]  # 默认值
    text, label = tf.io.decode_csv(line, record_defaults)
    return text, label

# 创建 Dataset 对象
dataset = tf.data.TextLineDataset("/content/processed_data.csv/part-00000-910940aa-cebb-4a1e-ac10-17dd23e14f5b-c000.csv")
dataset = dataset.map(parse_csv_line)

# 将 Dataset 转换为 TFRecord
def serialize_example(text, label):
    feature = {
        'text': tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.compat.as_bytes(text)])),
        'label': tf.train.Feature(int64_list=tf.train.Int64List(value=[int(label)]))
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

tfrecord_filename = 'data.tfrecord'
with tf.io.TFRecordWriter(tfrecord_filename) as writer:
    for text, label in dataset:
        example = serialize_example(text.numpy(), label.numpy())
        writer.write(example)


InvalidArgumentError: {{function_node __wrapped__IteratorGetNext_output_types_2_device_/job:localhost/replica:0/task:0/device:CPU:0}} Error in user-defined function passed to MapDataset:7 transformation with iterator: Iterator::Root::ParallelMapV2: Field 1 in record 0 is not a valid int32: label
	 [[{{node DecodeCSV}}]] [Op:IteratorGetNext] name: 

In [None]:
# -------------------------------
# 3. 使用TensorFlow的tf.data API加载和处理数据
# -------------------------------

# 获取所有TFRecord文件的路径
tfrecord_files = glob.glob("tfrecord_data/*.tfrecord")

# 定义解析TFRecord的函数
def parse_tfrecord(serialized_example):
    feature_description = {
        'text': tf.io.FixedLenFeature([], tf.string),
        'label': tf.io.FixedLenFeature([], tf.int64),
    }
    example = tf.io.parse_single_example(serialized_example, feature_description)
    text = example['text']
    label = example['label']
    return text, label

# 创建TFRecordDataset
raw_dataset = tf.data.TFRecordDataset(tfrecord_files)

# 解析数据
parsed_dataset = raw_dataset.map(parse_tfrecord, num_parallel_calls=tf.data.AUTOTUNE)

# 定义Tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000, oov_token="<OOV>")

# 提取部分文本数据用于拟合Tokenizer
sample_texts = []
for text, label in parsed_dataset.take(100000):  # 取10万条数据
    sample_texts.append(text.numpy().decode('utf-8'))

# 拟合Tokenizer
tokenizer.fit_on_texts(sample_texts)

# 定义数据预处理函数
max_seq_length = 100

def preprocess(text, label):
    text = tf.strings.lower(text)
    seq = tokenizer.texts_to_sequences([text.numpy().decode('utf-8')])
    padded_seq = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=max_seq_length, padding='post', truncating='post')
    return padded_seq[0], label

def tf_preprocess(text, label):
    text, label = tf.py_function(preprocess, inp=[text, label], Tout=(tf.int32, tf.int64))
    text.set_shape([max_seq_length])
    label.set_shape([])
    return text, label

# 应用预处理函数并创建批处理数据集
dataset = parsed_dataset.map(tf_preprocess, num_parallel_calls=tf.data.AUTOTUNE)
dataset = dataset.shuffle(100000).batch(64).prefetch(tf.data.AUTOTUNE)

# -------------------------------
# 4. 构建和训练TensorFlow模型
# -------------------------------

# 构建模型
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=10000, output_dim=128, input_length=max_seq_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')  # 三个情感类别
])

# 编译模型
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# 检查是否使用GPU
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# 训练模型
epochs = 5
history = model.fit(dataset, epochs=epochs)

# -------------------------------
# 5. 模型评估
# -------------------------------

# 收集真实标签和预测标签
true_labels = []
predicted_labels = []

for texts, labels in dataset.take(100):  # 取100个批次的数据
    predictions = model.predict(texts)
    predicted_classes = tf.argmax(predictions, axis=1)
    true_labels.extend(labels.numpy())
    predicted_labels.extend(predicted_classes.numpy())

# 计算分类报告
from sklearn.metrics import classification_report
print(classification_report(true_labels, predicted_labels, digits=4))