In [5]:
!pip install tensorflow_decision_forests

Collecting tensorflow_decision_forests
  Downloading tensorflow_decision_forests-1.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.0 kB)
Collecting wurlitzer (from tensorflow_decision_forests)
  Downloading wurlitzer-3.1.1-py3-none-any.whl.metadata (2.5 kB)
Collecting ydf (from tensorflow_decision_forests)
  Downloading ydf-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Downloading tensorflow_decision_forests-1.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.5/15.5 MB[0m [31m104.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading wurlitzer-3.1.1-py3-none-any.whl (8.6 kB)
Downloading ydf-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m95.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ydf, wurlitzer, tensorf

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from transformers import DebertaTokenizer, TFAutoModel
import tensorflow as tf
import optuna  # 用于贝叶斯优化

# 假设数据文件位于 'ggg_sg.csv'
data_path = "/content/drive/MyDrive/ggg_sg.csv"

# 获取文件的总行数（可选，如果文件过大，可跳过此步骤）
# total_rows = sum(1 for line in open(data_path, 'r', encoding='utf-8')) - 1  # 减去标题行

# 计算需要读取的行数（三分之一）
# one_third_rows = total_rows // 3

# 使用 Pandas 读取 CSV 文件的三分之一
# 如果 total_rows 已知，可以使用 nrows 参数
# df = pd.read_csv(data_path, nrows=one_third_rows)

# 如果 total_rows 未知，直接随机采样读取部分数据
df = pd.read_csv(data_path, usecols=['DateTime', 'Title', 'DomainCountryCode', 'ContextualText', 'DocTone'])

# 随机采样三分之一的数据
df_sampled = df#.sample(frac=0.33, random_state=42).reset_index(drop=True)

# 确保 'ContextualText' 和 'DocTone' 列存在
df_filtered = df_sampled[['ContextualText', 'DocTone']].dropna(subset=['ContextualText', 'DocTone'])

# 将 'DocTone' 列转换为浮点型
df_filtered['DocTone'] = df_filtered['DocTone'].astype(float)

# 定义百分位数列表
percentiles = [0.2, 0.4, 0.6, 0.8]

# 计算百分位数
quantiles = df_filtered['DocTone'].quantile(percentiles).values

# 输出百分位数阈值
print("DocTone percentile threshold：", quantiles)

# 获取阈值
q1, q2, q3, q4 = quantiles

def label_sentiment(score):
    if score <= q1:
        return 'Strongly Negative'  # 强烈负面
    elif q1 < score <= q2:
        return 'Negative'  # 负面
    elif q2 < score <= q3:
        return 'Neutral'  # 中性
    elif q3 < score <= q4:
        return 'Positive'  # 正面
    else:
        return 'Strongly Positive'  # 强烈正面

# 应用标签函数
data = df_filtered.copy()
data['Sentiment'] = data['DocTone'].apply(label_sentiment)

# 标签编码
label_encoder = LabelEncoder()
data['SentimentLabel'] = label_encoder.fit_transform(data['Sentiment'])

DocTone percentile threshold： [-2.58706468 -0.65502183  0.79928952  2.47191011]


In [8]:
!pip install keras_tuner tensorflow_decision_forests

Collecting keras_tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras_tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras_tuner
Successfully installed keras_tuner-1.4.7 kt-legacy-1.0.5


In [9]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_decision_forests as tfdf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import keras_tuner as kt
import optuna

# Extract texts
texts = data['ContextualText'].astype(str).tolist()

# Initialize the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(texts)

# Pad sequences to the same length
max_sequence_length = 100  # You can adjust this value
word_index = tokenizer.word_index
data_padded = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# Vocabulary size
vocab_size = len(word_index) + 1  # Adding 1 because of reserved 0 index

# Embedding dimension
embedding_dim = 100  # You can adjust this value

# Define the embedding layer
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length)

# Get the embedding weights (randomly initialized)
embedding_matrix = embedding_layer.get_weights()[0]

# Function to compute average embeddings
def get_average_embeddings(data_padded, embedding_matrix):
    embeddings = []
    for sequence in data_padded:
        valid_embeddings = [embedding_matrix[idx] for idx in sequence if idx != 0]
        if valid_embeddings:
            avg_embedding = np.mean(valid_embeddings, axis=0)
        else:
            avg_embedding = np.zeros(embedding_dim)
        embeddings.append(avg_embedding)
    return np.array(embeddings)

# Compute average embeddings for all texts
features = get_average_embeddings(data_padded, embedding_matrix)




IndexError: list index out of range