In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from transformers import DebertaTokenizer, TFAutoModel
import tensorflow as tf
import optuna  # 用于贝叶斯优化

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 假设数据文件位于 'ggg_sg.csv'
data_path = "./Downloads/ggg_sg.csv"

# 获取文件的总行数（可选，如果文件过大，可跳过此步骤）
# total_rows = sum(1 for line in open(data_path, 'r', encoding='utf-8')) - 1  # 减去标题行

# 计算需要读取的行数（三分之一）
# one_third_rows = total_rows // 3

# 使用 Pandas 读取 CSV 文件的三分之一
# 如果 total_rows 已知，可以使用 nrows 参数
# df = pd.read_csv(data_path, nrows=one_third_rows)

# 如果 total_rows 未知，直接随机采样读取部分数据
df = pd.read_csv(data_path, usecols=['DateTime', 'Title', 'DomainCountryCode', 'ContextualText', 'DocTone'])

# 随机采样三分之一的数据
df_sampled = df#.sample(frac=0.33, random_state=42).reset_index(drop=True)


In [3]:
# 确保 'ContextualText' 和 'DocTone' 列存在
df_filtered = df_sampled[['ContextualText', 'DocTone']].dropna(subset=['ContextualText', 'DocTone'])

# 将 'DocTone' 列转换为浮点型
df_filtered['DocTone'] = df_filtered['DocTone'].astype(float)

# 定义百分位数列表
percentiles = [0.2, 0.4, 0.6, 0.8]

# 计算百分位数
quantiles = df_filtered['DocTone'].quantile(percentiles).values

# 输出百分位数阈值
print("DocTone percentile threshold：", quantiles)

# 获取阈值
q1, q2, q3, q4 = quantiles

def label_sentiment(score):
    if score <= q1:
        return 'Strongly Negative'  # 强烈负面
    elif q1 < score <= q2:
        return 'Negative'  # 负面
    elif q2 < score <= q3:
        return 'Neutral'  # 中性
    elif q3 < score <= q4:
        return 'Positive'  # 正面
    else:
        return 'Strongly Positive'  # 强烈正面

# 应用标签函数
data = df_filtered.copy()
data['Sentiment'] = data['DocTone'].apply(label_sentiment)

# 标签编码
label_encoder = LabelEncoder()
data['SentimentLabel'] = label_encoder.fit_transform(data['Sentiment'])

DocTone percentile threshold： [-2.58706468 -0.65502183  0.79928952  2.47191011]


In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_decision_forests as tfdf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import keras_tuner as kt
import optuna

# Extract texts
texts = data['ContextualText'].astype(str).tolist()

# Initialize the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(texts)

# Pad sequences to the same length
max_sequence_length = 100  # You can adjust this value
word_index = tokenizer.word_index
data_padded = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# Vocabulary size
vocab_size = len(word_index) + 1  # Adding 1 because of reserved 0 index

# Embedding dimension
embedding_dim = 100  # You can adjust this value

# Define the embedding layer
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length)

# Get the embedding weights (randomly initialized)
embedding_matrix = embedding_layer.get_weights()[0]

# Function to compute average embeddings
def get_average_embeddings(data_padded, embedding_matrix):
    embeddings = []
    for sequence in data_padded:
        valid_embeddings = [embedding_matrix[idx] for idx in sequence if idx != 0]
        if valid_embeddings:
            avg_embedding = np.mean(valid_embeddings, axis=0)
        else:
            avg_embedding = np.zeros(embedding_dim)
        embeddings.append(avg_embedding)
    return np.array(embeddings)

# Compute average embeddings for all texts
features = get_average_embeddings(data_padded, embedding_matrix)




NotFoundError: D:\anaconda3\envs\NUS\lib\site-packages\tensorflow_decision_forests\tensorflow\ops\inference\inference.so not found

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# 定义参数分布
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(5, 50),
    'min_samples_leaf': randint(1, 10),
    'min_samples_split': randint(2, 15),
    'max_features': ['auto', 'sqrt', 'log2']
}

# 定义随机森林分类器
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

# 定义随机搜索
n_iter_search = 20  # 搜索次数
random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=n_iter_search, cv=3, scoring='accuracy', random_state=42)

# Create a custom tqdm callback
class TQDMProgressBar(tqdm):
    def __init__(self, total=None, **kwargs):
        super().__init__(total=total, **kwargs)
        self.total = total

    def __call__(self, *args, **kwargs):
        self.update(1)

# Initialize the progress bar
progress_bar = TQDMProgressBar(total=n_iter_search)

# Perform the hyperparameter tuning with progress tracking
random_search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=n_iter_search,
    cv=3,
    scoring='accuracy',
    random_state=42,
    verbose=0,
)

# Wrap the fit method to use the progress bar
for _ in range(n_iter_search):
    progress_bar.__call__()
    random_search.fit(X_train, y_train)

# Close the progress bar
progress_bar.close()

# Output the best parameters
print("Best parameters found by random search:")
print(random_search.best_params_)