In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install optuna optuna-integration[tfkeras]

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting optuna-integration[tfkeras]
  Downloading optuna_integration-4.0.0-py3-none-any.whl.metadata (11 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading optuna_integration-4.0.0-py3-none-any.whl (96 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━

In [3]:
# Step 1: Import Libraries and Set Up Environment
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel

from sklearn.utils.class_weight import compute_class_weight

import optuna
from optuna.integration import TFKerasPruningCallback

from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

from sklearn.ensemble import RandomForestClassifier

import os
import random
from tqdm import tqdm  # Added tqdm for progress visualization
import h5py  # Added h5py for saving data
import gc

from concurrent.futures import ThreadPoolExecutor

# Initialize GPU settings if available
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "physical GPUs,", len(logical_gpus), "logical GPUs.")
    except RuntimeError as e:
        print(e)

1 physical GPUs, 1 logical GPUs.


In [4]:
!cp -r "/content/drive/MyDrive/CS5344 Project Data/trick/"* /content/

In [5]:
# Set up mixed precision training
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')

max_length = 256
num_labels = 5

# # Open H5 files (after reprocessing data with max_length=256)
# train_ids_file = h5py.File('roberta_X_train_ids.h5', 'r')
# X_train_ids = train_ids_file['X_train_ids']

# train_masks_file = h5py.File('roberta_X_train_masks.h5', 'r')
# X_train_masks = train_masks_file['X_train_masks']

# y_train_file = h5py.File('roberta_y_train.h5', 'r')
# y_train = y_train_file['y_train']

# test_ids_file = h5py.File('roberta_X_test_ids.h5', 'r')
# X_test_ids = test_ids_file['X_test_ids']

# test_masks_file = h5py.File('roberta_X_test_masks.h5', 'r')
# X_test_masks = test_masks_file['X_test_masks']

# y_test_file = h5py.File('roberta_y_test.h5', 'r')
# y_test = y_test_file['y_test']

with h5py.File('roberta_X_train_ids.h5', 'r') as f:
    X_train_ids = f['X_train_ids'][:]

with h5py.File('roberta_X_test_ids.h5', 'r') as f:
    X_test_ids = f['X_test_ids'][:]

with h5py.File('roberta_X_train_masks.h5', 'r') as f:
    X_train_masks = f['X_train_masks'][:]

with h5py.File('roberta_X_test_masks.h5', 'r') as f:
    X_test_masks = f['X_test_masks'][:]

with h5py.File('roberta_y_train.h5', 'r') as f:
    y_train = f['y_train'][:]

with h5py.File('roberta_y_test.h5', 'r') as f:
    y_test = f['y_test'][:]

In [6]:
num_samples = X_train_ids.shape[0]
num_to_remove = int(num_samples * 0.3)

X_train_ids = X_train_ids[num_to_remove:]
X_train_masks = X_train_masks[num_to_remove:]
y_train = y_train[num_to_remove:]

In [7]:
from transformers import TFAutoModel, RobertaTokenizer
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
transformer_model = TFAutoModel.from_pretrained(model_name)
transformer_model.trainable = False

class EmbeddingExtractor(tf.keras.layers.Layer):
    def __init__(self, transformer_model, **kwargs):
        super(EmbeddingExtractor, self).__init__(**kwargs)
        self.transformer = transformer_model

    def call(self, inputs):
        input_ids, attention_mask = inputs
        outputs = self.transformer(input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state[:, 0, :]  # 提取 [CLS] token 的嵌入

input_ids_in = tf.keras.Input(shape=(256,), dtype=tf.int32, name='input_ids')
input_masks_in = tf.keras.Input(shape=(256,), dtype=tf.int32, name='attention_mask')

# 使用自定义的嵌入提取器
embedding_extractor = EmbeddingExtractor(transformer_model)
embeddings = embedding_extractor([input_ids_in, input_masks_in])
embedding_model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs=embeddings)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.layer_norm.weight', 'roberta.embeddings.position_ids', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

In [8]:
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')

In [9]:
print("Extracting embeddings for training and test data...")
num_splits = 20  # 将数据分成 20 批
split_size = len(X_train_ids) // num_splits
all_embeddings = []

for i in range(num_splits):
    # 确定每批的起始和结束索引
    start = i * split_size
    end = start + split_size if i < num_splits - 1 else len(X_train_ids)  # 最后一批包含剩余数据

    X_batch_ids = X_train_ids[start:end]
    X_batch_masks = X_train_masks[start:end]

    # 逐批预测并收集嵌入，关闭 verbose
    embeddings = embedding_model.predict([X_batch_ids, X_batch_masks], batch_size=2)
    all_embeddings.append(embeddings)

# 将所有分批嵌入合并并保存为 .npy 文件
X_train_embeddings = np.concatenate(all_embeddings, axis=0)
np.save('/content/drive/MyDrive/CS5344 Project Data/trick/X_train_embeddings.npy', X_train_embeddings)

Extracting embeddings for training and test data...
[1m160743/160743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m584s[0m 4ms/step
[1m160743/160743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m567s[0m 4ms/step
[1m160743/160743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m568s[0m 4ms/step
[1m160743/160743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m569s[0m 4ms/step
[1m160743/160743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m568s[0m 4ms/step
[1m160743/160743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m571s[0m 4ms/step
[1m160743/160743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m571s[0m 4ms/step
[1m160743/160743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m570s[0m 4ms/step
[1m160743/160743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m571s[0m 4ms/step
[1m160743/160743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m570s[0m 4ms/step
[1m160743/160743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m571s[0m 4ms/step
[1m160743/160743[0m [32m━━━━━━━━

In [10]:
num_splits = 30
split_size = len(X_test_ids) // num_splits
all_embeddings = []

for i in range(num_splits):
    start = i * split_size
    end = start + split_size if i < num_splits - 1 else len(X_test_ids)

    X_batch_ids = X_test_ids[start:end]
    X_batch_masks = X_test_masks[start:end]

    embeddings = embedding_model.predict([X_batch_ids, X_batch_masks], batch_size=4)
    all_embeddings.append(embeddings)

X_test_embeddings = np.concatenate(all_embeddings, axis=0)
np.save('/content/drive/MyDrive/CS5344 Project Data/trick/X_test_embeddings.npy', X_test_embeddings)

[1m22964/22964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 5ms/step
[1m22964/22964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 5ms/step
[1m22964/22964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 4ms/step
[1m22964/22964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 4ms/step
[1m22964/22964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 4ms/step
[1m22964/22964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 4ms/step
[1m22964/22964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 4ms/step
[1m22964/22964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 4ms/step
[1m22964/22964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 4ms/step
[1m22964/22964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 4ms/step
[1m22964/22964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 4ms/step
[1m22964/22964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 4ms/step
[1m22964/22964[0m [32m━━━

In [11]:
# def rf_objective(trial):
#     n_estimators = trial.suggest_int('n_estimators', 100, 500, step=100)
#     max_depth = trial.suggest_int('max_depth', 5, 30, step=5)
#     min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
#     min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
#     max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2'])

#     rf = RandomForestClassifier(
#         n_estimators=n_estimators,
#         max_depth=max_depth,
#         min_samples_split=min_samples_split,
#         min_samples_leaf=min_samples_leaf,
#         max_features=max_features,
#         random_state=42,
#         n_jobs=-1
#     )
#     rf.fit(X_train_embeddings, y_train)
#     y_pred_rf = rf.predict(X_test_embeddings)
#     return accuracy_score(y_test, y_pred_rf)

# rf_study = optuna.create_study(direction='maximize')
# rf_study.optimize(rf_objective, n_trials=10)

# print("Random Forest Best Hyperparameters:")
# print(rf_study.best_params)

In [12]:
# # 6. 用最佳参数训练随机森林
# best_rf_params = rf_study.best_params
# rf = RandomForestClassifier(
#     n_estimators=best_rf_params['n_estimators'],
#     max_depth=best_rf_params['max_depth'],
#     min_samples_split=best_rf_params['min_samples_split'],
#     min_samples_leaf=best_rf_params['min_samples_leaf'],
#     max_features=best_rf_params['max_features'],
#     random_state=42,
#     n_jobs=-1
# )
# print("Training Random Forest Classifier with embeddings...")
# rf.fit(X_train_embeddings, y_train)

# # 7. 评估模型
# y_pred_rf = rf.predict(X_test_embeddings)
# print(f"Test accuracy for Random Forest: {accuracy_score(y_test, y_pred_rf)}")

In [None]:
from google.colab import runtime
import time

time.sleep(3600)
runtime.unassign()