In [None]:
# !pip install pyspark

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# conda install -c conda-forge cudatoolkit=11.2 cudnn=8.1.0

In [None]:
# !pip install "numpy<2.0" "tensorflow==2.10"

In [None]:
# !pip install pandas transformers optuna scikit-learn

In [None]:
!pip install optuna optuna-integration[tfkeras]

Collecting optuna-integration[tfkeras]
  Downloading optuna_integration-4.0.0-py3-none-any.whl.metadata (11 kB)
Downloading optuna_integration-4.0.0-py3-none-any.whl (96 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.9/96.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: optuna-integration
Successfully installed optuna-integration-4.0.0


In [1]:
# Step 1: Import Libraries and Set Up Environment
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel

from sklearn.utils.class_weight import compute_class_weight

import optuna
from optuna.integration import TFKerasPruningCallback

from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

from sklearn.ensemble import RandomForestClassifier

import os
import random
from tqdm import tqdm  # Added tqdm for progress visualization
import h5py  # Added h5py for saving data

import gc

from concurrent.futures import ThreadPoolExecutor

# Initialize GPU settings if available
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "physical GPUs,", len(logical_gpus), "logical GPUs.")
    except RuntimeError as e:
        print(e)

1 physical GPUs, 1 logical GPUs.


In [2]:
# Load the entire dataset (without chunking)
print("Loading the dataset...")
data_path = "./Downloads/ggg_sg.csv"
usecols = ['DateTime', 'Title', 'DomainCountryCode', 'ContextualText', 'DocTone']
df = pd.read_csv(data_path, usecols=usecols)
df_filtered = df[['ContextualText', 'DocTone']].dropna(subset=['ContextualText', 'DocTone'])
df_filtered['DocTone'] = df_filtered['DocTone'].astype(float)
# Initialize variables to store label encoder classes
label_encoder = LabelEncoder()

# Compute quantiles
percentiles = [0.2, 0.4, 0.6, 0.8]
quantiles = df_filtered['DocTone'].quantile(percentiles)
print("DocTone quantile thresholds:", quantiles.values)
q1, q2, q3, q4 = quantiles.values

# Function to label sentiment
def label_sentiment(score):
    if score <= q1:
        return 'Strongly Negative'
    elif q1 < score <= q2:
        return 'Negative'
    elif q2 < score <= q3:
        return 'Neutral'
    elif q3 < score <= q4:
        return 'Positive'
    else:
        return 'Strongly Positive'

# Apply the sentiment labeling
df_filtered['Sentiment'] = df_filtered['DocTone'].apply(label_sentiment)

# Encode sentiments
df_filtered['SentimentLabel'] = label_encoder.fit_transform(df_filtered['Sentiment'])
num_labels = len(label_encoder.classes_)

Loading the dataset...
DocTone quantile thresholds: [-2.58706468 -0.65502183  0.79928952  2.47191011]


In [None]:
# print(num_labels)

5


In [None]:
# h5f.close()

In [3]:
# Step 2: Load and Preprocess Data
# Prepare h5py file to store processed data
h5_file = 'processed_data.h5'
if os.path.exists(h5_file):
    os.remove(h5_file)

h5f = h5py.File(h5_file, 'w')

# We will need to determine the max_length for tokenization
max_length = 256  # Adjust as needed

# Initialize tokenizer and model
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
transformer_model = TFRobertaModel.from_pretrained(model_name)

# Create datasets in h5py file with maxshape set to None to allow resizing
input_ids_dataset = h5f.create_dataset('input_ids', shape=(0, max_length), maxshape=(None, max_length), dtype='int32')
attention_masks_dataset = h5f.create_dataset('attention_masks', shape=(0, max_length), maxshape=(None, max_length), dtype='int32')
labels_dataset = h5f.create_dataset('labels', shape=(0,), maxshape=(None,), dtype='int32')

# Tokenize the texts and save to h5py in chunks
print("Tokenizing texts and saving to h5py file in batches...")
texts = df_filtered['ContextualText'].tolist()
labels = df_filtered['SentimentLabel'].values

batch_size = 2  # Adjust based on your memory capacity
total_samples = 0  # Keep track of the total number of samples processed

for i in tqdm(range(0, len(texts), batch_size), desc="Processing Batches"):
    texts_batch = texts[i:i+batch_size]
    labels_batch = labels[i:i+batch_size]
    encoded = tokenizer(
        texts_batch,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='np'  # Return numpy arrays
    )
    input_ids = encoded['input_ids']
    attention_masks = encoded['attention_mask']

    batch_size_actual = input_ids.shape[0]  # In case the last batch is smaller

    # Resize datasets to accommodate new data
    input_ids_dataset.resize((total_samples + batch_size_actual, max_length))
    input_ids_dataset[total_samples:total_samples + batch_size_actual] = input_ids

    attention_masks_dataset.resize((total_samples + batch_size_actual, max_length))
    attention_masks_dataset[total_samples:total_samples + batch_size_actual] = attention_masks

    labels_dataset.resize((total_samples + batch_size_actual,))
    labels_dataset[total_samples:total_samples + batch_size_actual] = labels_batch

    total_samples += batch_size_actual  # Update total samples processed

# Close the h5py file
h5f.close()



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.embeddings.position_ids', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

Tokenizing texts and saving to h5py file in batches...


Processing Batches: 100%|████████████████████████████████████████████████| 4592653/4592653 [1:03:01<00:00, 1214.50it/s]


In [4]:
# Step 3: Load Processed Data from h5py File
print("Loading processed data from h5py file...")
h5_file = 'processed_data.h5'
h5f = h5py.File(h5_file, 'r')
input_ids = np.array(h5f['input_ids'])
attention_masks = np.array(h5f['attention_masks'])
labels = h5f['labels']

# Convert labels to numpy array
labels = np.array(labels)

# Step 4: Split Data into Train and Test Sets
print("Splitting data into train and test sets...")
labels = np.array(labels)
X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train, y_test = train_test_split(
    input_ids,
    attention_masks,
    labels,
    test_size=0.3,
    random_state=42,
    stratify=labels
)
X_train_ids = np.concatenate([X_train_ids, X_test_ids])
X_train_masks = np.concatenate([X_train_masks, X_test_masks])
y_train = np.concatenate([y_train, y_test])

Loading processed data from h5py file...
Splitting data into train and test sets...


In [5]:
# Save in one shot
with h5py.File('roberta_train_test_data.h5', 'w') as f:
    f.create_dataset('X_train_ids', data=X_train_ids)
    f.create_dataset('X_test_ids', data=X_test_ids)
    f.create_dataset('X_train_masks', data=X_train_masks)
    f.create_dataset('X_test_masks', data=X_test_masks)
    f.create_dataset('y_train', data=y_train)
    f.create_dataset('y_test', data=y_test)

# Save by train/test
with h5py.File('roberta_train_data.h5', 'w') as train_file:
    train_file.create_dataset('X_train_ids', data=X_train_ids)
    train_file.create_dataset('X_train_masks', data=X_train_masks)
    train_file.create_dataset('y_train', data=y_train)

with h5py.File('roberta_test_data.h5', 'w') as test_file:
    test_file.create_dataset('X_test_ids', data=X_test_ids)
    test_file.create_dataset('X_test_masks', data=X_test_masks)
    test_file.create_dataset('y_test', data=y_test)

# Save individually
with h5py.File('roberta_X_train_ids.h5', 'w') as f:
    f.create_dataset('X_train_ids', data=X_train_ids)

with h5py.File('roberta_X_test_ids.h5', 'w') as f:
    f.create_dataset('X_test_ids', data=X_test_ids)

with h5py.File('roberta_X_train_masks.h5', 'w') as f:
    f.create_dataset('X_train_masks', data=X_train_masks)

with h5py.File('roberta_X_test_masks.h5', 'w') as f:
    f.create_dataset('X_test_masks', data=X_test_masks)

with h5py.File('roberta_y_train.h5', 'w') as f:
    f.create_dataset('y_train', data=y_train)

with h5py.File('roberta_y_test.h5', 'w') as f:
    f.create_dataset('y_test', data=y_test)

In [6]:
from transformers import TFAutoModel, RobertaTokenizer
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
transformer_model = TFAutoModel.from_pretrained(model_name)
transformer_model.trainable = False

class EmbeddingExtractor(tf.keras.layers.Layer):
    def __init__(self, transformer_model, **kwargs):
        super(EmbeddingExtractor, self).__init__(**kwargs)
        self.transformer = transformer_model

    def call(self, inputs):
        input_ids, attention_mask = inputs
        outputs = self.transformer(input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state[:, 0, :]  # 提取 [CLS] token 的嵌入

input_ids_in = tf.keras.Input(shape=(256,), dtype=tf.int32, name='input_ids')
input_masks_in = tf.keras.Input(shape=(256,), dtype=tf.int32, name='attention_mask')

# 使用自定义的嵌入提取器
embedding_extractor = EmbeddingExtractor(transformer_model)
embeddings = embedding_extractor([input_ids_in, input_masks_in])
embedding_model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs=embeddings)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.embeddings.position_ids', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

In [8]:
print("Extracting embeddings for training and test data...")
X_train_embeddings = embedding_model.predict([X_train_ids, X_train_masks], batch_size=16)
X_test_embeddings = embedding_model.predict([X_test_ids, X_test_masks], batch_size=16)

Extracting embeddings for training and test data...
    93/574082 [..............................] - ETA: 8:37:26

KeyboardInterrupt: 

In [None]:
!cp -r "/content/drive/MyDrive/CS5344 Project Data/max_length(token)512/Test Train Split/Each/"* /content/

In [None]:
# Set up mixed precision training
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')

max_length = 256
num_labels = 5

# Open H5 files (after reprocessing data with max_length=256)
# train_ids_file = h5py.File('roberta_X_train_ids.h5', 'r')
# X_train_ids = train_ids_file['X_train_ids']

# train_masks_file = h5py.File('roberta_X_train_masks.h5', 'r')
# X_train_masks = train_masks_file['X_train_masks']

# y_train_file = h5py.File('roberta_y_train.h5', 'r')
# y_train = y_train_file['y_train']

# test_ids_file = h5py.File('roberta_X_test_ids.h5', 'r')
# X_test_ids = test_ids_file['X_test_ids']

# test_masks_file = h5py.File('roberta_X_test_masks.h5', 'r')
# X_test_masks = test_masks_file['X_test_masks']

# y_test_file = h5py.File('roberta_y_test.h5', 'r')
# y_test = y_test_file['y_test']

with h5py.File('./256/roberta_X_train_ids.h5', 'r') as f:
    X_train_ids = f['X_train_ids'][:]

with h5py.File('./256/roberta_X_test_ids.h5', 'r') as f:
    X_test_ids = f['X_test_ids'][:]

with h5py.File('./256/roberta_X_train_masks.h5', 'r') as f:
    X_train_masks = f['X_train_masks'][:]

with h5py.File('./256/roberta_X_test_masks.h5', 'r') as f:
    X_test_masks = f['X_test_masks'][:]

with h5py.File('./256/roberta_y_train.h5', 'r') as f:
    y_train = f['y_train'][:]

with h5py.File('./256/roberta_y_test.h5', 'r') as f:
    y_test = f['y_test'][:]

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 4080 SUPER, compute capability 8.9


In [None]:
# Step 5: Compute Class Weights
print("Computing class weights...")
labels = y_train[:]
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weight_dict = dict(enumerate(class_weights))

Computing class weights...


In [None]:
# # Step 6: Define Function to Build the Model
# from transformers import TFAutoModel, RobertaTokenizer

# # Define the model
# class TransformerLayer(tf.keras.layers.Layer):
#     def __init__(self, transformer_model, **kwargs):
#         super(TransformerLayer, self).__init__(**kwargs)
#         self.transformer = transformer_model

#     def call(self, inputs):
#         input_ids, attention_mask = inputs
#         outputs = self.transformer(input_ids, attention_mask=attention_mask)
#         return outputs.last_hidden_state

# def build_model(transformer_model, learning_rate, dropout_rate, dense_units):
#     input_ids_in = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
#     input_masks_in = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')

#     # Use the custom Transformer layer
#     transformer_layer = TransformerLayer(transformer_model)
#     transformer_outputs = transformer_layer([input_ids_in, input_masks_in])

#     cls_token = transformer_outputs[:, 0, :]  # [CLS] token

#     x = tf.keras.layers.Dropout(dropout_rate)(cls_token)
#     x = tf.keras.layers.Dense(dense_units, activation='relu')(x)
#     x = tf.keras.layers.Dropout(dropout_rate)(x)
#     output = tf.keras.layers.Dense(num_labels, activation='softmax', dtype='float32')(x)

#     model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs=output)
#     optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
#     optimizer = mixed_precision.LossScaleOptimizer(optimizer, dynamic=True)
#     model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
#     return model

# # Define the data generator without shuffling indices
# def data_generator(ids_dataset, masks_dataset, labels_dataset, batch_size):
#     dataset_size = labels_dataset.shape[0]

#     for start_idx in range(0, dataset_size, batch_size):
#         end_idx = min(start_idx + batch_size, dataset_size)
#         batch_input_ids = ids_dataset[start_idx:end_idx]
#         batch_attention_masks = masks_dataset[start_idx:end_idx]
#         batch_labels = labels_dataset[start_idx:end_idx]

#         yield ({'input_ids': batch_input_ids, 'attention_mask': batch_attention_masks}, batch_labels)

# # Define the objective function for Optuna
# def objective(trial, X_train_ids):
#     # Clear session and collect garbage
#     tf.keras.backend.clear_session()
#     gc.collect()

#     learning_rate = trial.suggest_float('learning_rate', 1e-5, 5e-5, log=True)
#     dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.3)
#     dense_units = trial.suggest_int('dense_units', 64, 128, step=32)
#     batch_size = trial.suggest_categorical('batch_size', [1, 2, 4])

#     model = build_model(transformer_model, learning_rate, dropout_rate, dense_units)

#     # Recreate datasets with the new batch_size
#     train_dataset = tf.data.Dataset.from_generator(
#         lambda: data_generator(X_train_ids, X_train_masks, y_train, batch_size),
#         output_types=({'input_ids': tf.int32, 'attention_mask': tf.int32}, tf.int32),
#         output_shapes=(
#             {'input_ids': (None, max_length), 'attention_mask': (None, max_length)},
#             (None,)
#         )
#     )
#     # Shuffle the dataset
#     train_dataset = train_dataset.shuffle(buffer_size=10000).prefetch(tf.data.AUTOTUNE)

#     val_dataset = tf.data.Dataset.from_generator(
#         lambda: data_generator(X_test_ids, X_test_masks, y_test, batch_size),
#         output_types=({'input_ids': tf.int32, 'attention_mask': tf.int32}, tf.int32),
#         output_shapes=(
#             {'input_ids': (None, max_length), 'attention_mask': (None, max_length)},
#             (None,)
#         )
#     )
#     val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)

#     epochs = 3
#     history = model.fit(
#         train_dataset,
#         validation_data=val_dataset,
#         epochs=epochs,
#         steps_per_epoch=len(X_train_ids) // batch_size // 20,
#         validation_steps=len(X_test_ids) // batch_size // 20,
#         class_weight=class_weight_dict,
#         callbacks=[TFKerasPruningCallback(trial, 'val_accuracy')],
#         verbose=1
#     )

#     val_accuracy = max(history.history['val_accuracy'])

#     # Clean up
#     del model
#     tf.keras.backend.clear_session()
#     gc.collect()

#     return val_accuracy

# # Initialize tokenizer and model
# model_name = 'roberta-base'
# tokenizer = RobertaTokenizer.from_pretrained(model_name)
# transformer_model = TFAutoModel.from_pretrained(model_name)
# transformer_model.trainable = False  # Freeze Transformer layers

# print("Starting hyperparameter optimization with Optuna...")
# n_trials = 10
# study = optuna.create_study(direction='maximize')

# # Initialize tqdm progress bar
# with tqdm(total=n_trials, desc="Optuna Trials") as progress_bar:
#     def objective_with_progress(trial):
#         result = objective(trial)
#         progress_bar.update(1)
#         return result

#     study.optimize(objective_with_progress, n_trials=n_trials)

# print("Best hyperparameters:")
# print(study.best_params)

# # Close H5 files after training
# train_ids_file.close()
# train_masks_file.close()
# y_train_file.close()
# test_ids_file.close()
# test_masks_file.close()
# y_test_file.close()

In [None]:
# Step 8: Build and Train the Final Model with Best Hyperparameters
from transformers import TFAutoModel, RobertaTokenizer
# best_params = study.best_params
learning_rate = 4e-5 #best_params['learning_rate']
dropout_rate = 0.2 #best_params['dropout_rate']
dense_units = 128 #best_params['dense_units']
batch_size = 8 #best_params['batch_size']
epochs = 5

model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
transformer_model = TFAutoModel.from_pretrained(model_name)

from transformers import TFAutoModel, RobertaTokenizer

# Define the model
class TransformerLayer(tf.keras.layers.Layer):
    def __init__(self, transformer_model, **kwargs):
        super(TransformerLayer, self).__init__(**kwargs)
        self.transformer = transformer_model

    def call(self, inputs):
        input_ids, attention_mask = inputs
        outputs = self.transformer(input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state

def build_model(transformer_model, learning_rate, dropout_rate, dense_units):
    input_ids_in = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
    input_masks_in = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')

    # Use the custom Transformer layer
    transformer_layer = TransformerLayer(transformer_model)
    transformer_outputs = transformer_layer([input_ids_in, input_masks_in])

    cls_token = transformer_outputs[:, 0, :]  # [CLS] token

    x = tf.keras.layers.Dropout(dropout_rate)(cls_token)
    x = tf.keras.layers.Dense(dense_units, activation='relu')(x)
    x = tf.keras.layers.Dropout(dropout_rate)(x)
    output = tf.keras.layers.Dense(num_labels, activation='softmax', dtype='float32')(x)

    model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs=output)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    optimizer = mixed_precision.LossScaleOptimizer(optimizer, dynamic=True)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

model = build_model(transformer_model, learning_rate, dropout_rate, dense_units)

# Set up callbacks
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=1)

print("Training the final model...")

def get_data_subset(ids_dataset, masks_dataset, labels_dataset, epoch, total_epochs):
    dataset_size = labels_dataset.shape[0]
    start_idx = epoch * dataset_size // total_epochs
    end_idx = (epoch + 1) * dataset_size // total_epochs
    return (
        ids_dataset[start_idx:end_idx],
        masks_dataset[start_idx:end_idx],
        labels_dataset[start_idx:end_idx]
    )

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")

    # Get subset of training data for current epoch
    X_train_ids_epoch, X_train_masks_epoch, y_train_epoch = get_data_subset(
        X_train_ids, X_train_masks, y_train, epoch, epochs
    )

    # Get subset of validation data for current epoch
    X_test_ids_epoch, X_test_masks_epoch, y_test_epoch = get_data_subset(
        X_test_ids, X_test_masks, y_test, epoch, epochs
    )

    # Create training and validation dataset
    train_dataset = tf.data.Dataset.from_tensor_slices((
        {'input_ids': X_train_ids_epoch, 'attention_mask': X_train_masks_epoch},
        y_train_epoch
    )).batch(batch_size).prefetch(tf.data.AUTOTUNE)

    val_dataset = tf.data.Dataset.from_tensor_slices((
        {'input_ids': X_test_ids_epoch, 'attention_mask': X_test_masks_epoch},
        y_test_epoch
    )).batch(batch_size).prefetch(tf.data.AUTOTUNE)

    # Train model
    history = model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=1,
        class_weight=class_weight_dict,
        callbacks=[early_stopping, reduce_lr]
    )

    # Early Stopping
    if early_stopping.stopped_epoch > 0:
        print("Stop Training Early")
        break
model.save("saved_model")
model.save("final_model.h5")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.bias', 'roberta.embeddings.position_ids', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

Training the final model...
Epoch 1/5
  7892/183706 [>.............................] - ETA: 3:45:15 - loss: 1.2293 - accuracy: 0.4553

In [None]:
# Step 9: Evaluate the Transformer Model
print("Evaluating the transformer model...")
y_pred_probs = model.predict([X_test_ids, X_test_masks], batch_size=batch_size)
y_pred = np.argmax(y_pred_probs, axis=1)

# Output performance metrics
print("Transformer Model Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

print("Transformer Model Accuracy:", accuracy_score(y_test, y_pred))
print("Transformer Model Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Transformer Model Recall:", recall_score(y_test, y_pred, average='weighted'))
print("Transformer Model F1 Score:", f1_score(y_test, y_pred, average='weighted'))

In [None]:
# # Step 10: Extract Embeddings and Train Random Forest Classifier
# print("Extracting embeddings from the fine-tuned model...")
# # Define a new model to output embeddings
# embedding_model = tf.keras.Model(inputs=model.inputs, outputs=model.layers[-3].output)  # Output before the last dense layer

# print("Getting embeddings for training and test data...")
# X_train_embeddings = embedding_model.predict([X_train_ids, X_train_masks], batch_size=batch_size)
# X_test_embeddings = embedding_model.predict([X_test_ids, X_test_masks], batch_size=batch_size)

# # Save embeddings to h5py file
# print("Saving embeddings to h5py file...")
# embeddings_file = 'embeddings.h5'
# if os.path.exists(embeddings_file):
#     os.remove(embeddings_file)
# emb_h5f = h5py.File(embeddings_file, 'w')
# emb_h5f.create_dataset('X_train_embeddings', data=X_train_embeddings)
# emb_h5f.create_dataset('X_test_embeddings', data=X_test_embeddings)
# emb_h5f.close()


In [None]:
# # Step 11: Hyperparameter Tuning and Training Random Forest Classifier
# print("Starting hyperparameter tuning for Random Forest with Optuna...")
# def rf_objective(trial):
#     n_estimators = trial.suggest_int('n_estimators', 100, 500, step=100)
#     max_depth = trial.suggest_int('max_depth', 5, 30, step=5)
#     min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
#     min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
#     max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2'])

#     rf = RandomForestClassifier(
#         n_estimators=n_estimators,
#         max_depth=max_depth,
#         min_samples_split=min_samples_split,
#         min_samples_leaf=min_samples_leaf,
#         max_features=max_features,
#         random_state=42,
#         n_jobs=-1
#     )
#     rf.fit(X_train_embeddings, y_train)
#     y_pred_rf = rf.predict(X_test_embeddings)
#     return accuracy_score(y_test, y_pred_rf)

# rf_study = optuna.create_study(direction='maximize')
# rf_study.optimize(rf_objective, n_trials=10)

# print("Random Forest Best Hyperparameters:")
# print(rf_study.best_params)

# # Train Random Forest with best parameters
# best_rf_params = rf_study.best_params
# rf = RandomForestClassifier(
#     n_estimators=best_rf_params['n_estimators'],
#     max_depth=best_rf_params['max_depth'],
#     min_samples_split=best_rf_params['min_samples_split'],
#     min_samples_leaf=best_rf_params['min_samples_leaf'],
#     max_features=best_rf_params['max_features'],
#     random_state=42,
#     n_jobs=-1
# )
# print("Training Random Forest Classifier...")
# rf.fit(X_train_embeddings, y_train)

In [None]:
# # Step 12: Evaluate the Random Forest Model
# print("Evaluating the Random Forest model...")
# y_pred_rf = rf.predict(X_test_embeddings)

# print("Random Forest Classification Report:")
# print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))

# print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
# print("Random Forest Precision:", precision_score(y_test, y_pred_rf, average='weighted'))
# print("Random Forest Recall:", recall_score(y_test, y_pred_rf, average='weighted'))
# print("Random Forest F1 Score:", f1_score(y_test, y_pred_rf, average='weighted'))

# # Close the h5py files
# h5f.close()