In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!cp -r "/content/drive/MyDrive/CS5344 Project Data/max_length(token)256/W o Train RF Prepare File/"* /content

In [3]:
!cp -r "/content/drive/MyDrive/CS5344 Project Data/max_length(token)256/Test Train Split/Each/"* /content

In [4]:
!pip install optuna optuna-integration[tfkeras]

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting optuna-integration[tfkeras]
  Downloading optuna_integration-4.0.0-py3-none-any.whl.metadata (11 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading optuna_integration-4.0.0-py3-none-any.whl (96 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━

In [5]:
# Step 1: Import Libraries and Set Up Environment
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel

from sklearn.utils.class_weight import compute_class_weight

import optuna
from optuna.integration import TFKerasPruningCallback

from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

from sklearn.ensemble import RandomForestClassifier

import os
import random
from tqdm import tqdm  # Added tqdm for progress visualization
import h5py  # Added h5py for saving data
import gc

from concurrent.futures import ThreadPoolExecutor

# Initialize GPU settings if available
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "physical GPUs,", len(logical_gpus), "logical GPUs.")
    except RuntimeError as e:
        print(e)

1 physical GPUs, 1 logical GPUs.


In [6]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import ParameterGrid
import optuna

X_train_embeddings_path = 'X_train_embeddings.npy'
X_test_embeddings_path = 'X_test_embeddings.npy'
y_train_path = 'roberta_y_train.h5'
y_test_path = 'roberta_y_test.h5'

X_train = np.load(X_train_embeddings_path)
X_test = np.load(X_test_embeddings_path)
def load_h5_dataset(file_path, dataset_name):
    with h5py.File(file_path, 'r') as f:
        data = f[dataset_name][:]
    return data

y_train = load_h5_dataset(y_train_path, 'y_train')
y_test = load_h5_dataset(y_test_path, 'y_test')

print("Data shapes:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

Data shapes:
X_train: (7348244, 768)
X_test: (1837061, 768)
y_train: (7348244,)
y_test: (1837061,)


In [None]:
# Grid Search with Optuna
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
    }
    params['min_samples_leaf'] = max(params['min_samples_leaf'], 5)
    rf = RandomForestClassifier(**params, n_jobs=-1, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

study = optuna.create_study(direction='maximize')

n_trials = 20
with tqdm(total=n_trials, desc="Optuna Tuning") as pbar:
    def callback(study, trial):
        pbar.update(1)
    study.optimize(objective, n_trials=n_trials, callbacks=[callback])

print("Best hyperparameters found by Optuna:")
print(study.best_params)
print(f"Best accuracy: {study.best_value}")

best_params = study.best_params
best_rf = RandomForestClassifier(**best_params, n_jobs=-1, random_state=42)
best_rf.fit(X_train, y_train)
y_pred = best_rf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

[I 2024-11-06 16:27:42,514] A new study created in memory with name: no-name-1a576395-caa3-4cf9-a963-d2a7e940e836
Optuna Tuning:   0%|          | 0/20 [00:00<?, ?it/s][I 2024-11-06 18:33:32,865] Trial 0 finished with value: 0.5248187185945377 and parameters: {'n_estimators': 159, 'max_depth': 11, 'min_samples_split': 7, 'min_samples_leaf': 7, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 0.5248187185945377.
Optuna Tuning:   5%|▌         | 1/20 [2:05:50<39:50:56, 7550.35s/it][I 2024-11-06 19:55:48,640] Trial 1 finished with value: 0.49062061629962206 and parameters: {'n_estimators': 124, 'max_depth': 9, 'min_samples_split': 5, 'min_samples_leaf': 8, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 0.5248187185945377.
Optuna Tuning:  10%|█         | 2/20 [3:28:06<30:03:42, 6012.36s/it]

In [None]:
import joblib

joblib.dump(best_rf, '/content/drive/MyDrive/CS5344 Project Data/max_length(token)256/best_random_forest_model.pkl')

In [None]:
import pickle

with open('/content/drive/MyDrive/CS5344 Project Data/max_length(token)256/best_random_forest_model.pkl', 'wb') as model_file:
    pickle.dump(best_rf, model_file)