In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp -r "/content/drive/MyDrive/CS5344 Project Data/trick/new/"* /content

In [3]:
!pip install optuna optuna-integration[tfkeras]

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting optuna-integration[tfkeras]
  Downloading optuna_integration-4.0.0-py3-none-any.whl.metadata (11 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading optuna_integration-4.0.0-py3-none-any.whl (96 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import ParameterGrid
import optuna
import joblib
import pickle

X_train_embeddings_path = 'X_train_embeddings.npy'
X_test_embeddings_path = 'X_test_embeddings.npy'

In [5]:
import h5py
y_train_path = 'roberta_y_train.h5'
y_test_path = 'roberta_y_test.h5'

X_train = np.load(X_train_embeddings_path)
X_test = np.load(X_test_embeddings_path)
def load_h5_dataset(file_path, dataset_name):
    with h5py.File(file_path, 'r') as f:
        data = f[dataset_name][:]
    return data

y_train = load_h5_dataset(y_train_path, 'y_train')
y_test = load_h5_dataset(y_test_path, 'y_test')

print("Data shapes:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

Data shapes:
X_train: (6429714, 768)
X_test: (2755592, 768)
y_train: (6429714,)
y_test: (2755592,)


In [None]:
# Grid Search with Optuna
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
    }
    params['min_samples_leaf'] = max(params['min_samples_leaf'], 5)
    rf = RandomForestClassifier(**params, n_jobs=-1, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

study = optuna.create_study(direction='maximize')

n_trials = 20
with tqdm(total=n_trials, desc="Optuna Tuning") as pbar:
    def callback(study, trial):
        pbar.update(1)
    study.optimize(objective, n_trials=n_trials, callbacks=[callback])

print("Best hyperparameters found by Optuna:")
print(study.best_params)
print(f"Best accuracy: {study.best_value}")

best_params = study.best_params
best_rf = RandomForestClassifier(**best_params, n_jobs=-1, random_state=42)
best_rf.fit(X_train, y_train)
y_pred = best_rf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

[I 2024-11-06 15:08:36,995] A new study created in memory with name: no-name-fbc8d69a-766c-48e2-8e15-54d621cca7ee
Optuna Tuning:   0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
joblib.dump(best_rf, '/content/drive/MyDrive/CS5344 Project Data/trick/new/best_random_forest_model_copy.pkl')

with open('/content/drive/MyDrive/CS5344 Project Data/trick/new/best_random_forest_model_copy.pkl', 'wb') as model_file:
    pickle.dump(best_rf, model_file)
