In [2]:
import torch
from tqdm import tqdm
from transformers import BertTokenizer, BertModel

# Load pre-trained model tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

MAX_LEN = 512
def extract_features(text):
    input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True, max_length=MAX_LEN, truncation=True)])
    with torch.no_grad():
        outputs = model(input_ids)
        cls_hidden_state = outputs.last_hidden_state[:, 0, :]  # Extract the [CLS] token's hidden state
    return cls_hidden_state.numpy()

In [12]:
# Load the essays dataset 
from tqdm import tqdm
from utils.constants.directory import WORK_DIR, EMBEDDINGS_DIR
import os
import pandas as pd
import numpy as np

NAME = "essays"
INPUT_COL = "essay"
OUTPUT_DIR = os.path.join(EMBEDDINGS_DIR, NAME)

dir = os.path.join(WORK_DIR, f"data/datasets/{NAME}")

train_data = pd.read_csv(os.path.join(dir, "train.csv"))
test_data = pd.read_csv(os.path.join(dir, "test.csv"))

# Extract features
splits = [("train", train_data), ("test", test_data)]

for split, data in splits:
    features = []
    for sentence in tqdm(data[INPUT_COL]):
        f = extract_features(sentence)
        features.append(f)

    # Save the extracted features
    np.save(os.path.join(OUTPUT_DIR, f"{split}.npy"), np.array(features))
    print(f"Saved {split} data to {OUTPUT_DIR}")

100%|██████████| 11678/11678 [27:22<00:00,  7.11it/s]


Saved train data to /home/samsoup/Work/WrapperBox/data/embeddings/essays


100%|██████████| 1298/1298 [03:32<00:00,  6.11it/s]

Saved test data to /home/samsoup/Work/WrapperBox/data/embeddings/essays





In [13]:
# Load the tweets dataset 
from tqdm import tqdm
from utils.constants.directory import WORK_DIR, EMBEDDINGS_DIR
import os
import pandas as pd
import numpy as np

NAME = "tweets"
INPUT_COL = "text"
OUTPUT_DIR = os.path.join(EMBEDDINGS_DIR, NAME)

dir = os.path.join(WORK_DIR, f"data/datasets/{NAME}")

train_data = pd.read_csv(os.path.join(dir, "train.csv"))
test_data = pd.read_csv(os.path.join(dir, "test.csv"))

# Extract features for the Essay dataset

splits = [("train", train_data), ("test", test_data)]

for split, data in splits:
    features = []
    for sentence in tqdm(data[INPUT_COL]):
        f = extract_features(sentence)
        features.append(f)

    # Save the extracted features
    np.save(os.path.join(OUTPUT_DIR, f"{split}.npy"), np.array(features))
    print(f"Saved {split} data to {OUTPUT_DIR}")

100%|██████████| 18000/18000 [10:56<00:00, 27.40it/s]


Saved train data to /home/samsoup/Work/WrapperBox/data/embeddings/tweets


100%|██████████| 1000/1000 [00:37<00:00, 26.96it/s]

Saved test data to /home/samsoup/Work/WrapperBox/data/embeddings/tweets





In [4]:
# Load the hapespeech dataset 
from tqdm import tqdm
from utils.constants.directory import WORK_DIR, EMBEDDINGS_DIR
import os
import pandas as pd
import numpy as np

NAME = "hatespeech"
INPUT_COL = "text"
OUTPUT_DIR = os.path.join(EMBEDDINGS_DIR, NAME)

dir = os.path.join(WORK_DIR, f"data/datasets/{NAME}")

train_data = pd.read_csv(os.path.join(dir, "train.csv"))
test_data = pd.read_csv(os.path.join(dir, "test.csv"))

# Extract features for the Essay dataset

splits = [("train", train_data), ("test", test_data)]

for split, data in splits:
    features = []
    for sentence in tqdm(data[INPUT_COL]):
        f = extract_features(sentence)
        features.append(f)

    # Save the extracted features
    np.save(os.path.join(OUTPUT_DIR, f"{split}.npy"), np.array(features))
    print(f"Saved {split} data to {OUTPUT_DIR}")

  0%|          | 0/9632 [00:00<?, ?it/s]

100%|██████████| 9632/9632 [08:41<00:00, 18.48it/s]


Saved train data to /home/samsoup/Work/WrapperBox/data/embeddings/hatespeech


100%|██████████| 1071/1071 [00:58<00:00, 18.20it/s]

Saved test data to /home/samsoup/Work/WrapperBox/data/embeddings/hatespeech





In [15]:
# Load the emotion dataset 
from tqdm import tqdm
from utils.constants.directory import WORK_DIR, EMBEDDINGS_DIR
import os
import pandas as pd
import numpy as np

NAME = "emotion"
INPUT_COL = "text"
OUTPUT_DIR = os.path.join(EMBEDDINGS_DIR, NAME)

dir = os.path.join(WORK_DIR, f"data/datasets/{NAME}")

train_data = pd.read_csv(os.path.join(dir, "train.csv"))
test_data = pd.read_csv(os.path.join(dir, "test.csv"))

# Extract features for the Essay dataset

splits = [("train", train_data), ("test", test_data)]

for split, data in splits:
    features = []
    for sentence in tqdm(data[INPUT_COL]):
        f = extract_features(sentence)
        features.append(f)

    # Save the extracted features
    np.save(os.path.join(OUTPUT_DIR, f"{split}.npy"), np.array(features))
    print(f"Saved {split} data to {OUTPUT_DIR}")

  0%|          | 0/9025 [00:00<?, ?it/s]

100%|██████████| 9025/9025 [05:39<00:00, 26.56it/s]


Saved train data to /home/samsoup/Work/WrapperBox/data/embeddings/emotion


100%|██████████| 1003/1003 [00:41<00:00, 24.38it/s]

Saved test data to /home/samsoup/Work/WrapperBox/data/embeddings/emotion





In [5]:
# Load the emotion dataset 
from tqdm import tqdm
from utils.constants.directory import WORK_DIR, EMBEDDINGS_DIR
import os
import pandas as pd
import numpy as np

NAME = "sst"
INPUT_COL = "document"
OUTPUT_DIR = os.path.join(EMBEDDINGS_DIR, NAME)

dir = os.path.join(WORK_DIR, f"data/datasets/{NAME}")

train_data = pd.read_csv(os.path.join(dir, "train.csv"))
test_data = pd.read_csv(os.path.join(dir, "test.csv"))

# Extract features for the Essay dataset

splits = [("train", train_data), ("test", test_data)]

for split, data in splits:
    features = []
    for sentence in tqdm(data[INPUT_COL]):
        f = extract_features(sentence)
        features.append(f)

    # Save the extracted features
    np.save(os.path.join(OUTPUT_DIR, f"{split}.npy"), np.array(features))
    print(f"Saved {split} data to {OUTPUT_DIR}")

100%|██████████| 6920/6920 [04:03<00:00, 28.40it/s]


Saved train data to /home/samsoup/Work/WrapperBox/data/embeddings/sst


100%|██████████| 872/872 [00:31<00:00, 27.44it/s]

Saved test data to /home/samsoup/Work/WrapperBox/data/embeddings/sst



