In [2]:
import torch
from tqdm import tqdm
from transformers import BertTokenizer, BertModel

# Load pre-trained model tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

MAX_LEN = 512
def extract_features(text):
    input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True, max_length=MAX_LEN, truncation=True)])
    with torch.no_grad():
        outputs = model(input_ids)
        cls_hidden_state = outputs.last_hidden_state[:, 0, :]  # Extract the [CLS] token's hidden state
    return cls_hidden_state.numpy()

In [7]:
# Load the essays dataset 
from tqdm import tqdm
from utils.constants.directory import WORK_DIR, EMBEDDINGS_DIR
import os
import pandas as pd
import numpy as np

NAME = "essays"
INPUT_COL = "essay"
OUTPUT_DIR = os.path.join(EMBEDDINGS_DIR, NAME)

dir = os.path.join(WORK_DIR, f"data/datasets/{NAME}")

train_data = pd.read_csv(os.path.join(dir, "train.csv"))
test_data = pd.read_csv(os.path.join(dir, "test.csv"))

# Extract features
splits = [("train", train_data), ("test", test_data)]

for split, data in splits:
    features = []
    for sentence in tqdm(data[INPUT_COL]):
        f = extract_features(sentence)
        features.append(f)

    # Save the extracted features
    np.save(os.path.join(OUTPUT_DIR, f"{NAME}.npy"), np.array(features))
    print(f"Saved {split} data to {OUTPUT_DIR}")

  0%|          | 10/11678 [00:02<39:55,  4.87it/s]


KeyboardInterrupt: 

In [6]:
# Load the tweets dataset 
from tqdm import tqdm
from utils.constants.directory import WORK_DIR, EMBEDDINGS_DIR
import os
import pandas as pd
import numpy as np

NAME = "tweets"
INPUT_COL = "text"
OUTPUT_DIR = os.path.join(EMBEDDINGS_DIR, NAME)

dir = os.path.join(WORK_DIR, f"data/datasets/{NAME}")

train_data = pd.read_csv(os.path.join(dir, "train.csv"))
test_data = pd.read_csv(os.path.join(dir, "test.csv"))

# Extract features for the Essay dataset

splits = [("train", train_data), ("test", test_data)]

for split, data in splits:
    features = []
    for sentence in tqdm(data[INPUT_COL]):
        f = extract_features(sentence)
        features.append(f)

    # Save the extracted features
    np.save(os.path.join(OUTPUT_DIR, f"{NAME}.npy"), np.array(features))
    print(f"Saved {split} data to {OUTPUT_DIR}")

  0%|          | 36/18000 [00:01<11:08, 26.87it/s]


KeyboardInterrupt: 

In [5]:
# Load the tweets dataset 
from tqdm import tqdm
from utils.constants.directory import WORK_DIR, EMBEDDINGS_DIR
import os
import pandas as pd
import numpy as np

NAME = "hatespeech"
INPUT_COL = "text"
OUTPUT_DIR = os.path.join(EMBEDDINGS_DIR, NAME)

dir = os.path.join(WORK_DIR, f"data/datasets/{NAME}")

train_data = pd.read_csv(os.path.join(dir, "train.csv"))
test_data = pd.read_csv(os.path.join(dir, "test.csv"))

# Extract features for the Essay dataset

splits = [("train", train_data), ("test", test_data)]

for split, data in splits:
    features = []
    for sentence in tqdm(data[INPUT_COL]):
        f = extract_features(sentence)
        features.append(f)

    # Save the extracted features
    np.save(os.path.join(OUTPUT_DIR, f"{NAME}.npy"), np.array(features))
    print(f"Saved {split} data to {OUTPUT_DIR}")

  0%|          | 42/9632 [00:01<05:46, 27.66it/s]


KeyboardInterrupt: 

In [4]:
# Load the tweets dataset 
from tqdm import tqdm
from utils.constants.directory import WORK_DIR, EMBEDDINGS_DIR
import os
import pandas as pd
import numpy as np

NAME = "emotion"
INPUT_COL = "text"
OUTPUT_DIR = os.path.join(EMBEDDINGS_DIR, NAME)

dir = os.path.join(WORK_DIR, f"data/datasets/{NAME}")

train_data = pd.read_csv(os.path.join(dir, "train.csv"))
test_data = pd.read_csv(os.path.join(dir, "test.csv"))

# Extract features for the Essay dataset

splits = [("train", train_data), ("test", test_data)]

for split, data in splits:
    features = []
    for sentence in tqdm(data[INPUT_COL]):
        f = extract_features(sentence)
        features.append(f)

    # Save the extracted features
    np.save(os.path.join(OUTPUT_DIR, f"{NAME}.npy"), np.array(features))
    print(f"Saved {split} data to {OUTPUT_DIR}")

  0%|          | 41/9025 [00:01<05:23, 27.80it/s]


KeyboardInterrupt: 