In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv(
    "../../../Datasets/RegExpPURE/PURE_train.csv",
    usecols=["Requirement", "Req/Not Req"],
)

test_df = pd.read_csv(
    "../../../Datasets/RegExpPURE/PURE_test.csv",
    usecols=["Requirement", "Req/Not Req"],
)

valid_df = pd.read_csv(
    "../../../Datasets/RegExpPURE/PURE_valid.csv",
    usecols=["Requirement", "Req/Not Req"],
)

In [3]:
train_X = train_df["Requirement"].values
train_y = train_df["Req/Not Req"].map({"Req": 1, "Not_Req": 0}).values

test_X = test_df["Requirement"].values
test_y = test_df["Req/Not Req"].map({"Req": 1, "Not_Req": 0}).values

valid_X = valid_df["Requirement"].values
valid_y = valid_df["Req/Not Req"].map({"Req": 1, "Not_Req": 0}).values

## Running DistilBERT with Hugging Face Transformers

In [4]:
from transformers import RobertaModel, RobertaTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
roberta_model = RobertaModel.from_pretrained("../../../Models/Roberta_Pretrained")


# Create the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [6]:
# Save the model in the Model folder
# roberta_model.save_pretrained("../../../Models/Roberta_Pretrained")

In [7]:
train_X[1:2]

array(['The help should be accessible to the users both in the offline and online mode.'],
      dtype=object)

In [8]:
import torch
import numpy as np

In [9]:
BATCH_SIZE = 128
MAX_LENGTH = 128

In [10]:
for i in range(0, len(train_X), BATCH_SIZE):
    batch = train_X[i : i + BATCH_SIZE]
    inputs = tokenizer(
        batch.tolist(), return_tensors="pt", padding="max_length", truncation=True, max_length=MAX_LENGTH
    )
    torch.cuda.empty_cache()
    with torch.cuda.amp.autocast():
        outputs = roberta_model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    last_hidden_states = last_hidden_states.detach().numpy()
    np.save(f"../../../States/Roberta_Pretrained/Train/train_{i}.npy", last_hidden_states)

In [11]:
for i in range(0, len(test_X), BATCH_SIZE):
    batch = test_X[i : i + BATCH_SIZE]
    inputs = tokenizer(
        batch.tolist(),
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
    )
    torch.cuda.empty_cache()
    with torch.cuda.amp.autocast():
        outputs = roberta_model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    last_hidden_states = last_hidden_states.detach().numpy()
    np.save(
        f"../../../States/Roberta_Pretrained/Test/test_{i}.npy", last_hidden_states
    )

In [12]:
for i in range(0, len(valid_X), BATCH_SIZE):
    batch = valid_X[i : i + BATCH_SIZE]
    inputs = tokenizer(
        batch.tolist(),
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
    )
    torch.cuda.empty_cache()
    with torch.cuda.amp.autocast():
        outputs = roberta_model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    last_hidden_states = last_hidden_states.detach().numpy()
    np.save(
        f"../../../States/Roberta_Pretrained/Validation/validation_{i}.npy",
        last_hidden_states,
    )

In [13]:
test_states = np.concatenate(
    [
        np.load(f"../../../States/Roberta_Pretrained/Test/test_{i}.npy")
        for i in range(0, len(test_X), BATCH_SIZE)
    ]
)

print(len(test_X),test_states.shape)

1534 (1534, 128, 768)


In [14]:
train_states = np.concatenate(
    [
        np.load(f"../../../States/Roberta_Pretrained/Train/train_{i}.npy")
        for i in range(0, len(train_X), BATCH_SIZE)
    ]
)

print(len(train_X),train_states.shape)

5306 (5306, 128, 768)


In [15]:
validat_states = np.concatenate(
    [
        np.load(f"../../../States/Roberta_Pretrained/Validation/validation_{i}.npy")
        for i in range(0, len(valid_X), BATCH_SIZE)
    ]
)

print(len(valid_X),validat_states.shape)

905 (905, 128, 768)
