In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv(
    "../../../Datasets/RegExpPURE/PURE_train.csv",
    usecols=["Requirement", "Req/Not Req"],
)

test_df = pd.read_csv(
    "../../../Datasets/RegExpPURE/PURE_test.csv",
    usecols=["Requirement", "Req/Not Req"],
)

valid_df = pd.read_csv(
    "../../../Datasets/RegExpPURE/PURE_valid.csv",
    usecols=["Requirement", "Req/Not Req"],
)

In [3]:
train_X = train_df["Requirement"].values
train_y = train_df["Req/Not Req"].map({"Req": 1, "Not_Req": 0}).values

test_X = test_df["Requirement"].values
test_y = test_df["Req/Not Req"].map({"Req": 1, "Not_Req": 0}).values

valid_X = valid_df["Requirement"].values
valid_y = valid_df["Req/Not Req"].map({"Req": 1, "Not_Req": 0}).values

## Running DistilBERT with Hugging Face Transformers

In [4]:
from transformers import DistilBertTokenizer, DistilBertModel

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
distilbert_model = DistilBertModel.from_pretrained("../../../Models/DistilBERT_Pretrained")


# Create the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [6]:
# Save the model in the Model folder
# distilbert_model.save_pretrained("../../../Models/DistilBERT_Pretrained")

In [7]:
train_X[1:2]

array(['The help should be accessible to the users both in the offline and online mode.'],
      dtype=object)

In [8]:
import torch
import numpy as np

In [9]:
BATCH_SIZE = 128
MAX_LENGTH = 128

In [10]:
for i in range(0, len(train_X), BATCH_SIZE):
    batch = train_X[i : i + BATCH_SIZE]
    inputs = tokenizer(
        batch.tolist(), return_tensors="pt", padding="max_length", truncation=True, max_length=MAX_LENGTH
    )
    torch.cuda.empty_cache()
    with torch.cuda.amp.autocast():
        outputs = distilbert_model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    last_hidden_states = last_hidden_states.detach().numpy()
    np.save(f"../../../States/DistilBERT_Pretrained/Train/train_{i}.npy", last_hidden_states)

In [26]:
for i in range(0, len(test_X), BATCH_SIZE):
    batch = test_X[i : i + BATCH_SIZE]
    inputs = tokenizer(
        batch.tolist(),
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
    )
    torch.cuda.empty_cache()
    with torch.cuda.amp.autocast():
        outputs = distilbert_model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    last_hidden_states = last_hidden_states.detach().numpy()
    np.save(
        f"../../../States/DistilBERT_Pretrained/Test/Test_{i}.npy", last_hidden_states
    )

In [27]:
for i in range(0, len(valid_X), BATCH_SIZE):
    batch = valid_X[i : i + BATCH_SIZE]
    inputs = tokenizer(
        batch.tolist(),
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
    )
    torch.cuda.empty_cache()
    with torch.cuda.amp.autocast():
        outputs = distilbert_model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    last_hidden_states = last_hidden_states.detach().numpy()
    np.save(
        f"../../../States/DistilBERT_Pretrained/Validation/Validation_{i}.npy",
        last_hidden_states,
    )

In [28]:
test_states = np.concatenate(
    [
        np.load(f"../../../States/DistilBERT_Pretrained/Test/Test_{i}.npy")
        for i in range(0, len(test_X), BATCH_SIZE)
    ]
)

print(len(test_X),test_states.shape)

1534 (1534, 128, 768)


In [29]:
train_states = np.concatenate(
    [
        np.load(f"../../../States/DistilBERT_Pretrained/Train/train_{i}.npy")
        for i in range(0, len(train_X), BATCH_SIZE)
    ]
)

print(len(train_X),train_states.shape)

5306 (5306, 128, 768)


In [30]:
validat_states = np.concatenate(
    [
        np.load(f"../../../States/DistilBERT_Pretrained/Validation/Validation_{i}.npy")
        for i in range(0, len(valid_X), BATCH_SIZE)
    ]
)

print(len(valid_X),validat_states.shape)

905 (905, 128, 768)


In [31]:
# # Tokenize the train_X data
# train_encoded = tokenizer(
#     train_X.tolist(),
#     truncation=True,
#     padding="max_length",
#     max_length=64,
#     return_tensors="pt",
# )

In [32]:
# torch.cuda.empty_cache()

In [33]:
# with torch.cuda.amp.autocast():
#     # Pass the encoded data to the model
#     outputs = distilbert_model(**train_encoded)
#     last_hidden_state = outputs.last_hidden_state

# outputs.last_hidden_state.shape