In [1]:
!pip install tqdm
!pip install transformers sentencepiece



In [2]:
import torch
import sentencepiece
from transformers import T5ForConditionalGeneration, T5Tokenizer

In [4]:
print(torch.cuda.is_available())
device = torch.device("cpu")
device = torch.device("cuda:0")

True


In [5]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
tokenizer = T5Tokenizer.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Extracting Features like:
1. Sum of Log Probabilities for input text
2. Entropy
3. Kurtosis
4. Skewness
5. Perplexity

In [6]:
import gc
import math
import numpy as np
import torch
from torch.nn import CrossEntropyLoss
from scipy.stats import entropy

MAX_LEN = 512

def extract_features_batched(texts, model, tokenizer, device):

    num_texts = len(texts)
    num_batches = math.ceil(num_texts / BATCH_SIZE)

    results = []

    for batch_idx in range(num_batches):
        start_idx = batch_idx * BATCH_SIZE
        end_idx = (batch_idx + 1) * BATCH_SIZE
        batch_texts = texts[start_idx:end_idx]

        input_ids = tokenizer(
            batch_texts, truncation=True, max_length=MAX_LEN, return_tensors="pt", padding=True
        ).input_ids.to(device)
        output_ids = tokenizer(
            batch_texts, truncation=True, max_length=MAX_LEN, return_tensors="pt", padding=True
        ).input_ids.to(device)

        input_tokens = len(input_ids)
        output_tokens = len(output_ids)

        outputs = model(input_ids, labels=output_ids)

        loss = outputs.loss
        probs = outputs.logits.softmax(-1).detach().cpu()
        ids = output_ids.tolist()

        del input_tokens, output_tokens
        gc.collect()
        torch.cuda.empty_cache()

        for i, text_probs in enumerate(probs):
          logprobs = [math.log(probs[0, j, ids[i][j]].item()) for j in range(len(ids[i]))]
          maxp = np.max(logprobs)
          minp = np.min(logprobs)
          rangep = maxp - minp
          meanp = np.mean(logprobs)
          stdp = np.std(logprobs)
          entropyp = entropy(np.exp(logprobs))
          if stdp != 0:
              kurtosisp = np.mean((logprobs - meanp) ** 4) / stdp ** 4
              skewnessp = np.mean((logprobs - meanp) ** 3) / stdp ** 3
          else:
              kurtosisp = 0
              skewnessp = 0
          perplexityp = np.exp(-np.mean(logprobs))

          results.append([
              sum(logprobs),
              maxp,
              minp,
              rangep,
              meanp,
              stdp,
              entropyp,
              kurtosisp,
              skewnessp,
              perplexityp,
          ])

    return results

# Example usage:
# features = extract_features_batched(["text1", "text2", ...], model, tokenizer, device)


In [3]:
import json
import pandas as pd
import torch

file_path_dev = "/content/subtaskA_dev_monolingual.jsonl"
df_dev = pd.read_json(file_path_dev, lines=True)
df_dev.head()

file_path_train = "/content/subtaskA_train_monolingual.jsonl"
df_train = pd.read_json(file_path_train, lines=True)
df_train.head()

Unnamed: 0,text,label,model,source,id
0,Forza Motorsport is a popular racing game that...,1,chatGPT,wikihow,0
1,Buying Virtual Console games for your Nintendo...,1,chatGPT,wikihow,1
2,Windows NT 4.0 was a popular operating system ...,1,chatGPT,wikihow,2
3,How to Make Perfume\n\nPerfume is a great way ...,1,chatGPT,wikihow,3
4,How to Convert Song Lyrics to a Song'\n\nConve...,1,chatGPT,wikihow,4


In [None]:
from tqdm import tqdm

# Assuming you have a DataFrame df_train with a 'text' column
# and the 'extract_features_batched' function is defined as in the previous response

# Specify the device
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Specify the batch size
BATCH_SIZE = 21  # Adjust based on your GPU capacity

train_features_list = []

# Iterate over the rows in batches
for batch_start in tqdm(range(0, len(df_train), BATCH_SIZE), desc="Processing train batches"):
    batch_end = min(batch_start + BATCH_SIZE, len(df_train))
    batch_texts = df_train['text'].iloc[batch_start:batch_end].tolist()

    # Process the batch
    batch_features = extract_features_batched(batch_texts, model, tokenizer, device)

    train_features_list.append(batch_features)
    # flat_batch_features = [item for sublist in batch_features for item in sublist]

    # Update the DataFrame with the computed features
    # for i, index in enumerate(range(batch_start, batch_end)):
      # df_train.at[index, 'sll'] = flat_batch_features[i]

# Make sure to save or use the modified DataFrame (df_train) as needed


Processing train batches:  84%|████████▍ | 4777/5703 [2:40:26<32:36,  2.11s/it]

In [None]:
import pickle
flat_batch_features = [item for sublist in train_features_list for item in sublist]
# Saving the objects:
with open('train-features-t5-small.pkl', 'wb') as f:
    pickle.dump(flat_batch_features, f)

In [10]:
from tqdm import tqdm

# Assuming you have a DataFrame df_train with a 'text' column
# and the 'extract_features_batched' function is defined as in the previous response

# Specify the device
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Specify the batch size
BATCH_SIZE = 20  # Adjust based on your GPU capacity

dev_features_list = []

# Iterate over the rows in batches
for batch_start in tqdm(range(0, len(df_dev), BATCH_SIZE), desc="Processing dev batches"):
    batch_end = min(batch_start + BATCH_SIZE, len(df_dev))
    batch_texts = df_dev['text'].iloc[batch_start:batch_end].tolist()

    # Process the batch
    batch_features = extract_features_batched(batch_texts, model, tokenizer, device)

    dev_features_list.append(batch_features)

    flat_batch_features = [item for sublist in batch_features for item in sublist]

    # # Update the DataFrame with the computed features
    # for i, index in enumerate(range(batch_start, batch_end)):
    #   df_dev.at[index, 'sll'] = flat_batch_features[i]

# Make sure to save or use the modified DataFrame (df_train) as needed


Processing dev batches: 100%|██████████| 250/250 [07:28<00:00,  1.79s/it]


In [11]:
len(dev_features_list)

250

In [12]:
print(dev_features_list)

[[[-2396.5781878104267, -5.960466253896608e-07, -33.50983832653613, 33.50983773048951, -5.802852754988935, 8.958391731896823, 5.6293196773020915, 2.608843120521655, -1.1166840849965907, 331.2431689297531], [-5071.324666292343, -2.39613542706634e-05, -31.61002022710535, 31.60999626575108, -12.279236480126727, 10.424220833100309, 5.024913946966304, 1.4473296523094532, 0.03060513115780683, 215181.35864625315], [-4659.774708530235, -9.775209520368969e-06, -38.21374196467522, 38.2137321894657, -11.282747478281431, 11.276033639660849, 5.180558888943926, 1.8081736100236725, -0.3974802225468951, 79439.21986826583], [-5299.07211464996, -3.385601134070609e-05, -31.429682323057555, 31.429648467046214, -12.830683086319505, 10.144991955445919, 4.916475637458536, 1.5011345977051695, 0.1520795233413748, 373503.6613504849], [-4649.751850436067, -1.7643130492627094e-05, -29.617287700844514, 29.61727005771402, -11.258479056745909, 9.947450887119262, 5.078036339767301, 1.4447568430934168, -0.043959222171

In [19]:
flat_batch_features = [item for sublist in dev_features_list for item in sublist]

In [20]:
print(flat_batch_features)

[[-2396.5781878104267, -5.960466253896608e-07, -33.50983832653613, 33.50983773048951, -5.802852754988935, 8.958391731896823, 5.6293196773020915, 2.608843120521655, -1.1166840849965907, 331.2431689297531], [-5071.324666292343, -2.39613542706634e-05, -31.61002022710535, 31.60999626575108, -12.279236480126727, 10.424220833100309, 5.024913946966304, 1.4473296523094532, 0.03060513115780683, 215181.35864625315], [-4659.774708530235, -9.775209520368969e-06, -38.21374196467522, 38.2137321894657, -11.282747478281431, 11.276033639660849, 5.180558888943926, 1.8081736100236725, -0.3974802225468951, 79439.21986826583], [-5299.07211464996, -3.385601134070609e-05, -31.429682323057555, 31.429648467046214, -12.830683086319505, 10.144991955445919, 4.916475637458536, 1.5011345977051695, 0.1520795233413748, 373503.6613504849], [-4649.751850436067, -1.7643130492627094e-05, -29.617287700844514, 29.61727005771402, -11.258479056745909, 9.947450887119262, 5.078036339767301, 1.4447568430934168, -0.0439592221716

In [21]:
len(flat_batch_features)

5000

In [22]:
import pickle

# Saving the objects:
with open('dev-features-t5-small.pkl', 'wb') as f:
    pickle.dump(flat_batch_features, f)