In [1]:
import pandas as pd
import numpy as np
from transformers import ElectraTokenizer, ElectraForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from datasets import Dataset
from sklearn import metrics
import os

In [2]:
# Load datasets
train_df = pd.read_csv(
    'C:/Users/91898/Code/fibe/dataset_fibe/train.csv', encoding='ISO-8859-1')
test_df = pd.read_csv(
    'C:/Users/91898/Code/fibe/dataset_fibe/test.csv', encoding='ISO-8859-1')
sample_submission = pd.read_csv(
    'C:/Users/91898/Code/fibe/dataset_fibe/sample_submission.csv', encoding='ISO-8859-1')

In [3]:
train_df.head()

Unnamed: 0,text,target,Word Count
0,"python courses python courses, python exercise...",academic interests,125
1,the learning point open digital education. a r...,academic interests,147
2,"tech news, latest technology, mobiles, laptops...",academic interests,143
3,the best it certification materials in usa | k...,academic interests,364
4,"bioland scientific, for your research needs bi...",academic interests,176


In [2]:
# Load the entire training dataset (no splitting)
from transformers import ElectraTokenizer
from datasets import Dataset
from transformers import AutoTokenizer
train_df = pd.read_csv(
    'C:/Users/91898/Code/fibe/dataset_fibe/train.csv', encoding='ISO-8859-1')

# Preprocess the text data as done before
train_df['text'] = train_df['text'].str.lower().str.strip()

# Encode target labels using the same LabelEncoder
label_encoder = LabelEncoder()
train_df['target'] = label_encoder.fit_transform(train_df['target'])

# Tokenize the entire training dataset using the same tokenizer
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')


def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)


# Convert to a Hugging Face dataset and apply tokenization
full_train_dataset = Dataset.from_pandas(train_df[['text', 'target']])
full_train_dataset = full_train_dataset.map(tokenize_function, batched=True)
full_train_dataset.set_format(
    type='torch', columns=['input_ids', 'attention_mask', 'target'])

Map:   0%|          | 0/697527 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [None]:
import numpy as np
import torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader

# Load your trained electra model
model = AutoModelForSequenceClassification.from_pretrained(
    'electra/saved_model', num_labels=len(label_encoder.classes_)
)
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create a TrainingArguments object with FP16 enabled and a larger batch size
training_args = TrainingArguments(
    output_dir='.tettt/results',           # Output directory
    per_device_eval_batch_size=64,    # Increase this if your GPU has enough memory
    fp16=True                         # Enable mixed precision
)

# Initialize the Trainer with these arguments
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer
)

# Make predictions on the full training set with gradient computation disabled
with torch.no_grad():
    electra_full_train_predictions = trainer.predict(full_train_dataset)

# Apply softmax to get probabilities
electra_full_train_probabilities = torch.nn.functional.softmax(
    torch.tensor(electra_full_train_predictions.predictions), dim=-1
).numpy()

# Save these probabilities
np.save('electra_full_train_probabilities.npy',
        electra_full_train_probabilities)

In [4]:
# Preprocess the text data
def preprocess_text(text):
    return str(text).lower().strip()


# Apply preprocessing
train_df['text'] = train_df['text'].str.lower().str.strip()
test_df['text'] = test_df['text'].str.lower().str.strip()

# Encode target labels
label_encoder = LabelEncoder()
train_df['target'] = label_encoder.fit_transform(train_df['target'])

# Split into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['text'], train_df['target'], test_size=0.1, random_state=42
)

# Check the results of preprocessing and encoding
print(train_df.head())
print("Classes:", label_encoder.classes_)

                                                text  target  Word Count
0  python courses python courses, python exercise...       0         125
1  the learning point open digital education. a r...       0         147
2  tech news, latest technology, mobiles, laptops...       0         143
3  the best it certification materials in usa | k...       0         364
4  bioland scientific, for your research needs bi...       0         176
Classes: ['academic interests' 'arts and culture' 'automotives'
 'books and literature' 'business and finance' 'careers'
 'family and relationships' 'food and drinks' 'health' 'healthy living'
 'hobbies and interests' 'home and garden' 'movies' 'music and audio'
 'news and politics' 'personal finance' 'pets'
 'pharmaceuticals, conditions, and symptoms' 'real estate' 'shopping'
 'sports' 'style and fashion' 'technology and computing' 'television'
 'travel' 'video gaming']


In [5]:
# Convert pandas DataFrames to Hugging Face datasets
train_dataset = Dataset.from_pandas(pd.DataFrame(
    {'text': train_texts, 'labels': train_labels}))
val_dataset = Dataset.from_pandas(pd.DataFrame(
    {'text': val_texts, 'labels': val_labels}))
test_dataset = Dataset.from_pandas(pd.DataFrame({'text': test_df['text']}))

# Display first few entries to verify datasets
print(train_dataset[0])
print(val_dataset[0])
print(test_dataset[0])

{'text': 'lockdown perfect time tollywood stars introspect line up outside profession actor actress began drive back first love music shruti hasaan others like pranitha subhash turned help others tough times various opening samantha akkineni find out foretell horticulture growing bring about said last found something passionate break job starting get tired answering people asked hobby reply represent counterargument job hobby oh baby actress began journey first harvest cabbage microgreens lockdown carry insta explained one need declamatory lawn backyard gardening using space uncommitted domicile initially used window sill bedroom grow microgreens said interested growing require tray cocopeat seeds cool room used bedroom window lets sunlight partly also gave guide fans farsighted takes cum sprout number days tray needs covered use lamp case one enough sunlight room inspired tollywood diva take gardening first piazza pandemic fear able-bodied feed oneself cut back houses said often hear 

In [6]:
import os
from datasets import load_from_disk, Dataset
from transformers import AutoTokenizer

# Initialize the tokenizer

tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator', use_fast=True)

train_dataset_path = 'electra/tokenized_train_dataset'
val_dataset_path = 'electra/tokenized_val_dataset'
test_dataset_path = 'electra/tokenized_test_dataset'


if os.path.exists(train_dataset_path) and os.path.exists(val_dataset_path) and os.path.exists(test_dataset_path):
    # Load the tokenized datasets
    train_dataset = load_from_disk(train_dataset_path)
    val_dataset = load_from_disk(val_dataset_path)
    test_dataset = load_from_disk(test_dataset_path)
    print("Tokenized datasets loaded successfully from disk.")
else:
    # Define the tokenization function
    def tokenize_function(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

    # Perform tokenization without multiprocessing
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)

    # Set the format for PyTorch
    train_dataset.set_format(type='torch', columns=[
                             'input_ids', 'attention_mask', 'labels'])
    val_dataset.set_format(type='torch', columns=[
                           'input_ids', 'attention_mask', 'labels'])
    test_dataset.set_format(type='torch', columns=[
                            'input_ids', 'attention_mask'])

    # Save the tokenized datasets to disk
    os.makedirs('electra', exist_ok=True)
    train_dataset.save_to_disk(train_dataset_path)
    val_dataset.save_to_disk(val_dataset_path)
    test_dataset.save_to_disk(test_dataset_path)
    print("Tokenized datasets saved successfully to disk.")

# Check the results of tokenization
print(train_dataset[0])
print(val_dataset[0])

Tokenized datasets loaded successfully from disk.
{'labels': tensor(10), 'input_ids': tensor([  101,  5843,  7698,  3819,  2051,  9565, 26985,  3340, 17174, 13102,
        22471,  2240,  2039,  2648,  9518,  3364,  3883,  2211,  3298,  2067,
         2034,  2293,  2189, 14021, 22134,  2072,  2038, 14634,  2500,  2066,
        10975,  7088,  8322,  4942, 14949,  2232,  2357,  2393,  2500,  7823,
         2335,  2536,  3098, 11415, 17712,  4939, 18595,  2424,  2041, 18921,
        23567,  7570, 28228, 14561,  3652,  3288,  2055,  2056,  2197,  2179,
         2242, 13459,  3338,  3105,  3225,  2131,  5458, 10739,  2111,  2356,
        17792,  7514,  5050,  4675,  2906, 22850,  4765,  3105, 17792,  2821,
         3336,  3883,  2211,  4990,  2034, 11203, 28540, 12702, 28637,  3619,
         5843,  7698,  4287, 16021,  2696,  4541,  2028,  2342, 11703, 10278,
        14049, 10168, 16125, 21529,  2478,  2686,  4895,  9006, 22930,  3064,
        14383, 28775,  2571,  3322,  2109,  3332,  9033,

In [7]:
# Load BERT model for sequence classification
model = ElectraForSequenceClassification.from_pretrained('google/electra-small-discriminator', num_labels=len(label_encoder.classes_))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set training arguments with mixed precision enabled
training_args = TrainingArguments(
    dataloader_num_workers=4,
    output_dir='./electra/results',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='none',
    logging_steps=5000,
    evaluation_strategy="steps",
    save_steps=1000,
    eval_steps=1000,
    load_best_model_at_end=True,
    fp16=True,
    report_to="none",
    resume_from_checkpoint=True
)

# Display model details
print(model)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Li



In [8]:
# Define a function for computing metrics
from transformers import Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = metrics.f1_score(labels, predictions, average='weighted')
    accuracy = metrics.accuracy_score(labels, predictions)
    return {'f1': f1, 'accuracy': accuracy}


# Initialize the Trainer


class ContiguousTrainer(Trainer):
    def save_model(self, output_dir=None, _internal_call=False):
        if self.args.should_save:
            # Ensure all parameters are contiguous before saving
            for param in self.model.parameters():
                if not param.is_contiguous():
                    param.data = param.data.contiguous()
        super().save_model(output_dir, _internal_call)


# Use ContiguousTrainer instead of Trainer
trainer = ContiguousTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

# Verify trainer configuration
print(trainer)

<__main__.ContiguousTrainer object at 0x0000029C2CBC87F0>


In [9]:
import torch
print(torch.cuda.is_available())  # Should output True

True


In [10]:
# trainer.train()

In [11]:
trainer.train(resume_from_checkpoint='./electra/results/checkpoint-14712')

  0%|          | 0/14712 [00:00<?, ?it/s]

{'train_runtime': 0.1421, 'train_samples_per_second': 13252532.485, 'train_steps_per_second': 103525.185, 'train_loss': 0.0, 'epoch': 3.0}


TrainOutput(global_step=14712, training_loss=0.0, metrics={'train_runtime': 0.1421, 'train_samples_per_second': 13252532.485, 'train_steps_per_second': 103525.185, 'total_flos': 5.54367488451625e+16, 'train_loss': 0.0, 'epoch': 2.999898052808645})

In [12]:
# Make predictions on the test set
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

# Decode the predictions back to original labels
pred_labels = label_encoder.inverse_transform(pred_labels)

# Create the submission file
submission_df = pd.DataFrame({
    'Index': 'Article_' + test_df.index.astype(str),
    'target': pred_labels
})
submission_df.to_csv('electra_submission.csv', index=False)

# Display the first few rows of the submission file
submission_df.head()

  0%|          | 0/2725 [00:00<?, ?it/s]

Unnamed: 0,Index,target
0,Article_0,academic interests
1,Article_1,careers
2,Article_2,health
3,Article_3,academic interests
4,Article_4,academic interests


In [16]:
# Define the ContiguousTrainer class to handle non-contiguous tensors
class ContiguousTrainer(Trainer):
    def save_model(self, output_dir=None, _internal_call=False):
        if self.args.should_save:
            # Ensure all parameters are contiguous before saving
            for param in self.model.parameters():
                if not param.is_contiguous():
                    param.data = param.data.contiguous()
        super().save_model(output_dir, _internal_call)


# Use ContiguousTrainer instead of Trainer
trainer = ContiguousTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

# Save the model and tokenizer using ContiguousTrainer
trainer.save_model('electra/saved_model')
tokenizer.save_pretrained('electra/saved_tokenizer')

print("ELECTRA model and tokenizer saved successfully.")

ELECTRA model and tokenizer saved successfully.


In [17]:
# Make predictions on the test set for ELECTRA
electra_predictions = trainer.predict(test_dataset)

# Save predicted probabilities
electra_probabilities = torch.nn.functional.softmax(
    torch.tensor(electra_predictions.predictions), dim=-1).numpy()
np.save('electra_probabilities.npy', electra_probabilities)

# Save the actual predicted labels
electra_pred_labels = np.argmax(electra_probabilities, axis=1)
np.save('electra_pred_labels.npy', electra_pred_labels)

  0%|          | 0/2725 [00:00<?, ?it/s]

In [12]:
# Make predictions on the training set for electra
electra_train_predictions = trainer.predict(train_dataset)

# Save predicted probabilities for the training set
electra_train_probabilities = torch.nn.functional.softmax(
    torch.tensor(electra_train_predictions.predictions), dim=-1).numpy()
np.save('electra_train_probabilities.npy', electra_train_probabilities)

# Save the actual predicted labels for the training set
electra_train_pred_labels = np.argmax(electra_train_probabilities, axis=1)
np.save('electra_train_pred_labels.npy', electra_train_pred_labels)

  0%|          | 0/9809 [00:00<?, ?it/s]

In [None]:
# import pandas as pd
# import numpy as np
# import torch
# from tqdm import tqdm
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# from sklearn.preprocessing import LabelEncoder

# # Assuming you have loaded your test_df, model, and tokenizer already

# # Set model to evaluation mode
# model.eval()
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# # Define the batch size
# batch_size = 64  # Increase this based on your GPU/CPU capacity


# def preprocess_batch(batch_texts):
#     return tokenizer(batch_texts, truncation=True, padding='max_length', max_length=512, return_tensors="pt")


# # Make predictions in batches
# all_predictions = []

# for i in tqdm(range(0, len(test_df), batch_size), desc="Predicting in batches"):
#     batch_texts = test_df['text'][i:i + batch_size].tolist()
#     inputs = preprocess_batch(batch_texts)
#     inputs = {key: val.to(device) for key, val in inputs.items()}

#     with torch.no_grad():
#         outputs = model(**inputs)

#     batch_predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()
#     all_predictions.extend(batch_predictions)

# # Decode the predictions back to original labels
# pred_labels = label_encoder.inverse_transform(all_predictions)

# # Create the submission file
# submission_df = pd.DataFrame({
#     'Index': 'Article_' + test_df.index.astype(str),
#     'target': pred_labels
# })
# submission_df.to_csv('electra_submission.csv', index=False)

# print("Submission file generated as 'electra_submission.csv'.")