In [2]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from datasets import Dataset
from sklearn import metrics
import os

In [2]:
# Load datasets
train_df = pd.read_csv(
    'C:/Users/91898/Code/fibe/dataset_fibe/train.csv', encoding='ISO-8859-1')
test_df = pd.read_csv(
    'C:/Users/91898/Code/fibe/dataset_fibe/test.csv', encoding='ISO-8859-1')
sample_submission = pd.read_csv(
    'C:/Users/91898/Code/fibe/dataset_fibe/sample_submission.csv', encoding='ISO-8859-1')

In [3]:
train_df.head()

Unnamed: 0,text,target,Word Count
0,"python courses python courses, python exercise...",academic interests,125
1,the learning point open digital education. a r...,academic interests,147
2,"tech news, latest technology, mobiles, laptops...",academic interests,143
3,the best it certification materials in usa | k...,academic interests,364
4,"bioland scientific, for your research needs bi...",academic interests,176


In [8]:
import torch

# Clear cache
torch.cuda.empty_cache()

# If you want to clear memory allocated by tensors
torch.cuda.ipc_collect()

In [4]:
# Preprocess the text data
def preprocess_text(text):
    return str(text).lower().strip()


# Apply preprocessing
train_df['text'] = train_df['text'].str.lower().str.strip()
test_df['text'] = test_df['text'].str.lower().str.strip()

# Encode target labels
label_encoder = LabelEncoder()
train_df['target'] = label_encoder.fit_transform(train_df['target'])

# Split into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['text'], train_df['target'], test_size=0.1, random_state=42
)

# Check the results of preprocessing and encoding
print(train_df.head())
print("Classes:", label_encoder.classes_)

                                                text  target  Word Count
0  python courses python courses, python exercise...       0         125
1  the learning point open digital education. a r...       0         147
2  tech news, latest technology, mobiles, laptops...       0         143
3  the best it certification materials in usa | k...       0         364
4  bioland scientific, for your research needs bi...       0         176
Classes: ['academic interests' 'arts and culture' 'automotives'
 'books and literature' 'business and finance' 'careers'
 'family and relationships' 'food and drinks' 'health' 'healthy living'
 'hobbies and interests' 'home and garden' 'movies' 'music and audio'
 'news and politics' 'personal finance' 'pets'
 'pharmaceuticals, conditions, and symptoms' 'real estate' 'shopping'
 'sports' 'style and fashion' 'technology and computing' 'television'
 'travel' 'video gaming']


In [5]:

tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-xsmall', use_fast=True)


# Convert pandas DataFrames to Hugging Face datasets
train_dataset = Dataset.from_pandas(pd.DataFrame(
    {'text': train_texts, 'labels': train_labels}))
val_dataset = Dataset.from_pandas(pd.DataFrame(
    {'text': val_texts, 'labels': val_labels}))
test_dataset = Dataset.from_pandas(pd.DataFrame({'text': test_df['text']}))

# Display first few entries to verify datasets
print(train_dataset[0])
print(val_dataset[0])
print(test_dataset[0])



{'text': 'lockdown perfect time tollywood stars introspect line up outside profession actor actress began drive back first love music shruti hasaan others like pranitha subhash turned help others tough times various opening samantha akkineni find out foretell horticulture growing bring about said last found something passionate break job starting get tired answering people asked hobby reply represent counterargument job hobby oh baby actress began journey first harvest cabbage microgreens lockdown carry insta explained one need declamatory lawn backyard gardening using space uncommitted domicile initially used window sill bedroom grow microgreens said interested growing require tray cocopeat seeds cool room used bedroom window lets sunlight partly also gave guide fans farsighted takes cum sprout number days tray needs covered use lamp case one enough sunlight room inspired tollywood diva take gardening first piazza pandemic fear able-bodied feed oneself cut back houses said often hear 

In [6]:
import os
from datasets import load_from_disk, Dataset
from transformers import AutoTokenizer

# Initialize the tokenizer

tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-xsmall')

train_dataset_path = 'deberta/tokenized_train_dataset'
val_dataset_path = 'deberta/tokenized_val_dataset'
test_dataset_path = 'deberta/tokenized_test_dataset'

if os.path.exists(train_dataset_path) and os.path.exists(val_dataset_path) and os.path.exists(test_dataset_path):
    # Load the tokenized datasets
    train_dataset = load_from_disk(train_dataset_path)
    val_dataset = load_from_disk(val_dataset_path)
    test_dataset = load_from_disk(test_dataset_path)
    print("Tokenized datasets loaded successfully from disk.")
else:
    # Define the tokenization function
    def tokenize_function(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

    # Perform tokenization without multiprocessing
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)

    # Set the format for PyTorch
    train_dataset.set_format(type='torch', columns=[
                             'input_ids', 'attention_mask', 'labels'])
    val_dataset.set_format(type='torch', columns=[
                           'input_ids', 'attention_mask', 'labels'])
    test_dataset.set_format(type='torch', columns=[
                            'input_ids', 'attention_mask'])

    # Save the tokenized datasets to disk
    os.makedirs('deberta', exist_ok=True)
    train_dataset.save_to_disk(train_dataset_path)
    val_dataset.save_to_disk(val_dataset_path)
    test_dataset.save_to_disk(test_dataset_path)
    print("Tokenized datasets saved successfully to disk.")

# Check the results of tokenization
print(train_dataset[0])
print(val_dataset[0])

Tokenized datasets loaded successfully from disk.
{'labels': tensor(10), 'input_ids': tensor([     1,  47600,    801,    326,   1941,  59864,   2906,  16669,  37831,
           683,    322,    954,   6392,   4435,   6704,   1196,   1168,    396,
           362,    472,    755,  12432,  77926,    303,  22669,    690,    334,
           845,  28633,  14342,   2698,  42849,   1387,    408,    690,   2770,
           631,    847,   1802,  20782,  37431,    266,   1165,   7164,  25514,
           433,    321,  77867,  46376,   1479,    861,    314,    357,    437,
           505,    491,   5106,   1464,    688,   1392,    350,   4111,   9652,
           355,    921,  10169,   4806,   2993,   3118,  55987,    688,  10169,
          6359,   1483,   6704,   1196,   1930,    362,   8839,  17069,   4014,
          9496,    268,  47600,   1886,  59107,   2996,    311,    389,    718,
        100610,  19101,   6680,   7093,  10962,    478,    754, 118135,  63710,
          4114,    427,   1775,  4

In [7]:
# Load BERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    'microsoft/deberta-v3-xsmall', num_labels=len(label_encoder.classes_))

#model.gradient_checkpointing_enable()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set training arguments with mixed precision enabled
training_args = TrainingArguments(
    dataloader_num_workers=8,
    output_dir='./deberta/results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='none', 
    logging_steps=5000,
    evaluation_strategy="steps",
    save_steps=2000,
    eval_steps=2000,
    load_best_model_at_end=True,
    fp16=True,  
    report_to="none",
    resume_from_checkpoint=True
)

# Display model details
print(model)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 384, padding_idx=0)
      (LayerNorm): LayerNorm((384,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=384, out_features=384, bias=True)
              (key_proj): Linear(in_features=384, out_features=384, bias=True)
              (value_proj): Linear(in_features=384, out_features=384, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-07, elementwise_affine



In [8]:
# Check the classifier layer
print(model.classifier)

Linear(in_features=384, out_features=26, bias=True)


In [9]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='longest')


# Define a function for computing metrics


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = metrics.f1_score(labels, predictions, average='weighted')
    accuracy = metrics.accuracy_score(labels, predictions)
    return {'f1': f1, 'accuracy': accuracy}


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

# Verify trainer configuration
print(trainer)

<transformers.trainer.Trainer object at 0x0000018410BF1FD0>


In [10]:
import torch
print(torch.cuda.is_available())  # Should output True

True


In [11]:
trainer.train(resume_from_checkpoint='./deberta/results/checkpoint-58854')

  0%|          | 0/58854 [00:00<?, ?it/s]

{'train_runtime': 0.3953, 'train_samples_per_second': 4764453.088, 'train_steps_per_second': 148889.633, 'train_loss': 0.0, 'epoch': 3.0}


TrainOutput(global_step=58854, training_loss=0.0, metrics={'train_runtime': 0.3953, 'train_samples_per_second': 4764453.088, 'train_steps_per_second': 148889.633, 'total_flos': 1.2411746375635354e+17, 'train_loss': 0.0, 'epoch': 3.0})

In [12]:
# trainer.train()

In [13]:
# import joblib
# joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']

In [None]:
# Make predictions on the test set
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

# Decode the predictions back to original labels
pred_labels = label_encoder.inverse_transform(pred_labels)

# Create the submission file
submission_df = pd.DataFrame({
    'Index': 'Article_' + test_df.index.astype(str),
    'target': pred_labels
})
submission_df.to_csv('deberta_submission.csv', index=False)

# Display the first few rows of the submission file
submission_df.head()

In [None]:
# Save the model and tokenizer
model.save_pretrained('deberta/saved_model')
tokenizer.save_pretrained('deberta/saved_tokenizer')
print("DeBERTa-v3-xsmall model and tokenizer saved successfully.")

In [None]:










deberta_predictions = trainer.predict(test_dataset)

# Save predicted probabilities
deberta_probabilities = torch.nn.functional.softmax(
    torch.tensor(deberta_predictions.predictions), dim=-1).numpy()
np.save('deberta_probabilities.npy', deberta_probabilities)

# Save the actual predicted labels
deberta_pred_labels = np.argmax(deberta_probabilities, axis=1)
np.save('deberta_pred_labels.npy', deberta_pred_labels)

In [12]:
# Make predictions on the training set for deberta
deberta_train_predictions = trainer.predict(train_dataset)

# Save predicted probabilities for the training set
deberta_train_probabilities = torch.nn.functional.softmax(
    torch.tensor(deberta_train_predictions.predictions), dim=-1).numpy()
np.save('deberta_train_probabilities.npy', deberta_train_probabilities)

# Save the actual predicted labels for the training set
deberta_train_pred_labels = np.argmax(deberta_train_probabilities, axis=1)
np.save('deberta_train_pred_labels.npy', deberta_train_pred_labels)   

  0%|          | 0/9809 [00:00<?, ?it/s]

In [None]:
# Load the entire training dataset (no splitting)
from transformers import DebertaV2Tokenizer

from datasets import Dataset
from transformers import AutoTokenizer
train_df = pd.read_csv(
    'C:/Users/91898/Code/fibe/dataset_fibe/train.csv', encoding='ISO-8859-1')

# Preprocess the text data as done before
train_df['text'] = train_df['text'].str.lower().str.strip()

# Encode target labels using the same LabelEncoder
label_encoder = LabelEncoder()
train_df['target'] = label_encoder.fit_transform(train_df['target'])

# Tokenize the entire training dataset using the same tokenizer
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-xsmall')


def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)


# Convert to a Hugging Face dataset and apply tokenization
full_train_dataset = Dataset.from_pandas(train_df[['text', 'target']])
full_train_dataset = full_train_dataset.map(tokenize_function, batched=True)
full_train_dataset.set_format(
    type='torch', columns=['input_ids', 'attention_mask', 'target'])

In [None]:
import numpy as np
import torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader

# Load your trained DeBERTa model
model = AutoModelForSequenceClassification.from_pretrained(
    'deberta/saved_model', num_labels=len(label_encoder.classes_)
)
tokenizer = AutoTokenizer.from_pretrained(
    'microsoft/deberta-v3-xsmall', use_fast=True)

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create a TrainingArguments object with FP16 enabled and a larger batch size
training_args = TrainingArguments(
    output_dir='.tettt/results',           # Output directory
    per_device_eval_batch_size=64,    # Increase this if your GPU has enough memory
    fp16=True                         # Enable mixed precision
)

# Initialize the Trainer with these arguments
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer
)

# Make predictions on the full training set with gradient computation disabled
with torch.no_grad():
    deberta_full_train_predictions = trainer.predict(full_train_dataset)

# Apply softmax to get probabilities
deberta_full_train_probabilities = torch.nn.functional.softmax(
    torch.tensor(deberta_full_train_predictions.predictions), dim=-1
).numpy()

# Save these probabilities
np.save('deberta_full_train_probabilities.npy',
        deberta_full_train_probabilities)