In [None]:


import torch, os
import pandas as pd
from transformers import pipeline, BertForSequenceClassification, BertTokenizerFast
from torch.utils.data import Dataset

NUM_LABELS = 20

id2label = {
    0: "alt.atheism",
    1: "comp.graphics",
    2: "comp.os.ms-windows.misc",
    3: "comp.sys.ibm.pc.hardware",
    4: "comp.sys.mac.hardware",
    5: "comp.windows.x",
    6: "misc.forsale",
    7: "rec.autos",
    8: "rec.motorcycles",
    9: "rec.sport.baseball",
    10: "rec.sport.hockey",
    11: "sci.crypt",
    12: "sci.electronics",
    13: "sci.med",
    14: "sci.space",
    15: "soc.religion.christian",
    16: "talk.politics.guns",
    17: "talk.politics.mideast",
    18: "talk.politics.misc",
    19: "talk.religion.misc"
}

label2id = {
    "alt.atheism": 0,
    "comp.graphics": 1,
    "comp.os.ms-windows.misc": 2,
    "comp.sys.ibm.pc.hardware": 3,
    "comp.sys.mac.hardware": 4,
    "comp.windows.x": 5,
    "misc.forsale": 6,
    "rec.autos": 7,
    "rec.motorcycles": 8,
    "rec.sport.baseball": 9,
    "rec.sport.hockey": 10,
    "sci.crypt": 11,
    "sci.electronics": 12,
    "sci.med": 13,
    "sci.space": 14,
    "soc.religion.christian": 15,
    "talk.politics.guns": 16,
    "talk.politics.mideast": 17,
    "talk.politics.misc": 18,
    "talk.religion.misc": 19
}
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)
model.to(device)

In [None]:
from datasets import load_dataset
import datasets
import random
# Load datasets for each category
categories = [
    "alt.atheism", "comp.graphics", "comp.os.ms-windows.misc", "comp.sys.ibm.pc.hardware",
    "comp.sys.mac.hardware", "comp.windows.x", "misc.forsale", "rec.autos",
    "rec.motorcycles", "rec.sport.baseball", "rec.sport.hockey", "sci.crypt",
    "sci.electronics", "sci.med", "sci.space", "soc.religion.christian",
    "talk.politics.guns", "talk.politics.mideast", "talk.politics.misc", "talk.religion.misc"
]
system_message = "You are a text classification assistant. As the user presents a block of text, your task is to categorize it based on the following labels:\n\
0: alt.atheism\n\
1: comp.graphics\n\
2: comp.os.ms-windows.misc\n\
3: comp.sys.ibm.pc.hardware\n\
4: comp.sys.mac.hardware\n\
5: comp.windows.x\n\
6: misc.forsale\n\
7: rec.autos\n\
8: rec.motorcycles\n\
9: rec.sport.baseball\n\
10: rec.sport.hockey\n\
11: sci.crypt\n\
12: sci.electronics\n\
13: sci.med\n\
14: sci.space\n\
15: soc.religion.christian\n\
16: talk.politics.guns\n\
17: talk.politics.mideast\n\
18: talk.politics.misc\n\
19: talk.religion.misc\n\
Your role is to carefully read the text and respond with the correct ID associated with the category."


# datasets_1 = []
# for idx, category in enumerate(categories):
#     dataset = load_dataset("newsgroup", f"bydate_{category}", split="train")
#     arr = [random.randint(0,19) for _ in range(int(len(dataset)/10)) if _ !=0]
#     dataset = dataset.add_column("label", arr + ([idx] * (len(dataset)-len(arr))))
#     datasets_1.append(dataset)

# # Concatenate all datasets
# combined_dataset = datasets.concatenate_datasets(datasets_1)

# # Shuffle the combined dataset
# combined_dataset = combined_dataset.shuffle()

# # Function to create conversation format
# def create_conversation(sample):
#     return {
#         "messages": [
#             {"role": "system", "content": system_message},
#             {"role": "user", "content": sample["text"].replace("\n", " ")},
#             {"role": "assistant", "content": str(sample["label"])}
#         ]
#     }

# # Map and format the dataset
# combined_dataset = combined_dataset.map(lambda example: {"text": example["text"].replace("\n", " ")})
# combined_dataset = combined_dataset.map(create_conversation, remove_columns=combined_dataset.features, batched=False)

# # Save dataset to JSON
# combined_dataset.to_json("train_dataset_20_newsgroups_noisy.json", orient="records")


In [None]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="train_dataset_20_newsgroups_noisy.json", split="train")
train_texts=[]
train_labels = []
for stuff in dataset:
    train_texts.append(stuff["messages"][1]["content"])
    train_labels.append(stuff["messages"][2]["content"])



Generating train split: 11314 examples [00:00, 123494.15 examples/s]


In [None]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="test_dataset_20_newsgroups.json", split="train")
test_texts=[]
test_labels = []
for stuff in dataset:
    test_texts.append(stuff["messages"][1]["content"])
    test_labels.append(stuff["messages"][2]["content"])

In [None]:


tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", max_length=512)


train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts,truncation = True, padding= True)

In [None]:
! nvidia-smi

In [None]:
class DataLoader(Dataset):
    """
    Custom Dataset class for handling tokenized text data and corresponding labels.
    Inherits from torch.utils.data.Dataset.
    """
    def __init__(self, encodings, labels):
        """
        Initializes the DataLoader class with encodings and labels.

        Args:
            encodings (dict): A dictionary containing tokenized input text data
                              (e.g., 'input_ids', 'token_type_ids', 'attention_mask').
            labels (list): A list of integer labels for the input text data.
        """
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        """
        Returns a dictionary containing tokenized data and the corresponding label for a given index.

        Args:
            idx (int): The index of the data item to retrieve.

        Returns:
            item (dict): A dictionary containing the tokenized data and the corresponding label.
        """
        # Retrieve tokenized data for the given index
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Add the label for the given index to the item dictionary
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        """
        Returns the number of data items in the dataset.

        Returns:
            (int): The number of data items in the dataset.
        """
        return len(self.labels)

In [None]:
train_dataloader = DataLoader(train_encodings, train_labels)

test_dataset = DataLoader(test_encodings, test_labels)

In [None]:


from transformers import TrainingArguments, Trainer



from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    """
    Computes accuracy, F1, precision, and recall for a given set of predictions.

    Args:
        pred (obj): An object containing label_ids and predictions attributes.
            - label_ids (array-like): A 1D array of true class labels.
            - predictions (array-like): A 2D array where each row represents
              an observation, and each column represents the probability of
              that observation belonging to a certain class.

    Returns:
        dict: A dictionary containing the following metrics:
            - Accuracy (float): The proportion of correctly classified instances.
            - F1 (float): The macro F1 score, which is the harmonic mean of precision
              and recall. Macro averaging calculates the metric independently for
              each class and then takes the average.
            - Precision (float): The macro precision, which is the number of true
              positives divided by the sum of true positives and false positives.
            - Recall (float): The macro recall, which is the number of true positives
              divided by the sum of true positives and false negatives.
    """
    # Extract true labels from the input object
    labels = pred.label_ids

    # Obtain predicted class labels by finding the column index with the maximum probability
    preds = pred.predictions.argmax(-1)

    # Compute macro precision, recall, and F1 score using sklearn's precision_recall_fscore_support function
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')

    # Calculate the accuracy score using sklearn's accuracy_score function
    acc = accuracy_score(labels, preds)

    # Return the computed metrics as a dictionary
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }
train_dataloader.labels

In [None]:
training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written
    output_dir='./BERT_Text_classification_noisy_FULL',
    do_train=True,
    do_eval=True,
    #  The number of epochs, defaults to 3.0
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    # Number of steps used for a linear warmup
    warmup_steps=100,
    weight_decay=0.001,
    logging_strategy='steps',
   # TensorBoard log directory
    logging_dir='./multi-class-logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    fp16=False,
    load_best_model_at_end=True,
    push_to_hub=True,
    report_to='tensorboard'
)

In [None]:
trainer = Trainer(
    # the pre-trained model that will be fine-tuned
    model=model,
     # training arguments that we defined above
    args=training_args,
    train_dataset=train_dataloader,
    eval_dataset=test_dataset,
    compute_metrics= compute_metrics
)


In [None]:
trainer.train()

In [None]:


model_path = "BERT-20_newsgroup_classifier_noisy_FULL"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

('BERT-20_newsgroup_classifier_noisy_FULL/tokenizer_config.json',
 'BERT-20_newsgroup_classifier_noisy_FULL/special_tokens_map.json',
 'BERT-20_newsgroup_classifier_noisy_FULL/vocab.txt',
 'BERT-20_newsgroup_classifier_noisy_FULL/added_tokens.json',
 'BERT-20_newsgroup_classifier_noisy_FULL/tokenizer.json')

In [None]:
def predict(text):
    """
    Predicts the class label for a given input text

    Args:
        text (str): The input text for which the class label needs to be predicted.

    Returns:
        probs (torch.Tensor): Class probabilities for the input text.
        pred_label_idx (torch.Tensor): The index of the predicted class label.
        pred_label (str): The predicted class label.
    """
    # Tokenize the input text and move tensors to the GPU if available
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")

    # Get model output (logits)
    outputs = model(**inputs)

    probs = outputs[0].softmax(1)
    """ Explanation outputs: The BERT model returns a tuple containing the output logits (and possibly other elements depending on the model configuration). In this case, the output logits are the first element in the tuple, which is why we access it using outputs[0].

    outputs[0]: This is a tensor containing the raw output logits for each class. The shape of the tensor is (batch_size, num_classes) where batch_size is the number of input samples (in this case, 1, as we are predicting for a single input text) and num_classes is the number of target classes.

    softmax(1): The softmax function is applied along dimension 1 (the class dimension) to convert the raw logits into class probabilities. Softmax normalizes the logits so that they sum to 1, making them interpretable as probabilities. """

    # Get the index of the class with the highest probability
    # argmax() finds the index of the maximum value in the tensor along a specified dimension.
    # By default, if no dimension is specified, it returns the index of the maximum value in the flattened tensor.
    pred_label_idx = probs.argmax()

    # Now map the predicted class index to the actual class label
    # Since pred_label_idx is a tensor containing a single value (the predicted class index),
    # the .item() method is used to extract the value as a scalar
    pred_label = model.config.id2label[pred_label_idx.item()]

    return probs, pred_label_idx, pred_label

In [None]:
text="Planets and stars are extremely interesting to me."
predict(text)

(tensor([[0.0026, 0.0234, 0.0063, 0.0088, 0.0056, 0.0082, 0.0086, 0.0049, 0.0037,
          0.0094, 0.0020, 0.0037, 0.0148, 0.0055, 0.8702, 0.0065, 0.0034, 0.0025,
          0.0072, 0.0030]], device='cuda:0', grad_fn=<SoftmaxBackward0>),
 tensor(14, device='cuda:0'),
 'sci.space')

In [None]:

torch.cuda.empty_cache()
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer= BertTokenizerFast.from_pretrained(model_path)
nlp= pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [None]:
! nvidia-smi

In [None]:
nlp(text)

[{'label': 'sci.space', 'score': 0.9027161598205566}]

In [None]:
correct = 0
for i in range(1000):
    prediction = predict(test_texts[i])[-1]
    if prediction == id2label[int(test_labels[i])]:
        correct = correct +1

print(f"Accuracy = {correct/10 :.2f} %")

Accuracy = 87.60 %
