### <font color='blue'>Part 1: Preprocessing and Exploration of the AG_News Dataset</font>

In [2]:
from torch import cuda

print("Cuda availablity is:", cuda.is_available())

Cuda availablity is: False


#### Load the Dataset

In [8]:
import pandas as pd

# Define the URLs of the dataset files on GitHub
train_url = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv"
test_url = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv"

# Load the dataset using pandas
train_df = pd.read_csv(train_url, header=None, names=["label", "title", "description"])
test_df = pd.read_csv(test_url, header=None, names=["label", "title", "description"])

print("Train dataset shape:", train_df.shape)
print("Test dataset shape:", test_df.shape)

Train dataset shape: (120000, 3)
Test dataset shape: (7600, 3)


#### Combine the title and description columns in both the train and test dataframes

In [9]:
train_df['text'] = train_df['title'] + " " + train_df['description']
test_df['text'] = test_df['title'] + " " + test_df['description']

#### Tokenize the text using the Hugging Face Transformers library

In [10]:
from transformers import AutoTokenizer


# Choose a pre-trained model architecture (e.g., BERT)
model_name = "bert-base-uncased"

# model_name = "distilbert-base-uncased"

# Instantiate a tokenizer based on a pre-trained model (e.g., BERT)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the text in the train and test dataframes
train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True, max_length=256)

#### Convert the labels into numerical format

In [11]:
# Subtract 1 from the label values to make them zero-based (i.e., 0 to 3 instead of 1 to 4)
train_labels = train_df['label'].values - 1
test_labels = test_df['label'].values - 1

#### Create PyTorch DataLoader objects for training and testing

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader

class AGNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Randomly sample a subset of the original dataset for training
train_df_sample = train_df.sample(frac=0.001, random_state=42)

# Tokenize the text in the sampled train dataframe and the test dataframe
train_encodings = tokenizer(train_df_sample['text'].tolist(), truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True, max_length=256)

# Convert the labels into numerical format using the sampled train dataframe
train_labels = train_df_sample['label'].values - 1
test_labels = test_df['label'].values - 1

# Create dataset objects for the sampled train data and test data
train_dataset = AGNewsDataset(train_encodings, train_labels)
test_dataset = AGNewsDataset(test_encodings, test_labels)

# Create DataLoader objects for the sampled train data and test data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

### <font color='blue'>Part 2: Train and validate the text classification model using pyTorch</font>

#### Define the model, loss function, and optimizer

In [13]:
import torch
from transformers import AutoModelForSequenceClassification, AdamW

# Instantiate the model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=5e-5)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

#### Train the model on the preprocessed dataset for several epochs

In [14]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0

    for batch in dataloader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    return running_loss / len(dataloader)

# Train the model for the desired number of epochs
num_epochs = 20

for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {train_loss:.4f}")

Epoch 1/20, Loss: 1.3002
Epoch 2/20, Loss: 1.0319
Epoch 3/20, Loss: 0.7631
Epoch 4/20, Loss: 0.5259
Epoch 5/20, Loss: 0.2969
Epoch 6/20, Loss: 0.1466
Epoch 7/20, Loss: 0.0614
Epoch 8/20, Loss: 0.0355
Epoch 9/20, Loss: 0.0224
Epoch 10/20, Loss: 0.0168
Epoch 11/20, Loss: 0.0137
Epoch 12/20, Loss: 0.0117
Epoch 13/20, Loss: 0.0103
Epoch 14/20, Loss: 0.0094
Epoch 15/20, Loss: 0.0083
Epoch 16/20, Loss: 0.0077
Epoch 17/20, Loss: 0.0069
Epoch 18/20, Loss: 0.0067
Epoch 19/20, Loss: 0.0063
Epoch 20/20, Loss: 0.0055


#### Evaluate the model's performance on the test set

In [15]:
def evaluate(model, dataloader, device):
    model.eval()
    true_labels = []
    pred_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

            true_labels.extend(labels.cpu().numpy())
            pred_labels.extend(preds.cpu().numpy())

    return true_labels, pred_labels

# Evaluate the model on the test set
true_labels, pred_labels = evaluate(model, test_loader, device)

#### Calculate performance metrics

In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate performance metrics
accuracy = accuracy_score(true_labels, pred_labels)
precision = precision_score(true_labels, pred_labels, average='weighted')
recall = recall_score(true_labels, pred_labels, average='weighted')
f1 = f1_score(true_labels, pred_labels, average='weighted')

# Print the performance metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.8828
Precision: 0.8833
Recall: 0.8828
F1-score: 0.8825
