**1. Install Required Libraries**

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

**2. Load Data**

In [None]:
import pandas as pd
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

print(df_train.head())
print(df_test.head())

   Class Index                                              Title  \
0            3  Wall St. Bears Claw Back Into the Black (Reuters)   
1            3  Carlyle Looks Toward Commercial Aerospace (Reu...   
2            3    Oil and Economy Cloud Stocks' Outlook (Reuters)   
3            3  Iraq Halts Oil Exports from Main Southern Pipe...   
4            3  Oil prices soar to all-time record, posing new...   

                                         Description  
0  Reuters - Short-sellers, Wall Street's dwindli...  
1  Reuters - Private investment firm Carlyle Grou...  
2  Reuters - Soaring crude prices plus worries\ab...  
3  Reuters - Authorities have halted oil export\f...  
4  AFP - Tearaway world oil prices, toppling reco...  
   Class Index                                              Title  \
0            3                  Fears for T N pension after talks   
1            4  The Race is On: Second Private Team Sets Launc...   
2            4      Ky. Company Wins Grant to St

**3. Preprocessing the text**

In [None]:
def combine_title_and_description(df):
  df['text'] = df[['Title', 'Description']].agg('. '.join, axis = 1)
  df = df.drop(['Title', 'Description'], axis = 1)
  return df

In [None]:
df_train = combine_title_and_description(df_train)
df_test = combine_title_and_description(df_test)
df_train.head()

Unnamed: 0,Class Index,text
0,3,Wall St. Bears Claw Back Into the Black (Reute...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...
4,3,"Oil prices soar to all-time record, posing new..."


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
  words = text.split()
  words = [word for word in words if word.lower() not in stop_words]
  return " ".join(words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df_train['text'] = df_train['text'].apply(remove_stopwords)
df_test['text'] = df_test['text'].apply(remove_stopwords)

In [None]:
df_train['label'] = df_train['Class Index'] - 1
df_test['label'] = df_test['Class Index'] - 1

In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_train['text'].tolist(), df_train['label'].tolist(), test_size=0.2, random_state=42
)

In [None]:
df_train.head()

Unnamed: 0,Class Index,text,label
0,3,Wall St. Bears Claw Back Black (Reuters). Reut...,2
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,3,Oil Economy Cloud Stocks' Outlook (Reuters). R...,2
3,3,Iraq Halts Oil Exports Main Southern Pipeline ...,2
4,3,"Oil prices soar all-time record, posing new me...",2


**4. Tokeniztion with BERT**

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
  return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


**5. Convert Tokenized Data into PyTorch tensors**

In [None]:
import torch
from torch.utils.data import Dataset

class NewsDataset(Dataset):
  def __init__(self, encodings, labels):
      self.encodings = encodings
      self.labels = labels

  def __len__(self):  # Ensure this is properly defined
      return len(self.encodings['input_ids'])

  def __getitem__(self, idx):
      item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
      # Ensure 'input_ids' exists
      if "input_ids" not in item:
          print(f"Error: Missing 'input_ids' in dataset at index {idx}")
      item['labels'] = torch.tensor(self.labels[idx])
      return item

In [None]:
train_dataset = NewsDataset(train_encodings, train_labels)
test_dataset = NewsDataset(val_encodings, val_labels)
print(f"Training dataset size: {len(train_dataset)}")  # Should print a nonzero value
print(f"Validation dataset size: {len(test_dataset)}")

Training dataset size: 96000
Validation dataset size: 24000


In [None]:
print(train_encodings.keys())  # Should include 'input_ids'

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


**6. Load Pretrained BERT Model**

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

if torch.cuda.device_count() > 1:
  print(f"Using {torch.cuda.device_count()} GPUs!")
  model = torch.nn.DataParallel(model)

**7. Train the Model**

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import TrainingArguments, Trainer
from transformers import AdamW

# Define Training Arguments
training_args = TrainingArguments(
    fp16=True,
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    warmup_steps=500,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    logging_steps=10
)

# Create a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    optimizers=(AdamW(model.parameters(), lr=5e-5), None)
)

# Train the model
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-18-dd002018d705>", line 30, in <cell line: 0>
    trainer.train()
  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2171, in train
    return inner_training_loop(
           ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2531, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 3712, in training_step
    self.accelerator.backward(loss, **kwargs)
  File "/usr/local/lib/python3.11/dist-packages/accelerate/accelerator.py", line 2246, in backward
    loss.backward(**kwargs)
  File "/usr/local/lib/pytho

TypeError: object of type 'NoneType' has no len()

**8. Evaluate and Save the Model**

In [None]:
trainer.evaluate()

In [None]:
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')

**9. Predict on the Test Dataset**

In [None]:
df_test.head()

Unnamed: 0,Class Index,text
0,3,Fears N pension talks. Unions representing wor...
1,4,Race On: Second Private Team Sets Launch Date ...
2,4,Ky. Company Wins Grant Study Peptides (AP). AP...
3,4,Prediction Unit Helps Forecast Wildfires (AP)....
4,4,Calif. Aims Limit Farm-Related Smog (AP). AP -...


In [None]:
test_encodings = tokenizer(df_test['text'].tolist(), truncation=True, padding=True, max_length=512)
test_dataset = NewsDataset(test_encodings, df_test['label'].tolist())

predictions = trainer.predict(test_dataset)
predicted_labels = torch.argmax(torch.tensor(predictions.predictions), axis=1)

df_test['Predicted Class'] = predicted_labels.numpy() + 1 # Convert back to 1-based index
df_test.to_csv('predictions.csv', index=False)

**10. Calculate the Accuracy**

In [None]:
from sklearn.metrics import accuracy_score

predicted_accuracy = accuracy_score(df_test['label'], predicted_labels.numpy())
print(f"Accuracy: {predicted_accuracy:.4f}")