### Requirements

In [1]:
import torch
import pandas as pd
import numpy as np

from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import  TensorDataset, DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


##### Data Preparation

In [2]:
# 0  world
# 1  sport
# 2  business
# 3  tech
train_data = pd.read_csv(r"projects\text-classification\data\train.csv")
test_data = pd.read_csv(r"projects\text-classification\data\test.csv")

test_data.rename(columns={
    'Class Index':'label',
    'Title':'title',
    'Description':'text',
}, inplace=True)

train_data.rename(columns={
    'Class Index':'label',
     'Title':'title',
      'Description':'text',
}, inplace=True)

train_data.drop(axis=1, columns='title', inplace=True)
test_data.drop(axis=1, columns='title', inplace=True)

train_data = train_data.sample(frac=0.0051, random_state=0)
test_data = test_data.sample(frac=0.0051, random_state=0)

train_data['label'].replace({1:0, 2:1, 3:2, 4: 3}, inplace=True)
test_data['label'].replace({1:0, 2:1, 3:2, 4: 3}, inplace=True)

x_train = train_data['text'].tolist()
y_train = train_data['label'].tolist()

x_test = test_data['text'].tolist()
y_test = test_data['label'].tolist()

train_data['label'].value_counts()

0    165
2    158
1    145
3    144
Name: label, dtype: int64

##### Data Preprocessing

In [3]:
# Define the model name and number of labels
num_labels = 4

# Load the pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
model.to(device)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

train_encodings = tokenizer(x_train, truncation=True, padding=True, return_tensors='pt')
val_encodings = tokenizer(x_test, truncation=True, padding=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [4]:
## These are the actual sentence not the cleaned sentence
x_train[0]

'London - British airline magnate Richard Branson announced a plan on Monday for the world #39;s first commercial space flights, saying  quot;thousands quot; of fee-paying astronauts could be sent into orbit in the near future.'

In [5]:
print(train_encodings['input_ids'][0])

tensor([  101,  1498,   118,  1418,  8694, 12477, 21772,  2055,   139,  4047,
         2142,  1717,   170,  2197,  1113,  6356,  1111,  1103,  1362,   108,
         3614,   132,   188,  1148,  2595,  2000,  7306,   117,  2157,   186,
        11848,  1204,   132,  4674,   186, 11848,  1204,   132,  1104,  7216,
          118,  6573, 27149,  1116,  1180,  1129,  1850,  1154,  8895,  1107,
         1103,  1485,  2174,   119,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [6]:
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              torch.tensor(y_train))

val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']),
                            torch.tensor(val_encodings['attention_mask']),
                            torch.tensor(y_test))



# Fine-tune the BERT model
optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

  train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
  torch.tensor(train_encodings['attention_mask']),


##### Model Training

In [None]:
for epoch in range(2):
    model.train()
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Evaluate the model
    model.eval()
    val_loader = DataLoader(val_dataset, batch_size=16)
    val_accuracy = 0
    for batch in val_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            val_accuracy += torch.sum(predictions == labels).item()
    val_accuracy /= len(x_test)
    print(f'Epoch {epoch + 1} val accuracy: {val_accuracy:.4f}')


##### Save Tokenizer and Trained model

In [None]:
model.save_pretrained('bert_classifier')
tokenizer.save_pretrained('bert_classifier')


##### Load Tokenizer and trained Model

In [3]:
# Load saved model and tokenizer
model = BertForSequenceClassification.from_pretrained(r'E:\Space\Code\data science\bert_classifier')
tokenizer = BertTokenizer.from_pretrained(r'E:\Space\Code\data science\bert_classifier')
    

##### Make Prediction using new data

In [None]:
# Tokenize input text
input_text = """

According to a SamMobile report, Samsung is likely to roll out its purported Android 14-based One UI 6 update to select Galaxy phones including the latest Galaxy S23 series. It will be released for some Galaxy handsets that are 3 years old, as the company promises four years of OS updates for recently launched models. This suggests that the older Galaxy S20 series, S20 FE, Galaxy S10 Lite and a few other phones will not get the next major OS update and will only get the security updates.

The Samsung Galaxy S20, Galaxy S20+, Galaxy S20 Ultra, Galaxy S10 Lite, and Galaxy S20 FE will reportedly not get the Android 14 update. Meanwhile, Samsung's foldable handsets that are part of the Galaxy Z series, including the Samsung Galaxy Z Fold 2, and Galaxy Z Flip, will also not receive the update as per the report.

"""
input_ids = tokenizer.encode(input_text, truncation=True, padding=True)
attention_mask = [int(i != 0) for i in input_ids]

# Make a prediction
output = model(torch.tensor([input_ids]), attention_mask=torch.tensor([attention_mask]))

# Get predicted label
predicted_label = torch.argmax(output[0]).item()
predicted_label

#### ------------------------------------------- END -------------------------------------------