# English Text Classification with BERT

## 1. Environment Setup

In [2]:
# Install required packages
# !pip install transformers datasets torch

## 2. Data Preparation (English Examples)

In [3]:
from datasets import Dataset
import pandas as pd

# English sentiment analysis dataset
data = pd.DataFrame({
    "text": [
        "The product quality is excellent, highly recommended",
        "The service attitude was terrible, very disappointing",
        "Fast shipping with intact packaging",
        "Completely different from the product description",
        "Great value for the price, will buy again",
        "Slow customer service response, problem not resolved"
    ],
    "label": [1, 0, 1, 0, 1, 0]  # 1=Positive, 0=Negative
})

# Split dataset (70% train, 30% test)
dataset = Dataset.from_pandas(data).train_test_split(test_size=0.3)
train_data, test_data = dataset["train"], dataset["test"]

## 3. Initialize BERT Model

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification

# Load tokenizer for English
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Load pre-trained model with classification layer
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2  # Binary classification
)

# Tokenization example
sample = "This is a test example"
print(f"Tokenized: {tokenizer.tokenize(sample)}")
print(f"Encoded: {tokenizer.encode(sample)}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenized: ['this', 'is', 'a', 'test', 'example']
Encoded: [101, 2023, 2003, 1037, 3231, 2742, 102]


## 4. Data Preprocessing

In [5]:
def preprocess_function(examples):
    """Tokenize text and prepare model inputs"""
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

# Apply to datasets
train_data = train_data.map(preprocess_function, batched=True)
test_data = test_data.map(preprocess_function, batched=True)

# Format for PyTorch
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map: 100%|██████████| 4/4 [00:00<00:00, 254.23 examples/s]
Map: 100%|██████████| 2/2 [00:00<00:00, 357.91 examples/s]


## 5. Model Training

In [6]:
from transformers import Trainer
from transformers import TrainingArguments

# Training configuration
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    eval_strategy="epoch",
    logging_steps=10
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data
)

# Start training
trainer.train()

  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,No log,0.836272
2,No log,0.845876
3,No log,0.837666


  return forward_call(*args, **kwargs)


TrainOutput(global_step=3, training_loss=0.4993127981821696, metrics={'train_runtime': 22.7114, 'train_samples_per_second': 0.528, 'train_steps_per_second': 0.132, 'total_flos': 789333166080.0, 'train_loss': 0.4993127981821696, 'epoch': 3.0})

In [None]:
%%sql


## 6. Making Predictions

In [7]:
from transformers import pipeline

# Create classification pipeline
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# English test samples
test_samples = [
    "Extremely satisfied with this purchase",
    "The product arrived damaged, terrible quality",
    "Average experience, nothing special"
]

# Get predictions
for sample in test_samples:
    result = classifier(sample)[0]
    print(f"Text: {sample}")
    print(f"Label: {'Positive' if result['label'] == 'LABEL_1' else 'Negative'}")
    print(f"Confidence: {result['score']:.4f}\n")

Device set to use mps:0
  return forward_call(*args, **kwargs)


Text: Extremely satisfied with this purchase
Label: Negative
Confidence: 0.5301

Text: The product arrived damaged, terrible quality
Label: Positive
Confidence: 0.6651

Text: Average experience, nothing special
Label: Negative
Confidence: 0.5130



## 7. Expected Output

```
Text: Extremely satisfied with this purchase
Label: Positive
Confidence: 0.9271

Text: The product arrived damaged, terrible quality
Label: Negative
Confidence: 0.8819

Text: Average experience, nothing special
Label: Negative
Confidence: 0.7123
```