In [1]:
!pip install transformers
!pip install datasets
!pip install accelerate>=0.20.1



Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m69.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m66.8 MB/s[0m eta [36m0:00:0

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments


In [3]:
dataset = load_dataset("Deysi/spam-detection-dataset")

Downloading readme:   0%|          | 0.00/581 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.92M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/663k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/8175 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2725 [00:00<?, ? examples/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8175
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2725
    })
})

In [5]:
# Tokenizer and model setup
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Convert labels to integers
def convert_to_int(example):
    example["label"] = int(example["label"] == "spam")  # Convert "spam" to 1, "ham" to 0
    return example

dataset["train"] = dataset["train"].map(convert_to_int)
dataset["test"] = dataset["test"].map(convert_to_int)


Map:   0%|          | 0/8175 [00:00<?, ? examples/s]

Map:   0%|          | 0/2725 [00:00<?, ? examples/s]

In [7]:
# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/8175 [00:00<?, ? examples/s]

Map:   0%|          | 0/2725 [00:00<?, ? examples/s]

In [8]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./spam_classification",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    evaluation_strategy="steps",
    save_steps=1000,
    eval_steps=1000,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)


In [9]:
# Training
trainer.train()


Step,Training Loss,Validation Loss
1000,0.0013,0.006402


TrainOutput(global_step=1533, training_loss=0.008674945942532035, metrics={'train_runtime': 744.3537, 'train_samples_per_second': 32.948, 'train_steps_per_second': 2.06, 'total_flos': 6452798632704000.0, 'train_loss': 0.008674945942532035, 'epoch': 3.0})

In [11]:
# Evaluation
results = trainer.evaluate()
print(results)

{'eval_loss': 0.006952384486794472, 'eval_runtime': 27.9481, 'eval_samples_per_second': 97.502, 'eval_steps_per_second': 12.201, 'epoch': 3.0}


In [12]:
results

{'eval_loss': 0.006952384486794472,
 'eval_runtime': 27.9481,
 'eval_samples_per_second': 97.502,
 'eval_steps_per_second': 12.201,
 'epoch': 3.0}

In [14]:
'''Yes, you can use the trained spam classification model to make predictions on an unlabeled dataset to classify emails as either spam or non-spam (ham). Here are the general steps to do this:

Load and Tokenize the Unlabeled Dataset:

Load the unlabeled dataset that you want to classify.
Tokenize the text data in the dataset using the same tokenizer that you used during training. This ensures that the input data is preprocessed in the same way.
Use the Trained Model for Inference:

Initialize the trained model that you fine-tuned for spam classification.
Make Predictions:

Pass the tokenized text data through the model to make predictions.
The model will output probabilities for each class (spam and non-spam).
You can classify emails as spam if the probability for the "spam" class is above a certain threshold (e.g., 0.5), and as non-spam (ham) otherwise.
Here's a code snippet that demonstrates how to perform this classification on an unlabeled dataset:

python
Copy code
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the tokenizer and the trained model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("spam_classification")  # Load your fine-tuned model

# Load and tokenize the unlabeled dataset
unlabeled_dataset = [...]  # Load and preprocess your unlabeled dataset here

# Make predictions
predictions = []

for text in unlabeled_dataset:
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    logits = model(**inputs).logits
    probabilities = torch.softmax(logits, dim=1)  # Apply softmax to get class probabilities
    spam_probability = probabilities[0, 1].item()  # Probability of being spam (class index 1)
    predictions.append((text, spam_probability))

# Classify emails based on probability threshold
threshold = 0.5
classified_emails = [(text, spam_prob > threshold) for text, spam_prob in predictions]

# Display or save the classified emails
for text, is_spam in classified_emails:
    if is_spam:
        print(f"SPAM: {text}")
    else:
        print(f"HAM: {text}")
In this code:

Replace [...] with the code to load and preprocess your unlabeled dataset.
The model is used to predict the probability of each input being spam.
You can adjust the threshold value to control the trade-off between precision and recall in your classification.
This code will allow you to classify emails in your unlabeled dataset as spam or non-spam using the fine-tuned model.'''

'Yes, you can use the trained spam classification model to make predictions on an unlabeled dataset to classify emails as either spam or non-spam (ham). Here are the general steps to do this:\n\nLoad and Tokenize the Unlabeled Dataset:\n\nLoad the unlabeled dataset that you want to classify.\nTokenize the text data in the dataset using the same tokenizer that you used during training. This ensures that the input data is preprocessed in the same way.\nUse the Trained Model for Inference:\n\nInitialize the trained model that you fine-tuned for spam classification.\nMake Predictions:\n\nPass the tokenized text data through the model to make predictions.\nThe model will output probabilities for each class (spam and non-spam).\nYou can classify emails as spam if the probability for the "spam" class is above a certain threshold (e.g., 0.5), and as non-spam (ham) otherwise.\nHere\'s a code snippet that demonstrates how to perform this classification on an unlabeled dataset:\n\npython\nCopy co