Setup Google Colab Environment

In [4]:
!pip install transformers pandas numpy matplotlib seaborn




In [5]:
!pip install scikit-learn



In [6]:
import sklearn
print(sklearn.__version__)


1.6.0


In [7]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

Load the YouTube Spam Collection Dataset

In [8]:
import pandas as pd
from google.colab import files
uploaded = files.upload()

# Load the dataset into a DataFrame
df = pd.read_csv("Youtube02-KatyPerry.csv")

# Display the first few rows
print(df.head())




Saving Youtube02-KatyPerry.csv to Youtube02-KatyPerry.csv
                              COMMENT_ID        AUTHOR                 DATE  \
0      z12pgdhovmrktzm3i23es5d5junftft3f   lekanaVEVO1  2014-07-22T15:27:50   
1    z13yx345uxepetggz04ci5rjcxeohzlrtf4      Pyunghee  2014-07-27T01:57:16   
2  z12lsjvi3wa5x1vwh04cibeaqnzrevxajw00k    Erica Ross  2014-07-27T02:51:43   
3    z13jcjuovxbwfr0ge04cev2ipsjdfdurwck  Aviel Haimov  2014-08-01T12:27:48   
4  z13qybua2yfydzxzj04cgfpqdt2syfx53ms0k    John Bello  2014-08-01T21:04:03   

                                             CONTENT  CLASS  
0  i love this so much. AND also I Generate Free ...      1  
1  http://www.billboard.com/articles/columns/pop-...      1  
2  Hey guys! Please join me in my fight to help a...      1  
3  http://psnboss.com/?ref=2tGgp3pV6L this is the...      1  
4  Hey everyone. Watch this trailer!!!!!!!!  http...      1  


In [9]:
#Clean the Data
df = df[['CONTENT', 'CLASS']] #keep only text and label

In [10]:
#Preprocess the text
import re

def clean_text(text):
  text = re.sub(r"htttp\S+", "", text)#Remove URLs
  test = re.sub(r"[^A-Za-z0-9\s]","", text) #Remove special characters
  text = text.lower().strip() #Lowercase and trim
  return text
df['CONTENT'] = df['CONTENT'].apply(clean_text)

Model Preparation
- Install and Import Required Libraries

In [11]:
!pip install transformers
!pip install torch scikit-learn

import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader




- Tokenize Text Using BERT

In [12]:
#Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#Tokenize text with padding/trunaction
def tokenize_texts(texts, max_len=128):
  return tokenizer(
      texts.tolist(),
      max_length=max_len,
      padding = 'max_length',
      truncation=True,
      return_tensors='pt'
  )
tokens = tokenize_texts(df['CONTENT'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Dataset Splitting
- Create Training and Validation Sets



In [13]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['CONTENT'], df['CLASS'], test_size=0.2, random_state=42
)

Prepare Dataset for PyTorch
- Define a custom dataset class:

In [14]:
class SpamDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(
            texts.tolist(),
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        self.labels = torch.tensor(labels.values)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

train_dataset = SpamDataset(train_texts, train_labels)
val_dataset = SpamDataset(val_texts, val_labels)


Model Training
- Load Pre-trained BERT

In [15]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


- Define Training Parameters

In [16]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir = './results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="steps"
)

In [17]:
import os
os.getcwd()


'/content'

- Train the Model

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
10,0.7574,0.784713
20,0.7326,0.74946
30,0.7417,0.689153
40,0.6543,0.624216
50,0.5986,0.522912
60,0.5303,0.47876
70,0.4555,0.418888
80,0.407,0.355858
90,0.3222,0.300649
100,0.2762,0.245864


TrainOutput(global_step=105, training_loss=0.535962944939023, metrics={'train_runtime': 1573.2917, 'train_samples_per_second': 0.534, 'train_steps_per_second': 0.067, 'total_flos': 55253321625600.0, 'train_loss': 0.535962944939023, 'epoch': 3.0})

Model Evaluation

In [19]:
from sklearn.metrics import classification_report

# Predict on validation set
predictions = trainer.predict(val_dataset)
pred_labels = torch.argmax(torch.tensor(predictions.predictions), axis=1)

# Evaluate
print(classification_report(val_labels, pred_labels))


              precision    recall  f1-score   support

           0       0.87      1.00      0.93        34
           1       1.00      0.86      0.93        36

    accuracy                           0.93        70
   macro avg       0.94      0.93      0.93        70
weighted avg       0.94      0.93      0.93        70



Deployment
- Save the Model

In [20]:
model.save_pretrained('./spam_detector_model')
tokenizer.save_pretrained('./spam_detector_model')


('./spam_detector_model/tokenizer_config.json',
 './spam_detector_model/special_tokens_map.json',
 './spam_detector_model/vocab.txt',
 './spam_detector_model/added_tokens.json')

Load the Model for Real-Time Predictions

In [21]:
from transformers import pipeline

spam_detector = pipeline("text-classification", model='./spam_detector_model', tokenizer=tokenizer)

# Test on new data
new_comment = "She named the tiger Kitty Purry  No, seriously, she did, check the video"
print(spam_detector(new_comment))


Device set to use cpu


[{'label': 'LABEL_0', 'score': 0.8272313475608826}]


Deploy Using Flask (Optional)

In [22]:
import pickle




In [23]:
filename = 'spam_detector_model.sav'
pickle.dump(spam_detector, open(filename, 'wb'))

In [24]:
#loading the saved model
loaded_model = pickle.load(open('spam_detector_model.sav', 'rb'))