In [2]:
# !pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.2


In [4]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import evaluate
from datasets import Dataset
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from transformers import pipeline
from transformers import AutoTokenizer

In [5]:
masterDF = pd.read_csv('/kaggle/input/telegram-spam-or-ham/dataset.csv')

In [6]:
masterDF.head(5)

Unnamed: 0,text_type,text
0,spam,naturally irresistible your corporate identity...
1,spam,the stock trading gunslinger fanny is merrill ...
2,spam,unbelievable new homes made easy im wanting to...
3,spam,4 color printing special request additional in...
4,spam,do not have money get software cds from here s...


In [7]:
masterDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20348 entries, 0 to 20347
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text_type  20348 non-null  object
 1   text       20348 non-null  object
dtypes: object(2)
memory usage: 318.1+ KB


In [8]:
masterDF.isnull().sum()

text_type    0
text         0
dtype: int64

In [9]:
masterDF['text_type'].value_counts()

text_type
ham     14337
spam     6011
Name: count, dtype: int64

In [10]:
X = masterDF['text']
y = masterDF['text_type']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
def remove_stopWords_and_punct(text):
    stop_words = set(stopwords.words('english'))
    remove_punct = "".join([char for char in text if char not in string.punctuation])
    word_tokens = word_tokenize(remove_punct)
    remove_stopWords = " ".join([w for w in word_tokens if w.lower() not in stop_words])
    return remove_stopWords

In [13]:
vectorizer = TfidfVectorizer(preprocessor=remove_stopWords_and_punct)
linearSVCpipeline = Pipeline([('tfidf', vectorizer),
                              ('clf', LinearSVC(dual=True))])

randomFORESTpipeline = Pipeline([('tfidf', vectorizer),
                              ('clf', RandomForestClassifier(n_estimators=100))])

In [14]:
linearSVCpipeline.fit(X_train, y_train)
predictions = linearSVCpipeline.predict(X_test)
print(metrics.classification_report(y_test,predictions))
print(f"Accuracy score:{metrics.accuracy_score(y_test,predictions)}")

              precision    recall  f1-score   support

         ham       0.96      0.97      0.96      4338
        spam       0.91      0.89      0.90      1767

    accuracy                           0.94      6105
   macro avg       0.93      0.93      0.93      6105
weighted avg       0.94      0.94      0.94      6105

Accuracy score:0.9441441441441442


In [15]:
randomFORESTpipeline.fit(X_train, y_train)
predictions = randomFORESTpipeline.predict(X_test)
print(metrics.classification_report(y_test,predictions))
print(f"Accuracy score:{metrics.accuracy_score(y_test,predictions)}")

              precision    recall  f1-score   support

         ham       0.92      0.99      0.96      4338
        spam       0.97      0.80      0.88      1767

    accuracy                           0.94      6105
   macro avg       0.95      0.90      0.92      6105
weighted avg       0.94      0.94      0.93      6105

Accuracy score:0.9351351351351351


# Using transformers

In [16]:
masterDF = masterDF.replace({'ham': 0, 'spam': 1})
masterDF = masterDF.rename(columns={'text_type': 'labels'})
dataset = Dataset.from_pandas(masterDF)

  masterDF = masterDF.replace({'ham': 0, 'spam': 1})


In [17]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [18]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

In [19]:
tokenized_masterDF = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/20348 [00:00<?, ? examples/s]

In [20]:
tokenized_masterDF = tokenized_masterDF.train_test_split(test_size=0.3)

In [21]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [22]:
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [23]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [24]:
id2label = {0: "ham", 1: "spam"}
label2id = {"ham": 0, "spam": 1}

In [25]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
training_args = TrainingArguments(
    output_dir="spamDetectionBERT",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_masterDF['train'],
    eval_dataset=tokenized_masterDF['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1981,0.101514,0.976085
2,0.046,0.095289,0.979853


TrainOutput(global_step=1782, training_loss=0.10558480820404292, metrics={'train_runtime': 642.9305, 'train_samples_per_second': 44.306, 'train_steps_per_second': 2.772, 'total_flos': 2743446640908000.0, 'train_loss': 0.10558480820404292, 'epoch': 2.0})

In [27]:
text = "upside only treasury linked annuity upside of annual increases in 5 year t note bonus crediting over normal treasury notes alternative for large municipal bond or t note buyers call or e mail us today or please fill out the form below for more information name e mail phone city state for deposits over 100 000 we don t want anybody to receive our mailings who does not wish to receive them this is professional communication sent to insurance professionals to be removed from this mailing list do not reply to this message instead go he http www insurancemail net legal notice"

model = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/spamDetectionBERT/checkpoint-1782")
new_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

classifier = pipeline("text-classification", model=model, tokenizer=new_tokenizer)
classifier(text)

[{'label': 'spam', 'score': 0.9995502829551697}]