### Binary Classification - SpamFilter-sm

#### Import the Necessary Libraries

In [None]:
import os
import re

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score

import torch
import torch.nn
# !pip install datasets
from datasets import Dataset, DatasetDict

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
from transformers import TrainingArguments, Trainer, get_scheduler, set_seed

import subprocess
import os

# result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
# output = result.stdout
# for line in output.splitlines():
#     if '=' in line:
#         var, value = line.split('=', 1)
#         os.environ[var] = value

Versions of Important Libraries Used

In [None]:
print("Pandas:".ljust(18), pd.__version__)
print("NumPy:".ljust(18), np.__version__)
print("Torch:".ljust(18), torch.__version__)

#### Basic Constants/Values Used

In [None]:
model_ckpt = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

set_seed(42)

#### Ingest & Preprocess Data

In [None]:
# training file
# train_data = pd.read_csv('SMS_train.csv', encoding='unicode_escape')
def clean_text(text:str):
    # text =  text.lower()
    # text = re.sub(r'\S+@\S+', ' email ', text)
    # text = re.sub(r'(http|www)\S+', ' website ', text)
    # text = re.sub(r'[\S+]\.(com|net)', ' website ', text)
    # text = re.sub(r'\d+', ' number ', text)
    return text

train_data = pd.read_csv('train.csv', encoding='utf')
# train_data = pd.read_csv('full.csv', encoding='utf')

train_data.fillna("",inplace=True)
train_data.rename(columns={'v2': 'text', 'v1': 'label'}, inplace=True)
# train_data.rename(columns={'Message': 'text', 'Category': 'label'}, inplace=True)

train_data['text'] = train_data.apply(lambda row: ''.join(map(str, row[2:])), axis=1)
# train_data['text'] = train_data['text'].apply(clean_text)

train_data = train_data[['text', 'label']]

train_data.to_csv("temp.csv",index=True)
train_data.info()

In [None]:
# testing file
# test_data = pd.read_csv('SMS_test.csv', encoding='unicode_escape')
test_data = pd.read_csv('test.csv', encoding='utf')

test_data.fillna("",inplace=True)

test_data['text'] = test_data.apply(lambda row: ''.join(map(str, row[0:])), axis=1)
train_data['text'] = train_data['text'].apply(clean_text)

test_data = pd.DataFrame(test_data['text'])
test_data['label'] = 0
test_data.info()

#### Visualize Message Length (By Result Type)

In [None]:
print(train_data.value_counts('label'))

train_data['message_len'] = train_data['text'].str.split().apply(len)
train_data.boxplot("message_len", by="label", grid=False, showfliers=False, color="Blue")

plt.suptitle('')
plt.title('Message Length by Output')
plt.xlabel('')
plt.ylabel('Words / Tweet')
plt.show()

train_data.drop(columns=['message_len'], inplace=True)


#### Convert String Label to Integer Values

In [None]:
train_data['label'].replace({'ham': 0, 'spam': 1}, inplace=True)

#### Convert Pandas DataFrame to Dataset

In [None]:
train_dataset = Dataset.from_pandas(train_data)
train_dataset.shuffle(seed=42)
print(train_dataset)
print(train_dataset.features)

#### Split into Train/Test/Valid

In [None]:

train_valid = train_dataset.train_test_split(test_size=0.2)
test_dataset = Dataset.from_pandas(test_data)

ds = DatasetDict({
    'train' : train_valid['train'],
    'test' : test_dataset,
    'valid' : train_valid['test']
})

print("Training dataset shape:", ds['train'].shape)
print("Validation dataset shape:", ds['valid'].shape)
print("Testing dataset shape:", ds['test'].shape)

#### Tokenize Entire Dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)


def tokenize(example):
    text = example['text']
    tokenized_output = tokenizer(text, truncation=True, padding=True, max_length=128)
    return tokenized_output


# encoded_data = ds.map(tokenize, batched=True, batch_size=None)
encoded_data = ds.map(tokenize, batched=True, batch_size=None)

In [None]:
temp_text = 'hmm...bad news...hype park PLA $ number  studio taken...only left  number  bedrm-$ 15 28 ...,ham'

print(clean_text(temp_text))
print(tokenizer.tokenize(tokenizer.decode(tokenizer.encode(clean_text(temp_text)))))


#### Print Sample to Ensure Edits Worked as Expected

In [None]:
print(encoded_data['train'].column_names)
print(encoded_data['train'][240])

#### Instantiate Model

In [None]:
num_labels = 2
labels_dict = {0: 'ham', 1: 'spam'}

model = (AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device))

#### Make Sure That git lfs is Installed; Remember to Enter Access Token

In [None]:
# Access Token ...

# Used this the first time to install the git lfs; otherwise, it lets me know that git lfs has been initialized
!git lfs install

#### Create Function to Compute Metrics

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

#### Instantiate TrainingArguments

In [None]:
batch_size = 128
num_of_epochs = 5
logging_steps = round((len(ds["train"]) / batch_size), 0) // 2
model_name = f"{model_ckpt}-SpamFilter-DunnBC22"
learning_rate = 5e-5


def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device)

# !pip install transformers[torch]
args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=num_of_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    # compute_metrics=compute_metrics,

    # push_to_hub=True,
    # hub_strategy="every_save",
    # logging_steps=logging_steps,
    # logging_dir="./logs",
    # log_level="error",
    save_strategy="steps",
    save_steps=logging_steps,
    save_total_limit=12,
    group_by_length=True,
)

#### Instantiate Optimizer (& Learning Rate Scheduler)

In [None]:
# optimizer_name = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# num_training_steps = num_of_epochs * len(ds["train"])

# lr_scheduler = get_scheduler(
#     "linear",
#     optimizer=optimizer_name,
#     num_warmup_steps=0,
#     num_training_steps=num_training_steps,
# )

# optimizer = [optimizer_name, lr_scheduler]

#### Instantiate Trainer

In [None]:
# !pip install --upgrade huggingface_hub
import huggingface_hub

# Notebook_login()
huggingface_hub.login()


class MyTrainer(Trainer):
    # def __int__(self, *args, **kwargs):
    #     super().__int__(*args, **kwargs)

    def create_optimizer_and_scheduler(self, num_training_steps: int):
        optimizer_name = torch.optim.AdamW(model.parameters(), lr=learning_rate)

        num_training_steps = num_of_epochs * len(ds["train"])

        lr_scheduler = get_scheduler(
            "linear",
            optimizer=optimizer_name,
            num_warmup_steps=0,
            num_training_steps=num_training_steps,
        )

        return [optimizer_name, lr_scheduler]


trainer = MyTrainer(
    # model=model,
    args=args,
    tokenizer=tokenizer,
    train_dataset=encoded_data["train"],
    eval_dataset=encoded_data["valid"],
    model_init=model_init,
    # optimizers=optimizer,
    compute_metrics=compute_metrics,
)

#### Train Model

In [None]:
# !pip install optuna

# trainer.hyperparameter_search(
#     direction="maximize",
#     backend="ray",
#     n_trials=10 ,# number of trials
# )
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64, 128]),
    }

best_trials = trainer.hyperparameter_search(
    direction= "maximize",
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=20,
    # compute_objective=compute_objective,
)
# trainer.train()

#### Inference Using Fine-Tuned Model

In [None]:
valid_predictions = trainer.predict(encoded_data['valid'])
valid_predictions.metrics

In [None]:
test_predictions = trainer.predict(encoded_data['test'])
# test_predictions.metrics

In [None]:
output_dataFrame = pd.DataFrame(test_predictions.predictions)
output_dataFrame['label'] = np.argmax(test_predictions.predictions, axis=1)

output_dataFrame.apply(lambda row: 'ham' if row.iloc[0] > row.iloc[1] else 'spam', axis=1).to_csv("submission.txt", index=False, header=False)

In [None]:
output_dataFrame

#### Create Function to display Confusion Matrix

In [None]:
def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax)
    plt.title('Normalized Confusion Matrix')
    plt.show()

#### Calculate Baseline Values for Confusion Matrix

In [None]:
tokenizer2 = AutoTokenizer.from_pretrained(model_ckpt)

def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, padding=True, max_length=128, return_tensors="np")

encoded_data = ds.map(tokenize, batched=True, batch_size=None)

model2 = (AutoModel.from_pretrained(model_ckpt, num_labels=num_labels, output_hidden_states=True).to(device))

def extract_hidden_states(batch=True):
    inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state = model2(**inputs).last_hidden_state
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

encoded_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

hidden_data = encoded_data.map(extract_hidden_states, batched=True)

X_train = np.array(hidden_data["train"]["hidden_state"])
X_valid = np.array(hidden_data["valid"]["hidden_state"])
y_train = np.array(hidden_data["train"]["label"])
y_valid = np.array(hidden_data["valid"]["label"])
X_train.shape, X_valid.shape

#### Plot Confusion Matrix for Fine-Tuned Model

In [None]:
y_preds = np.argmax(valid_predictions.predictions, axis=1)
labels = {0: 'ham', 1: 'spam'}

print(len(y_preds))
print(len(y_valid))
print(valid_predictions.predictions.shape)


plot_confusion_matrix(y_preds, y_valid, labels=labels)

#### Push Fine-Tuned Model to HuggingFace Hub (My Profile)

In [None]:
trainer.push_to_hub(commit_message="All Done")