# BERT Slang Identification

In [38]:
!pip install datasets
!pip install evaluate



### imports

In [39]:
import pandas as pd
from datasets import DatasetDict, Dataset, load_dataset

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

import evaluate
import numpy as np
from transformers import DataCollatorWithPadding

### load data

In [40]:
dataset_dict = load_dataset("SohailaMohammed/BERTSlangDetectionInitial")

In [41]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 2490
    })
    dev: Dataset({
        features: ['text', 'labels'],
        num_rows: 534
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 534
    })
})

### Train Teacher Model

In [42]:
# Load model directly
model_path = "google-bert/bert-large-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_path)

id2label = {0: "Not Slang", 1: "Slang"}
label2id = {"Not Slang": 0, "Slang": 1}
model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                           num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id,)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Freeze base model

In [43]:
# print layers
for name, param in model.named_parameters():
   print(name, param.requires_grad)

bert.embeddings.word_embeddings.weight True
bert.embeddings.position_embeddings.weight True
bert.embeddings.token_type_embeddings.weight True
bert.embeddings.LayerNorm.weight True
bert.embeddings.LayerNorm.bias True
bert.encoder.layer.0.attention.self.query.weight True
bert.encoder.layer.0.attention.self.query.bias True
bert.encoder.layer.0.attention.self.key.weight True
bert.encoder.layer.0.attention.self.key.bias True
bert.encoder.layer.0.attention.self.value.weight True
bert.encoder.layer.0.attention.self.value.bias True
bert.encoder.layer.0.attention.output.dense.weight True
bert.encoder.layer.0.attention.output.dense.bias True
bert.encoder.layer.0.attention.output.LayerNorm.weight True
bert.encoder.layer.0.attention.output.LayerNorm.bias True
bert.encoder.layer.0.intermediate.dense.weight True
bert.encoder.layer.0.intermediate.dense.bias True
bert.encoder.layer.0.output.dense.weight True
bert.encoder.layer.0.output.dense.bias True
bert.encoder.layer.0.output.LayerNorm.weight True


In [44]:
# freeze base model parameters
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

# unfreeze base model pooling layers
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

In [45]:
# print layers
for name, param in model.named_parameters():
   print(name, param.requires_grad)

bert.embeddings.word_embeddings.weight False
bert.embeddings.position_embeddings.weight False
bert.embeddings.token_type_embeddings.weight False
bert.embeddings.LayerNorm.weight False
bert.embeddings.LayerNorm.bias False
bert.encoder.layer.0.attention.self.query.weight False
bert.encoder.layer.0.attention.self.query.bias False
bert.encoder.layer.0.attention.self.key.weight False
bert.encoder.layer.0.attention.self.key.bias False
bert.encoder.layer.0.attention.self.value.weight False
bert.encoder.layer.0.attention.self.value.bias False
bert.encoder.layer.0.attention.output.dense.weight False
bert.encoder.layer.0.attention.output.dense.bias False
bert.encoder.layer.0.attention.output.LayerNorm.weight False
bert.encoder.layer.0.attention.output.LayerNorm.bias False
bert.encoder.layer.0.intermediate.dense.weight False
bert.encoder.layer.0.intermediate.dense.bias False
bert.encoder.layer.0.output.dense.weight False
bert.encoder.layer.0.output.dense.bias False
bert.encoder.layer.0.output.Lay

#### Preprocess text

In [46]:
# define text preprocessing
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [47]:
# tokenize all datasetse
tokenized_data = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/534 [00:00<?, ? examples/s]

In [48]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#### Evaluation

In [49]:
# load metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred

    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)
    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]
    # compute auc
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, references=labels)['roc_auc'],3)

    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, references=labels)['accuracy'],3)

    return {"Accuracy": acc, "AUC": auc}

#### Train model

In [50]:
# hyperparameters
lr = 2e-4
batch_size = 8
num_epochs = 10

training_args = TrainingArguments(
    output_dir="bert-slang-detector",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [51]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Auc
1,0.1285,0.046029,0.981,0.999
2,0.0537,0.033784,0.985,1.0
3,0.0386,0.024812,0.989,1.0
4,0.0287,0.021564,0.991,1.0
5,0.0289,0.030174,0.991,1.0
6,0.0219,0.020639,0.991,1.0
7,0.0167,0.024137,0.989,1.0
8,0.0183,0.026354,0.989,1.0
9,0.021,0.024136,0.989,1.0
10,0.0181,0.024296,0.989,1.0


TrainOutput(global_step=3120, training_loss=0.03743554307864262, metrics={'train_runtime': 592.1156, 'train_samples_per_second': 42.053, 'train_steps_per_second': 5.269, 'total_flos': 2059062303347448.0, 'train_loss': 0.03743554307864262, 'epoch': 10.0})

### Apply Model to Validation Dataset

In [52]:
# apply model to validation dataset
predictions = trainer.predict(tokenized_data["dev"])

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))
print(metrics)

{'Accuracy': 0.994, 'AUC': 0.999}


In [53]:
# Evaluate the model on the test dataset
predictions = trainer.predict(tokenized_data["test"])

# Extract logits, true labels, and metrics
logits = predictions.predictions
labels = predictions.label_ids
predicted_labels = np.argmax(logits, axis=-1)

# Print evaluation metrics
print("Test Metrics:", predictions.metrics)

Test Metrics: {'test_loss': 0.020639292895793915, 'test_Accuracy': 0.991, 'test_AUC': 1.0, 'test_runtime': 5.1863, 'test_samples_per_second': 102.963, 'test_steps_per_second': 12.919}


In [54]:
results_df = pd.DataFrame({
    "text": tokenized_data["test"]["text"],  # Original text from test set
    "true_label": labels,
    "predicted_label": predicted_labels
})

# Display the first few rows
print(results_df.head())

# Save results to a CSV file for detailed inspection
results_df.to_csv("test_results.csv", index=False)

                                                text  true_label  \
0            She's absolutely stunning, a real BUBU.           1   
1  Vista Chemical Co., with three chemical plants...           0   
2  The base rate on corporate loans at large U.S....           0   
3           Things got heated last night, total UDS.           1   
4  Either way it was a pity, because Mr. Stolzman...           0   

   predicted_label  
0                1  
1                0  
2                0  
3                1  
4                0  


### Push to hub

In [55]:
# push model to hub
# trainer.push_to_hub()