In [None]:
!pip install datasets


In [None]:
!unzip train.csv.zip
!unzip test.csv.zip
!unzip test_labels.csv.zip


In [None]:
import pandas as pd


In [4]:

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
test_labels_df = pd.read_csv('test_labels.csv')

# Display the first few rows of the train dataset
train_df.head()


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:

X_train = train_df['comment_text']
y_train = train_df['toxic']


In [6]:

filtered_test_df = test_labels_df[test_labels_df['toxic'] != -1]
filtered_test_df = filtered_test_df.merge(test_df, on="id")


In [7]:
X_test = filtered_test_df['comment_text']
y_test = filtered_test_df["toxic"]


In [None]:
from transformers import AutoTokenizer

# Load the TinyBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")

# Tokenize the datasets with truncation and shorter sequences
def tokenize_data(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=128)

train_encodings = tokenize_data(X_train)
test_encodings = tokenize_data(X_test)


In [9]:
import torch


In [10]:

class ToxicCommentsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [11]:

# Create the PyTorch datasets
train_dataset = ToxicCommentsDataset(train_encodings, y_train.values)
test_dataset = ToxicCommentsDataset(test_encodings, y_test.values)


In [11]:
from transformers import TrainerCallback


In [12]:
# Custom callback to make sure weights are contiguous before saving
class MakeWeightsContiguousCallback(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        # Ensure that model weights are contiguous
        model = kwargs['model']
        for param in model.parameters():
            param.data = param.data.contiguous()


In [13]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments


In [14]:
# Load the pretrained TinyBERT model for sequence classification (2 labels: toxic, non-toxic)
model = AutoModelForSequenceClassification.from_pretrained("prajjwal1/bert-tiny", num_labels=2)


pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",           # Output directory
    evaluation_strategy="epoch",      # Evaluate at each epoch
    learning_rate=2e-5,               # Learning rate
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    num_train_epochs=3,               # Number of training epochs
    weight_decay=0.01,                # Weight decay for optimization
    fp16=True                         # Mixed precision training
)




In [16]:
# Initialize the Trainer with the custom callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[MakeWeightsContiguousCallback()]  # Add the callback here
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [17]:
for name, param in model.named_parameters():
    if not param.is_contiguous():
        print(f"Non-contiguous tensor found: {name}")
        param.data = param.data.contiguous()


Non-contiguous tensor found: bert.encoder.layer.0.attention.self.query.weight
Non-contiguous tensor found: bert.encoder.layer.0.attention.self.key.weight
Non-contiguous tensor found: bert.encoder.layer.0.attention.self.value.weight
Non-contiguous tensor found: bert.encoder.layer.0.attention.output.dense.weight
Non-contiguous tensor found: bert.encoder.layer.0.intermediate.dense.weight
Non-contiguous tensor found: bert.encoder.layer.0.output.dense.weight
Non-contiguous tensor found: bert.encoder.layer.1.attention.self.query.weight
Non-contiguous tensor found: bert.encoder.layer.1.attention.self.key.weight
Non-contiguous tensor found: bert.encoder.layer.1.attention.self.value.weight
Non-contiguous tensor found: bert.encoder.layer.1.attention.output.dense.weight
Non-contiguous tensor found: bert.encoder.layer.1.intermediate.dense.weight
Non-contiguous tensor found: bert.encoder.layer.1.output.dense.weight
Non-contiguous tensor found: bert.pooler.dense.weight


In [18]:
# Make model weights contiguous in-place
for name, param in model.named_parameters():
    if not param.is_contiguous():
        param.data = param.data.contiguous()


In [19]:
# Train the model
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.1459,0.275235
2,0.1465,0.267949
3,0.1133,0.325081


TrainOutput(global_step=59841, training_loss=0.1410714025789558, metrics={'train_runtime': 792.2575, 'train_samples_per_second': 604.239, 'train_steps_per_second': 75.532, 'total_flos': 152049665594880.0, 'train_loss': 0.1410714025789558, 'epoch': 3.0})

In [22]:
# Evaluate the model on the test set
results = trainer.evaluate()


In [23]:
# Print the evaluation results (accuracy, loss, etc.)
print(results)


{'eval_loss': 0.3250805139541626, 'eval_runtime': 31.2457, 'eval_samples_per_second': 2047.576, 'eval_steps_per_second': 255.971, 'epoch': 3.0}


In [26]:

# Function to predict if a query is toxic or not
def predict_toxicity(query):
    # Set device (use GPU if available)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Move model to the correct device
    model.to(device)

    # Tokenize the query and move input tensors to the same device
    inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)

    # Forward pass through the model
    outputs = model(**inputs)

    # Get the predicted class
    logits = outputs.logits
    prediction = logits.argmax(-1).item()  # 0 for non-toxic, 1 for toxic

    return "Toxic" if prediction == 1 else "Non-Toxic"


In [27]:
# Example of a new query
query = "I loved my cat Nixy."
result = predict_toxicity(query)
print(f"The query is classified as: {result}")


The query is classified as: Non-Toxic


In [42]:
# Example of a new query
query_two = "I hate pinguins."
result_two = predict_toxicity(query_two)
print(f"The query is classified as: {result_two}")


The query is classified as: Toxic


In [28]:
# Define the directory where you want to save the model and tokenizer
save_directory = "./fine_tuned_tinybert"

# Save the fine-tuned TinyBERT model
model.save_pretrained(save_directory)

# Save the tokenizer as well
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")


Model and tokenizer saved to ./fine_tuned_tinybert


In [29]:
!zip -r fine_tuned_tinybert.zip fine_tuned_tinybert


  adding: fine_tuned_tinybert/ (stored 0%)
  adding: fine_tuned_tinybert/tokenizer_config.json (deflated 75%)
  adding: fine_tuned_tinybert/special_tokens_map.json (deflated 42%)
  adding: fine_tuned_tinybert/tokenizer.json (deflated 71%)
  adding: fine_tuned_tinybert/model.safetensors (deflated 7%)
  adding: fine_tuned_tinybert/config.json (deflated 49%)
  adding: fine_tuned_tinybert/vocab.txt (deflated 53%)


In [30]:
from google.colab import files
files.download('fine_tuned_tinybert.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>