In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import tensorflow as tf

# Load the dataset
df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/dataset.csv")

# Define the labels
label_dict = {
    "Cy-Flaming": 0,
    "Cy-Racism": 1,
    "Cy-Threat": 2,
    "Cy-Pull-a-Pig": 3,
    "Not Bullying": 4
}

# Convert labels to numeric
df['label'] = df['label'].map(label_dict)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

# Tokenize the data
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=128)

# Convert to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

# Load the model
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=5)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Train the model
model.fit(train_dataset.shuffle(1000).batch(16), epochs=3, batch_size=16)

# Evaluate the model
results = model.evaluate(test_dataset.batch(16))
print("Test Loss:", results[0])
print("Test Accuracy:", results[1])

# Make predictions
predictions = model.predict(test_dataset.batch(16))
predicted_labels = np.argmax(predictions.logits, axis=1)

# Print classification report
print(classification_report(y_test, predicted_labels, target_names=list(label_dict.keys())))

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/3
Epoch 2/3
Epoch 3/3
Test Loss: 0.6543238162994385
Test Accuracy: 0.7586206793785095
               precision    recall  f1-score   support

   Cy-Flaming       0.65      0.72      0.69       136
    Cy-Racism       0.79      0.87      0.82       119
    Cy-Threat       0.77      0.77      0.77        99
Cy-Pull-a-Pig       0.84      0.66      0.74        89
 Not Bullying       0.81      0.76      0.78       108

     accuracy                           0.76       551
    macro avg       0.77      0.76      0.76       551
 weighted avg       0.76      0.76      0.76       551

