In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
import tensorflow as tf


In [7]:
df.head()

Unnamed: 0,content,intensity
0,"Sometimes I’m not angry, I’m hurt and there’s ...",angriness
1,Not available for busy people☺,angriness
2,I do not exist to impress the world. I exist t...,angriness
3,Everything is getting expensive except some pe...,angriness
4,My phone screen is brighter than my future 🙁,angriness


In [8]:
# Load data
angriness_df = pd.read_csv('data/angriness.csv')
happiness_df = pd.read_csv('data/happiness.csv')
sadness_df = pd.read_csv('data/sadness.csv')
df = pd.concat([angriness_df, happiness_df, sadness_df], ignore_index=True)
df['intensity'] = df['intensity'].map({"anger": 0, "happiness": 1, "sadness": 2})  # Map labels to integers

In [9]:
# Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['content'], df['intensity'], test_size=0.2, random_state=42
)

In [10]:
# Tokenize data
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer.save_pretrained("./model/tokenizer")
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)


In [11]:
# Convert to TensorFlow dataset
def create_dataset(encodings, labels):
    inputs = {key: tf.constant(val) for key, val in encodings.items()}
    dataset = tf.data.Dataset.from_tensor_slices((inputs, tf.constant(labels)))
    return dataset

In [12]:
train_dataset = create_dataset(train_encodings, train_labels.tolist()).shuffle(1000).batch(16)
test_dataset = create_dataset(test_encodings, test_labels.tolist()).batch(16)

W0000 00:00:1734354069.875964   77284 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [13]:
# Load pre-trained model
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [24]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=["accuracy"])

ValueError: Could not interpret optimizer identifier: <keras.src.optimizers.adam.Adam object at 0x7f991ffcf2b0>

In [20]:
import tensorflow as tf
print(tf.__version__)


2.18.0


In [None]:
# Train the model
model.fit(train_dataset, epochs=3, validation_data=test_dataset)

In [None]:
# Save the model
model.save_pretrained("./model")
tokenizer.save_pretrained("./model")

In [None]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

In [None]:
# Generate classification report
preds = model.predict(test_dataset)["logits"]
predicted_labels = tf.argmax(preds, axis=1)
print(classification_report(test_labels, predicted_labels.numpy(), target_names=["Anger", "Happiness", "Sadness"]))