In [1]:
# imports
import tensorflow as tf
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizer
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, accuracy_score


2025-05-24 10:05:49.980356: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-24 10:05:59.362309: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-24 10:05:59.377528: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-24 10:06:00.998687: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-24 10:06:04.681268: I tensorflow/core/platform/cpu_feature_guar

In [7]:
dataset=load_dataset("go_emotions", cache_dir="data/cached_data")

Downloading data: 100%|██████████| 2.77M/2.77M [00:01<00:00, 2.57MB/s]
Downloading data: 100%|██████████| 350k/350k [00:00<00:00, 559kB/s]t]
Downloading data: 100%|██████████| 347k/347k [00:00<00:00, 632kB/s]s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.30it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 68.72it/s]
Generating train split: 100%|██████████| 43410/43410 [00:00<00:00, 315271.10 examples/s]
Generating validation split: 100%|██████████| 5426/5426 [00:00<00:00, 176485.18 examples/s]
Generating test split: 100%|██████████| 5427/5427 [00:00<00:00, 187598.80 examples/s]


In [None]:
# Random see
tf.random.set_seed(42)

# Simplify to single-label (take first label)
def process_labels(example):
    if len(example['labels']) > 0:
        example['labels'] = example['labels'][0]  # Use first label
    else:
        example['labels']= -1
    return example

dataset = dataset.map(process_labels)

# Filter out examples with no labels
dataset = dataset.filter(lambda x: x['labels'] != -1)

# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='tf'
    )

# Tokenize and convert to TensorFlow format
tokenized_dataset = dataset.map(tokenize_function, batched=True)
train_dataset = tokenized_dataset['train']
eval_dataset = tokenized_dataset['validation']

# Convert to TensorFlow datasets
def to_tf_dataset(dataset, split, batch_size=8):
    features = {
        'input_ids': tf.convert_to_tensor(dataset[split]['input_ids'], dtype=tf.int32),
        'attention_mask': tf.convert_to_tensor(dataset[split]['attention_mask'], dtype=tf.int32)
    }
    labels = tf.convert_to_tensor(dataset[split]['labels'], dtype=tf.int32)
    tf_dataset = tf.data.Dataset.from_tensor_slices((features, labels))
    tf_dataset = tf_dataset.shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return tf_dataset

train_tf_dataset = to_tf_dataset(tokenized_dataset, 'train', batch_size=8)
eval_tf_dataset = to_tf_dataset(tokenized_dataset, 'validation', batch_size=8)

# Initialize model
model = TFDistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=28  # GoEmotions has 28 emotions
)
# Custom metrics for precision, recall, and F1 score
class CustomMetrics(tf.keras.callbacks.Callback):
    def __init__(self, validation_data):
        super().__init__()
        self.validation_data = validation_data

    def on_epoch_end(self, epoch, logs=None):
        val_pred = []
        val_true = []
        for batch in self.validation_data:
            features, labels = batch
            predictions = self.model.predict(features, verbose=0)
            val_pred.extend(tf.argmax(predictions.logits, axis=1).numpy())
            val_true.extend(labels.numpy())
        
        precision, recall, f1, _ = precision_recall_fscore_support(val_true, val_pred, average='weighted', zero_division=0)
        logs['val_precision'] = precision
        logs['val_recall'] = recall
        logs['val_f1'] = f1
        print(f"\nEpoch {epoch + 1}: val_precision: {precision:.4f}, val_recall: {recall:.4f}, val_f1: {f1:.4f}")


# Compile model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=['accuracy']
)

# Define callbacks
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        filepath='ai_model/sentiment_analysis/checkpoints/model_{epoch}',
        save_best_only=True,
        monitor='val_accuracy',
        mode='max'
    ),
    tf.keras.callbacks.EarlyStopping(
        monitor='val_accuracy',
        patience=1,
        restore_best_weights=True
    ),
    CustomMetrics(validation_data=eval_tf_dataset)
]

# Train model
history = model.fit(
    train_tf_dataset,
    validation_data=eval_tf_dataset,
    epochs=3,
    callbacks=callbacks
)

# Evaluate model
eval_results = model.evaluate(eval_tf_dataset)
print(f"Validation Loss: {eval_results[0]}, Validation Accuracy: {eval_results[1]}")

# Save model and tokenizer
model.save_pretrained('ai_model/sentiment_analysis/sentiment_model')
tokenizer.save_pretrained('ai_model/sentiment_analysis/sentiment_tokenizer')

# Save training history (optional)
pd.DataFrame(history.history).to_csv('data/training_history.csv')



Map: 100%|██████████| 43410/43410 [00:01<00:00, 28751.51 examples/s]
Map: 100%|██████████| 5426/5426 [00:00<00:00, 33367.53 examples/s]
Map: 100%|██████████| 5427/5427 [00:00<00:00, 35540.29 examples/s]
Filter: 100%|██████████| 43410/43410 [00:00<00:00, 349642.12 examples/s]
Filter: 100%|██████████| 5426/5426 [00:00<00:00, 231605.93 examples/s]
Filter: 100%|██████████| 5427/5427 [00:00<00:00, 228332.71 examples/s]
Downloading tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 245kB/s]
Downloading vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 12.1MB/s]
Downloading tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 2.26MB/s]
Downloading config.json: 100%|██████████| 483/483 [00:00<00:00, 3.38MB/s]
Map:   0%|          | 0/43410 [00:00<?, ? examples/s]2025-05-23 13:58:25.481733: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built withou

Epoch 1/3


2025-05-23 13:59:19.969840: I external/local_xla/xla/service/service.cc:168] XLA service 0x7fc89811f0e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-05-23 13:59:19.969876: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 4060 Laptop GPU, Compute Capability 8.9
2025-05-23 13:59:20.107816: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-05-23 13:59:20.306059: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
I0000 00:00:1747988960.446107   10483 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


INFO:tensorflow:Assets written to: ai_model/sentiment_analysis/checkpoints/model_1/assets


INFO:tensorflow:Assets written to: ai_model/sentiment_analysis/checkpoints/model_1/assets



Epoch 1: val_precision: 0.5609, val_recall: 0.5675, val_f1: 0.5366
Epoch 2/3
Epoch 2: val_precision: 0.5609, val_recall: 0.5675, val_f1: 0.5366
Validation Loss: 1.4190527200698853, Validation Accuracy: 0.5674530267715454


OSError: Cannot save file into a non-existent directory: '../ai_model/data'

In [3]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1
