<a href="https://colab.research.google.com/github/Vamsiratnala/Fine-Tuned-LLM/blob/main/cleared_metadata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Conda in Colab
!pip install -q condacolab
import condacolab
condacolab.install()


In [None]:
# Create a clean Conda environment with older versions
!conda install -y python=3.10 numpy=1.24.3 tensorflow=2.13.0 transformers=4.38.2


In [None]:
import numpy as np
import tensorflow as tf
import transformers

print("✅ Current Library Versions:")
print(f"NumPy version      : {np.__version__}")
print(f"TensorFlow version : {tf.__version__}")
print(f"Transformers version: {transformers.__version__}")


In [None]:

import pandas as pd
df = pd.read_csv('/content/SMSSpamCollection.csv',sep='\t',header = None,names=['label','message'])
print(df.head())
print(df.shape)
print(df['label'].value_counts())

In [None]:
print(df['label'].unique())
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
print(df['label'].value_counts())


In [None]:
from sklearn.model_selection import train_test_split

#converting df to lists
all_labels = df['label'].tolist()
all_texts = df['message'].tolist()
# splitting data
train_texts,temp_texts,train_labels,temp_labels = train_test_split(all_texts,all_labels,test_size=0.3,stratify=all_labels,random_state = 42)
val_texts,test_texts,val_labels,test_labels = train_test_split(temp_texts,temp_labels,test_size=0.5,stratify=temp_labels,random_state = 42)

In [None]:

from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts,truncation=True,padding=True)
val_encodings = tokenizer(val_texts,truncation=True,padding=True)
test_encodings = tokenizer(test_texts,truncation=True,padding=True)

In [None]:
#define a conversion function

def convert_to_tf_dataset(encodings, labels):
  return tf.data.Dataset.from_tensor_slices(
      ({'input_ids':encodings['input_ids'],'attention_mask':encodings['attention_mask']},labels)
  )


In [None]:
train_dataset = convert_to_tf_dataset(train_encodings,train_labels)
val_dataset = convert_to_tf_dataset(val_encodings,val_labels)
test_dataset = convert_to_tf_dataset(test_encodings,test_labels)

In [None]:
BATCH_SIZE = 8

train_dataset = train_dataset.shuffle(len(train_labels)).batch(BATCH_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

In [None]:
from transformers import TFDistilBertForSequenceClassification

model = TFDistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2  # since we're doing binary classification: spam vs ham
)


In [None]:
from sklearn.utils import class_weight


# Your encoded labels: 0 = ham, 1 = spam
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels),
    y=train_labels
)

class_weights_dict = {i : weight for i, weight in enumerate(class_weights)}
print(class_weights_dict)


In [None]:
import tensorflow as tf

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)


In [None]:
# Assume class_weights_dict is already defined, like:
# class_weights_dict = {0: 0.55, 1: 3.56}  (example)

# Use per-example loss to apply class weights
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    reduction=tf.keras.losses.Reduction.NONE
)

epochs = 1

for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    epoch_loss = 0
    batch_count = 0

    for batch in train_dataset:
        inputs, labels = batch

        with tf.GradientTape() as tape:
            outputs = model(inputs, training=True)
            logits = outputs.logits

            # Step 1: Get un-reduced (per-example) loss
            per_example_loss = loss_fn(labels, logits)

            # Step 2: Look up class weight for each label in the batch
            weights = tf.gather([class_weights_dict[0], class_weights_dict[1]], labels)

            weights = tf.cast(weights, dtype=tf.float32)

            # Step 3: Apply weights and reduce
            weighted_loss = tf.reduce_mean(per_example_loss * weights)

        gradients = tape.gradient(weighted_loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        epoch_loss += weighted_loss.numpy()
        batch_count += 1

    print(f"✅ Epoch {epoch+1} completed | Average Loss: {epoch_loss / batch_count:.4f}")


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import tensorflow as tf

all_preds = []
all_labels = []

for batch in val_dataset:
    inputs, labels = batch
    outputs = model(inputs, training=False)
    logits = outputs.logits
    preds = tf.argmax(logits, axis=1)

    all_preds.extend(preds.numpy())
    all_labels.append(labels.numpy())

# Convert to numpy arrays
all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

# Classification report
print("📊 Classification Report:")
print(classification_report(all_labels, all_preds, target_names=["ham", "spam"]))

# Confusion matrix
print("🧾 Confusion Matrix:")
print(confusion_matrix(all_labels, all_preds))


In [None]:
test_peds =[]
test_labels = []
for batch in test_datset:
  inputs , labels = batch
  output = model(inputs,training = false) #outputs is an object of type TFSequenceClassifierOutput.
  logits = output.logits
  preds = tf.argmax(logits,axis = 1)
  test_preds.extend(preds.numpy())
    if isinstance(labels, tf.Tensor) and len(labels.shape) == 0:
        test_labels.append(labels.numpy())
    else:
        test_labels.extend(labels.numpy())
print(preds)

In [None]:
model.save_pretrained("distilbert-sms-spam")
tokenizer.save_pretrained("distilbert-sms-spam")