# 6. 5-Fold Cross-Validation with DistilBERT

**Objective:** Combine the 5-fold CV with the DistilBERT model to get a reliable performance estimate and a better submission file.

In [1]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from datasets import Dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding

# Configuration
MODEL_CHECKPOINT = "distilbert-base-uncased"
BATCH_SIZE = 16
N_SPLITS = 5

## 6.1. Load and Prepare Full Dataset

In [2]:
# Loading the full training and test data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
sample_submission_df = pd.read_csv('../data/sample_submission.csv')

# Define clean3
def clean3(text):
  text = text.lower() # lowercasing
  text = re.sub(r"#([a-z0-9_]+)", r"\1", text) # Hashtag to plain word
  text = re.sub(r'http\S+', "", text) # removing HTTP. URL
  text = re.sub(r"www\.\S+", "", text) # removing WWW. URL
  text = re.sub(r'@\w+', "", text) # removing @mentions
  text = re.sub(r"[^a-z0-9\s]", " ", text) #removing other characters other than a-z, 0-9 and whitespace
  text = re.sub(r"\s+", " ", text).strip() # Changing multiple spaces into one
  return text

# Apply cleaning
train_df['text'] = train_df['text'].apply(clean3)
test_df['text'] = test_df['text'].apply(clean3)

# Rename 'target' to 'label' for the model
train_df = train_df.rename(columns={'target': 'label'})

print("Data loaded and prepared.")


Data loaded and prepared.


## 6.2. Tokenizer and CV Setup

In [3]:
# Loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# Setting up the CV splitter
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle = True, random_state = 42)

# Creating empty arrays for the predictions
oof_preds = np.zeros((len(train_df),))
test_preds = np.zeros((len(test_df)),)

print("Tokenizer and CV strategy are ready.")

Tokenizer and CV strategy are ready.


## 6.3. 5-Fold CV Training Loop

In [4]:
# Loop through each fold
for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['label'])):
  print(f"===== FOLD {fold+1} =====")

  # 1.splitting data
  train_fold_df = train_df.iloc[train_idx]
  val_fold_df = train_df.iloc[val_idx]

  ## Converting to hugging face dataset
  train_dataset = Dataset.from_pandas(train_fold_df)
  val_dataset = Dataset.from_pandas(val_fold_df)

  # 2.Tokenizing the data
  def tokenize_function(examples):
    return tokenizer(examples["text"], truncation = True)
  
  tokenized_train_dataset = train_dataset.map(tokenize_function, batched = True)
  tokenized_val_dataset = val_dataset.map(tokenize_function, batched = True)
  
  test_dataset = Dataset.from_pandas(test_df)
  tokenized_test_dataset = test_dataset.map(tokenize_function, batched = True)

  data_collator = DataCollatorWithPadding(tokenizer = tokenizer, return_tensors= "tf")

  # 3.Preparing Tensorflow Datasets
  tf_train_dataset = tokenized_train_dataset.to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"], shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator
    )
  tf_val_dataset = tokenized_val_dataset.to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"], shuffle=False, batch_size=BATCH_SIZE, collate_fn=data_collator
    )
  tf_test_dataset = tokenized_test_dataset.to_tf_dataset(
    columns=["attention_mask", "input_ids"], shuffle=False, batch_size=BATCH_SIZE, collate_fn=data_collator
    )

  # 4. Loading a fresh, untrained model 
  model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels = 2)
  optimizer = tf.keras.optimizers.legacy.Adam(learning_rate = 5e-5)
  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)
  model.compile(optimizer = optimizer, loss = loss)

  # 5. Fine tuning the model
  model.fit(tf_train_dataset, validation_data = tf_val_dataset, epochs = 3)

  # 6. Getting predictions
  # Get OOF predictions for the validation set of this fold
  val_logits = model.predict(tf_val_dataset).logits
  val_probs = tf.nn.softmax(val_logits, axis=1).numpy()[:, 1] 
  oof_preds[val_idx] = val_probs
    
    # Get test predictions for this fold's model and add to our average
  test_logits = model.predict(tf_test_dataset).logits
  test_probs = tf.nn.softmax(test_logits, axis=1).numpy()[:, 1]
  test_preds += test_probs / N_SPLITS

print("\nCV loop finished!")



===== FOLD 1 =====


Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

2025-06-13 13:42:14.743345: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2025-06-13 13:42:14.743667: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-06-13 13:42:14.743691: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
I0000 00:00:1749811334.744294   71311 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1749811334.744921   71311 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected

Epoch 1/3


2025-06-13 13:42:23.818514: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.




KeyboardInterrupt: 