# Glove + Bidirectional GRU Workflow

This workflow describes how to fine-tune Word2Vec embeddings on your text data and integrate them into a bidirectional GRU network for text classification.

---

## Overview

This project implements a text classification pipeline using:
1. **Word2Vec embeddings** trained on your domain-specific corpus
2. **Bidirectional GRU** network for sequence classification
3. **Transfer learning** approach where pre-trained embeddings can be fine-tuned or frozen

**Dataset**: Emotion classification (6 classes: 0-5) from text data

---

In [43]:
# imports
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import gensim.downloader as api
from tqdm import tqdm
from nltk.tokenize import TweetTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout, Bidirectional, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import mixed_precision
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.model_selection import ParameterGrid 

In [42]:
# Configuration
MAX_LEN = 50       # Max words per tweet (Twitter is short, 50 is usually safe)
EMBED_DIM = 200    
GENSIM_MODEL = "glove-twitter-200" # Corresponds to glove.twitter.27B.200d
# GPU-optimized batch sizes (larger batches = better GPU utilization)
BATCH_SIZE = 64 if len(tf.config.list_physical_devices('GPU')) > 0 else 32
# Epoch Settings: Grid Search (Fast) vs Final Run (Thorough)
GRID_WARMUP_EPOCHS = 2
GRID_FINETUNE_EPOCHS = 4

FINAL_WARMUP_EPOCHS = 5 
FINAL_FINETUNE_EPOCHS = 15
MAX_WORD = 20000
TRAIN_PATH = r"train.csv"
VAL_PATH = r"validation.csv"

In [14]:
# Get glove model
# 'glove-twitter-200' is the 200d version trained on 2B tweets
try:
    glove_model = api.load(GENSIM_MODEL)
except Exception as e:
    print(f"Error loading Gensim model: {e}")
    glove_model = None
print("Gensim model loaded.")


Gensim model loaded.


In [None]:
# Variables and Hyperparameters for tuning (GPU-optimized)
PARAM_GRID = {
    # Larger networks benefit more from GPU
    'gru_units': [64, 128, 256],          # Increased capacity
    'dropout': [0.3, 0.4, 0.5],
    'fine_tune_lr': [1e-4, 5e-5, 1e-5],
    'spatial_dropout': [0.2, 0.3],
    'dense_units': [32, 64, 128],             
}

LABEL_MAP = {
    0: 'sadness', 1: 'joy', 2: 'love',
    3: 'anger', 4: 'fear', 5: 'surprise'
}

# Grid size calculation
total_combinations = (len(PARAM_GRID['gru_units']) * len(PARAM_GRID['dropout']) *
                     len(PARAM_GRID['fine_tune_lr']) * len(PARAM_GRID['spatial_dropout']) *
                     len(PARAM_GRID['dense_units']))

print(f"Total grid search combinations: {total_combinations}")

# Estimate time based on device
if len(tf.config.list_physical_devices('GPU')) > 0:
    est_time = total_combinations * 0.5  # ~30 seconds per config on GPU
    print(f"Estimated GPU training time: ~{est_time:.0f} minutes")
else:
    est_time = total_combinations * 2  # ~2 minutes per config on CPU
    print(f"Estimated CPU training time: ~{est_time:.0f} minutes")

Total grid search combinations: 108


In [44]:
# GPU Configuration and Setup


# Check GPU availability
gpus = tf.config.list_physical_devices('GPU')
print(f"Number of GPUs Available: {len(gpus)}")

if gpus:
    try:
        # Enable memory growth to prevent TensorFlow from allocating all GPU memory
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        
        # Set GPU as visible device
        tf.config.set_visible_devices(gpus[0], 'GPU')
        
        print(f"✓ Using GPU: {gpus[0].name}")
        print(f"  Device type: {gpus[0].device_type}")
        
        # Print GPU details
        gpu_details = tf.config.experimental.get_device_details(gpus[0])
        if gpu_details:
            print(f"  Compute Capability: {gpu_details.get('compute_capability', 'N/A')}")
        
        # Enable mixed precision for faster training on modern GPUs
        
        policy = mixed_precision.Policy('mixed_float16')
        mixed_precision.set_global_policy(policy)
        print("✓ Mixed precision enabled (FP16) for faster training")
        
    except RuntimeError as e:
        print(f"⚠️ GPU configuration error: {e}")
else:
    print("⚠️ No GPU detected - using CPU")
    print("  Training will be significantly slower")


Number of GPUs Available: 0
⚠️ No GPU detected - using CPU
  Training will be significantly slower


In [16]:
# Tokenizer
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)



In [17]:
# Preprocessing functions
def preprocess_text(text):
    """
    Minimal preprocessing for already-clean emotion text
    """
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    # Normalize excessive whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text
def tokenize_text(text_list):
    """
    Converts raw text list into list of token lists using NLTK.
    Example: "I am happy :)" -> ['i', 'am', 'happy', ':)']
    """
    tokenized = []
    for text in text_list:
        tokens = tweet_tokenizer.tokenize(str(text))
        tokenized.append(tokens)
    return tokenized

def build_vocab(tokenized_texts):
    """
    Builds a dictionary mapping words to integers.
    """
    word_index = {'<PAD>': 0, '<UNK>': 1}
    idx = 2
    for tokens in tokenized_texts:
        for word in tokens:
            if word not in word_index:
                word_index[word] = idx
                idx += 1
    return word_index

def text_to_sequences(tokenized_texts, word_index):
    """
    Converts token lists to integer sequences.
    """
    sequences = []
    for tokens in tokenized_texts:
        # each sequence is a list of integers, with unknown words mapped to 1
        seq = [word_index.get(word, word_index['<UNK>']) for word in tokens]
        sequences.append(seq)
    return sequences

In [35]:
# Loading DATA
def load_and_prep_data(filepath, word_index=None, is_train=True):
    """
    Load and preprocess data.
    If is_train=True, builds new vocabulary.
    If is_train=False, uses provided word_index.
    """
    print(f"Loading {filepath}...")
    df = pd.read_csv(filepath)
    
    # Preprocess text first
    texts = df['text'].apply(preprocess_text).values
    labels = df['label'].values
    
    # Tokenize
    tokenized = tokenize_text(texts)
    
    if is_train:
        # Build vocab for training data
        word_vocab = build_vocab(tokenized)
    else:
        # Use provided vocab for validation data
        word_vocab = word_index
    
    # Convert text to sequences
    sequences = text_to_sequences(tokenized, word_vocab)
    
    # Pad for fixed sized Inputs
    X = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')
    
    return X, labels, word_vocab

In [None]:
# Create matrix embedding
def load_glove_matrix(word_index, embed_dim):
    print("Loading GloVe vectors...")
    embeddings_index = {}
    # get glove model, will load only once
    glove_model = api.load(GENSIM_MODEL)
    # Parse the GloVe file
    
    print(f"Found {len(embeddings_index)} word vectors in GloVe.")
    
    # Create the matrix
    vocab_size = len(word_index)
    embedding_matrix = np.zeros((vocab_size, embed_dim), dtype="float32")
    
    hits = 0
    misses = 0
    
    for word, i in tqdm(word_index.items(), desc="Building Embedding Matrix"):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            # Initialize <UNK> and misses with random normal or zeros
            # Random is better for 'learning' unknown words later
            embedding_matrix[i] = np.random.normal(scale=0.6, size=(embed_dim,))
            misses += 1

    # Delete the huge gensim model to free up RAM
    del glove_model
    print(f"Embedding Matrix Ready: {hits} hits, {misses} misses (coverage: {hits/vocab_size:.2%})")
    return embedding_matrix

In [39]:
# Build Model Architecture (Updated)
def build_bi_gru(vocab_size, embedding_matrix, gru_units=64, dropout=0.5, 
                 spatial_dropout=0.2, dense_units=64):
    """
    Build Bidirectional GRU model with configurable hyperparameters.
    
    Args:
        vocab_size: Size of vocabulary
        embedding_matrix: Pre-trained word embeddings
        gru_units: Number of GRU units (per direction)
        dropout: Dropout rate after Dense layer
        spatial_dropout: Spatial dropout rate (drops entire embedding dimensions)
        dense_units: Number of units in hidden Dense layer
    """
    model = Sequential([
        # Layer 1: Embeddings
        Embedding(
            input_dim=vocab_size,
            output_dim=EMBED_DIM,
            weights=[embedding_matrix],
            trainable=False,
            mask_zero=True
        ),
        
        # Layer 2: Spatial Dropout
        SpatialDropout1D(spatial_dropout),
        
        # Layer 3: Bidirectional GRU
        Bidirectional(GRU(gru_units, return_sequences=False)),
        
        # Layer 4: Dense Hidden Layer
        Dense(dense_units, activation='relu'),
        Dropout(dropout),
        
        # Layer 5: Output
        Dense(6, activation='softmax')
    ])
    
    return model

In [34]:
# Training function with Freeze-Thaw
def train_model(params, X_train, y_train, X_val, y_val, vocab_size, embedding_matrix, warmup_epochs, finetune_epochs):
    """
    Runs the full Freeze-Thaw training cycle.
    Returns the trained model and history objects.
    
    Args:
        params: dict with 'gru_units', 'dropout', 'fine_tune_lr'
        X_train, y_train: Training data
        X_val, y_val: Validation data
        vocab_size: Vocabulary size
        embedding_matrix: Pre-trained embeddings
        warmup_epochs: Number of epochs for Phase 1 (frozen embeddings)
        finetune_epochs: Number of epochs for Phase 2 (unfrozen embeddings)
    """
    # Build
    model = build_bi_gru(
        vocab_size, 
        embedding_matrix, 
        gru_units=params['gru_units'], 
        dropout=params['dropout']
    )
    
    # Phase 1: Warm-up (Embeddings Frozen)
    model.layers[0].trainable = False
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    history1 = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=warmup_epochs,
        batch_size=BATCH_SIZE,
        verbose=0
    )
    
    # Phase 2: Fine-tuning (Embeddings Unfrozen)
    model.layers[0].trainable = True
    model.compile(
        optimizer=Adam(learning_rate=params['fine_tune_lr']), 
        loss='sparse_categorical_crossentropy', 
        metrics=['accuracy']
    )
    
    callbacks = [
        EarlyStopping(monitor='val_accuracy', patience=2, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=1, min_lr=1e-6)
    ]
    
    history2 = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=finetune_epochs,
        batch_size=BATCH_SIZE,
        callbacks=callbacks,
        verbose=0
    )
    
    return model, history1, history2

In [40]:
# Training function with Freeze-Thaw (Updated)
def train_model(params, X_train, y_train, X_val, y_val, vocab_size, embedding_matrix, warmup_epochs, finetune_epochs):
    """
    Runs the full Freeze-Thaw training cycle with configurable architecture.
    """
    # Build with all hyperparameters
    model = build_bi_gru(
        vocab_size, 
        embedding_matrix, 
        gru_units=params['gru_units'], 
        dropout=params['dropout'],
        spatial_dropout=params.get('spatial_dropout', 0.2),  # Default if not in grid
        dense_units=params.get('dense_units', 64)            # Default if not in grid
    )
    
    # Phase 1: Warm-up (Embeddings Frozen)
    model.layers[0].trainable = False
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    history1 = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=warmup_epochs,
        batch_size=BATCH_SIZE,
        verbose=0
    )
    
    # Phase 2: Fine-tuning (Embeddings Unfrozen)
    model.layers[0].trainable = True
    model.compile(
        optimizer=Adam(learning_rate=params['fine_tune_lr']), 
        loss='sparse_categorical_crossentropy', 
        metrics=['accuracy']
    )
    
    callbacks = [
        EarlyStopping(monitor='val_accuracy', patience=2, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=1, min_lr=1e-6)
    ]
    
    history2 = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=finetune_epochs,
        batch_size=BATCH_SIZE,
        callbacks=callbacks,
        verbose=0
    )
    
    return model, history1, history2

In [20]:
# Evaluation function
def evaluate_model(model, X_val, y_val):
    """
    Evaluates the model on validation data.
    Returns loss and accuracy.
    """
    loss, acc = model.evaluate(X_val, y_val, verbose=0)
    return loss, acc

In [29]:
# Grid Search function
def run_grid_search(param_grid, X_train, y_train, X_val, y_val, vocab_size, embedding_matrix):
    """
    Iterates through all parameter combinations using 'GRID_' epoch settings.
    """
    grid = list(ParameterGrid(param_grid))
    print(f"\n--- Starting Grid Search over {len(grid)} combinations ---")
    
    best_acc = 0.0
    best_params = None
    
    for i, params in enumerate(grid):
        print(f"[{i+1}/{len(grid)}] Testing params: {params}...", end=" ")
        
        # 1. Train with LOW epochs for speed
        model, _, _ = train_model(
            params, X_train, y_train, X_val, y_val, vocab_size, embedding_matrix,
            GRID_WARMUP_EPOCHS,  # ✓ Positional argument
            GRID_FINETUNE_EPOCHS  # ✓ Positional argument
        )
        
        # 2. Evaluate
        _ ,val_acc = evaluate_model(model, X_val, y_val)
        
        print(f"Val Acc: {val_acc:.4f}")
        
        if val_acc > best_acc:
            best_acc = val_acc
            best_params = params
            
    print("\n" + "="*30)
    print(f"BEST RESULT: {best_acc:.4f}")
    print(f"BEST PARAMS: {best_params}")
    print("="*30)
    
    return best_params

In [30]:
# Main function
def main():
    # A. Load Data
    print("\n=== STEP 1: Loading and Preprocessing Data ===")
    X_train, y_train, word_index = load_and_prep_data(TRAIN_PATH, is_train=True)
    X_val, y_val, _ = load_and_prep_data(VAL_PATH, word_index=word_index, is_train=False)
    
    print(f"\nTraining samples: {len(X_train)}")
    print(f"Validation samples: {len(X_val)}")
    print(f"Vocabulary size: {len(word_index)}")
    
    # B. Build Embedding Matrix
    print("\n=== STEP 2: Building Embedding Matrix ===")
    vocab_size = len(word_index)
    embedding_matrix = load_glove_matrix(word_index, EMBED_DIM)
    
    # C. Run Grid Search (Fast Mode)
    print("\n=== STEP 3: Grid Search ===")
    best_params = run_grid_search(
        PARAM_GRID, X_train, y_train, X_val, y_val, vocab_size, embedding_matrix
    )
    
    # D. Retrain Best Model (Thorough Mode)
    print(f"\n=== STEP 4: Final Training ===")
    print(f"Retraining with best params: {best_params}")
    print(f"Epochs: Warmup={FINAL_WARMUP_EPOCHS}, Finetune={FINAL_FINETUNE_EPOCHS}")
    
    final_model, h1, h2 = train_model(
        best_params, X_train, y_train, X_val, y_val, vocab_size, embedding_matrix,
        FINAL_WARMUP_EPOCHS,  # ✓ Positional argument
        FINAL_FINETUNE_EPOCHS  # ✓ Positional argument
    )
    
    # E. Final Evaluation & Plotting
    print("\n=== STEP 5: Final Evaluation ===")
    predictions = final_model.predict(X_val)
    y_pred = np.argmax(predictions, axis=1)
    
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred, target_names=list(LABEL_MAP.values())))
    
    # F. Plot Training History
    print("\n=== STEP 6: Plotting Results ===")
    acc = h1.history['accuracy'] + h2.history['accuracy']
    val_acc = h1.history['val_accuracy'] + h2.history['val_accuracy']
    
    plt.figure(figsize=(10, 5))
    plt.plot(acc, label='Training Accuracy')
    plt.plot(val_acc, label='Validation Accuracy')
    plt.axvline(x=len(h1.history['accuracy']), color='r', linestyle='--', label='Fine-Tuning Start')
    plt.title(f'Final Model Training History\nParams: {best_params}')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.savefig('best_model_history.png', dpi=300, bbox_inches='tight')
    plt.show()
    print("✓ Saved plot to best_model_history.png")
    
    print("\n=== DONE ===")
    return final_model, best_params

In [33]:
main()


=== STEP 1: Loading and Preprocessing Data ===
Loading train.csv...
Loading validation.csv...

Training samples: 16000
Validation samples: 2000
Vocabulary size: 15204

=== STEP 2: Building Embedding Matrix ===
Building embedding matrix from GloVe...


Building Embedding Matrix: 100%|██████████| 15204/15204 [00:00<00:00, 322343.58it/s]

Embedding Matrix Ready: 14300 hits, 904 misses (coverage: 94.05%)

=== STEP 3: Grid Search ===

--- Starting Grid Search over 12 combinations ---
[1/12] Testing params: {'dropout': 0.2, 'fine_tune_lr': 0.0001, 'gru_units': 16}... 




KeyboardInterrupt: 