In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

data = pd.read_csv('Dataset/train.txt', sep=';')
data.columns = ['Text', 'Emotions']  
data.head()

Unnamed: 0,Text,Emotions
0,i can go from feeling so hopeless to so damned...,sadness
1,im grabbing a minute to post i feel greedy wrong,anger
2,i am ever feeling nostalgic about the fireplac...,love
3,i am feeling grouchy,anger
4,ive been feeling a little burdened lately wasn...,sadness


In [None]:
data['Text'].dy

In [6]:
texts = data['Text'].tolist()
emotions = data['Emotions'].tolist()

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

In [13]:
sequences = tokenizer.texts_to_sequences(texts)

# Understanding pad_sequences

`pad_sequences` is a utility function from TensorFlow's Keras preprocessing module that standardizes the length of sequences (like tokenized text) for neural network processing.

## Why it's necessary:

1. **Uniform Input Size**: Neural networks require fixed-size inputs, but text documents naturally vary in length.

2. **Batch Processing**: To efficiently process data in batches, all sequences in a batch must have the same length.

## Key Parameters:

- **maxlen**: Maximum sequence length (longer sequences are truncated, shorter ones are padded)
- **padding**: Where to add padding ('pre' or 'post')
- **truncating**: Where to truncate ('pre' or 'post')
- **value**: Padding value (default is 0)

## Example visualization below:

In [8]:
# Example demonstrating pad_sequences with some sample sentences
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Example sequences of different lengths (after tokenization)
sample_sequences = [
    [1, 2, 3, 4, 5],               # 5 tokens
    [1, 2],                         # 2 tokens
    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] # 10 tokens
]

print("Original sequences:")
for seq in sample_sequences:
    print(f"Length: {len(seq)}, Sequence: {seq}")

# Padding with default settings (pad with 0s at the beginning)
padded_sequences = pad_sequences(sample_sequences)
print("\nAfter default padding (pre, maxlen=longest):")
for seq in padded_sequences:
    print(f"Length: {len(seq)}, Sequence: {seq}")

# Padding with custom maxlen and post padding
padded_sequences = pad_sequences(sample_sequences, maxlen=7, padding='post')
print("\nWith maxlen=7 and padding='post':")
for seq in padded_sequences:
    print(f"Length: {len(seq)}, Sequence: {seq}")

# Padding with custom maxlen and truncating
padded_sequences = pad_sequences(sample_sequences, maxlen=4, truncating='pre')
print("\nWith maxlen=4 and truncating='pre':")
for seq in padded_sequences:
    print(f"Length: {len(seq)}, Sequence: {seq}")

Original sequences:
Length: 5, Sequence: [1, 2, 3, 4, 5]
Length: 2, Sequence: [1, 2]
Length: 10, Sequence: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

After default padding (pre, maxlen=longest):
Length: 10, Sequence: [0 0 0 0 0 1 2 3 4 5]
Length: 10, Sequence: [0 0 0 0 0 0 0 0 1 2]
Length: 10, Sequence: [ 1  2  3  4  5  6  7  8  9 10]

With maxlen=7 and padding='post':
Length: 7, Sequence: [1 2 3 4 5 0 0]
Length: 7, Sequence: [1 2 0 0 0 0 0]
Length: 7, Sequence: [ 4  5  6  7  8  9 10]

With maxlen=4 and truncating='pre':
Length: 4, Sequence: [2 3 4 5]
Length: 4, Sequence: [0 0 1 2]
Length: 4, Sequence: [ 7  8  9 10]


In [14]:
# Apply pad_sequences to our text data sequences

# First, let's see the length distribution of our sequences
seq_lengths = [len(seq) for seq in sequences]

print(f"Average sequence length: {np.mean(seq_lengths):.1f}")
print(f"Maximum sequence length: {max(seq_lengths)}")
print(f"Minimum sequence length: {min(seq_lengths)}")
print(f"Median sequence length: {np.median(seq_lengths):.1f}")

# Let's choose a maxlen that covers most of our data
# A common approach is to use a length that covers 95-99% of your sequences
maxlen = int(np.percentile(seq_lengths, 95))
print(f"\nUsing maxlen={maxlen} (95th percentile)")

# Pad our sequences
padded_sequences = pad_sequences(sequences, maxlen=maxlen)

print(f"\nShape of padded_sequences: {padded_sequences.shape}")
print(f"All sequences now have the same length: {padded_sequences.shape[1]}")

# Let's look at a couple of examples to see the effect
import random
for _ in range(3):
    idx = random.randint(0, len(sequences)-1)
    print(f"\nOriginal sequence length: {len(sequences[idx])}")
    print(f"Padded sequence length: {len(padded_sequences[idx])}")
    if len(sequences[idx]) < maxlen:
        print(f"Padding added: {maxlen - len(sequences[idx])} zeros")
    elif len(sequences[idx]) > maxlen:
        print(f"Truncated: {len(sequences[idx]) - maxlen} tokens removed")

Average sequence length: 19.2
Maximum sequence length: 66
Minimum sequence length: 2
Median sequence length: 17.0

Using maxlen=41 (95th percentile)

Shape of padded_sequences: (15999, 41)
All sequences now have the same length: 41

Original sequence length: 47
Padded sequence length: 41
Truncated: 6 tokens removed

Original sequence length: 27
Padded sequence length: 41
Padding added: 14 zeros

Original sequence length: 28
Padded sequence length: 41
Padding added: 13 zeros


## Why pad_sequences is Critical for Model Training

### Technical Reasons:

1. **Shape Consistency**: Neural networks expect inputs of consistent dimensions
2. **Batch Processing Efficiency**: GPUs/TPUs process batches faster when all inputs have the same shape
3. **Tensor Operations**: TensorFlow/Keras operations require regular tensors, not ragged arrays

### Effect on Model Performance:

1. **Information Preservation**: Setting appropriate `maxlen` preserves important content while removing noise
2. **Padding Strategy**: `padding='post'` is often better for text as most important words come earlier
3. **Memory Usage**: Too large `maxlen` can waste memory and slow training

### In Text Emotion Classification:

For our text emotion classifier, pad_sequences is essential because:
- Emotions can be expressed in texts of varying lengths
- The standardized sequence length allows the model to learn patterns regardless of original text length
- The embedding layer (which follows) expects fixed-length sequences as input