# Disaster Tweet Classification Pipeline

A concise end-to-end workflow combining BoW, Word2Vec, an RNN, and a BoW-LogReg ensemble with threshold tuning to maximize recall.


## 1. Setup

In [18]:
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import Recall

## 2. Data Loading

In [19]:
train = pd.read_csv('/kaggle/input/txtclas-rnn-train/train.csv')
test  = pd.read_csv('/kaggle/input/txtclas-rnn-train/test_x.csv')
sub   = pd.read_csv('/kaggle/input/txtclas-rnn-train/sample_submission.csv')
print(f"Train: {train.shape}, Test: {test.shape}")


Train: (5329, 5), Test: (2284, 4)


## 3. Text Preprocessing

In [20]:
def clean_text(s: str) -> str:
    s = s.lower()
    s = re.sub(r'http\S+|@\w+|[^a-z\s]', '', s)
    return re.sub(r'\s+', ' ', s).strip()

train['clean'] = train['text'].apply(clean_text)
test ['clean'] = test ['text'].apply(clean_text)


## 4. Feature Engineering

### 4.1 Bag-of-Words

In [21]:
cv = CountVectorizer(max_features=5000, ngram_range=(1,2))
X_bow      = cv.fit_transform(train['clean'])
X_test_bow = cv.transform(test['clean'])
y = train['target'].values

### 4.2 Word2Vec

In [22]:
sentences = [t.split() for t in train['clean']]
w2v = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4, epochs=20)

def avg_w2v(doc: str) -> np.ndarray:
    vecs = [w2v.wv[w] for w in doc.split() if w in w2v.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(w2v.vector_size)

### 4.3 Sequences for RNN

In [23]:
MAX_WORDS, MAX_LEN = 10000, 100
tok = Tokenizer(num_words=MAX_WORDS)
tok.fit_on_texts(train['clean'])

X_seq      = tok.texts_to_sequences(train['clean'])
X_pad      = pad_sequences(X_seq,    maxlen=MAX_LEN)
X_test_seq = tok.texts_to_sequences(test['clean'])
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)

# Embedding matrix
embed_dim = w2v.vector_size
word_index = tok.word_index
num_words  = min(MAX_WORDS, len(word_index) + 1)

embed_matrix = np.zeros((num_words, embed_dim))
for word, idx in word_index.items():
    if idx < num_words and word in w2v.wv:
        embed_matrix[idx] = w2v.wv[word]

 ## 5. Train/Validation Split

In [24]:
X_tr_pad, X_val_pad, X_tr_bow, X_val_bow, y_tr, y_val = train_test_split(
    X_pad, X_bow, y, test_size=0.1, stratify=y, random_state=42
)

## 6. RNN Model

In [25]:
model = Sequential([
    Embedding(input_dim=num_words, output_dim=embed_dim, weights=[embed_matrix], trainable=True),
    Bidirectional(LSTM(64)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid'),
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[Recall(name='recall')])
model.summary()

## 7. Callbacks & Class Weights

In [27]:
cw = compute_class_weight('balanced', classes=[0,1], y=y_tr)
class_weight = {0: cw[0], 1: cw[1]}

es = EarlyStopping(monitor='val_recall', mode='max', patience=3, restore_best_weights=True)
mc = ModelCheckpoint('best_rnn.h5', monitor='val_recall', mode='max', save_best_only=True)

## 8. Train RNN

In [28]:
history = model.fit(
    X_tr_pad, y_tr,
    validation_data=(X_val_pad, y_val),
    epochs=20,
    batch_size=64,
    class_weight=class_weight,
    callbacks=[es, mc]
)

model.save('/kaggle/working/best_rnn.h5')

Epoch 1/20
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 25ms/step - loss: 0.6284 - recall: 0.6211 - val_loss: 0.5198 - val_recall: 0.5833
Epoch 2/20
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 0.5349 - recall: 0.6646 - val_loss: 0.4934 - val_recall: 0.5439
Epoch 3/20
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - loss: 0.4332 - recall: 0.7408 - val_loss: 0.4635 - val_recall: 0.6886
Epoch 4/20
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - loss: 0.2980 - recall: 0.8542 - val_loss: 0.5006 - val_recall: 0.7105
Epoch 5/20
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - loss: 0.2079 - recall: 0.9020 - val_loss: 0.6173 - val_recall: 0.7895
Epoch 6/20
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 0.1619 - recall: 0.9325 - val_loss: 0.6965 - val_recall: 0.7588
Epoch 7/20
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

## 9. BoW + Logistic Regression

In [30]:
lr = LogisticRegression(class_weight='balanced', max_iter=500)
lr.fit(X_tr_bow, y_tr)

## 10. Threshold Tuning

In [31]:
model.load_weights('/kaggle/working/best_rnn.h5')

rnn_val_proba = model.predict(X_val_pad).ravel()
bow_val_proba = lr.predict_proba(X_val_bow)[:,1]
ens_val_proba = 0.5 * (rnn_val_proba + bow_val_proba)

best_thr, best_rec = 0.5, 0
for thr in np.linspace(0.1, 0.9, 41):
    rec = recall_score(y_val, (ens_val_proba > thr).astype(int))
    if rec > best_rec:
        best_rec, best_thr = rec, thr

print(f"Best recall={best_rec:.4f} at threshold={best_thr:.2f}")


[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step
Best recall=0.9298 at threshold=0.10


## 11. Predict & Submit

In [32]:
rnn_test_proba = model.predict(X_test_pad).ravel()
bow_test_proba = lr.predict_proba(X_test_bow)[:,1]
ens_test_proba = 0.5 * (rnn_test_proba + bow_test_proba)

submission = sub.copy()
submission['target'] = (ens_test_proba > best_thr).astype(int)
submission.to_csv('submission.csv', index=False)
submission.head()

[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


Unnamed: 0,id,target
0,4659,1
1,1072,1
2,2559,1
3,9368,0
4,1050,1


## Conclusion

In this notebook, we developed a comprehensive disaster‐tweet classification pipeline by blending three complementary techniques:  
1. **Bag-of-Words** for sparse lexical features  
2. **Word2Vec** to capture distributed word semantics  
3. A **fine-tuned Bi-LSTM** for sequence modeling  

Key enhancements included class weighting to penalize missed disasters, early stopping to prevent overfitting, and an ensemble of the RNN with a BoW-LogReg model. We also performed threshold tuning on validation data to maximize recall.  

As a result, our model achieved a recall of **0.97727** on the private leaderboard. This high score demonstrates the system’s strong ability to identify disaster‐related tweets with minimal false negatives.  

**Next steps** might involve:  
- Incorporating pre-trained contextual embeddings (e.g., BERT)  
- Adding Conv1D or stacked RNN layers for richer sequence patterns  
- Leveraging metadata (user location, timestamps) for additional signals  

These enhancements could further improve recall and robustness in real-world alerting applications.  
