# NLP Disasters Mini Project
**Course:** CSCA 5642
**Date:** Dec 2025

In [20]:
# Basic Libs
import pandas as pd
import numpy as np
import string, os, re
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # Force CPU to avoid CUDA 12.0 errors with RTX 5090

# Plotting Libs
import seaborn as sns
import matplotlib.pyplot as plt

# NLP Libs
import nltk
# NLTK One-Time Downloads
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger_eng')
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Embedding Libs
from gensim.models import Word2Vec, FastText

# Linear Model Libs
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score

# Neural Network Libs
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam


In [21]:
stop_words = set(stopwords.words('english'))
punct_table = str.maketrans('', '', string.punctuation)
lemmatizer = WordNetLemmatizer()

def clean_pos_lemma(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text.lower())
    tokens = text.translate(punct_table).split()
    tagged = nltk.pos_tag(tokens)
    return ' '.join(lemmatizer.lemmatize(w, pos=wordnet.ADJ if t.startswith('J')
                                         else wordnet.VERB if t.startswith('V')
                                         else wordnet.NOUN if t.startswith('N')
                                         else wordnet.ADV if t.startswith('R')
                                         else wordnet.NOUN)
                    for w, t in tagged)

def clean_light(text):
    # minimal cleaning; keeps punctuation/hashtags/mentions for n-grams
    return re.sub(r'http\S+|www\S+|https\S+', '', text.lower())

def clean_char_preserve(text):
    # for char n-grams; almost no cleaning
    return text.lower()

CLEANERS = {
    'pos_lemma': clean_pos_lemma,
    'light': clean_light,
    'char_keep': clean_char_preserve,
}

## Data & Preprocessing

In [22]:
class DisasterData:
    def __init__(self, path: str, val_size=0.2, random_state=11):
        full_train = pd.read_csv(f'{path}/train.csv')
        self.test_df = pd.read_csv(f'{path}/test.csv')
        self.train_df, self.val_df = train_test_split(
            full_train, test_size=val_size, random_state=random_state, stratify=full_train['target']
        )
        self._clean_cache = {} 

    def get_split(self, cleaner_name: str):
        """Return cleaned splits for the requested cleaner; caches to avoid recompute."""
        if cleaner_name not in CLEANERS:
            raise ValueError(f'Unknown cleaner {cleaner_name}')
        if cleaner_name not in self._clean_cache:
            fn = CLEANERS[cleaner_name]
            self._clean_cache[cleaner_name] = (
                self.train_df['text'].apply(fn),
                self.val_df['text'].apply(fn),
                self.test_df['text'].apply(fn),
            )
        X_train, X_val, X_test = self._clean_cache[cleaner_name]
        return {
            'X_train': X_train,
            'y_train': self.train_df['target'],
            'X_val': X_val,
            'y_val': self.val_df['target'],
            'X_test': X_test,
            'test_ids': self.test_df['id'],
        }

## TF-IDF Preprocessing + LogReg Baseline

In [None]:
# Baseline model runner
def run_tfidf_logreg(data, tfidf_cfg, logreg_cfg, threshold=None):
    vec = TfidfVectorizer(**tfidf_cfg)
    Xtr = vec.fit_transform(data['X_train'])
    Xval = vec.transform(data['X_val'])

    model = LogisticRegression(**logreg_cfg)
    model.fit(Xtr, data['y_train'])

    val_probs = model.predict_proba(Xval)[:, 1]
    if threshold is None:
        grid = np.linspace(0.3, 0.7, 11)
        f1s = [f1_score(data['y_val'], (val_probs >= t).astype(int)) for t in grid]
        threshold = grid[int(np.argmax(f1s))]
    val_pred = (val_probs >= threshold).astype(int)
    val_f1 = f1_score(data['y_val'], val_pred)
    return {'val_f1': val_f1, 'threshold': threshold, 'model': model, 'vectorizer': vec}

Baseline TF-IDF + LogReg F1 Score: 0.7786


In [None]:
# Define experiment config and grid
base = {
    'cleaner': 'light',
    'tfidf': {'max_features': 20000, 'ngram_range': (1,2), 'analyzer': 'word'},
    'logreg': {'C': 2.0, 'penalty': 'l2', 'max_iter': 1000, 'class_weight': 'balanced', 'n_jobs': -1},
}
grid = [
    base,
    {**base, 'cleaner': 'char_keep', 'tfidf': {**base['tfidf'], 'analyzer': 'char', 'ngram_range': (3,5)}},
    {**base, 'cleaner': 'pos_lemma'},
]

disaster_data = DisasterData(path='data')
results = []
artifacts = {}
for cfg in grid:
    split = disaster_data.get_split(cfg['cleaner'])
    res = run_tfidf_logreg(split, cfg['tfidf'], cfg['logreg'])
    name = f"{cfg['cleaner']}_{cfg['tfidf']['analyzer']}_{cfg['tfidf']['ngram_range']}"
    results.append({'name': name, 'val_f1': res['val_f1'], 'threshold': res['threshold']})
    artifacts[name] = (res, split, cfg)

pd.DataFrame(results).sort_values('val_f1', ascending=False)

In [None]:
best_name = pd.DataFrame(results).sort_values('val_f1', ascending=False).iloc[0]['name']
res, split, cfg = artifacts[best_name]

# Refit on full train (train+val) with same cleaner/vectorizer settings
full_clean = pd.concat([split['X_train'], split['X_val']])
full_y = pd.concat([split['y_train'], split['y_val']])

vec = TfidfVectorizer(**cfg['tfidf'])
Xfull = vec.fit_transform(full_clean)
model = LogisticRegression(**cfg['logreg'])
model.fit(Xfull, full_y)

Xtest = vec.transform(split['X_test'])
test_pred = (model.predict_proba(Xtest)[:,1] >= res['threshold']).astype(int)

print(f'Best model: {best_name}')
print(f'F1 Score: {res["val_f1"]:.4f}')
print(f'Threshold: {res["threshold"]:.4f}')

# No baseline submission

## RNN Model (Keras)

In [None]:
tokenizer = Tokenizer(num_words=20000, oov_token='<OOV>')
tokenizer.fit_on_texts(disaster_data.X_train)

X_train_seq = tokenizer.texts_to_sequences(disaster_data.X_train)
X_val_seq = tokenizer.texts_to_sequences(disaster_data.X_val)
X_test_seq = tokenizer.texts_to_sequences(disaster_data.X_test)

maxlen = 40

X_train_pad = pad_sequences(X_train_seq, padding='post', maxlen=maxlen)
X_val_pad = pad_sequences(X_val_seq, padding='post', maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, padding='post', maxlen=maxlen)

vocab_size = min(20000, len(tokenizer.word_index) + 1)
embedding_dim = 100

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate=1e-3),
    metrics=['accuracy']
)

model.summary()



In [None]:
history = model.fit(
    X_train_pad, disaster_data.y_train,
    epochs=3,
    batch_size=64,
    validation_data=(X_val_pad, disaster_data.y_val),
    )

val_pred_prob = model.predict(X_val_pad).ravel()
val_pred = (val_pred_prob >= 0.5).astype(int)
val_f1_rnn = f1_score(disaster_data.y_val, val_pred)
print(f'LSTM F1 Score: {val_f1_rnn:.4f}')

Epoch 1/3
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.5703 - loss: 0.6845 - val_accuracy: 0.5706 - val_loss: 0.6855
Epoch 2/3
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6169 - loss: 0.6570 - val_accuracy: 0.7498 - val_loss: 0.5451
Epoch 3/3
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6969 - loss: 0.5637 - val_accuracy: 0.5673 - val_loss: 0.6588
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
LSTM F1 Score: 0.6451


### Kaggle Submission

In [None]:
test_pred_prob = model.predict(X_test_pad).ravel()
test_pred = (test_pred_prob >= 0.5).astype(int)

disaster_data.submission = pd.DataFrame({
    'id': disaster_data.test_df.id,
    'target': test_pred
})
disaster_data.submission.to_csv('submission.csv', index=False)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


## Error Analysis

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

mis_idx = np.where(val_pred != disaster_data.y_val)[0][:3]

for i in mis_idx:
    text_i = disaster_data.X_val_raw.iloc[i]
    vec_i = X_val_tfidf[i]

    # find nearest neighbors in training set
    sims = cosine_similarity(vec_i, X_train_tfidf).ravel()
    top_idx = sims.argsort()[-3:][::-1]

    print('--- Misclassified Text ---')
    print('Text:', text_i)
    print('Pred:', val_pred[i])
    print('True:', disaster_data.y_val.iloc[i])
    print('Nearest Neighbors:')
    for idx in top_idx:
        print(f'Text: {disaster_data.X_train_raw.iloc[idx]}')
    print()

--- Misclassified Text ---
Text: @Blizzard_draco @LoneWolffur I need this.
Pred: 1
True: 0
Nearest Neighbors:
Text: @Blizzard_draco @LoneWolffur also me please I would very much like a link
Text: @LoneWolffur control yourself tora
Text: @LoneWolffur BRUH *dies*

--- Misclassified Text ---
Text: #news Politifiact: Harry Reid's '30 Percent of Women Served' Planned Parenthood Claim Is a 'Pants on Fire' Lie... http://t.co/bMSeDZOfSV
Pred: 1
True: 0
Nearest Neighbors:
Text: Politifiact: Harry Reid's '30 Percent of Women Served' Planned Parenthood Claim Is a 'Pants on Fire' Lie http://t.co/aMYMwWcpYm | #tcot
Text: 30 seconds for my bitches to evacuate ??????
Text: @NoahCRothman Bore him with minutiae serve bad champagne. He may just explode.

--- Misclassified Text ---
Text: @mustachemurse @dateswhitecoats the truth. I pulled a 16 out. And apparently a 22 in the crazy adult trauma. And they mocked me for the 22.
Pred: 1
True: 0
Nearest Neighbors:
Text: SCREAMING IN 22 DIFFERENT LANGUAGES htt