In [29]:
import numpy as np
import pandas as pd

import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.callbacks import EarlyStopping, ModelCheckpoint

print("Numpy version: ", np.__version__)
print("Pandas version: ", pd.__version__)
print("Sklearn version: ", sk.__version__)
print("TensorFlow version: ", tf.__version__)

Numpy version:  1.26.4
Pandas version:  2.2.3
Sklearn version:  1.6.1
TensorFlow version:  2.18.0


In [8]:
from gensim.models import Word2Vec
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt_tab to /Users/zoe/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [3]:
train_df = pd.read_csv('Data/train.csv')
test_df = pd.read_csv('Data/test.csv')

train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
print(train_df.shape)
print("=" * 50)
print(train_df.info())
print("=" * 50)
print(train_df.isnull().sum())

(7613, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
None
id             0
keyword       61
location    2533
text           0
target         0
dtype: int64


In [35]:
class NlpProcessor:

    def __init__(self):
        self.bow_vectorizer = CountVectorizer(max_features=3000)
        self.tfidf_vectorizer = TfidfVectorizer(max_features=3000)
        self.w2v_model = Word2Vec(vector_size=100, window=5, min_count=1, workers=4)
        self.tokenizer = Tokenizer(num_words=5000)
        self.maxlen = 0

    def preprocessing(self, df):
        df['keyword'] = df['keyword'].fillna('')
        df['location'] = df['location'].fillna('')
        return df

    def transform_text(self, texts):
        
        # 1. BoW
        X_bow = self.bow_vectorizer.transform(texts)
        
        # 2. TF-IDF
        X_tfidf = self.tfidf_vectorizer.transform(texts)
        
        # 3. W2V
        X_w2v = np.array([self.text_to_vec(text) for text in texts])
        
        # 4. Tokenization
        X_seq = self.tokenizer.texts_to_sequences(texts)
        X_token = pad_sequences(X_seq, maxlen=self.maxlen)

        return X_bow, X_tfidf, X_w2v, X_token

    def train_text_processors(self, texts):
        
        # 1. BoW
        self.bow_vectorizer.fit(texts)

        # 2. TF-TDF
        self.tfidf_vectorizer.fit(texts)

        # 3. Word Embeddings (Word2Vec)
        tokenized_texts = [word_tokenize(text.lower()) for text in texts]
        self.w2v_model.build_vocab(tokenized_texts)
        self.w2v_model.train(tokenized_texts, total_examples=self.w2v_model.corpus_count, epochs=10)
        
        # 4. Tokenization
        self.tokenizer.fit_on_texts(texts)
        X_seq = self.tokenizer.texts_to_sequences(texts)
        self.maxlen = max(len(seq) for seq in X_seq)
        
        return self.transform_text(texts)
    
    def text_to_vec(self, text):
        words = word_tokenize(text.lower())
        word_vecs = []
        for word in words:
            try:
                word_vecs.append(self.w2v_model.wv[word])
            except KeyError: continue

        return np.mean(word_vecs, axis=0) if word_vecs else np.zeros(self.w2v_model.vector_size)

In [None]:
class Model:
    def __init__(self):
        self.model = Sequential()
        
    def build_lstm(self ,input_dim, input_len):

        self.model = Sequential([
            Embedding(input_dim=input_dim, output_dim=128, input_length=50),
            Bidirectional(LSTM(64, return_sequences=True)),
            Dropout(0.2),
            Bidirectional(LSTM(32)),
            Dropout(0.2),
            Dense(1, activation='sigmoid')
        ])

        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        self.model.build(input_shape=(None, input_len))


    def train_lstm(self, X_train, X_val, y_train, y_val, epochs=50, batch_size=32):

        # Set EarlyStopping
        early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
        
        # Store best model
        checkpoint = ModelCheckpoint("best_model.h5", monitor='val_loss', save_best_only=True)

        return self.model.fit(X_train, y_train, 
                              epochs=epochs, 
                              batch_size=batch_size,
                              validation_data=(X_val, y_val),
                              callbacks=[early_stop, checkpoint]
                             )

In [None]:
NP = NlpProcessor()

train_df = NP.preprocessing(train_df)

X = train_df["text"]
y = train_df["target"]

X_bow, X_tfidf, X_w2v, X_token = NP.train_text_processors(X)

In [37]:
datasets = {
    'bow': [len(NP.bow_vectorizer.vocabulary_), X_bow],
    'tfidf': [len(NP.tfidf_vectorizer.vocabulary_), X_tfidf],
    'w2v': [len(NP.w2v_model.wv.index_to_key), X_w2v],
    'tokenize': [NP.tokenizer.num_words, X_token]
}

for data in datasets:

    input_dim = datasets[data][0]
    X_convert = datasets[data][1]

    X_train, X_val, y_train, y_val = train_test_split(X_convert, y, test_size=0.2, random_state=42)

    model = Model()
    model.build_lstm(input_dim, X_convert.shape[1])
    history = model.train_lstm(X_train, X_val, y_train, y_val)
    loss, accuracy = model.model.evaluate(X_val, y_val)

    print(f'Text processor: {data}')
    print(f'Validation Accuracy: {accuracy:.4f}')

Epoch 1/50




[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.5646 - loss: 0.6856



[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m351s[0m 2s/step - accuracy: 0.5646 - loss: 0.6856 - val_accuracy: 0.5739 - val_loss: 0.6890
Epoch 2/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.5738 - loss: 0.6854



[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m372s[0m 2s/step - accuracy: 0.5738 - loss: 0.6854 - val_accuracy: 0.5739 - val_loss: 0.6827
Epoch 3/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7s/step - accuracy: 0.5505 - loss: 0.6898



[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2286s[0m 12s/step - accuracy: 0.5506 - loss: 0.6898 - val_accuracy: 0.5765 - val_loss: 0.6824
Epoch 4/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.5597 - loss: 0.6878



[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m430s[0m 2s/step - accuracy: 0.5598 - loss: 0.6878 - val_accuracy: 0.5752 - val_loss: 0.6818
Epoch 5/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.5691 - loss: 0.6799



[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m454s[0m 2s/step - accuracy: 0.5691 - loss: 0.6799 - val_accuracy: 0.5752 - val_loss: 0.6742
Epoch 6/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m616s[0m 3s/step - accuracy: 0.5781 - loss: 0.6730 - val_accuracy: 0.5739 - val_loss: 0.6746
Epoch 7/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m567s[0m 3s/step - accuracy: 0.5740 - loss: 0.6713 - val_accuracy: 0.5666 - val_loss: 0.6758
Epoch 8/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m426s[0m 2s/step - accuracy: 0.5706 - loss: 0.6752 - val_accuracy: 0.5752 - val_loss: 0.6823
Epoch 9/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m405s[0m 2s/step - accuracy: 0.5720 - loss: 0.6862 - val_accuracy: 0.5712 - val_loss: 0.6836
Epoch 10/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m416s[0m 2s/step - accuracy: 0.5718 - loss: 0.6838 - val_accuracy: 0.5745 - val_loss: 0.6823
[1m48/48[0m [32m━━━━━━━━━━━━━



[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m382s[0m 2s/step - accuracy: 0.5696 - loss: 0.6877 - val_accuracy: 0.5739 - val_loss: 0.6827
Epoch 2/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.5754 - loss: 0.6842



[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m380s[0m 2s/step - accuracy: 0.5754 - loss: 0.6842 - val_accuracy: 0.5739 - val_loss: 0.6826
Epoch 3/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m407s[0m 2s/step - accuracy: 0.5747 - loss: 0.6830 - val_accuracy: 0.5739 - val_loss: 0.6834
Epoch 4/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.5677 - loss: 0.6841



[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m371s[0m 2s/step - accuracy: 0.5677 - loss: 0.6841 - val_accuracy: 0.5739 - val_loss: 0.6822
Epoch 5/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m380s[0m 2s/step - accuracy: 0.5680 - loss: 0.6853 - val_accuracy: 0.5739 - val_loss: 0.6823
Epoch 6/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m515s[0m 3s/step - accuracy: 0.5715 - loss: 0.6834 - val_accuracy: 0.5739 - val_loss: 0.6822
Epoch 7/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m387s[0m 2s/step - accuracy: 0.5829 - loss: 0.6795 - val_accuracy: 0.5739 - val_loss: 0.6822
Epoch 8/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m351s[0m 2s/step - accuracy: 0.5697 - loss: 0.6840 - val_accuracy: 0.5739 - val_loss: 0.6823
Epoch 9/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m376s[0m 2s/step - accuracy: 0.5728 - loss: 0.6834 - val_accuracy: 0.5739 - val_loss: 0.6824
[1m48/48[0m [32m━━━━━━━━━━━━━━



[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 69ms/step - accuracy: 0.5710 - loss: 0.6810 - val_accuracy: 0.6402 - val_loss: 0.6447
Epoch 2/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step - accuracy: 0.6323 - loss: 0.6404



[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 83ms/step - accuracy: 0.6324 - loss: 0.6403 - val_accuracy: 0.6573 - val_loss: 0.6265
Epoch 3/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - accuracy: 0.6577 - loss: 0.6172



[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 87ms/step - accuracy: 0.6577 - loss: 0.6172 - val_accuracy: 0.6973 - val_loss: 0.5967
Epoch 4/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 84ms/step - accuracy: 0.6629 - loss: 0.6114 - val_accuracy: 0.6848 - val_loss: 0.6017
Epoch 5/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - accuracy: 0.6759 - loss: 0.6030



[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 83ms/step - accuracy: 0.6759 - loss: 0.6031 - val_accuracy: 0.6947 - val_loss: 0.5937
Epoch 6/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 85ms/step - accuracy: 0.6738 - loss: 0.6061 - val_accuracy: 0.6750 - val_loss: 0.6054
Epoch 7/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 83ms/step - accuracy: 0.6699 - loss: 0.6089 - val_accuracy: 0.6967 - val_loss: 0.5956
Epoch 8/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step - accuracy: 0.6728 - loss: 0.6012



[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 85ms/step - accuracy: 0.6728 - loss: 0.6013 - val_accuracy: 0.6960 - val_loss: 0.5923
Epoch 9/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 83ms/step - accuracy: 0.6726 - loss: 0.6093 - val_accuracy: 0.6940 - val_loss: 0.5934
Epoch 10/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 83ms/step - accuracy: 0.6750 - loss: 0.6045 - val_accuracy: 0.7026 - val_loss: 0.5933
Epoch 11/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 83ms/step - accuracy: 0.6850 - loss: 0.5975 - val_accuracy: 0.6842 - val_loss: 0.5939
Epoch 12/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 83ms/step - accuracy: 0.6778 - loss: 0.5958 - val_accuracy: 0.6664 - val_loss: 0.6028
Epoch 13/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 86ms/step - accuracy: 0.6846 - loss: 0.5869 - val_accuracy: 0.6927 - val_loss: 0.5961
[1m48/48[0m [32m━━━━



[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.6701 - loss: 0.5992 - val_accuracy: 0.8043 - val_loss: 0.4485
Epoch 2/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 26ms/step - accuracy: 0.8582 - loss: 0.3444 - val_accuracy: 0.8109 - val_loss: 0.4512
Epoch 3/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.9047 - loss: 0.2541 - val_accuracy: 0.7846 - val_loss: 0.5180
Epoch 4/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 29ms/step - accuracy: 0.9288 - loss: 0.2044 - val_accuracy: 0.7656 - val_loss: 0.5506
Epoch 5/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.9514 - loss: 0.1522 - val_accuracy: 0.7505 - val_loss: 0.6900
Epoch 6/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.9659 - loss: 0.1195 - val_accuracy: 0.7315 - val_loss: 0.8237
[1m48/48[0m [32m━━━━━━━━━━━━━━

In [None]:
# Text processor: bow
# Validation Accuracy: 0.5752

# Text processor: tfidf
# Validation Accuracy: 0.5739

# Text processor: w2v
# Validation Accuracy: 0.6960

# Text processor: tokenize
# Validation Accuracy: 0.8043