In [None]:
import numpy as np
import pandas as pd
from datetime import datetime

import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout, Input, Concatenate, GlobalAveragePooling1D
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.callbacks import EarlyStopping, ModelCheckpoint

print("Numpy version: ", np.__version__)
print("Pandas version: ", pd.__version__)
print("Sklearn version: ", sk.__version__)
print("TensorFlow version: ", tf.__version__)

Numpy version:  1.26.4
Pandas version:  2.2.3
Sklearn version:  1.6.1
TensorFlow version:  2.18.0


In [8]:
train_df = pd.read_csv('Data/train.csv')
test_df = pd.read_csv('Data/test.csv')

train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Data Engineering

In [9]:
def preprocessing(df):
    
    df['keyword'] = df['keyword'].fillna('')
    df['location'] = df['location'].fillna('')

    return df

In [22]:
# Preprocess the dataset
train_df = preprocessing(train_df)
X_text = train_df["text"]
X_keyword = train_df['keyword']
X_location = train_df['location']
y = train_df["target"]

# Tokenization
MAX_VOCAB_SIZE = 5000
TEXT_LENGTH = 50
KEYWORD_LENGTH = 5

# 1. Tokenizer for text
text_tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
text_tokenizer.fit_on_texts(X_text)
X_text_seq = text_tokenizer.texts_to_sequences(X_text)
X_text_padded = pad_sequences(X_text_seq, maxlen=TEXT_LENGTH, padding="post")

# 2. Tokenizer for Keyword
keyword_tokenizer = Tokenizer(num_words=1000, oov_token="<OOV>")
keyword_tokenizer.fit_on_texts(X_keyword)
X_keyword_seq = keyword_tokenizer.texts_to_sequences(X_keyword)
X_keyword_padded = pad_sequences(X_keyword_seq, maxlen=KEYWORD_LENGTH, padding="post")

# 3. Label Encoding for Location
location_encoder = LabelEncoder()
X_location_encoded = location_encoder.fit_transform(X_location)

In [16]:
X_train_text, X_val_text, X_train_keyword, X_val_keyword, X_train_location, X_val_location, y_train, y_val = train_test_split(
    X_text_padded, X_keyword_padded, X_location_encoded, y, test_size=0.2, random_state=42
)

X_train = [X_train_text, X_train_keyword, X_train_location]
X_val = [X_val_text, X_val_keyword, X_val_location]

## Model Training

In [None]:
VOCAB_SIZE_TEXT = text_tokenizer.num_words
VOCAB_SIZE_KEYWORD = keyword_tokenizer.num_words
VOCAB_SIZE_LOCATION = len(set(X_location))
EMBEDDING_DIM = 128
LSTM_UNITS = 64

# **第一個 Input**
input_text = Input(shape=(TEXT_LENGTH,), name="input_text")
e1 = Embedding(VOCAB_SIZE_TEXT, EMBEDDING_DIM)(input_text)
l1_1 = Bidirectional(LSTM(LSTM_UNITS, return_sequences=False, name="lstm_1"))(e1)
# d1_1 = Dropout(0.2)(l1_1)

# **第二個 Input**
input_keyword = Input(shape=(KEYWORD_LENGTH,), name="input_keyword")
e2 = Embedding(VOCAB_SIZE_KEYWORD, EMBEDDING_DIM)(input_keyword)
l2_1 = Bidirectional(LSTM(LSTM_UNITS, return_sequences=False, name="lstm_2"))(e2)
# d2_1 = Dropout(0.2)(l2_1)

# **第三個 Input**
input_location = Input(shape=(1,), name="input_location")
e3 = Embedding(VOCAB_SIZE_LOCATION, EMBEDDING_DIM)(input_location)
g3 = GlobalAveragePooling1D()(e3)

# 合併
merged = Concatenate()([l1_1, l2_1, g3])
dense = Dense(64, activation="relu")(merged)
output = Dense(1, activation="sigmoid")(dense)

model = Model(inputs=[input_text, input_keyword, input_location], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [133]:
# Set EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Store best model
checkpoint = ModelCheckpoint("best_model.h5", monitor='val_loss', save_best_only=True)


EPOCHS = 50
BATCH_SIZE = 32
history = model.fit(X_train, y_train, 
                    epochs=EPOCHS, 
                    batch_size=BATCH_SIZE,
                    validation_data=(X_val, y_val),
                    callbacks=[early_stop, checkpoint]
                    )

loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Accuracy: {accuracy:.4f}')

Epoch 1/50
[1m189/191[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 22ms/step - accuracy: 0.6627 - loss: 0.5999



[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 24ms/step - accuracy: 0.6640 - loss: 0.5985 - val_accuracy: 0.7984 - val_loss: 0.4408
Epoch 2/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.8759 - loss: 0.3118 - val_accuracy: 0.7748 - val_loss: 0.4904
Epoch 3/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.9396 - loss: 0.1713 - val_accuracy: 0.7754 - val_loss: 0.5927
Epoch 4/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.9668 - loss: 0.0994 - val_accuracy: 0.7597 - val_loss: 0.6985
Epoch 5/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 30ms/step - accuracy: 0.9770 - loss: 0.0677 - val_accuracy: 0.7479 - val_loss: 0.8798
Epoch 6/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 29ms/step - accuracy: 0.9855 - loss: 0.0412 - val_accuracy: 0.7315 - val_loss: 1.0035
[1m48/48[0m [32m━━━━━━━━━━━━━━

In [134]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Accuracy: {accuracy:.4f}')

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7918 - loss: 0.4580
Validation Accuracy: 0.7984


## Model Prediction

In [115]:
test_df = preprocessing(test_df)
X_test_text = text_tokenizer.texts_to_sequences(test_df["text"]) 
X_test_text = pad_sequences(X_test_text, maxlen=TEXT_LENGTH)  

X_test_keyword = keyword_tokenizer.texts_to_sequences(test_df["keyword"]) 
X_test_keyword = pad_sequences(X_test_keyword, maxlen=KEYWORD_LENGTH)

X_test_location = location_encoder.fit_transform(test_df["location"])

X_test = [X_test_text, X_test_keyword, X_test_location]

In [131]:
# Make predictions
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob >= 0.5).astype(int).flatten()

# Create submission file
submission = pd.DataFrame({
    "id": test_df["id"],  # Ensure the test set has an "id" column
    "target": y_pred
})

# Save to CSV
current_time = datetime.now().strftime('%Y%m%d_%H%M')
submission.to_csv("Prediction/submission_" + current_time + ".csv", index=False)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step
