In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout, Input, Concatenate, GlobalAveragePooling1D
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2

from keras.callbacks import EarlyStopping, ModelCheckpoint

print("Numpy version: ", np.__version__)
print("Pandas version: ", pd.__version__)
print("Sklearn version: ", sk.__version__)
print("TensorFlow version: ", tf.__version__)

Numpy version:  1.26.4
Pandas version:  2.2.2
Sklearn version:  1.6.1
TensorFlow version:  2.18.0


In [2]:
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Data Engineering

In [3]:
def preprocessing(df):

    df['keyword'] = df['keyword'].fillna('')
    df['location'] = df['location'].fillna('')

    return df

In [4]:
# Preprocess the dataset
train_df = preprocessing(train_df)
X_text = train_df["text"]
X_keyword = train_df['keyword']
X_location = train_df['location']
y = train_df["target"]

# Tokenization
MAX_VOCAB_SIZE = 5000
TEXT_LENGTH = 50
KEYWORD_LENGTH = 5

# 1. Tokenizer for text
text_tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
text_tokenizer.fit_on_texts(X_text)
X_text_seq = text_tokenizer.texts_to_sequences(X_text)
X_text_padded = pad_sequences(X_text_seq, maxlen=TEXT_LENGTH, padding="post")

# 2. Tokenizer for Keyword
keyword_tokenizer = Tokenizer(num_words=1000, oov_token="<OOV>")
keyword_tokenizer.fit_on_texts(X_keyword)
X_keyword_seq = keyword_tokenizer.texts_to_sequences(X_keyword)
X_keyword_padded = pad_sequences(X_keyword_seq, maxlen=KEYWORD_LENGTH, padding="post")

# 3. Label Encoding for Location
location_encoder = LabelEncoder()
X_location_encoded = location_encoder.fit_transform(X_location)

In [5]:
# 切分訓練集和驗證集
X_train_text, X_val_text, X_train_keyword, X_val_keyword, y_train, y_val = train_test_split(
    X_text_padded, X_keyword_padded, y, test_size=0.2, random_state=42
)

# 調整 X_train 和 X_val，去除 location
X_train = [X_train_text, X_train_keyword]
X_val = [X_val_text, X_val_keyword]


## Model Training

In [6]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, Concatenate
from tensorflow.keras.optimizers import Adam

# 調整詞彙表大小
VOCAB_SIZE_TEXT = text_tokenizer.num_words
VOCAB_SIZE_KEYWORD = keyword_tokenizer.num_words
EMBEDDING_DIM = 128
LSTM_UNITS = 64

# **第一個 Input: 文字**
input_text = Input(shape=(TEXT_LENGTH,), name="input_text")
e1 = Embedding(VOCAB_SIZE_TEXT, EMBEDDING_DIM)(input_text)
l1_1 = Bidirectional(LSTM(LSTM_UNITS, return_sequences=False, name="lstm_1"))(e1)
l1_1 = Dropout(0.3)(l1_1)  # Dropout


# **第二個 Input: 關鍵字**
input_keyword = Input(shape=(KEYWORD_LENGTH,), name="input_keyword")
e2 = Embedding(VOCAB_SIZE_KEYWORD, EMBEDDING_DIM)(input_keyword)
l2_1 = Bidirectional(LSTM(LSTM_UNITS, return_sequences=False, name="lstm_2"))(e2)
l2_1 = Dropout(0.3)(l2_1)

# **合併層**
merged = Concatenate()([l1_1, l2_1])
dense = Dense(64, activation="relu", kernel_regularizer=l2(0.01))(merged)
output = Dense(1, activation="sigmoid")(dense)

# **建立模型**
model = Model(inputs=[input_text, input_keyword], outputs=output)
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# **顯示模型架構**
model.summary()


In [7]:
# Set EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Store best model
checkpoint = ModelCheckpoint("best_model.h5", monitor='val_loss', save_best_only=True)


EPOCHS = 50
BATCH_SIZE = 128
history = model.fit(X_train, y_train,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_data=(X_val, y_val),
                    callbacks=[early_stop, checkpoint]
                    )

loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Accuracy: {accuracy:.4f}')

Epoch 1/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 243ms/step - accuracy: 0.5886 - loss: 1.4669



[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 281ms/step - accuracy: 0.5898 - loss: 1.4625 - val_accuracy: 0.7538 - val_loss: 0.8512
Epoch 2/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 253ms/step - accuracy: 0.8027 - loss: 0.7010



[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 272ms/step - accuracy: 0.8030 - loss: 0.6993 - val_accuracy: 0.8194 - val_loss: 0.5593
Epoch 3/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 242ms/step - accuracy: 0.8739 - loss: 0.4133



[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 273ms/step - accuracy: 0.8740 - loss: 0.4128 - val_accuracy: 0.8076 - val_loss: 0.5009
Epoch 4/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 259ms/step - accuracy: 0.9137 - loss: 0.2755 - val_accuracy: 0.7991 - val_loss: 0.5375
Epoch 5/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 259ms/step - accuracy: 0.9338 - loss: 0.2251 - val_accuracy: 0.7800 - val_loss: 0.5475
Epoch 6/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 273ms/step - accuracy: 0.9431 - loss: 0.1893 - val_accuracy: 0.7840 - val_loss: 0.5794
Epoch 7/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 242ms/step - accuracy: 0.9426 - loss: 0.1822 - val_accuracy: 0.7577 - val_loss: 0.6546
Epoch 8/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 267ms/step - accuracy: 0.9569 - loss: 0.1553 

In [8]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Accuracy: {accuracy:.4f}')

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.7942 - loss: 0.5292
Validation Accuracy: 0.8076


## Model Prediction

In [9]:
test_df = preprocessing(test_df)

# Tokenize and pad text
X_test_text = text_tokenizer.texts_to_sequences(test_df["text"])
X_test_text = pad_sequences(X_test_text, maxlen=TEXT_LENGTH)

# Tokenize and pad keyword
X_test_keyword = keyword_tokenizer.texts_to_sequences(test_df["keyword"])
X_test_keyword = pad_sequences(X_test_keyword, maxlen=KEYWORD_LENGTH)

# 構建測試集輸入
X_test = [X_test_text, X_test_keyword]

# 進行預測
predictions = model.predict(X_test)


[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 39ms/step


In [10]:
# Make predictions
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob >= 0.5).astype(int).flatten()

# Create submission file
submission = pd.DataFrame({
    "id": test_df["id"],  # Ensure the test set has an "id" column
    "target": y_pred
})

# Save to CSV
current_time = datetime.now().strftime('%Y%m%d_%H%M')
submission.to_csv("/content/sample_submission.csv" + current_time + ".csv", index=False)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step
