In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
data = pd.read_csv('/content/Tweet Disaster All CSV File v2.0 - Tweet Disaster All CSV File.csv.csv')

  data = pd.read_csv('/content/Tweet Disaster All CSV File v2.0 - Tweet Disaster All CSV File.csv.csv')


In [None]:
filtered_data = data[['Tweet Text', 'FloodFlag (4:Non_Flood; 5:Flood)']].dropna()
filtered_data['FloodFlag (4:Non_Flood; 5:Flood)'] = filtered_data[
    'FloodFlag (4:Non_Flood; 5:Flood)'].map({4: 0, 5: 1})

In [None]:
texts = filtered_data['Tweet Text'].astype(str).values
labels = filtered_data['FloodFlag (4:Non_Flood; 5:Flood)'].astype(int).values

In [None]:
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42, stratify=labels)

In [None]:
# Step 2: Tokenization and padding
max_words = 20000  # Maximum number of unique tokens
max_len = 50       # Maximum sequence length for padding


In [None]:
tokenizer = Tokenizer(num_words=max_words, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

In [None]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

In [None]:
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding='post', truncating='post')

In [None]:
vocab_size = min(max_words, len(tokenizer.word_index) + 1)

In [None]:
# Step 3: Build the BiLSTM model
embedding_dim = 128

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])



In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


In [None]:

# Step 4: Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=10,
    batch_size=64,
    callbacks=[early_stopping]
)



Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 148ms/step - accuracy: 0.8949 - loss: 0.3888 - val_accuracy: 0.9308 - val_loss: 0.2471
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 149ms/step - accuracy: 0.9326 - loss: 0.2095 - val_accuracy: 0.9333 - val_loss: 0.2397
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 150ms/step - accuracy: 0.9654 - loss: 0.0918 - val_accuracy: 0.9013 - val_loss: 0.3469
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 127ms/step - accuracy: 0.9950 - loss: 0.0210 - val_accuracy: 0.9207 - val_loss: 0.4624
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 140ms/step - accuracy: 0.9980 - loss: 0.0081 - val_accuracy: 0.9013 - val_loss: 0.5799


In [None]:
# Step 5: Evaluate the model
loss, accuracy = model.evaluate(X_val_pad, y_val)
print(f"Validation Accuracy: {accuracy:.2f}")

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.9298 - loss: 0.2447
Validation Accuracy: 0.93


In [None]:
# Save the model and tokenizer
model.save("bilstm_flood_model.h5")
with open("tokenizer.json", "w") as f:
    f.write(tokenizer.to_json())




In [None]:
import numpy as np

# Step 1: Predict probabilities
y_pred_prob = model.predict(X_val_pad)

# Step 2: Convert probabilities to binary class labels
threshold = 0.5
y_pred = (y_pred_prob > threshold).astype(int).flatten()

# Step 3: Compare with actual labels
correct_predictions = np.sum(y_pred == y_val)
total_predictions = len(y_val)
accuracy = correct_predictions / total_predictions
print(f"Validation Accuracy (manual check): {accuracy:.2f}")

# Step 4: Display a few examples for manual verification
for i in range(10):
    print(f"Tweet: {X_val[i]}")
    print(f"Actual Label: {'Flood-related' if y_val[i] == 1 else 'Non-Flood-related'}")
    print(f"Predicted Label: {'Flood-related' if y_pred[i] == 1 else 'Non-Flood-related'}")
    print("-" * 50)


[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step
Validation Accuracy (manual check): 0.93
Tweet: Kerala is sinking under floods &amp; situation is bad, pray for Kerala @KeralaFloods
Actual Label: Flood-related
Predicted Label: Flood-related
--------------------------------------------------
Tweet: Happy Onam to all :-) Low key celebrations as Kerala limps back to life after being battered and bruised by rains .. Lets stand together and utilise the funds for Onam celebrations for flood victims
Actual Label: Flood-related
Predicted Label: Flood-related
--------------------------------------------------
Tweet: Kudumbashree workers had been active from the initial days of the flood, making packaged meals available to affected people. #KeralaFloods #KeralaFloodsRelief
Actual Label: Flood-related
Predicted Label: Flood-related
--------------------------------------------------
Actual Label: Flood-related
Predicted Label: Flood-related
--------------------------------