In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import TextVectorization, Embedding, LSTM, Dense, Dropout, Bidirectional, GRU
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import re

In [2]:
df = pd.read_csv("/kaggle/input/tuits-desastres/tuits_desastres.csv")
df = df[['text', 'target']].dropna()

In [3]:
df['text'] = df['text'].str.lower()
df['text'] = df['text'].apply(lambda x: re.sub(r"http\S+", "", x))             # URLs
df['text'] = df['text'].apply(lambda x: re.sub(r"@\w+", "", x))                # menciones
df['text'] = df['text'].apply(lambda x: re.sub(r"#", "", x))                   # hashtag
df['text'] = df['text'].apply(lambda x: re.sub(r"[^\w\s]", "", x))             # puntuación
df['text'] = df['text'].apply(lambda x: re.sub(r"\s+", " ", x))                # múltiples espacios a uno
df['text'] = df['text'].str.strip()                                            # quitar espacios

In [7]:
for i in range(20):
    print(df['text'][i])

our deeds are the reason of this earthquake may allah forgive us all
forest fire near la ronge sask canada
all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected
13000 people receive wildfires evacuation orders in california
just got sent this photo from ruby alaska as smoke from wildfires pours into a school
rockyfire update california hwy 20 closed in both directions due to lake county fire cafire wildfires
flood disaster heavy rain causes flash flooding of streets in manitou colorado springs areas
im on top of the hill and i can see a fire in the woods
theres an emergency evacuation happening now in the building across the street
im afraid that the tornado is coming to our area
three people died from the heat wave so far
haha south tampa is getting flooded hah wait a second i live in south tampa what am i gonna do what am i gonna do fvck flooding
raining flooding florida tampabay tampa 18 or 19 days ive lost

In [8]:
max_tokens = 10000 
sequence_length = 100

vectorizer = TextVectorization(max_tokens=max_tokens, output_mode='int', output_sequence_length=sequence_length)
vectorizer.adapt(df['text'].values)

X = vectorizer(df['text'].values)
y = df['target'].values

X_train, X_val, y_train, y_val = train_test_split(X.numpy(), y, test_size=0.2, random_state=42)

In [None]:
# Crear el modelo con LSTM
model = Sequential([
    Embedding(input_dim=max_tokens, output_dim=64),
    Bidirectional(LSTM(64, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
    Bidirectional(LSTM(32, dropout=0.3, recurrent_dropout=0.3)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Entrenar el modelo
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val),callbacks=[early_stop])

In [None]:
# Evaluar
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Precisión en validación: {accuracy:.4f}")

# Curva de precisión
plt.plot(history.history['accuracy'], label='Entrenamiento')
plt.plot(history.history['val_accuracy'], label='Validación')
plt.xlabel('Época')
plt.ylabel('Precisión')
plt.legend()
plt.title('Precisión durante el entrenamiento')
plt.show()

In [None]:
df['target'].value_counts()

In [None]:
print(X.shape, y.shape)

In [None]:
def plotLossAccuracy(history):
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Épocas')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Evolución de la pérdida')
    plt.ylim(bottom=0)
    
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.xlabel('Épocas')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title('Evolución de la precisión')
    plt.ylim(bottom=0)
    
    plt.show()

In [None]:
def evaluateModel(model):
    y_pred = (model.predict([q1_test, q2_test]) > 0.5).astype(int)
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(4, 4))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No Duplicada', 'Duplicada'], yticklabels=['No Duplicada', 'Duplicada'])
    plt.xlabel('Predicción')
    plt.ylabel('Real')
    plt.title('Matriz de confusión')
    plt.show()
    
    print("Informe de clasificación:")
    print(classification_report(y_test, y_pred, target_names=['No Duplicada', 'Duplicada']))

In [None]:
plotLossAccuracy(history)

In [None]:
evaluateModel(model)