In [107]:
# Generales
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import pandas as pd
import numpy as np

# Sklearn
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, CountVectorizer  # Stopwords
from sklearn.model_selection import train_test_split

# Tensorflow y Keras
import tensorflow as tf
from keras.layers import Input, Dense, Dropout, LSTM, Bidirectional, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers, losses
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [108]:
def get_dictionary(text, sw, N):
    vectorizer = CountVectorizer(stop_words=sw, max_features=N)
    vectorizer.fit_transform(text)
    dictionary = vectorizer.vocabulary_
    dictionary = {word: i + 2 for i, word in enumerate(vectorizer.get_feature_names_out())}
    dictionary['DESC'] = 0
    dictionary['PAD'] = 1
    return dictionary

In [109]:
def sanitize_tweet(text):
    text = text.lower()
    text = re.sub(r'@\S+', '', text)
    text = re.sub(r'#\S+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\.\S+', '', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.strip()

In [110]:
texto = "Hola @usuario, visita http://example.com para más info! ;) #feliz #día"
print(sanitize_tweet(texto))

hola  visita  para ms info


In [111]:
def text_to_sequence(text, dictionary, stop_words, T):
    palabras = re.findall(r'\b\w+\b', text.lower())  # Tokenizar correctamente
    palabras = [palabra for palabra in palabras if palabra not in stop_words]
    secuencia = [dictionary.get(palabra, dictionary['DESC']) for palabra in palabras]
    secuencia = secuencia[:T]
    secuencia += [dictionary['PAD']] * (T - len(secuencia))
    return np.array(secuencia)

In [112]:
# Parámetros configurables
N = 20000  # Número de palabras del diccionario
T = 100    # Longitud prefijada de la reseña
EPOCHS = 20
NEURONAS = 128

# Usamos la lista de stopwords de sklearn
stop_words = list(ENGLISH_STOP_WORDS)

path = "/kaggle/input/tweetsdata/tuits_desastres.csv"

datos = pd.read_csv(path)

In [113]:
datos.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [114]:
datos.drop(["id", "keyword", "location"], axis=1, inplace=True)

In [115]:
datos.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [116]:
# Variable objetivo
y = datos['target']
num_clases = len(y.unique())

# Textos de reseñas
datos['text'] = datos['text'].apply(sanitize_tweet)
textos = datos['text']
X = []
diccionario = get_dictionary(textos, stop_words, N)  # Crea el diccionario con textos limpios
for texto in textos:
    X.append(text_to_sequence(texto, diccionario, stop_words, T))  # Convierte los textos a secuencias
# X contendrá las reseñas codificadas y rellenas con la misma longitud
X = np.array(X)

In [117]:
datos.head(50)

Unnamed: 0,text,target
0,our deeds are the reason of this may allah fo...,1
1,forest fire near la ronge sask canada,1
2,all residents asked to shelter in place are be...,1
3,13000 people receive evacuation orders in cal...,1
4,just got sent this photo from ruby as smoke f...,1
5,update california hwy 20 closed in both direc...,1
6,heavy rain causes flash flooding of streets in...,1
7,im on top of the hill and i can see a fire in ...,1
8,theres an emergency evacuation happening now i...,1
9,im afraid that the tornado is coming to our area,1


In [118]:
first_key, first_value = next(iter(diccionario.items()))
print(first_key, first_value)

0011 2


In [119]:
X.shape

(7613, 100)

In [120]:
X.shape[1]

100

In [121]:
X = pad_sequences(textos.apply(lambda t: text_to_sequence(t, diccionario, stop_words, T)), 
                  maxlen=T, padding='post')  # T debe ser la longitud deseada (ej: 100)
y = datos['target']

In [122]:
print(X.shape[0])
print(y.shape)

7613
(7613,)


In [123]:
N = 5  # Tamaño del vocabulario
D = 5  # Dimensión del embedding
embedding_layer = Embedding(input_dim=N, output_dim=D)
entrada = np.array([[4, 3, 1, 1, 3]])
embeddings = embedding_layer(entrada)
print('Representación de {}'.format(str(entrada)))
print(embeddings.numpy())

Representación de [[4 3 1 1 3]]
[[[-0.00793458 -0.01541839  0.03233547 -0.04618199  0.0069806 ]
  [-0.01567497 -0.03854574  0.01761606  0.02770319 -0.00727118]
  [-0.04176041  0.01081615 -0.02187594  0.03204694 -0.04321019]
  [-0.04176041  0.01081615 -0.02187594  0.03204694 -0.04321019]
  [-0.01567497 -0.03854574  0.01761606  0.02770319 -0.00727118]]]


In [124]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [125]:
model = Sequential([
    Input(shape=(T,)),  # Solo longitud de secuencia (100)
    Embedding(input_dim=N+2, output_dim=D),
    Bidirectional(LSTM(units=NEURONAS, return_sequences=True)),
    Bidirectional(LSTM(units=NEURONAS)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(units=num_clases, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [126]:
history = model.fit(X_train, y_train, validation_split=0.2, epochs=EPOCHS, batch_size=32, verbose=1)

Epoch 1/20


ValueError: Arguments `target` and `output` must have the same shape. Received: target.shape=(None, 1), output.shape=(None, 2)

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
# Plotting the accuracy and loss over time

# Training history
history_dict = history.history

# Seperating validation and training accuracy
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']

# Seperating validation and training loss
loss = history_dict['loss']
val_loss = history_dict['val_loss']

# Plotting
plt.figure(figsize=(8, 4))
plt.subplot(1, 2, 1)
plt.plot(acc)
plt.plot(val_acc)
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(['Accuracy', 'Validation Accuracy'])

plt.subplot(1, 2, 2)
plt.plot(loss)
plt.plot(val_loss)
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(['Loss', 'Validation Loss'])

plt.show()

In [None]:
def train_validation(model, X_test, y_test):
   
    y_pred_probs = model.predict(X_test)
    y_pred = (y_pred_probs > 0.5).astype(int).flatten()  # Convertir probabilidades en etiquetas binarias

    # Matriz de confusión
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Reporte de clasificación
    print("\nReporte de Clasificación:")
    print(classification_report(y_test, y_pred, target_names=["No Duplicado", "Duplicado"]))

    # Mostrar matriz de confusión con heatmap
    plt.figure(figsize=(6, 4))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=["No Duplicado", "Duplicado"],
                yticklabels=["No Duplicado", "Duplicado"])
    plt.title("Matriz de Confusión")
    plt.xlabel("Predicción")
    plt.ylabel("Real")
    plt.show()

# Llama a la función con el modelo ya entrenado
train_validation(model, X_test, y_test)