In [14]:
# Importar las librerias.
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import re
import pickle

In [6]:
tf.__version__

'2.18.0'

In [7]:
np.__version__

'2.0.2'

In [15]:
# Cargar conjunto de datos
dataset_path = "/content/sample_data/training.1600000.processed.noemoticon.csv"
df = pd.read_csv(dataset_path, encoding='latin1', header=None)
df.columns = ["target", "ids", "date", "flag", "user", "text"]

In [16]:
#Pre procesar el texto
def preprocess_data(df):
    df = df[["target", "text"]].copy()
    df["target"] = df["target"].replace({0: 0, 4: 2})  # 0: Negativo, 2: Positivo

    # Limpiar texto
    def clean_text(text):
        text = text.lower()
        text = re.sub(r'http\S+|www\S+', '', text)  # Eliminar URLs
        text = re.sub(r'@\w+', '', text)  # Eliminar menciones
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Eliminar caracteres especiales
        text = re.sub(r'\s+', ' ', text).strip()  # Espacios adicionales
        return text

    df["text"] = df["text"].apply(clean_text)
    return df

In [17]:
df = preprocess_data(df)

In [18]:
# Tokenización y secuencias
max_words = 10000  # Número máximo de palabras en el vocabulario
max_len = 30  # Longitud máxima de las secuencias

In [19]:
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df["text"])
sequences = tokenizer.texts_to_sequences(df["text"])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

In [20]:
# Guardar el tokenizer
with open("tokenizer.pkl", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
# Separar datos en entrenamiento y prueba
labels = np.array(df["target"])
train_size = int(len(df) * 0.8)
X_train, X_test = padded_sequences[:train_size], padded_sequences[train_size:]
y_train, y_test = labels[:train_size], labels[train_size:]

In [22]:
# Definir el modelo
def create_model():
    model = keras.Sequential([
        keras.layers.Embedding(max_words, 64),
        keras.layers.LSTM(64, return_sequences=True),
        keras.layers.LSTM(32),
        keras.layers.Dense(16, activation='relu'),
        keras.layers.Dense(3, activation='softmax')  # 3 clases
    ])

    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [23]:
model = create_model()

In [24]:
model.fit(X_train, y_train, epochs=15, batch_size=128, validation_data=(X_test, y_test))

Epoch 1/15
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 9ms/step - accuracy: 0.7809 - loss: 0.4589 - val_accuracy: 0.7069 - val_loss: 0.6399
Epoch 2/15
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 9ms/step - accuracy: 0.8369 - loss: 0.3649 - val_accuracy: 0.7919 - val_loss: 0.4485
Epoch 3/15
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 9ms/step - accuracy: 0.8470 - loss: 0.3453 - val_accuracy: 0.7607 - val_loss: 0.5033
Epoch 4/15
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 9ms/step - accuracy: 0.8553 - loss: 0.3296 - val_accuracy: 0.7400 - val_loss: 0.5620
Epoch 5/15
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 9ms/step - accuracy: 0.8635 - loss: 0.3144 - val_accuracy: 0.7321 - val_loss: 0.6015
Epoch 6/15
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 9ms/step - accuracy: 0.8714 - loss: 0.2988 - val_accuracy: 0.7590 - val_loss: 

<keras.src.callbacks.history.History at 0x7a37bc5757d0>

In [25]:
model.save('sentiment_analysis_model.h5')



In [32]:
# Función para predecir emociones en frases personalizadas
def predict_sentiment(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'http\S+|www\S+', '', sentence)
    sentence = re.sub(r'@\w+', '', sentence)
    sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
    sentence = re.sub(r'\s+', ' ', sentence).strip()
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
    prediction = model.predict(padded_sequence)
    sentiment_labels = ["Negativo", "Neutro", "Positivo"]
    predicted_class = np.argmax(prediction)
    return sentiment_labels[predicted_class]

# Probar el modelo con frases personalizadas
frases_prueba = [
    "Everything was perfect until i went to work, then my boss made my day awful",
    "Even though it was kinda salty, it was pretty good",
    "The movie plot is excellent, but the director kinda makes it trash",
    "Im so happy i could kill myself",
    "The food was great!"
]

for frase in frases_prueba:
    emocion_predicha = predict_sentiment(frase)
    print(f"Frase: {frase}\nEmoción predicha: {emocion_predicha}\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Frase: Everything was perfect until i went to work, then my boss made my day awful
Emoción predicha: Negativo

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Frase: Even though it was kinda salty, it was pretty good
Emoción predicha: Positivo

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Frase: The movie plot is excellent, but the director kinda makes it trash
Emoción predicha: Negativo

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Frase: Im so happy i could kill myself
Emoción predicha: Positivo

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Frase: The food was great!
Emoción predicha: Positivo

