In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import json

from joblib import dump

# Cargar el dataframe
data = pd.read_csv('./dataset/train.csv')

# 1. Preprocesamiento de texto
def convert_to_numeric(value):
    if isinstance(value, str):
        if 'K' in value:
            return float(value.replace('K', '')) * 1_000
        elif 'M' in value:
            return float(value.replace('M', '')) * 1_000_000
    return float(value)

data["Summary"] = data["Summary"].fillna("").astype(str)

# Crear y ajustar el tokenizer para la columna "Summary"
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(data["Summary"])
sequences = tokenizer.texts_to_sequences(data["Summary"])
word_index = tokenizer.word_index

# Guardar el tokenizer como JSON
with open("tokenizer.json", "w") as f:
    json.dump(tokenizer.to_json(), f)

# Longitud máxima de secuencia
max_length = 50
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Codificar las etiquetas (género)
label_encoder = LabelEncoder()
data["Genre"] = label_encoder.fit_transform(data["Genre"])
labels = to_categorical(data["Genre"])

# Dividir datos en entrenamiento y validación
X_train, X_val, y_train, y_val = train_test_split(
    padded_sequences, labels, test_size=0.2, random_state=42
)

# 2. Definir el modelo
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(labels.shape[1], activation='softmax')
])

# 3. Compilar el modelo
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# 4. Entrenar el modelo
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=32,
    verbose=2
)

# 5. Guardar el modelo con joblib
model.save("game_genre_model.h5")  # Guardar el modelo en formato HDF5 para TensorFlow

# 6. Evaluar el modelo
loss, accuracy = model.evaluate(X_val, y_val, verbose=2)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")


In [None]:
# Load the test data
# Load the training data again
train_data = pd.read_csv('./dataset/train.csv')

# Randomly select a subset of rows from the training data for prediction (e.g., 100 samples)
random_sample = train_data.sample(n=100, random_state=42)

# Preprocess the 'Summary' column in the random sample
random_sample["Summary"] = random_sample["Summary"].fillna("").astype(str)

# Convert the random sample 'Summary' to sequences and pad them
sample_sequences = tokenizer.texts_to_sequences(random_sample["Summary"])
sample_padded_sequences = pad_sequences(sample_sequences, maxlen=max_length, padding='post')

# Make predictions
predictions = model.predict(sample_padded_sequences, verbose=1)

# Convert predictions to genre labels
predicted_genres = label_encoder.inverse_transform(predictions.argmax(axis=1))

# Save results to a new CSV file
output = pd.DataFrame({
    'id': random_sample['id'], 
    'Genre': predicted_genres
})
output.to_csv('./dataset/predicted_random_sample_genres.csv', index=False)

print("Predictions saved to './dataset/predicted_random_sample_genres.csv'")

