In [3]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate
from tqdm import tqdm

In [4]:
# Load the finalized dataset
df = pd.read_csv("../Datasets/pokedex_final.csv")

# Combine type1 and type2 into a list
df['type_list'] = df[['type1', 'type2']].values.tolist()
df['type_list'] = df['type_list'].apply(lambda x: [t for t in x if pd.notna(t) and t != ''])

# Encode the types with MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['type_list'])

# Tokenize the enhanced_info column
text_tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
text_tokenizer.fit_on_texts(df['enhanced_info'])

sequences = text_tokenizer.texts_to_sequences(df['enhanced_info'])
X_text = pad_sequences(sequences, maxlen=300, padding='post', truncating='post')

# Normalize numeric features
stats_cols = ['hp', 'attack', 'defense', 's_attack', 'speed', 'height', 'weight']
scaler = StandardScaler()
X_stats = scaler.fit_transform(df[stats_cols])

In [None]:
# Split into train and test sets
X_train_text, X_test_text, X_train_stats, X_test_stats, y_train, y_test = train_test_split(
    X_text, X_stats, y, test_size=0.2, random_state=42)

# Build the RNN + Stats model
input_text = Input(shape=(300,), name='text_input')
text_embed = Embedding(input_dim=5000, output_dim=64, input_length=300)(input_text)
text_lstm = LSTM(64)(text_embed)

input_stats = Input(shape=(X_stats.shape[1],), name='stats_input')

combined = Concatenate()([text_lstm, input_stats])
hidden = Dense(128, activation='relu')(combined)
output = Dense(len(mlb.classes_), activation='sigmoid')(hidden)

model = Model(inputs=[input_text, input_stats], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

# Train the model
model.fit(
    [X_train_text, X_train_stats],
    y_train,
    validation_data=([X_test_text, X_test_stats], y_test),
    epochs=5,
    batch_size=32
)

# Save model and label binarizer
model.save("rnn_stats_pokemon_model")
pd.DataFrame(mlb.classes_, columns=['Type']).to_csv("/mnt/data/type_classes.csv", index=False)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 text_input (InputLayer)     [(None, 300)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 300, 64)              320000    ['text_input[0][0]']          
                                                                                                  
 lstm (LSTM)                 (None, 64)                   33024     ['embedding[0][0]']           
                                                                                                  
 stats_input (InputLayer)    [(None, 7)]                  0         []                            
                                                                                              

PermissionDeniedError: /mnt; Read-only file system