In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.utils import to_categorical

# 1. Load and preprocess data
df = pd.read_csv('pokedex_with_type_count.csv')

# Drop missing or invalid info entries
df = df.dropna(subset=['info', 'type_count'])

# 2. Tokenize the text data
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['info'])

# Convert text to sequences and pad them
sequences = tokenizer.texts_to_sequences(df['info'])
max_len = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=max_len, padding='post')

# 3. Encode labels: 'Mono' -> 0, 'Dual' -> 1
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['type_count'])

# 4. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Build the RNN model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=max_len))
model.add(SimpleRNN(64))
model.add(Dense(1, activation='sigmoid'))  # Binary classification

# 6. Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 7. Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# 8. Evaluate
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.53


In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Load dataset
df = pd.read_csv('pokedex_with_type_count.csv')
df = df.dropna(subset=['info', 'type'])

# Convert type string into list: "{Fire,Water}" -> ['Fire', 'Water']
df['type_list'] = df['type'].str.strip('{}').str.split(',')

# Use MultiLabelBinarizer for multi-label classification
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['type_list'])  # Each column is a type

# Tokenize the 'info' column
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['info'])
sequences = tokenizer.texts_to_sequences(df['info'])
max_len = max(len(s) for s in sequences)
X = pad_sequences(sequences, maxlen=max_len, padding='post')

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=max_len))
model.add(LSTM(64))
model.add(Dense(len(mlb.classes_), activation='sigmoid'))  # multi-label

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Predict on test set
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)  # convert probabilities to binary

# Build per-type accuracy array
per_type_accuracy = []

for i, type_name in enumerate(mlb.classes_):
    acc = accuracy_score(y_test[:, i], y_pred_binary[:, i])
    per_type_accuracy.append(acc)

# Convert to DataFrame for correlation analysis
type_acc_df = pd.DataFrame([per_type_accuracy], columns=mlb.classes_)

# Create correlation matrix (although it's just one row; this works best if you had multiple models/runs)
corr_matrix = type_acc_df.T.corr()

# Save or display
print("Per-type accuracy:\n", type_acc_df.T)
print("\nCorrelation matrix between types:\n", corr_matrix)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Per-type accuracy:
                  0
bug       0.912195
dark      0.926829
dragon    0.936585
electric  0.921951
fairy     0.946341
fighting  0.912195
fire      0.941463
flying    0.921951
ghost     0.931707
grass     0.873171
ground    0.921951
ice       0.965854
normal    0.887805
poison    0.882927
psychic   0.892683
rock      0.931707
steel     0.936585
water     0.858537

Correlation matrix between types:
      0
0  1.0


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import BertTokenizer, TFBertModel

# BERT Model

# Load and clean data
df = pd.read_csv('pokedex_with_type_count.csv')
df = df.dropna(subset=['info', 'type'])

# Process type strings like "{Fire,Water}" → ['Fire', 'Water']
df['type_list'] = df['type'].str.strip('{}').str.split(',')

# Encode labels
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['type_list'])

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize 'info' column
tokens = tokenizer(
    list(df['info']), 
    padding=True, 
    truncation=True, 
    return_tensors='tf'
)

# Train-test split
X_train = {
    'input_ids': tokens['input_ids'][:int(0.8 * len(df))],
    'attention_mask': tokens['attention_mask'][:int(0.8 * len(df))]
}
X_test = {
    'input_ids': tokens['input_ids'][int(0.8 * len(df)):],
    'attention_mask': tokens['attention_mask'][int(0.8 * len(df)):]
}
y_train = y[:int(0.8 * len(df))]
y_test = y[int(0.8 * len(df)):]

# Load pre-trained BERT base model
bert = TFBertModel.from_pretrained('bert-base-uncased')

# Freeze BERT layers (optional, for faster training)
# for layer in bert.layers:
#     layer.trainable = False

# Build model
input_ids = tf.keras.Input(shape=(tokens['input_ids'].shape[1],), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.Input(shape=(tokens['attention_mask'].shape[1],), dtype=tf.int32, name='attention_mask')

embedding = bert(input_ids, attention_mask=attention_mask)[1]  # [1] gives the pooled output
output = tf.keras.layers.Dense(len(mlb.classes_), activation='sigmoid')(embedding)

model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

# Compile
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=16)

# Predict
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)

# Optional: Save model
# model.save('bert_pokemon_type_model')


  from .autonotebook import tqdm as notebook_tqdm
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TF

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
