# Pokemon Feature Engineering and Data Preparation

This notebook focuses on feature engineering, data cleaning, feature selection, and preparation of the final dataset for modeling. Includes bootstrapping analysis, information gain calculations, and creation of a modeling-ready dataset.

## Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# prevent results from being clipped
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# set up consistent color palette across all notebooks
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("Set2")
presentation_palette = sns.color_palette("Set2")  # 8 colors available

# load the pokemon dataset
df = pd.read_csv('Pokemon Database.csv')
print(f"dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

# define base features
base_stats = ['Health Stat', 'Attack Stat', 'Defense Stat', 'Special Attack Stat', 'Special Defense Stat', 'Speed Stat']

## Feature Engineering

In [None]:
# Create new features for modeling
df_engineered = df.copy()

# Separate single-type and dual-type Pokemon for analysis
single_type = df_engineered[df_engineered['Secondary Type'].isna()]
dual_type = df_engineered[~df_engineered['Secondary Type'].isna()]

print(f"Single-type Pokemon: {len(single_type)}")
print(f"Dual-type Pokemon: {len(dual_type)}")
print()

# 1. Stat ratios and differences
df_engineered['Physical_Special_Ratio'] = (df_engineered['Attack Stat'] + df_engineered['Defense Stat']) / (df_engineered['Special Attack Stat'] + df_engineered['Special Defense Stat'] + 1)  # +1 to avoid division by zero
df_engineered['Offensive_Defensive_Ratio'] = (df_engineered['Attack Stat'] + df_engineered['Special Attack Stat']) / (df_engineered['Defense Stat'] + df_engineered['Special Defense Stat'] + 1)
df_engineered['Speed_Agility_Index'] = df_engineered['Speed Stat'] / (df_engineered['Health Stat'] + 1)

# 2. Stat categories (high/medium/low)
for stat in base_stats:
    df_engineered[f'{stat[:-5]}_Category'] = pd.qcut(df_engineered[stat], q=3, labels=['Low', 'Medium', 'High'])

# 3. Type combination features for dual-type Pokemon
df_engineered['Type_Combination'] = df_engineered.apply(
    lambda row: f"{row['Primary Type']}+{row['Secondary Type']}" if not pd.isna(row['Secondary Type']) else row['Primary Type'],
    axis=1
)

# 4. Type effectiveness indicators (simplified)
type_effectiveness = {
    'Fire': ['Grass', 'Bug', 'Ice', 'Steel'],
    'Water': ['Fire', 'Ground', 'Rock'],
    'Grass': ['Water', 'Ground', 'Rock'],
    'Electric': ['Water', 'Flying'],
    'Ice': ['Grass', 'Ground', 'Flying', 'Dragon'],
    'Fighting': ['Normal', 'Ice', 'Rock', 'Dark', 'Steel'],
    'Poison': ['Grass', 'Fairy'],
    'Ground': ['Electric', 'Rock', 'Poison', 'Fire', 'Steel'],
    'Flying': ['Grass', 'Fighting', 'Bug'],
    'Psychic': ['Fighting', 'Poison'],
    'Bug': ['Grass', 'Psychic', 'Dark'],
    'Rock': ['Flying', 'Bug', 'Ice', 'Fire'],
    'Ghost': ['Psychic', 'Ghost'],
    'Dragon': ['Dragon'],
    'Dark': ['Psychic', 'Ghost'],
    'Steel': ['Ice', 'Rock', 'Fairy'],
    'Fairy': ['Fighting', 'Dragon', 'Dark']
}

# Create effectiveness score for primary type
df_engineered['Primary_Type_Effectiveness'] = df_engineered['Primary Type'].map(lambda x: len(type_effectiveness.get(x, [])))

# Create effectiveness score for secondary type (0 if single type)
df_engineered['Secondary_Type_Effectiveness'] = df_engineered['Secondary Type'].map(lambda x: len(type_effectiveness.get(x, [])) if not pd.isna(x) else 0)

# Combined effectiveness for dual types
df_engineered['Combined_Type_Effectiveness'] = df_engineered['Primary_Type_Effectiveness'] + df_engineered['Secondary_Type_Effectiveness']

# 5. Legendary indicators
df_engineered['High_Stat_Count'] = (df_engineered[base_stats] > df_engineered[base_stats].quantile(0.8)).sum(axis=1)
df_engineered['Stat_Variance'] = df_engineered[base_stats].var(axis=1)

# 6. Type diversity features
df_engineered['Is_Single_Type'] = df_engineered['Secondary Type'].isna().astype(int)
df_engineered['Is_Dual_Type'] = (~df_engineered['Secondary Type'].isna()).astype(int)

print(f"Original features: {len(df.columns)}")
print(f"Engineered features: {len(df_engineered.columns)}")
print(f"New features added: {len(df_engineered.columns) - len(df.columns)}")

# Show new features
new_features = [col for col in df_engineered.columns if col not in df.columns]
print(f"\nNew features created: {new_features}")

# Summary of type combinations
print(f"\n=== TYPE COMBINATION SUMMARY ===")
print(f"Unique single types: {single_type['Primary Type'].nunique()}")
print(f"Unique dual-type combinations: {dual_type['Type_Combination'].nunique()}")
print(f"Most common type combination: {dual_type['Type_Combination'].value_counts().index[0]} ({dual_type['Type_Combination'].value_counts().iloc[0]} Pokemon)")

## Feature Selection and Information Gain

In [None]:
# prepare data for feature selection
# encode categorical variables
le_type = LabelEncoder()
le_combo = LabelEncoder()

df_encoded = df_engineered.copy()
df_encoded['Primary_Type_Encoded'] = le_type.fit_transform(df_encoded['Primary Type'])
df_encoded['Type_Combination_Encoded'] = le_combo.fit_transform(df_encoded['Type_Combination'])

# separate datasets for single-type and dual-type analysis
single_encoded = df_encoded[df_encoded['Is_Single_Type'] == 1]
dual_encoded = df_encoded[df_encoded['Is_Dual_Type'] == 1]

# select numeric features for analysis
numeric_features = df_encoded.select_dtypes(include=[np.number]).columns.tolist()
numeric_features = [f for f in numeric_features if f not in ['Primary_Type_Encoded', 'Type_Combination_Encoded']]  # remove targets

print(f"total numeric features available: {len(numeric_features)}")
print(f"single-type pokemon: {len(single_encoded)}")
print(f"dual-type pokemon: {len(dual_encoded)}")

# 1. feature selection for primary type classification (all pokemon)
print("feature selection for primary type prediction (all pokemon)")

X_type_all = df_encoded[numeric_features]
y_type_all = df_encoded['Primary_Type_Encoded']

selector_f_type_all = SelectKBest(score_func=f_classif, k='all')
selector_f_type_all.fit(X_type_all, y_type_all)
top_f_type_all = pd.DataFrame({
    "feature": X_type_all.columns,
    "score": selector_f_type_all.scores_
}).sort_values(by="score", ascending=False)

selector_mi_type_all = SelectKBest(score_func=mutual_info_classif, k='all')
selector_mi_type_all.fit(X_type_all, y_type_all)
top_mi_type_all = pd.DataFrame({
    "feature": X_type_all.columns,
    "score": selector_mi_type_all.scores_
}).sort_values(by="score", ascending=False)

# 2. feature selection for primary type (single-type pokemon only)
print("\nfeature selection for primary type prediction (single-type only)")

X_type_single = single_encoded[numeric_features]
y_type_single = single_encoded['Primary_Type_Encoded']

selector_f_type_single = SelectKBest(score_func=f_classif, k='all')
selector_f_type_single.fit(X_type_single, y_type_single)
top_f_type_single = pd.DataFrame({
    "feature": X_type_single.columns,
    "score": selector_f_type_single.scores_
}).sort_values(by="score", ascending=False)

# 3. feature selection for type combination (dual-type pokemon only)
print("\nfeature selection for type combination prediction (dual-type only)")

X_combo_dual = dual_encoded[numeric_features]
y_combo_dual = dual_encoded['Type_Combination_Encoded']

selector_f_combo_dual = SelectKBest(score_func=f_classif, k='all')
selector_f_combo_dual.fit(X_combo_dual, y_combo_dual)
top_f_combo_dual = pd.DataFrame({
    "feature": X_combo_dual.columns,
    "score": selector_f_combo_dual.scores_
}).sort_values(by="score", ascending=False)

# determine thresholds
f_threshold_type_all = top_f_type_all['score'].quantile(0.75)
f_threshold_type_single = top_f_type_single['score'].quantile(0.75)
f_threshold_combo_dual = top_f_combo_dual['score'].quantile(0.75)

combined_threshold = min(f_threshold_type_all, f_threshold_type_single, f_threshold_combo_dual)

# display thresholds as table
thresholds_comparison = pd.DataFrame({
    'analysis type': ['all pokemon (primary type)', 'single-type only (primary type)', 'dual-type only (combinations)'],
    'threshold (75th percentile)': [f_threshold_type_all, f_threshold_type_single, f_threshold_combo_dual],
    'features above threshold': [
        len(top_f_type_all[top_f_type_all['score'] >= combined_threshold]),
        len(top_f_type_single[top_f_type_single['score'] >= combined_threshold]),
        len(top_f_combo_dual[top_f_combo_dual['score'] >= combined_threshold])
    ]
}).round(4)
print("\nfeature selection thresholds comparison")
thresholds_comparison

# 4. information gain for legendary classification
legendary_features_excl_target = [f for f in numeric_features if f != 'Is Legendary']
X_legendary = df_encoded[legendary_features_excl_target]
y_legendary = df_encoded['Is Legendary']

# feature selection for legendary prediction
print("\nfeature selection for legendary prediction")

# f-classif
selector_f_legendary = SelectKBest(score_func=f_classif, k='all')
selector_f_legendary.fit(X_legendary, y_legendary)
top_f_legendary = pd.DataFrame({
    "feature": X_legendary.columns,
    "score": selector_f_legendary.scores_
}).sort_values(by="score", ascending=False)

# mutual info
selector_mi_legendary = SelectKBest(score_func=mutual_info_classif, k='all')
selector_mi_legendary.fit(X_legendary, y_legendary)
top_mi_legendary = pd.DataFrame({
    "feature": X_legendary.columns,
    "score": selector_mi_legendary.scores_
}).sort_values(by="score", ascending=False)

# determine threshold for legendary prediction
f_threshold_legendary = top_f_legendary['score'].quantile(0.75)
mi_threshold_legendary = top_mi_legendary['score'].quantile(0.75)
combined_threshold_legendary = min(f_threshold_legendary, mi_threshold_legendary)

# display thresholds as table
thresholds_legendary = pd.DataFrame({
    'method': ['f-classif', 'mutual info', 'combined'],
    'threshold (75th percentile)': [f_threshold_legendary, mi_threshold_legendary, combined_threshold_legendary]
}).round(4)
print("\nfeature selection thresholds for legendary prediction")
thresholds_legendary

# select features above threshold in both methods
selected_f_legendary = set(top_f_legendary[top_f_legendary['score'] >= combined_threshold_legendary]['feature'])
selected_mi_legendary = set(top_mi_legendary[top_mi_legendary['score'] >= combined_threshold_legendary]['feature'])
selected_legendary_features = list(selected_f_legendary & selected_mi_legendary)

# display feature selection results as table
feature_selection_legendary = pd.DataFrame({
    'selection method': ['f-classif above threshold', 'mutual info above threshold', 'both methods (intersection)'],
    'features selected': [len(selected_f_legendary), len(selected_mi_legendary), len(selected_legendary_features)],
    'feature names': [', '.join(sorted(list(selected_f_legendary))), ', '.join(sorted(list(selected_mi_legendary))), ', '.join(sorted(selected_legendary_features))]
})
print("\nfeature selection results for legendary prediction")
feature_selection_legendary

# visualization of feature importance across different analyses
fig, axs = plt.subplots(2, 3, figsize=(20, 16))

# all pokemon - primary type (f-classif)
top_f_type_all_plot = top_f_type_all.head(15)
sns.barplot(x="score", y="feature", data=top_f_type_all_plot, ax=axs[0,0], color=presentation_palette[0])
axs[0,0].set_title(f"top features - all pokemon\nprimary type prediction (f-classif)")

# single-type only - primary type (f-classif)
top_f_type_single_plot = top_f_type_single.head(15)
sns.barplot(x="score", y="feature", data=top_f_type_single_plot, ax=axs[0,1], color=presentation_palette[1])
axs[0,1].set_title(f"top features - single-type only\nprimary type prediction")

# dual-type only - combinations (f-classif)
top_f_combo_dual_plot = top_f_combo_dual.head(15)
sns.barplot(x="score", y="feature", data=top_f_combo_dual_plot, ax=axs[0,2], color=presentation_palette[2])
axs[0,2].set_title(f"top features - dual-type only\ntype combination prediction")

# legendary prediction f-classif
top_f_legendary_plot = top_f_legendary.head(15)
sns.barplot(x="score", y="feature", data=top_f_legendary_plot, ax=axs[1,0], color=presentation_palette[3])
axs[1,0].set_title(f"top features - legendary prediction\n(f-classif)")

# legendary prediction mutual info
top_mi_legendary_plot = top_mi_legendary.head(15)
sns.barplot(x="score", y="feature", data=top_mi_legendary_plot, ax=axs[1,1], color=presentation_palette[4])
axs[1,1].set_title(f"top features - legendary prediction\n(mutual info)")

# feature comparison across analyses
axs[1,2].axis('off')  # turn off the last subplot

# add summary text
summary_text = f"""
feature engineering summary:

• total pokemon: {len(df_encoded)}
• single-type: {len(single_encoded)} ({len(single_encoded)/len(df_encoded)*100:.1f}%)
• dual-type: {len(dual_encoded)} ({len(dual_encoded)/len(df_encoded)*100:.1f}%)

• features engineered: {len(new_features)}
• type combinations identified: {dual_encoded['Type_Combination'].nunique()}

• primary type prediction features: {len(selected_type_features) if 'selected_type_features' in locals() else 'tbd'}
• legendary prediction features: {len(selected_legendary_features)}
"""

axs[1,2].text(0.1, 0.8, summary_text, transform=axs[1,2].transAxes,
             fontsize=10, verticalalignment='top', fontfamily='monospace',
             bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.8))

plt.suptitle("feature importance across different pokemon type prediction tasks", fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('figures/feature_importance_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## Bootstrapping Analysis

In [None]:
# bootstrapping analysis for feature stability
def bootstrap_feature_importance(X, y, n_bootstraps=100, random_state=42):
    """calculate feature importance using bootstrapping"""
    np.random.seed(random_state)
    feature_names = X.columns
    n_features = len(feature_names)

    # store importance scores for each bootstrap
    importance_scores = np.zeros((n_bootstraps, n_features))

    for i in range(n_bootstraps):
        # bootstrap sample
        indices = np.random.choice(len(X), size=len(X), replace=True)
        X_boot = X.iloc[indices]
        y_boot = y.iloc[indices]

        # train model and get feature importance
        rf = RandomForestClassifier(n_estimators=50, random_state=i, max_depth=10)
        rf.fit(X_boot, y_boot)
        importance_scores[i] = rf.feature_importances_

    # calculate statistics
    mean_importance = np.mean(importance_scores, axis=0)
    std_importance = np.std(importance_scores, axis=0)
    ci_lower = np.percentile(importance_scores, 2.5, axis=0)
    ci_upper = np.percentile(importance_scores, 97.5, axis=0)

    return pd.DataFrame({
        'feature': feature_names,
        'mean_importance': mean_importance,
        'std_importance': std_importance,
        'ci_lower': ci_lower,
        'ci_upper': ci_upper,
        'stability': mean_importance / std_importance  # coefficient of variation
    }).sort_values('mean_importance', ascending=False)

# bootstrap analysis for type prediction
print("running bootstrap analysis for type prediction (this may take a moment)...")
bootstrap_type = bootstrap_feature_importance(X_type, pd.Series(y_type), n_bootstraps=50)

print("\nbootstrap feature importance for type prediction")
print("top 10 most stable features:")
bootstrap_type.head(10)[['feature', 'mean_importance', 'std_importance', 'stability']].round(4)

# bootstrap analysis for legendary prediction
print("\nrunning bootstrap analysis for legendary prediction...")
bootstrap_legendary = bootstrap_feature_importance(X_legendary, y_legendary, n_bootstraps=50)

print("\nbootstrap feature importance for legendary prediction")
print("top 10 most stable features:")
bootstrap_legendary.head(10)[['feature', 'mean_importance', 'std_importance', 'stability']].round(4)

# visualize bootstrap results
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# type prediction bootstrap
top_bootstrap_type = bootstrap_type.head(8)
x_pos = np.arange(len(top_bootstrap_type))
axes[0].bar(x_pos, top_bootstrap_type['mean_importance'], yerr=top_bootstrap_type['std_importance'],
           color=presentation_palette[0], alpha=0.7, capsize=5)
axes[0].set_xticks(x_pos)
axes[0].set_xticklabels([f.replace('_', '\n') for f in top_bootstrap_type['feature']], rotation=45, ha='right', fontsize=8)
axes[0].set_title('bootstrap feature importance\n(type prediction)', fontweight='bold')
axes[0].set_ylabel('importance score')
axes[0].grid(True, alpha=0.3)

# legendary prediction bootstrap
top_bootstrap_leg = bootstrap_legendary.head(8)
x_pos = np.arange(len(top_bootstrap_leg))
axes[1].bar(x_pos, top_bootstrap_leg['mean_importance'], yerr=top_bootstrap_leg['std_importance'],
           color=presentation_palette[1], alpha=0.7, capsize=5)
axes[1].set_xticks(x_pos)
axes[1].set_xticklabels([f.replace('_', '\n') for f in top_bootstrap_leg['feature']], rotation=45, ha='right', fontsize=8)
axes[1].set_title('bootstrap feature importance\n(legendary prediction)', fontweight='bold')
axes[1].set_ylabel('importance score')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('figures/bootstrap_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

## Final Dataset Preparation

In [None]:
# Create final modeling datasets

# 1. Single-Type Primary Type Classification Dataset
single_type_features = list(set(top_f_type_single[top_f_type_single['Score'] >= combined_threshold]['Feature']))
df_single_type_modeling = single_encoded[single_type_features + ['Primary Type', 'Primary_Type_Encoded']].copy()

# 2. Dual-Type Combination Classification Dataset
dual_combo_features = list(set(top_f_combo_dual[top_f_combo_dual['Score'] >= combined_threshold]['Feature']))
df_dual_combo_modeling = dual_encoded[dual_combo_features + ['Type_Combination', 'Type_Combination_Encoded']].copy()

# 3. All Pokemon Primary Type Classification Dataset
all_type_features = list(set(top_f_type_all[top_f_type_all['Score'] >= combined_threshold]['Feature']))
df_all_type_modeling = df_encoded[all_type_features + ['Primary Type', 'Primary_Type_Encoded']].copy()

# 4. Legendary Classification Dataset
legendary_features = selected_legendary_features
df_legendary_modeling = df_encoded[legendary_features + ['Is Legendary']].copy()

# 5. Clustering Dataset (unsupervised)
clustering_features = [f for f in numeric_features if f not in ['Primary_Type_Encoded', 'Type_Combination_Encoded', 'Is Legendary', 'Is_Single_Type', 'Is_Dual_Type']]
df_clustering = df_encoded[clustering_features].copy()

# Display comprehensive dataset summaries
dataset_summary = pd.DataFrame({
    'Dataset': [
        'Single-Type Primary Classification',
        'Dual-Type Combination Classification',
        'All Pokemon Primary Classification',
        'Legendary Classification',
        'Clustering (Unsupervised)'
    ],
    'Samples': [
        df_single_type_modeling.shape[0],
        df_dual_combo_modeling.shape[0],
        df_all_type_modeling.shape[0],
        df_legendary_modeling.shape[0],
        df_clustering.shape[0]
    ],
    'Features': [
        len(single_type_features),
        len(dual_combo_features),
        len(all_type_features),
        len(legendary_features),
        len(clustering_features)
    ],
    'Target Variable': [
        'Primary Type',
        'Type Combination',
        'Primary Type',
        'Is Legendary',
        'None (unsupervised)'
    ],
    'Pokemon Types Covered': [
        f'{single_encoded["Primary Type"].nunique()} single types',
        f'{dual_encoded["Type_Combination"].nunique()} combinations',
        f'{df_encoded["Primary Type"].nunique()} primary types',
        'All Pokemon',
        'All Pokemon'
    ]
})

print("=== COMPREHENSIVE MODELING DATASETS CREATED ===")
print(f"Total engineered features: {len([col for col in df_engineered.columns if col not in df.columns])}")
print(f"Type combinations identified: {dual_encoded['Type_Combination'].nunique()}")
print(f"Single-type Pokemon: {len(single_encoded)} ({len(single_encoded)/len(df_encoded)*100:.1f}%)")
print(f"Dual-type Pokemon: {len(dual_encoded)} ({len(dual_encoded)/len(df_encoded)*100:.1f}%)")
print()

dataset_summary

# Save the engineered dataset for use in modeling
df_engineered.to_csv('pokemon_engineered.csv', index=False)
print("\n=== ENGINEERED DATASET SAVED ===")
print("File: pokemon_engineered.csv")
print(f"Shape: {df_engineered.shape}")
print(f"Features: {list(df_engineered.columns)}")

# Scale the datasets
scaler_type = StandardScaler()
scaler_legendary = StandardScaler()
scaler_clustering = StandardScaler()

# Scale type dataset
numeric_type_features = [f for f in type_features if df_type_modeling[f].dtype in ['int64', 'float64']]
df_type_modeling_scaled = df_type_modeling.copy()
df_type_modeling_scaled[numeric_type_features] = scaler_type.fit_transform(df_type_modeling[numeric_type_features])

# Scale legendary dataset
numeric_legendary_features = [f for f in legendary_features if df_legendary_modeling[f].dtype in ['int64', 'float64']]
df_legendary_modeling_scaled = df_legendary_modeling.copy()
df_legendary_modeling_scaled[numeric_legendary_features] = scaler_legendary.fit_transform(df_legendary_modeling[numeric_legendary_features])

# Scale clustering dataset
df_clustering_scaled = pd.DataFrame(
    scaler_clustering.fit_transform(df_clustering),
    columns=df_clustering.columns,
    index=df_clustering.index
)

# Save final datasets
df_type_modeling_scaled.to_csv('pokemon_type_modeling.csv', index=False)
df_legendary_modeling_scaled.to_csv('pokemon_legendary_modeling.csv', index=False)
df_clustering_scaled.to_csv('pokemon_clustering.csv', index=False)

# Dataset files summary
dataset_files = pd.DataFrame({
    'File': ['pokemon_type_modeling.csv', 'pokemon_legendary_modeling.csv', 'pokemon_clustering.csv'],
    'Purpose': ['Type classification', 'Legendary classification', 'Clustering analysis'],
    'Features': [len(type_features), len(legendary_features), len(clustering_features)],
    'Target': ['Primary_Type_Encoded', 'Is Legendary', 'None (unsupervised)']
})

print("\n=== DATASET FILES SAVED ===")
dataset_files

# Quick validation - test model performance with selected features
print("\n=== MODEL VALIDATION WITH SELECTED FEATURES ===")

# Type classification quick test
X_type_sel = df_type_modeling_scaled[numeric_type_features]
y_type_sel = df_type_modeling_scaled['Primary_Type_Encoded']

X_train_type, X_test_type, y_train_type, y_test_type = train_test_split(
    X_type_sel, y_type_sel, test_size=0.2, random_state=42, stratify=y_type_sel
)

rf_type = RandomForestClassifier(n_estimators=100, random_state=42)
rf_type.fit(X_train_type, y_train_type)
type_accuracy = rf_type.score(X_test_type, y_test_type)

# Legendary classification quick test
X_leg_sel = df_legendary_modeling_scaled[numeric_legendary_features]
y_leg_sel = df_legendary_modeling_scaled['Is Legendary']

X_train_leg, X_test_leg, y_train_leg, y_test_leg = train_test_split(
    X_leg_sel, y_leg_sel, test_size=0.2, random_state=42, stratify=y_leg_sel
)

rf_leg = RandomForestClassifier(n_estimators=100, random_state=42)
rf_leg.fit(X_train_leg, y_train_leg)
legendary_accuracy = rf_leg.score(X_test_leg, y_test_leg)

# Display validation results as table
validation_results = pd.DataFrame({
    'Task': ['Type Classification', 'Legendary Classification'],
    'Test Accuracy': [f'{type_accuracy:.3f}', f'{legendary_accuracy:.3f}'],
    'Features Used': [len(numeric_type_features), len(numeric_legendary_features)],
    'Model': ['Random Forest (100 trees)', 'Random Forest (100 trees)']
})
validation_results