# Pokémon Data Classification: Logistic Regression vs MLP

## Objective
This notebook demonstrates how to classify Pokémon as legendary or not using two machine learning models:
- Logistic Regression
- Multi-Layer Perceptron (MLP)

We will compare the performance of both models using metrics such as accuracy, precision, recall, F1-score, and AUC-ROC.

## Step 1: Load and Explore the Dataset

In [None]:
import pandas as pd
import numpy as np

# Load the Pokémon dataset
file_path = 'pokemon.csv'  # Ensure this file is in the same directory as the notebook
pokemon_data = pd.read_csv(file_path)

# Display the first few rows
pokemon_data.head()

## Step 2: Data Preprocessing
### 2.1 Feature Engineering

In [None]:
# Create a derived feature: attack_to_defense_ratio
pokemon_data['attack_to_defense_ratio'] = pokemon_data['attack'] / pokemon_data['defense']

# Select features and target
features = ['attack', 'defense', 'sp_attack', 'sp_defense', 'speed', 'total_bs',
            'capture_rt', 'attack_to_defense_ratio', 'type']
target = 'legendary'
X = pokemon_data[features]
y = pokemon_data[target]

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, Normalizer, MaxAbsScaler


### 2.2 Example for Scaling

### MinMaxScaler Example 

In [None]:
X.head(5)

In [None]:
# Min-Max Scaling
# Select features and target
features_digit = ['attack', 'defense', 'sp_attack', 'sp_defense', 'speed', 'total_bs',
            'capture_rt']
target = 'legendary'
X_example = pokemon_data[features_digit]
y_target = pokemon_data[target]

min_max_scaler = MinMaxScaler()
X_min_max_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_example), columns=features_digit)
print("Min-Max Scaled Data:")
print(X_min_max_scaled.head(5))

### Normalizer Example

In [None]:
# Normalization with L1 Norm
l1_normalizer = Normalizer(norm='l1')
X_l1 = l1_normalizer.fit_transform(X_example)
X_normalized_l1 = pd.DataFrame(X_l1, columns=features_digit)
print("\nL1 Normalized Data:\n")
print(X_normalized_l1.head(5))


In [None]:
# Normalization with L2 Norm
l2_normalizer = Normalizer(norm='l2')
X_l2 = l2_normalizer.fit_transform(X_example)
X_normalized_l2 = pd.DataFrame(X_l2, columns=features_digit)
print("\nL2 Normalized Data:\n")
print(X_normalized_l2.head(5))

### MaxAbsScaler(Maximum Absolute scaler) Example

In [None]:
# Maximum Absolute Scaling
max_abs_scaler = MaxAbsScaler()
X_max_abs_scaled = pd.DataFrame(max_abs_scaler.fit_transform(X_example), columns=features_digit)
print("\nMax-Abs Scaled Data:")
print(X_max_abs_scaled.head(5))

### StandardScaler Example

In [None]:
# StandardScaler
scaler = StandardScaler()  # Initialize the scaler
X_scaled = scaler.fit_transform(X_example)  # Standardize the features

# Step 4: Convert the scaled features back into a DataFrame
X_scaled_df = pd.DataFrame(X_scaled, columns=features_digit)

# Display the scaled data
print("Standardized Features (Mean = 0, Std = 1):")
print(X_scaled_df.head(5))

### 2.3 Data Transformation Using ColumnTransformer

In [None]:
# Categorical and numerical columns
categorical_features = ['type']
numeric_features = [col for col in features if col not in categorical_features]

# Preprocessing pipeline
numeric_transformer =  # Please try to use max_abs_scaler
categorical_transformer = OneHotEncoder()

# **ColumnTransformer**:
# This handles preprocessing for both numerical and categorical features in a single step.
# - Numerical data is scaled using `StandardScaler` to normalize values.
# - Categorical data (like 'type') is converted into numerical format using one-hot encoding.
# Benefits:
# - Enables seamless integration of preprocessing into a single pipeline.
# - Prevents data leakage by applying transformations only on training data during cross-validation.


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [None]:
preprocessor

### Prepare the training data

In [None]:
from sklearn.model_selection import train_test_split

# Transform the data
X_transformed = preprocessor.fit_transform(X)

# Train-test split
# Step 1: First train-test split to create training and temporary sets
# X_transformed: Preprocessed feature data
# y: Target labels
# test_size=0.3: Reserve 30% of the data for validation and test sets
# random_state=42: Ensures reproducibility of the splits
# stratify=y: Maintains the class distribution in the split
X_train, X_temp, y_train, y_temp = train_test_split(
    X_transformed, y, test_size=0.3, random_state=42, stratify=y
)

# Result:
# - X_train, y_train: Training set (70% of the data)
# - X_temp, y_temp: Temporary set (30% of the data)

# Step 2: Second split to create validation and test sets from the temporary set
# test_size=0.5: Splits the remaining 30% of the data equally into validation (15%) and test (15%) sets
# random_state=42: Ensures reproducibility
# stratify=y_temp: Maintains the class distribution in the split
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

## Step 3: Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.metrics import accuracy_score

# Train Logistic Regression
logistic_model = LogisticRegression(max_iter=300, random_state=42)
logistic_model.fit(X_train, y_train)

y_val_prob_logistic = logistic_model.predict_proba(X_val)[:, 1]
auc_val_logistic = roc_auc_score(y_val, y_val_prob_logistic)

y_val_pred_logistic = logistic_model.predict(X_val)
accuracy_score_logistic = accuracy_score(y_val, y_val_pred_logistic)  # Use predicted labels here




print("Logistic Regression - Validation Performance:")
print(classification_report(y_val, y_val_pred_logistic))
print(f"Validation AUC: {auc_val_logistic:.4f}")
print(f"Validation accuracy: {accuracy_score_logistic:.4f}")

## Step 4: Multi-Layer Perceptron (MLP)

In [None]:
from sklearn.neural_network import MLPClassifier

# Train MLP Classifier
# mlp_model = MLPClassifier(hidden_layer_sizes=(3, 2), activation='relu', solver='adam', max_iter=500, random_state=42)
mlp_model = MLPClassifier(hidden_layer_sizes=(3, 2), activation='relu', max_iter=500, random_state=42)
mlp_model.fit(X_train, y_train)

# Evaluate MLP Classifier
y_val_pred_mlp = mlp_model.predict(X_val)
accuracy_score_mlp = accuracy_score(y_val, y_val_pred_mlp)

y_val_prob_mlp = mlp_model.predict_proba(X_val)[:, 1]
auc_val_mlp = roc_auc_score(y_val, y_val_prob_mlp)

print("MLP Classifier - Validation Performance:")
print(classification_report(y_val, y_val_pred_mlp))
print(f"Validation AUC: {auc_val_mlp:.4f}")
print(f"Validation accuracy: {accuracy_score_mlp:.4f}")

## Step 5: Comparison of Models - Training Loss Curve

### Manually record the loss of logistic regression

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
# Initialize the model with warm_start to fit incrementally
logistic_model = LogisticRegression(
    max_iter=1,  # Run one iteration at a time
    solver='saga',  # The saga solver supports recording loss values during optimization
    random_state=42,
    warm_start=True  # Continue training from the last state, which allow manually record the loss at each iteration
)

# To store the loss values
loss_curve = []

# Train the model incrementally
for i in range(1, 301):  # 300 iterations
    logistic_model.fit(X_train, y_train)  # Fit one iteration
    # Predict probabilities to calculate log loss
    y_train_prob = logistic_model.predict_proba(X_train)
    # Calculate the log loss and append to the loss curve
    loss = log_loss(y_train, y_train_prob)
    loss_curve.append(loss)


In [None]:
# Extract loss during MLP training
mlp_model_loss_curve = mlp_model.loss_curve_
Logic_loss = loss_curve

# Plot the loss difference during MLP training and Logistic
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(Logic_loss) + 1), Logic_loss, label="Logistic Loss Curve")
plt.plot(mlp_model_loss_curve, label="MLP Loss Curve")
plt.xlabel("Iterations")
plt.ylabel("Loss")
yourname = 
plt.title(yourname+": Training Loss Over Iterations")
plt.grid()
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

# Plot ROC curves
fpr_logistic, tpr_logistic, _ = roc_curve(y_val, y_val_prob_logistic)
fpr_mlp, tpr_mlp, _ = roc_curve(y_val, y_val_prob_mlp)

plt.figure(figsize=(10, 6))
plt.plot(fpr_logistic, tpr_logistic, label=f"Logistic Regression (AUC = {auc_val_logistic:.2f})")
plt.plot(fpr_mlp, tpr_mlp, label=f"MLP Classifier (AUC = {auc_val_mlp:.2f})")
plt.title("ROC Curve Comparison")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.grid()
plt.show()

## Conclusion
- Logistic Regression is a simpler model but might perform comparably on well-processed datasets.
- MLP leverages neural network architecture, potentially achieving better results for complex data.
- AUC-ROC curves are helpful for comparing models' classification performance at different thresholds.

## Step 6: Visualization of Predictions and Loss

### 6.1 Actual vs Predicted Results

In [None]:
plt.scatter(range(len(y_val)), y_val, color='blue', alpha=0.6, label="Actual Values", s=15)
plt.title("Actual results")
plt.xlabel("Sample Index")
plt.ylabel("Legendary (1) or Not (0)")
plt.legend()
plt.show()

In [None]:

plt.scatter(range(len(y_val)), y_val_pred_logistic, color='red', alpha=0.6, label="Predicted (Logistic Regression)", s=15)
plt.title("<Your Name> +  Predicted (Logistic Regression)")
plt.xlabel("Sample Index")
plt.ylabel("Legendary (1) or Not (0)")
plt.legend()
plt.show()

In [None]:
plt.scatter(range(len(y_val)), y_val_pred_mlp, color='green', alpha=0.6, label="Predicted (MLP)", s=15)
plt.title("Predicted (MLP)")
plt.xlabel("Sample Index")
plt.ylabel("Legendary (1) or Not (0)")
plt.legend()

In [None]:
import matplotlib.pyplot as plt

# Identify error indices for Logistic Regression
errors_logistic = (y_val != y_val_pred_logistic)

# Identify error indices for MLP
errors_mlp = (y_val != y_val_pred_mlp)

# Plot the actual values at error indices
plt.scatter(
    np.where(errors_logistic)[0],  # Indices of errors for Logistic Regression
    y_val[errors_logistic],  # Actual values where Logistic Regression fails
    color='blue', alpha=0.6, label="Actual Values (Errors for Logistic Regression)", s=15
)

# # Plot the Logistic Regression predictions at error indices
plt.scatter(
    np.where(errors_logistic)[0],  # Indices of errors for Logistic Regression
    y_val_pred_logistic[errors_logistic],  # Predicted values where errors occur
    color='red', alpha=0.6, label="Predicted (Logistic Regression Errors)", s=15
)

# # Plot the MLP predictions at error indices
plt.scatter(
    np.where(errors_mlp)[0],  # Indices of errors for MLP
    y_val_pred_mlp[errors_mlp],  # Predicted values where errors occur
    color='green', alpha=0.6, label="Predicted (MLP Errors)", s=15
)

# Add plot details
plt.xlabel("Index")
plt.ylabel("Values")
plt.title("<Your Name> +  Error Points for Logistic Regression and MLP")
plt.legend()
plt.show()


### Confusion Matrices

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
# Logistic Regression
cm_logistic = confusion_matrix(y_val, y_val_pred_logistic)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_logistic, annot=True, fmt="d", cmap="Blues")
plt.title("<Your Name> + Confusion Matrix: Logistic Regression")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
# MLP
cm_mlp = confusion_matrix(y_val, y_val_pred_mlp)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_mlp, annot=True, fmt="d", cmap="Greens")
plt.title("<Your Name> + Confusion Matrix: MLP Classifier")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()