<a href="https://colab.research.google.com/github/ShadaElewa/Pokemon-Classification/blob/main/Final_Pok%C3%A9mon_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'pokemon:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F121%2F280%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240515%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240515T093244Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D25dda758a96d556b2cc04ae3fb4636fcf9c13354928a6ad8b2dc067cc2286081accdff918ad3afbf06e73d132a0d70e8eb55654e9e1142031a0da075bfc8b5572f3bb994e2fe0f493dc2940bef9adbfc14342e70b734c4fc36a81d8d55d74d9f48fc8dbb077e32823606bc7fe0347828f3ccc458eb32c267baf0c954fb8ec8e8c63a8646ecc5349c4c8b2e946607bf72e878414b15e8a3f5f30ef60d378803d617b8e8d70f986656f0806683672323f70267d9814adf6e800bd8ef74f93704fe1972d1972f4391d4ef44531f557ad2dcfe4905a6f48a226aa0031d4fa6254fb524d9ae25d9a1e07745443823acd2f45a9e8eaa36ea6b851e966209d4ad4734d5'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


# AI - Final Project
### Class 14
### Names :
Shada Hazem Mohammed - 12200081
Sama Mahmoud Abdelbaset - 12200353

***

## Legendary Pokémon Classification  

Given *data about various Pokémon*, let's try to predict if a given Pokémon is **legendary** or not.  
  
We will use a TensorFlow ANN to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf

from sklearn.decomposition import PCA

In [None]:
data = pd.read_csv('../input/pokemon/Pokemon.csv')

In [None]:
data

In [None]:
data_raw = data.copy()

# Cleaning

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data = data.drop(['#', 'Name', 'Type 2'], axis=1)

In [None]:
data['Legendary'] = data['Legendary'].astype(np.int)

In [None]:
data

# EDA

In [None]:
data['Type 1'].unique()

In [None]:
numeric_columns = data.drop('Type 1', axis=1).columns

In [None]:
correlation_matrix = data[numeric_columns].corr()

plt.figure(figsize=(18, 15))
sns.heatmap(correlation_matrix, annot=True, vmin=-1.0, vmax=1.0)
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
for column in ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']:
    sns.kdeplot(data[column], shade=True)
plt.show()

# Preprocessing

In [None]:
data.dtypes

## Encoding

In [None]:
def onehot_encode(df, column, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [None]:
data = onehot_encode(data, 'Type 1', 't')

In [None]:
data

## Splitting and Scaling

In [None]:
y = data['Legendary']
X = data.drop('Legendary', axis=1)

In [None]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

# Training

In [None]:
X.shape

In [None]:
inputs = tf.keras.Input(shape=(26,))
x = tf.keras.layers.Dense(64, activation='relu')(inputs)
x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)


model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)


batch_size = 32
epochs = 20

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[tf.keras.callbacks.ReduceLROnPlateau()],
    verbose=0
)

# Results

In [None]:
fig_loss = px.line(
    history.history,
    y=['loss', 'val_loss'],
    labels={'x': "Epoch", 'y':"Loss"},
    title="Loss Over Time"
)

fig_loss.show()

In [None]:
np.argmin(history.history['val_loss'])

In [None]:
fig_auc = px.line(
    history.history,
    y=['auc', 'val_auc'],
    labels={'x': "Epoch", 'y':"AUC"},
    title="AUC Over Time"
)

fig_auc.show()

In [None]:
model.evaluate(X_test, y_test)

# Post-Training Analysis

In [None]:
predictions = np.hstack((model.predict(X_test) >= 0.5).astype(np.int)) != y_test
predictions

In [None]:
mislabeled_indices = y_test[predictions].index

In [None]:
data_raw.loc[mislabeled_indices, :]

In [None]:
X.shape

In [None]:
pca = PCA(n_components=2)
data_reduced = pd.DataFrame(pca.fit_transform(data), columns=["PC1", "PC2"])

In [None]:
data_reduced

In [None]:
legendary_indices = data.query("Legendary == 1").index

mislabeled_legendary_indices = np.intersect1d(mislabeled_indices, legendary_indices)

In [None]:
plt.figure(figsize=(20, 10))

plt.scatter(data_reduced['PC1'], data_reduced['PC2'], c='lightgray')
plt.scatter(data_reduced.loc[legendary_indices, 'PC1'], data_reduced.loc[legendary_indices, 'PC2'], c='dimgray')
plt.scatter(data_reduced.loc[mislabeled_indices, 'PC1'], data_reduced.loc[mislabeled_indices, 'PC2'], c='orchid')
plt.scatter(data_reduced.loc[mislabeled_legendary_indices, 'PC1'], data_reduced.loc[mislabeled_legendary_indices, 'PC2'], c='mediumspringgreen')

plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend(['Non-Legendary', 'Legendary', 'Non-Legendary Misclassified', 'Legendary Misclassified'])
plt.title("PCA Scatter Plot")
plt.show()