# Import Necessary Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

# Import your Data

In [None]:
df = pd.read_csv('./Datasets/pokemonDB_dataset.csv')
df = df[['Pokemon', 'Type', 'Height', 'Weight', 'Attack Base']]
df

In [None]:
df.info()

# Let's clean up the Height and Weight column!

In [None]:
df_hw = df.copy(deep=True) # ---Create a copy of your Dataframe--- #

df_hw['Height'] = df_hw['Height'].apply(lambda height: height.split(' m')[0])
df_hw['Weight'] = df_hw['Weight'].apply(lambda weight: weight.split(' kg')[0])

df_hw.rename(columns={'Height': 'Height (m)', 'Weight':'Weight (kg)'}, inplace=True)
df_hw['Height (m)'] = df_hw['Height (m)'].astype('float64')

# Now, youd think converting the column Weight to float would work
# df_hw['Height (m)'] = df_hw['Height (m)'].astype('float64')

# Right way, somehow
df_hw['Weight (kg)'] = pd.to_numeric(df_hw['Weight (kg)'], errors='coerce') ## -- This also replaces '—' to NaN

display(df_hw.info(), df_hw)

In [None]:
# Search for the error
# df_hw[df_hw['Weight (kg)']=='—'] 

# How many total Types are there for a pokemon?

In [None]:
df_type = df_hw.copy(deep=True) # Copy new dataframe

df_type['Type_count'] = df_type['Type'].str.count(',') + 1
df_type['Type_count'].max()

In [None]:
df_type['Type'] = df_type['Type'].apply(lambda x : x.strip().split(','))
df_type['Type1'] = df_type['Type'].apply(lambda typ: typ[0])
df_type['Type2'] = df_type['Type'].apply(lambda typ: typ[1] if len(typ) > 1 else None)

df_type = df_type[['Pokemon', 'Type1', 'Type2', 'Height (m)', 'Weight (kg)', 'Attack Base']]
df_type

# Data Summary

In [None]:
df_type.shape

## Mean, Median, Mode?

In [None]:
df_type

In [None]:
"""Mean for column Height"""

df_type['Height (m)'].mean()  ## or you can also use numpy
# np.mean(df_type['Height (m)'])

In [None]:
"""Median for column Weight"""

# df_type['Weight (kg)'].median()

# np.median(df_type['Weight (kg)']) ## Why wont this work?
np.nanmedian(df_type['Weight (kg)'])

In [None]:
"""Mode, Mode, Mode"""

df_type['Type1'].mode()

In [None]:
df_type['Type1'].value_counts()

## Or use df.describe()

In [None]:
df_type.describe()

# Visualize!

## Bar Graph
### Using a Countplot

In [None]:
plt.figure(figsize=(8.5,5))

sns.countplot(x = df_type['Type1'])
plt.xticks(rotation=60)
plt.title('Count of Pokemon Types (Type 1)')
plt.show()

In [None]:
type_colors = {
    'Grass': '#A8C256',
    'Psychic': '#C52184',
    'Dark': '#334139',
    'Bug': '#AFC97E',
    'Steel': '#E6DBD0',
    'Rock': '#503B31',
    'Normal': '#DCE1E9',
    'Fairy': '#F4AFB4',
    'Water': '#3066BE',
    'Dragon': '#28C2FF',
    'Electric':'#F7E9A1',
    'Fighting':'#9C1715',
    'Poison':'#B388EB',
    'Fire':'#D36135',
    'Ice':'#ABDDED',
    'Ground':'#FFAF47',
    'Ghost':'#BF9ACA',
    'Flying':'#F6F5F3'
}
type_order = df_type['Type1'].unique()
type_pallete = [type_colors.get(t) for t in type_order]

plt.figure(figsize=(13.5,5))

plt.grid(axis='y', linestyle='--', alpha = 1)
ax = sns.countplot(x = df_type['Type1'], order=type_order, palette=type_pallete)
for bar in ax.patches:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:,.0f}',
             ha='center', va='bottom', fontsize=8)
plt.xticks(rotation=60)
plt.title('Count of Pokemon Types (Type 1)')
plt.show()

## Heatmap

In [None]:
cross_tab = pd.crosstab(df_type['Type1'], df_type['Type2'])

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(cross_tab, cmap='Reds', annot=True, fmt='d', linewidths=0.5)
plt.title('Co-occurrence of Pokémon Types (Type1 vs. Type2)')
plt.xlabel('Type2')
plt.ylabel('Type1')
plt.show()

In [None]:
# Lets Find them!
df_type[(df_type['Type1'].str.contains('Bug')) & (df_type['Type2'].str.contains('Dark'))]

## Scatter Plot

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Weight (kg)', y='Height (m)', data=df_type, alpha=0.6)
plt.title('Height vs. Weight of Pokémon')
plt.xlabel('Pokemon Weight (kg)')
plt.ylabel('Pokemon Height (m)')

# # Add a regression line
# sns.regplot(x='Weight (kg)', y='Height (m)', data=df_type, scatter=False, color='red')
plt.show()

## Most Powerful Pokemons?

In [None]:
df_att = dict(zip(df_type['Pokemon'], df_type['Attack Base']))
top20 = Counter(df_att).most_common(20)
top20

Here are the top 20 Strongest Pokemon

In [None]:
top20_keys = [item[0] for item in top20]
top20_colors = ['red' if key in top20_keys else 'gray' for key in df_att.keys()]

plt.bar(df_att.keys(), df_att.values(), color = top20_colors)
plt.xlabel('Pokemons')
plt.ylabel('Attack (Base)')
plt.title('Base Attack per Pokemon')

This right here? Ugly.

In [None]:
top20_attack = df_type.sort_values('Attack Base', ascending=False).head(20)
plt.figure(figsize=(12,8))
bar = sns.barplot(
    x='Attack Base',
    y='Pokemon',
    data=top20_attack,
    palette='viridis',
    edgecolor='black'
)

for p in bar.patches:
    width = p.get_width()
    plt.text(
        width +2, 
        p.get_y() + p.get_height() / 2, 
        f'{int(width)}',
        ha = 'left',
        va = 'center',
        fontsize = 10
    )

plt.title('Top 20 Strongest Pokemon by Base Attack', fontsize=10, weight='bold')
plt.xlabel('Base Attack')
plt.ylabel('Pokemon')
plt.grid(axis='x', linestyle = '--', alpha=0.1)

plt.tight_layout()
plt.show()