In [158]:
import pandas as pd
# import json

In [159]:
df = pd.read_csv('pokemon-tcg.csv')
df.head(5)

Unnamed: 0,id,stage,name,from,hp,type,ability,a1cost,a1name,a1damage,a1bonus,a1bonusDmg,a2cost,a2name,a2damage,a2bonus,a2bonusDmg,weakness,retreat,rarity
0,ga1,0,bulbasaur,,70,grass,,gx,vine whip,40.0,,,,,,,,fire,1,1d
1,ga2,1,ivysaur,bulbasaur,90,grass,,gxx,razor leaf,60.0,,,,,,,,fire,2,2d
2,ga3,2,venusaur,ivysaur,160,grass,,ggxx,mega drain,80.0,heal,30.0,,,,,,fire,3,3d
3,ga4,2,venusaur EX,ivysaur,190,grass,,gxx,razor leaf,60.0,,,ggxx,giant bloom,100.0,heal,30.0,fire,3,4d
4,ga5,0,caterpie,,50,grass,,x,find a friend,,,,,,,,,fire,1,1d


In [160]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 565 entries, 0 to 564
Data columns (total 20 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          565 non-null    object 
 1   stage       565 non-null    int64  
 2   name        565 non-null    object 
 3   from        262 non-null    object 
 4   hp          565 non-null    int64  
 5   type        565 non-null    object 
 6   ability     41 non-null     object 
 7   a1cost      565 non-null    object 
 8   a1name      565 non-null    object 
 9   a1damage    527 non-null    float64
 10  a1bonus     207 non-null    object 
 11  a1bonusDmg  146 non-null    float64
 12  a2cost      34 non-null     object 
 13  a2name      34 non-null     object 
 14  a2damage    30 non-null     float64
 15  a2bonus     14 non-null     object 
 16  a2bonusDmg  14 non-null     float64
 17  weakness    556 non-null    object 
 18  retreat     565 non-null    int64  
 19  rarity      565 non-null    o

In [161]:
weakness_colors = {
    'water': '#0000aa',
    'psychic': '#aa00aa',
    'metal': 'lightblue',
    'grass': '#00aa00',
    'fire': '#aa0000',
    'fighting': '#964b00',
    'electric': '#fff700',
    'dark': '#222244',
    'dragon': '#c0c0c0'
}

In [162]:
from lets_plot import *
LetsPlot.setup_html()

In [163]:
(
    ggplot(
        df,
        aes(
            x='hp'
        )
    )
    + geom_histogram(binwidth=20)
)

In [164]:
type_health = df.groupby('type')['hp'].mean().round(2).reset_index().sort_values('hp', ascending=False)

type_health

(
    ggplot(
        type_health,
        aes(
            x='type',
            y='hp',
            fill='type'
        )
    )
    + geom_bar(stat='identity')
    + scale_fill_manual(values=weakness_colors)
    + labs(
        title='Average HP by Type',
        x='Type',
        y='HP',
        fill='Type'
    )
)

In [165]:
(
    ggplot(
        df.groupby('stage')['hp'].mean().round(2).reset_index(),
        aes(
            x='stage',
            y='hp'
        )
    )
    + geom_bar(stat='identity')
    + scale_x_discrete(breaks=[0, 1, 2], labels=['Basic', 'Stage 1', 'Stage 2'])
    + labs(
        title='Average HP by Stage',
        x='Stage',
        y='HP'
    )
)

In [166]:
(
    ggplot(
        df,
        aes(
            x='stage',
        )
    )
    + geom_bar()
    + scale_x_discrete(breaks=[0, 1, 2], labels=['Basic', 'Stage 1', 'Stage 2'])
    + labs(
        title='Count of Cards per Stage',
        x='Stage',
        y='Count'
    )
)

This chart represents the counts of different pokemon stages. This logically makes sense since all higher-stage pokemon require lower-stage pokemon to evolve from, so for every stage 2 pokemon there are at least as many stage 1 and basic, and for every pokemon with a stage 1 max evolution, there are at least as many basic pokemon present.

In [167]:
pokemon_name_counts = df.groupby('name')['id'].count().reset_index().rename(columns={'id': 'count'}).sort_values('count', ascending=False)

(
    ggplot(
        pokemon_name_counts,
        aes(
            x='count',
        )
    )
    + geom_bar()
    + labs(
        title='Number of Pokemon with Duplicate Cards',
        x='Duplicates',
        y='# of Pokemon'
    )
)

In [168]:
(
    ggplot(
        df.groupby('weakness')['id'].count().reset_index().rename(columns={'id': 'count'}).sort_values('count', ascending=False),
        aes(
            x='weakness',
            y='count',
            fill='weakness'
        )
    )
    + geom_bar(stat='identity')
    + scale_fill_manual(values=weakness_colors)
    + labs(
        title='Total # of Cards with each Weakness',
        x='Weakness',
        y='Count',
        fill='Weakness'
    )
)

This chart shows the total number of cards with each weakness. According to this chart, fighting-type cards cover the weakness of the most number of cards total, while metal-type has he fewest cards weak to it.

In [169]:
weakness_percents = (df.groupby('type')['weakness'].value_counts() / df.groupby('type')['weakness'].count() * 100).round(2).reset_index().rename(columns={0: 'percent'}).sort_values('weakness', ascending=False)

(
    ggplot(
        weakness_percents,
        aes(
            x='type',
            y='percent',
        )
    )
    + geom_bar(
        aes(
            fill='weakness',
        ),
        stat='identity',
        size=2
    )
    + scale_fill_manual(values=weakness_colors)
    + scale_color_manual(values=weakness_colors)
    + labs(
        title='Pokemon Weakness by Type',
        x='Type',
        y='% Weakness',
        fill='Weakness',
    )
)

This chart breaks down the percentage of weaknesses found inside of each pokemon type. The only type not present in this chart is *Dragon*, which contains no weaknesses, and nothing is weak to dragon type. One notable feature of this chart is that every single type has 2 or more different weaknesses. Also notable is that every present type has some percent of the pokemon weak to electric type moves; this is because nearly all bird-like pokemon have electric-type weakness.

In [170]:
weakness_counts = df.groupby('type')['weakness'].value_counts().reset_index().drop('count', axis=1).groupby('weakness').count().reset_index().rename(columns={'type': 'count'}).sort_values('count', ascending=False)

(
    ggplot(
        weakness_counts,
        aes(
            x='weakness',
            y='count',
            fill='weakness'
        )
    )
    + geom_bar(stat='identity')
    + scale_fill_manual(values=weakness_colors)
    + labs(
        title='Counts of Types Containing each Weakness',
        x='Weakness',
        y='Count',
        fill='Weakness'
    )
)

This chart explains maximum type coverage, where the total counts are how many pokemon types contain a weakness of the given type. Electric type has the best type coverage since 9 different pokemon types contain weakness to electric type moves, followed second by fighting, where 3 different types contain fighting weakness. The worst types for overall coverage are dark, psychic, and water, each only having one type weak to them.

In [171]:
df.groupby('type')['weakness'].value_counts()

type      weakness
dark      fighting    30
          grass       14
          electric     2
electric  fighting    49
          electric     4
fighting  grass       37
          psychic     26
          electric     3
fire      water       45
          electric     5
grass     fire        78
          electric     4
metal     fire        18
          electric     2
normal    fighting    55
          electric    21
psychic   dark        59
          metal       12
          electric     3
water     electric    78
          metal       11
Name: count, dtype: int64

In [172]:
max_weaknesses = df.groupby('type')['weakness'].value_counts().groupby(level=0).idxmax().reset_index()
max_weaknesses['max weakness'] = max_weaknesses['count'].apply(lambda x: x[1])

max_weaknesses = max_weaknesses.drop(columns=['count'])
max_weaknesses = max_weaknesses.groupby('max weakness')['type'].count().reset_index().rename(columns={'type': 'count'}).sort_values('count', ascending=False)

(
    ggplot(
        max_weaknesses,
        aes(
            x='max weakness',
            y='count',
            fill='max weakness'
        )
    )
    + geom_bar(stat='identity')
    + scale_fill_manual(values={key: weakness_colors[key] for key in weakness_colors if key in max_weaknesses['max weakness'].unique()})
    + scale_y_discrete(breaks=[0, 1, 2, 3])
    + labs(
        title='Total Types with a Given Max Weakness',
        subtitle='A count of how many types have a majority weakness to the given type',
        x='Max Weakness',
        y='Count'
    )
)

This chart explains maximum-coverage efficiency by type. Fighting-type pokemon have the best maximum coverage, covering three different types that have majority weakness to fighting type moves.

In [173]:
unique_pokemon = df.groupby('name').first().reset_index()

stage2_from = unique_pokemon[unique_pokemon['stage'] == 2]['from'].unique()
stage2_max_names = unique_pokemon[unique_pokemon['stage'] == 2]['name'].unique()

stage1_from = unique_pokemon[(unique_pokemon['stage'] == 1)]['from'].unique()
stage1_max_names = unique_pokemon[(unique_pokemon['stage'] == 1) & (~unique_pokemon['name'].isin(stage2_from))]['name'].unique()

basic_max_names = unique_pokemon[(unique_pokemon['stage'] == 0) & (~unique_pokemon['name'].isin(stage1_from))]['name'].unique()

max_names = set(stage2_max_names) | set(stage1_max_names) | set(basic_max_names)

unique_pokemon['is_final'] = unique_pokemon['name'].apply(lambda name: True if name in max_names else False)

unique_pokemon[['name', 'stage', 'from', 'is_final']].head(10)

Unnamed: 0,name,stage,from,is_final
0,abomasnow,1,snover,True
1,abra,0,,False
2,aerodactyl,1,old amber,True
3,aerodactyl EX,1,old amber,True
4,aipom,0,,False
5,alakazam,2,kadabra,True
6,ambipom,1,aipom,True
7,arbok,1,ekans,True
8,arcanine,1,growlithe,True
9,arcanine EX,1,growlithe,True


In [174]:
(
    ggplot(
        unique_pokemon,
        aes(
            x='stage',
            fill='is_final'
        )
    )
    + geom_bar()
    + scale_x_discrete(breaks=[0, 1, 2], labels=['Basic', 'Stage 1', 'Stage 2'])
)

In [175]:
unique_ex = unique_pokemon[unique_pokemon['name'].str.contains('EX')]

unique_pokemon_names = set(unique_pokemon['name'].unique())
unique_ex = unique_ex.merge(unique_ex['name'].apply(lambda name: str.split(name, ' ')[0] in unique_pokemon_names), left_index=True, right_index=True).rename(columns={'name_x': 'name', 'name_y': 'has_non_ex'})

ex_stage_counts = unique_ex.groupby('stage')['id'].count().reset_index().rename(columns={'id': 'count'}).merge(unique_ex.groupby('stage')['has_non_ex'].sum().reset_index().drop('stage', axis=1), left_index=True, right_index=True)

(
    ggplot(
        ex_stage_counts,
        aes(
            x='stage',
            y='count'
        )
    )
    + geom_bar(stat='identity')
    + geom_bar(
        aes(
            y='has_non_ex'
        ),
        stat='identity',
        fill='red'
    )
    + scale_x_discrete(breaks=[0, 1, 2], labels=['Basic', 'Stage 1', 'Stage 2'])
    + labs(
        title='Count of EX Pokemon by Stage',
        x='Stage',
        y='EX Count',
    )
)