In [36]:
import numpy as np
import pandas as pd

# Import data
df = pd.read_csv('data/pokemon.csv')

# Join with evolution data
evolution_df = pd.read_csv('data/pokemon_evolutions.csv')
# First merge on evolves_from to get next eolution
df = pd.merge(df, evolution_df, how='left', left_on='name', right_on='evolves_from', copy=False).drop('evolves_from', axis=1)
# Then merge on evolves_to to get previous evolution
df = pd.merge(df, evolution_df, how='left', left_on='name', right_on='evolves_to', copy=False).drop('evolves_to_y', axis=1).rename({'evolves_to_x':'evolves_to'}, axis=1)
df


Unnamed: 0,attack,classfication,defense,height_m,hp,name,pokedex_number,speed,type1,type2,weight_kg,generation,is_legendary,evolves_to,evolves_from
0,49,Seed Pokémon,49,0.7,45,Bulbasaur,1,45,grass,poison,6.9,1,0,Ivysaur,
1,62,Seed Pokémon,63,1.0,60,Ivysaur,2,60,grass,poison,13.0,1,0,Venusaur,Bulbasaur
2,100,Seed Pokémon,123,2.0,80,Venusaur,3,80,grass,poison,100.0,1,0,,Ivysaur
3,52,Lizard Pokémon,43,0.6,39,Charmander,4,65,fire,,8.5,1,0,Charmeleon,
4,64,Flame Pokémon,58,1.1,58,Charmeleon,5,80,fire,,19.0,1,0,Charizard,Charmander
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
862,101,Launch Pokémon,103,9.2,97,Celesteela,797,61,steel,flying,999.9,7,1,,
863,181,Drawn Sword Pokémon,131,0.3,59,Kartana,798,109,grass,steel,0.1,7,1,,
864,101,Junkivore Pokémon,53,5.5,223,Guzzlord,799,43,dark,dragon,888.0,7,1,,
865,107,Prism Pokémon,101,2.4,97,Necrozma,800,79,psychic,,230.0,7,1,,


In [37]:
# For columns with many classes, we will get dummy variable for each class
oh_cols = ['type1', 'type2', 'generation'] # Columns that need to be encoded with OH
dum_df = pd.get_dummies(df, columns=oh_cols, prefix=['type1_', 'type2_', 'generation_'])
df = df.join(dum_df, rsuffix='_dum')
# Drop duplicated columns created in dum_df
df.drop([col for col in df.columns if '_dum' in col], axis=1, inplace=True)

# Create binary columns by comparing numeric values with mean
df['attack_low'] = (df['attack'] < df['attack'].mean()).astype(int)
df['attack_high'] = (df['attack'] > df['attack'].mean()).astype(int)
df['defense_low'] = (df['defense'] < df['defense'].mean()).astype(int)
df['defense_high'] = (df['defense'] > df['defense'].mean()).astype(int)
df['height_low'] = (df['height_m'] < df['height_m'].mean()).astype(int)
df['height_high'] = (df['height_m'] > df['height_m'].mean()).astype(int)
df['weight_low'] = (df['weight_kg'] < df['weight_kg'].mean()).astype(int)
df['weight_high'] = (df['weight_kg'] > df['weight_kg'].mean()).astype(int)

# Is first/last evolution
df['first_evolution'] = (df['evolves_from'].isnull()).astype(int)
df['last_evolution'] = (df['evolves_to'].isnull()).astype(int)

# Drop non-binary columns
df.drop(['classfication','generation','type1','type2', 'attack', 'defense','height_m', 'hp','speed','weight_kg', 'evolves_from', 'evolves_to'], axis=1, inplace=True)

# Set default values for missing values
df.fillna(df.mean(), inplace=True) # TODO adjust mean here

print(df)

           name  pokedex_number  is_legendary  type1__bug  type1__dark  \
0     Bulbasaur               1             0           0            0   
1       Ivysaur               2             0           0            0   
2      Venusaur               3             0           0            0   
3    Charmander               4             0           0            0   
4    Charmeleon               5             0           0            0   
..          ...             ...           ...         ...          ...   
862  Celesteela             797             1           0            0   
863     Kartana             798             1           0            0   
864    Guzzlord             799             1           0            1   
865    Necrozma             800             1           0            0   
866    Magearna             801             1           0            0   

     type1__dragon  type1__electric  type1__fairy  type1__fighting  \
0                0                0      

  df.fillna(df.mean(), inplace=True) # TODO adjust mean here


In [38]:
# Output to csv file
df.to_csv('data/pokemon_preprocessed.csv', index=False)

In [34]:
df = pd.read_csv('data/pokemon.csv')
df['weight_kg'].mean()

61.378104993597944

In [16]:
df = pd.read_csv('data/pokemon.csv')
df['height_m'].mean()

1.1638924455825865