In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
# 그림을 로딩없이 바로 볼 수 있게 하는 명령어
%matplotlib inline
# 한글 폰트 사용 시 글자를 선명하게 하는 명령어
%config InlineBackend.figure_format = 'retina'

## 포켓몬 데이터 불러오기

In [24]:
# 환경변수 설정
users_dir = os.getenv("PYTHON-TRAINING", "/default/path")
# 절대 경로 설정
csv_path = "/Users/mac/Documents/GitHub/python-training/notebooks/Pokemon.csv"
# 원본 데이터를 담은 변수는 고이 놔두는 습관 필요
original_data = pd.read_csv(csv_path)
# 데이터를 카피해서 사용
pokemon = original_data.copy()


(800, 13)


Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [30]:
# 전설의 포켓몬 데이터셋 생성
legendary = pokemon[pokemon["Legendary"] == True].reset_index(drop=True)
display(legendary.shape)
display(legendary.head())

(65, 13)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,144,Articuno,Ice,Flying,580,90,85,100,95,125,85,1,True
1,145,Zapdos,Electric,Flying,580,90,90,85,125,90,100,1,True
2,146,Moltres,Fire,Flying,580,90,100,90,125,85,90,1,True
3,150,Mewtwo,Psychic,,680,106,110,90,154,90,130,1,True
4,150,MewtwoMega Mewtwo X,Psychic,Fighting,780,106,190,100,154,100,130,1,True


In [32]:
# 일반 포켓몬 데이터셋 생성
ordinary = pokemon[pokemon["Legendary"] == False].reset_index(drop=True)
display(ordinary.shape)
display(ordinary.head())

(735, 13)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [37]:
pokemon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   #           800 non-null    int64 
 1   Name        800 non-null    object
 2   Type 1      800 non-null    object
 3   Type 2      414 non-null    object
 4   Total       800 non-null    int64 
 5   HP          800 non-null    int64 
 6   Attack      800 non-null    int64 
 7   Defense     800 non-null    int64 
 8   Sp. Atk     800 non-null    int64 
 9   Sp. Def     800 non-null    int64 
 10  Speed       800 non-null    int64 
 11  Generation  800 non-null    int64 
 12  Legendary   800 non-null    bool  
dtypes: bool(1), int64(9), object(3)
memory usage: 75.9+ KB


In [38]:
pokemon.describe()

Unnamed: 0,#,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,362.81375,435.1025,69.25875,79.00125,73.8425,72.82,71.9025,68.2775,3.32375
std,208.343798,119.96304,25.534669,32.457366,31.183501,32.722294,27.828916,29.060474,1.66129
min,1.0,180.0,1.0,5.0,5.0,10.0,20.0,5.0,1.0
25%,184.75,330.0,50.0,55.0,50.0,49.75,50.0,45.0,2.0
50%,364.5,450.0,65.0,75.0,70.0,65.0,70.0,65.0,3.0
75%,539.25,515.0,80.0,100.0,90.0,95.0,90.0,90.0,5.0
max,721.0,780.0,255.0,190.0,230.0,194.0,230.0,180.0,6.0


## 결측치와 전체 칼럼
데이터셋 하나하나 뜯어보기

### 범주형
* Name : 포켓몬 이름. 포켓몬 각각의 이름으로 저장되고, 800개의 포켓몬의 이름 데이터는 모두 다름
* Type 1: 첫 번째 속성, 속성을 하나만 가지는 경우 Type 1에 입력
* Type 2: 두 번째 속성, 속성을 하나만 가지는 포켓몬의 경우 Type2는 NaN을 가짐

### 숫자형
* '#' : 포켓만 ID number 성별은 다르지만 같은 포켓몬인 경우 등은 같은 #을 가짐
* Total: 전체 6가지 스탯의 총합
* HP: 포켓몬의 체력
* Attack: 물리 공격력
* Defense: 물리 공격에 대한 방어력
* Sp.Atk: 특수 공격력
* Sp.Def: 특수 공격에 대한 방어력
* Speed: 포켓몬 매치에 대해 어떤 포켓몬이 먼저 공격할지 결정 (더 높은 포켓몬이 먼저 공격)
* Generation: 포켓몬의 세대. 현대 6세대까지 있음
* Legendary: 전설의 포켓몬 여부 (bool)

In [36]:
# 빈 데이터 확인하기 - 결측치 확인
pokemon.isnull().sum()

#               0
Name            0
Type 1          0
Type 2        386
Total           0
HP              0
Attack          0
Defense         0
Sp. Atk         0
Sp. Def         0
Speed           0
Generation      0
Legendary       0
dtype: int64

In [111]:
# 전체 columns 출력해서 이해해보기
cols = pokemon.columns
display(cols)
display(len(cols))

Index(['#', 'Name', 'Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense',
       'Sp. Atk', 'Sp. Def', 'Speed', 'Generation', 'Legendary'],
      dtype='object')

13

In [89]:
# 전체 rows 출력해서 이해해보기
for col in pokemon.columns:
    print(f"{col} : {pokemon[col].nunique()} , {pokemon[col].unique()}")

# : 721 , [  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
 235 236 237 238 239 240 241 242 243 244 

In [110]:
# id_num 중복값 갯수 확인
id_num = pokemon.duplicated(subset='#', keep='first')
print(sum(id_num)) # True 갯수 확인
print(sum(~id_num)) # False 갯수 확인

# id_num 중복값 고유 이름 확인
duplicate_id_num = pokemon.loc[pokemon.duplicated(subset='#', keep='first'), 'Name']
print(duplicate_id_num.unique())

# 중복값 고유 이름 포함 데이터프레임 출력
duplicate_rows = pokemon[pokemon.duplicated(subset='#', keep='first')]
display(duplicate_rows[cols]) 

79
721
['VenusaurMega Venusaur' 'CharizardMega Charizard X'
 'CharizardMega Charizard Y' 'BlastoiseMega Blastoise'
 'BeedrillMega Beedrill' 'PidgeotMega Pidgeot' 'AlakazamMega Alakazam'
 'SlowbroMega Slowbro' 'GengarMega Gengar' 'KangaskhanMega Kangaskhan'
 'PinsirMega Pinsir' 'GyaradosMega Gyarados' 'AerodactylMega Aerodactyl'
 'MewtwoMega Mewtwo X' 'MewtwoMega Mewtwo Y' 'AmpharosMega Ampharos'
 'SteelixMega Steelix' 'ScizorMega Scizor' 'HeracrossMega Heracross'
 'HoundoomMega Houndoom' 'TyranitarMega Tyranitar' 'SceptileMega Sceptile'
 'BlazikenMega Blaziken' 'SwampertMega Swampert' 'GardevoirMega Gardevoir'
 'SableyeMega Sableye' 'MawileMega Mawile' 'AggronMega Aggron'
 'MedichamMega Medicham' 'ManectricMega Manectric' 'SharpedoMega Sharpedo'
 'CameruptMega Camerupt' 'AltariaMega Altaria' 'BanetteMega Banette'
 'AbsolMega Absol' 'GlalieMega Glalie' 'SalamenceMega Salamence'
 'MetagrossMega Metagross' 'LatiasMega Latias' 'LatiosMega Latios'
 'KyogrePrimal Kyogre' 'GroudonPrimal Groud

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
7,6,CharizardMega Charizard X,Fire,Dragon,634,78,130,111,130,85,100,1,False
8,6,CharizardMega Charizard Y,Fire,Flying,634,78,104,78,159,115,100,1,False
12,9,BlastoiseMega Blastoise,Water,,630,79,103,120,135,115,78,1,False
19,15,BeedrillMega Beedrill,Bug,Poison,495,65,150,40,15,80,145,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
785,711,GourgeistSmall Size,Ghost,Grass,494,55,85,122,58,75,99,6,False
786,711,GourgeistLarge Size,Ghost,Grass,494,75,95,122,58,75,69,6,False
787,711,GourgeistSuper Size,Ghost,Grass,494,85,100,122,58,75,54,6,False
796,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True


## ID와 이름 살펴보기

In [118]:
# ID number 갯수 확인
# set는 중복 허용 안됨
display(len(set(pokemon['#'])))

# ID number가 6인 포켓몬 확인
display(pokemon[pokemon["#"] == 6])

721

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
6,6,Charizard,Fire,Flying,534,78,84,78,109,85,100,1,False
7,6,CharizardMega Charizard X,Fire,Dragon,634,78,130,111,130,85,100,1,False
8,6,CharizardMega Charizard Y,Fire,Flying,634,78,104,78,159,115,100,1,False


In [119]:
# Name 갯수 확인
len(set(pokemon['Name']))

800

## 결측치 파악

In [120]:
# Type 1 & Type 2 : 포켓몬 속성
pokemon.loc[[6, 10]]

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
6,6,Charizard,Fire,Flying,534,78,84,78,109,85,100,1,False
10,8,Wartortle,Water,,405,59,63,80,65,80,58,1,False


In [124]:
# 속성 갯수 확인
len(list(set(pokemon["Type 1"]))), len(list(set(pokemon["Type 2"]))) 

(18, 19)

In [126]:
# 파이썬 차집합 함수(set difference)
set(pokemon["Type 2"]) - set(pokemon["Type 1"])

{nan}

In [127]:
types = list(set(pokemon["Type 1"]))
display(types)
display(len(types))

['Bug',
 'Fire',
 'Ghost',
 'Fairy',
 'Water',
 'Dark',
 'Psychic',
 'Fighting',
 'Flying',
 'Steel',
 'Grass',
 'Rock',
 'Dragon',
 'Ground',
 'Electric',
 'Ice',
 'Poison',
 'Normal']

18

In [128]:
# 결측치 확인
# 속성 1만 가지고 있는 포켓몬 갯수 확인
pokemon["Type 2"].isna().sum()

np.int64(386)

## 데이터셋 시각화