# **Dataset**
[Pokemon with stats](https://www.kaggle.com/datasets/abcsds/pokemon)

# **Import Libraries**

In [105]:
import pandas as pd
import numpy as np

# Data Split
from sklearn.model_selection import train_test_split

# Data Transforming
from sklearn.preprocessing import MinMaxScaler # Normalization
from sklearn.preprocessing import StandardScaler # Standardization

# Data Cleaning
from sklearn.impute import SimpleImputer

# Encoding
from sklearn.preprocessing import OneHotEncoder # One Hot Encoding

# **Baca Berkas 'Pokemon.csv'**

In [106]:
df = pd.read_csv('Pokemon.csv')

In [107]:
df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   #           800 non-null    int64 
 1   Name        800 non-null    object
 2   Type 1      800 non-null    object
 3   Type 2      414 non-null    object
 4   Total       800 non-null    int64 
 5   HP          800 non-null    int64 
 6   Attack      800 non-null    int64 
 7   Defense     800 non-null    int64 
 8   Sp. Atk     800 non-null    int64 
 9   Sp. Def     800 non-null    int64 
 10  Speed       800 non-null    int64 
 11  Generation  800 non-null    int64 
 12  Legendary   800 non-null    bool  
dtypes: bool(1), int64(9), object(3)
memory usage: 75.9+ KB


# **Data Split**
Split Ratio
1. Training Set 70%
2. Testing Set 30%

In [109]:
# ambil kolom '#' kemudian gabungkan dengan semua kolom setelah kolom 'Name'
data = df[['#']].join(df.iloc[:, 2:])
# ambil kolom 'Name'
label = df[['Name']]
# split datasetnya
data_train, data_test, label_train, label_test = train_test_split(data, label, test_size = .3)

In [110]:
print(f'Jumlah Data pada Dataset: {len(df)} data',
      f'Jumlah Data Train\t: {len(data_train)} data',
      f'Jumlah Label Train\t: {len(label_train)} data',
      f'Jumlah Data Test\t: {len(data_test)} data',
      f'Jumlah Label Test\t: {len(label_test)} data',
      sep = '\n'
      )

Jumlah Data pada Dataset: 800 data
Jumlah Data Train	: 560 data
Jumlah Label Train	: 560 data
Jumlah Data Test	: 240 data
Jumlah Label Test	: 240 data


# **Data Transforming - Normalization**

In [111]:
normalized_dataset = df.copy() # copy dataset

In [112]:
minMaxScaler = MinMaxScaler() # make min max scaler object

In [113]:
normalized_dataset = minMaxScaler.fit_transform(normalized_dataset[['HP']]) # do normalization

In [114]:
normalized_dataset = pd.DataFrame(normalized_dataset).rename(columns = {0 : 'HP'}) # convert normalized dataset to pandas DataFrame

In [115]:
normalized_dataset.head()

Unnamed: 0,HP
0,0.173228
1,0.232283
2,0.311024
3,0.311024
4,0.149606


# **Data Transforming - Standardization**

In [116]:
standardized_dataset = df.copy() # copy dataset

In [117]:
standardized_dataset.drop(['Name', 'Type 1', 'Type 2', 'Legendary'], axis = 1, inplace = True) # drop column with type object

In [118]:
standardScaler = StandardScaler() # make standard scaler object

In [119]:
standardized_dataset = pd.DataFrame(standardScaler.fit_transform(standardized_dataset), columns = ['#', 'Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Generation']) # convert standardized dataset to pandas DataFrame

In [120]:
standardized_dataset.std() # show standard deviation

#             1.000626
Total         1.000626
HP            1.000626
Attack        1.000626
Defense       1.000626
Sp. Atk       1.000626
Sp. Def       1.000626
Speed         1.000626
Generation    1.000626
dtype: float64

# **Data Cleaning - Null Value**

In [121]:
df.isna().sum() # check null value

#               0
Name            0
Type 1          0
Type 2        386
Total           0
HP              0
Attack          0
Defense         0
Sp. Atk         0
Sp. Def         0
Speed           0
Generation      0
Legendary       0
dtype: int64

In [122]:
df_no_null = df.copy() # copy dataset

In [123]:
imputer_modus = SimpleImputer(strategy='most_frequent') # make simple imputer object with most frequent strategy for substitute null object data type

In [124]:
df_no_null['Type 2'] = imputer_modus.fit_transform(df_no_null[['Type 2']])

In [125]:
df_no_null.isna().sum() # check null value

#             0
Name          0
Type 1        0
Type 2        0
Total         0
HP            0
Attack        0
Defense       0
Sp. Atk       0
Sp. Def       0
Speed         0
Generation    0
Legendary     0
dtype: int64

# **Data Cleaning - Duplicate Value**

In [126]:
df_no_duplicate = df.copy().append(df.head(100), ignore_index = True) # copy dataset and add 100 duplicate value

In [127]:
df_no_duplicate.duplicated().sum() # check total duplicate value

100

In [128]:
df_no_duplicate.drop_duplicates(inplace=True) # delete duplicate value

In [129]:
df_no_duplicate.duplicated().sum() # check total duplicate value

0

# **Change Data Type Numeric Attribute**

In [130]:
df.dtypes # check the dataset types

#              int64
Name          object
Type 1        object
Type 2        object
Total          int64
HP             int64
Attack         int64
Defense        int64
Sp. Atk        int64
Sp. Def        int64
Speed          int64
Generation     int64
Legendary       bool
dtype: object

In [131]:
# converting Attack DTypes to float64
df['Attack'].astype('float64').head()

0     49.0
1     62.0
2     82.0
3    100.0
4     52.0
Name: Attack, dtype: float64

# **Encoding - One Hot Encoding**

In [132]:
df_encode = df.copy() # copy dataset

In [133]:
temp_dataset = pd.get_dummies(df_encode[['Type 1']]) # encode attribute Type 1
temp_dataset2 = pd.get_dummies(df_encode[['Type 2']]) # encode attribute Type 2

In [134]:
# change temporary dataset into dataframe
temp_dataset = pd.DataFrame(temp_dataset)
temp_dataset2 = pd.DataFrame(temp_dataset2)

In [135]:
df_encode.drop(['Type 1', 'Type 2'], axis = 1, inplace = True) # drop Type 1 and Type 2 Column

In [136]:
df_encode = df_encode.join([temp_dataset, temp_dataset2]) # merge

In [137]:
df_encode.head() # show encoded dataset

Unnamed: 0,#,Name,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,...,Type 2_Ghost,Type 2_Grass,Type 2_Ground,Type 2_Ice,Type 2_Normal,Type 2_Poison,Type 2_Psychic,Type 2_Rock,Type 2_Steel,Type 2_Water
0,1,Bulbasaur,318,45,49,49,65,65,45,1,...,0,0,0,0,0,1,0,0,0,0
1,2,Ivysaur,405,60,62,63,80,80,60,1,...,0,0,0,0,0,1,0,0,0,0
2,3,Venusaur,525,80,82,83,100,100,80,1,...,0,0,0,0,0,1,0,0,0,0
3,3,VenusaurMega Venusaur,625,80,100,123,122,120,80,1,...,0,0,0,0,0,1,0,0,0,0
4,4,Charmander,309,39,52,43,60,50,65,1,...,0,0,0,0,0,0,0,0,0,0


