### Import packages

In [1]:
import numpy as np
import pandas as pd

### Read the data and delete rows with missing data

In [2]:
df = pd.read_csv('datasets\pokemon.csv', index_col=0)
df = df.dropna()
df

Unnamed: 0_level_0,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False
2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False
3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False
3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False
6,Charizard,Fire,Flying,78,84,78,109,85,100,1,False
...,...,...,...,...,...,...,...,...,...,...,...
719,Diancie,Rock,Fairy,50,100,150,100,150,50,6,True
719,DiancieMega Diancie,Rock,Fairy,50,160,110,160,110,110,6,True
720,HoopaHoopa Confined,Psychic,Ghost,80,110,60,150,130,70,6,True
720,HoopaHoopa Unbound,Psychic,Dark,80,160,60,170,130,80,6,True


Get unique values in the 'Type 1' column and also print the number of unique values.

In [3]:
print(df['Type 1'].unique())

print(len(df['Type 1'].unique()))

['Grass' 'Fire' 'Bug' 'Normal' 'Poison' 'Water' 'Rock' 'Electric' 'Ghost'
 'Ground' 'Psychic' 'Ice' 'Dragon' 'Fairy' 'Dark' 'Steel' 'Fighting'
 'Flying']
18


### Encode using OneHotEncoder

In [4]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()

encoded_col = encoder.fit_transform(df['Type 1']
                                    .values
                                    .reshape(-1,1))

In [5]:
print(type(encoded_col))
print(f'{encoded_col.shape=}')

<class 'scipy.sparse._csr.csr_matrix'>
encoded_col.shape=(414, 18)


Notice that the result of encoding is a sparse matrix.

### Get the category types of the encoder.

In [6]:
encoder.get_feature_names_out()

array(['x0_Bug', 'x0_Dark', 'x0_Dragon', 'x0_Electric', 'x0_Fairy',
       'x0_Fighting', 'x0_Fire', 'x0_Flying', 'x0_Ghost', 'x0_Grass',
       'x0_Ground', 'x0_Ice', 'x0_Normal', 'x0_Poison', 'x0_Psychic',
       'x0_Rock', 'x0_Steel', 'x0_Water'], dtype=object)

Notice that the category names are prepended by 'x0_'. This prepending string can be modified using custom string.

In [7]:
encoder.get_feature_names_out(['enc'])

array(['enc_Bug', 'enc_Dark', 'enc_Dragon', 'enc_Electric', 'enc_Fairy',
       'enc_Fighting', 'enc_Fire', 'enc_Flying', 'enc_Ghost', 'enc_Grass',
       'enc_Ground', 'enc_Ice', 'enc_Normal', 'enc_Poison', 'enc_Psychic',
       'enc_Rock', 'enc_Steel', 'enc_Water'], dtype=object)

In [8]:
encoder.get_feature_names_out(['xyz'])

array(['xyz_Bug', 'xyz_Dark', 'xyz_Dragon', 'xyz_Electric', 'xyz_Fairy',
       'xyz_Fighting', 'xyz_Fire', 'xyz_Flying', 'xyz_Ghost', 'xyz_Grass',
       'xyz_Ground', 'xyz_Ice', 'xyz_Normal', 'xyz_Poison', 'xyz_Psychic',
       'xyz_Rock', 'xyz_Steel', 'xyz_Water'], dtype=object)

We would want the custom string to be the name of the column in original data, i.e., "Type 1". We will use an "underscore" in place of "space" character.

In [9]:
encoder.get_feature_names_out(['Type_1'])

array(['Type_1_Bug', 'Type_1_Dark', 'Type_1_Dragon', 'Type_1_Electric',
       'Type_1_Fairy', 'Type_1_Fighting', 'Type_1_Fire', 'Type_1_Flying',
       'Type_1_Ghost', 'Type_1_Grass', 'Type_1_Ground', 'Type_1_Ice',
       'Type_1_Normal', 'Type_1_Poison', 'Type_1_Psychic', 'Type_1_Rock',
       'Type_1_Steel', 'Type_1_Water'], dtype=object)

To modify the original dataframe to encoded dataframe we will have to first delete the original column (Type 1) and then add the encoded columns to the original data. We will also have to convert the sparse matrix created by OneHotEncoder to dense array first so that we can concatenate our data.

In [10]:
# delete the original column
df.drop('Type 1', axis=1, inplace=True)

In [11]:
#convert sparse matrix to dense array
encoded_col_dense = encoded_col.toarray()

# get column names
new_cols = encoder.get_feature_names_out(['Type_1'])

# convert the encoded matrix to dataframe
enc_mat_df = pd.DataFrame(encoded_col_dense, columns=new_cols)

# concatenate this dataframe with original data
df = df.reset_index()
df_with_encoding = pd.concat([df, enc_mat_df], axis=1)
df_with_encoding

Unnamed: 0,#,Name,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,...,Type_1_Ghost,Type_1_Grass,Type_1_Ground,Type_1_Ice,Type_1_Normal,Type_1_Poison,Type_1_Psychic,Type_1_Rock,Type_1_Steel,Type_1_Water
0,1,Bulbasaur,Poison,45,49,49,65,65,45,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Ivysaur,Poison,60,62,63,80,80,60,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Venusaur,Poison,80,82,83,100,100,80,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,VenusaurMega Venusaur,Poison,80,100,123,122,120,80,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,Charizard,Flying,78,84,78,109,85,100,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
409,719,Diancie,Fairy,50,100,150,100,150,50,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
410,719,DiancieMega Diancie,Fairy,50,160,110,160,110,110,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
411,720,HoopaHoopa Confined,Ghost,80,110,60,150,130,70,6,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
412,720,HoopaHoopa Unbound,Dark,80,160,60,170,130,80,6,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


This is how we get the new dataframe with the encoded data. We can write this as a function as follows.

In [1]:
def do_one_hot_encoding(df, column_name):
    
    # import OneHotEncoder
    from sklearn.preprocessing import OneHotEncoder
    
    # define encoder
    encoder = OneHotEncoder()
    
    # fit transform
    encoded_col = encoder.fit_transform(df[column_name]
                                        .values
                                        .reshape(-1,1))
    
    #convert sparse matrix to dense array
    encoded_col_dense = encoded_col.toarray()
    
    # delete the original column
    df.drop('Type 1', axis=1, inplace=True)
    
    # get column names
    new_cols = encoder.get_feature_names_out([column_name])

    # convert the encoded matrix to dataframe
    enc_mat_df = pd.DataFrame(encoded_col_dense, columns=new_cols)

    # concatenate this dataframe with original data
    df = df.reset_index()
    df_with_encoding = pd.concat([df, enc_mat_df], axis=1)
    
    return df_with_encoding


# import pandas
import pandas as pd

# read data
df = pd.read_csv('datasets\pokemon.csv', index_col=0)
column_name = 'Type 1'
df = df.dropna()

# OneHotEncoder
df_encoded = do_one_hot_encoding(df, column_name)

# display result
df_encoded.head()

Unnamed: 0,#,Name,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,...,Type 1_Ghost,Type 1_Grass,Type 1_Ground,Type 1_Ice,Type 1_Normal,Type 1_Poison,Type 1_Psychic,Type 1_Rock,Type 1_Steel,Type 1_Water
0,1,Bulbasaur,Poison,45,49,49,65,65,45,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Ivysaur,Poison,60,62,63,80,80,60,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Venusaur,Poison,80,82,83,100,100,80,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,VenusaurMega Venusaur,Poison,80,100,123,122,120,80,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,Charizard,Flying,78,84,78,109,85,100,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
