In [1]:
import numpy as np
import pandas as pd

In [2]:
df  = pd.read_csv('Customers.csv')

In [3]:
df.head()

Unnamed: 0,age,income,gender,m_status,buys
0,25,high,male,single,no
1,25,high,male,married,no
2,35,high,male,single,yes
3,35,medium,male,single,yes
4,30,low,female,single,yes


In [4]:
df.isnull().any()

age         False
income      False
gender      False
m_status    False
buys        False
dtype: bool

In [5]:
df.isnull(). sum()

age         0
income      0
gender      0
m_status    0
buys        0
dtype: int64

# Manual Encoding

In [6]:
df.income.unique ()

array(['high', 'medium', 'low'], dtype=object)

In [7]:
df ['income'] = df ['income'].replace (['high', 'medium', 'low'], [2,1,0])

In [8]:
df['income'].head(7)

0    2
1    2
2    2
3    1
4    0
5    0
6    0
Name: income, dtype: int64

# Label Encoding

In [9]:
from sklearn.preprocessing import LabelEncoder           

In [10]:
lb = LabelEncoder()

In [11]:
df ['gender'] = lb.fit_transform (df ['gender'])

In [12]:
df ['gender'].head ()

0    1
1    1
2    1
3    1
4    0
Name: gender, dtype: int32

In [13]:
df.head ()

Unnamed: 0,age,income,gender,m_status,buys
0,25,2,1,single,no
1,25,2,1,married,no
2,35,2,1,single,yes
3,35,1,1,single,yes
4,30,0,0,single,yes


# Loop

In [14]:
for column in  df.columns:
    if df [column].dtype == np.number:
        continue
    df [column] = LabelEncoder ().fit_transform (df [column])
    

  if df [column].dtype == np.number:


In [15]:
df.head ()

Unnamed: 0,age,income,gender,m_status,buys
0,1,2,1,1,0
1,1,2,1,0,0
2,4,2,1,1,1
3,4,1,1,1,1
4,2,0,0,1,1


# One-hot Encoding

In [18]:
df  = pd.read_csv('Customers.csv')

In [20]:
df.head ()

Unnamed: 0,age,income,gender,m_status,buys
0,25,high,male,single,no
1,25,high,male,married,no
2,35,high,male,single,yes
3,35,medium,male,single,yes
4,30,low,female,single,yes


In [25]:
dummy_variable = pd.get_dummies (df ['income'], drop_first = True)

In [27]:
dummy_variable.head()

Unnamed: 0,low,medium
0,0,0
1,0,0
2,0,0
3,0,1
4,1,0


In [30]:
new_df = df.drop (['income'], axis = 1)

In [32]:
new_df.head ()

Unnamed: 0,age,gender,m_status,buys
0,25,male,single,no
1,25,male,married,no
2,35,male,single,yes
3,35,male,single,yes
4,30,female,single,yes


In [41]:
df = pd.concat ([new_df, dummy_variable] , axis = 1)

In [42]:
df.head ()

Unnamed: 0,age,gender,m_status,buys,low,medium
0,25,male,single,no,0,0
1,25,male,married,no,0,0
2,35,male,single,yes,0,0
3,35,male,single,yes,0,1
4,30,female,single,yes,1,0


# Ordinal Encoding

In [44]:
df = pd.read_csv ('Customers.csv')

In [46]:
df.head()

Unnamed: 0,age,income,gender,m_status,buys
0,25,high,male,single,no
1,25,high,male,married,no
2,35,high,male,single,yes
3,35,medium,male,single,yes
4,30,low,female,single,yes


In [50]:
df ['income'].unique()

array(['high', 'medium', 'low'], dtype=object)

In [51]:
income_list = ['high', 'medium', 'low']

In [52]:
from sklearn.preprocessing import OrdinalEncoder

In [54]:
ordinal = OrdinalEncoder (categories = [income_list])

In [55]:
encoded_values = ordinal.fit_transform (df [['income']])

In [56]:
encoded_values

array([[0.],
       [0.],
       [0.],
       [1.],
       [2.],
       [2.],
       [2.],
       [1.],
       [2.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [2.]])

In [61]:
newIncome = pd.DataFrame (encoded_values, columns = ['income'])

In [68]:
newIncome

Unnamed: 0,income
0,0.0
1,0.0
2,0.0
3,1.0
4,2.0
5,2.0
6,2.0
7,1.0
8,2.0
9,1.0


In [64]:
df ['income'] = newIncome

In [71]:
df.head()

Unnamed: 0,age,income,gender,m_status,buys
0,25,0.0,male,single,no
1,25,0.0,male,married,no
2,35,0.0,male,single,yes
3,35,1.0,male,single,yes
4,30,2.0,female,single,yes


# Concatenation

In [72]:
df = pd.read_csv ('Customers.csv')

In [73]:
df.income.unique ()

array(['high', 'medium', 'low'], dtype=object)

In [74]:
incomeList = ['high', 'medium', 'low']

In [75]:
from sklearn.preprocessing import OrdinalEncoder

In [76]:
ordinal = OrdinalEncoder (categories = [income_list])

In [78]:
encodedValues = ordinal.fit_transform (df [['income']])

In [79]:
encodedValues

array([[0.],
       [0.],
       [0.],
       [1.],
       [2.],
       [2.],
       [2.],
       [1.],
       [2.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [2.]])

In [80]:
NewIncome = pd.DataFrame (encodedValues, columns = ['income'])

In [82]:
NewIncome.head ()

Unnamed: 0,income
0,0.0
1,0.0
2,0.0
3,1.0
4,2.0


In [84]:
drop_income = df.drop (['income'], axis = 1)

In [85]:
drop_income

Unnamed: 0,age,gender,m_status,buys
0,25,male,single,no
1,25,male,married,no
2,35,male,single,yes
3,35,male,single,yes
4,30,female,single,yes
5,32,female,single,no
6,22,female,married,yes
7,22,male,married,no
8,25,female,single,yes
9,35,female,married,yes


In [88]:
df = pd.concat ([drop_income, NewIncome], axis = 1)

In [89]:
df

Unnamed: 0,age,gender,m_status,buys,income
0,25,male,single,no,0.0
1,25,male,married,no,0.0
2,35,male,single,yes,0.0
3,35,male,single,yes,1.0
4,30,female,single,yes,2.0
5,32,female,single,no,2.0
6,22,female,married,yes,2.0
7,22,male,married,no,1.0
8,25,female,single,yes,2.0
9,35,female,married,yes,1.0


# Hashing Encoding

In [94]:
df.head()

Unnamed: 0,age,gender,m_status,buys,income
0,25,male,single,no,0.0
1,25,male,married,no,0.0
2,35,male,single,yes,0.0
3,35,male,single,yes,1.0
4,30,female,single,yes,2.0


In [98]:
import category_encoders as ce

In [102]:
hashing_en = ce.HashingEncoder (cols = ['buys'], n_components =2)

In [103]:
fit_hashing = hashing_en.fit_transform (df)

  elif pd.api.types.is_categorical(cols):


In [104]:
fit_hashing

Unnamed: 0,col_0,col_1,age,gender,m_status,income
0,0,1,25,male,single,0.0
1,0,1,25,male,married,0.0
2,1,0,35,male,single,0.0
3,1,0,35,male,single,1.0
4,1,0,30,female,single,2.0
5,0,1,32,female,single,2.0
6,1,0,22,female,married,2.0
7,0,1,22,male,married,1.0
8,1,0,25,female,single,2.0
9,1,0,35,female,married,1.0
