In [16]:
import pandas as pd
import numpy as np

##

Count or frequency encoding

If we have categorical variables containing many multiple labels or high cardinality,then by using one hot encoding, we will expand the feature space dramatically.

One approach that is to replace each label of the categorical variable by the count, this is the amount of times each label appears in the dataset. Or the frequency, this is the percentage of observations within that category. The 2 are equivalent.


In [17]:
df.shape

(4209, 2)

In [19]:
df = pd.read_csv('mercedesbenz.csv', usecols=['X1', 'X2'])
df.head()

Unnamed: 0,X1,X2
0,v,at
1,t,av
2,w,n
3,t,n
4,v,n


In [20]:
df['X1'].unique()
df.X2.unique()

array(['at', 'av', 'n', 'e', 'as', 'aq', 'r', 'ai', 'ak', 'm', 'a', 'k',
       'ae', 's', 'f', 'd', 'ag', 'ay', 'ac', 'ap', 'g', 'i', 'aw', 'y',
       'b', 'ao', 'al', 'h', 'x', 'au', 't', 'an', 'z', 'ah', 'p', 'am',
       'j', 'q', 'af', 'l', 'aa', 'c', 'o', 'ar'], dtype=object)

In [21]:
for col in df:
    print(df[col].unique())

['v' 't' 'w' 'b' 'r' 'l' 's' 'aa' 'c' 'a' 'e' 'h' 'z' 'j' 'o' 'u' 'p' 'n'
 'i' 'y' 'd' 'f' 'm' 'k' 'g' 'q' 'ab']
['at' 'av' 'n' 'e' 'as' 'aq' 'r' 'ai' 'ak' 'm' 'a' 'k' 'ae' 's' 'f' 'd'
 'ag' 'ay' 'ac' 'ap' 'g' 'i' 'aw' 'y' 'b' 'ao' 'al' 'h' 'x' 'au' 't' 'an'
 'z' 'ah' 'p' 'am' 'j' 'q' 'af' 'l' 'aa' 'c' 'o' 'ar']


In [22]:
len(df.X1.unique()) #27
len(df.X2.unique()) #44

44

In [23]:
# let's have a look at how many labels each variable has

for col in df.columns:
    print(col, ': ', len(df[col].unique()), ' labels')


X1 :  27  labels
X2 :  44  labels


In [24]:
# let's examine how many columns we will obtain after one hot encoding these variables
pd.get_dummies(df, drop_first=True).shape

(4209, 69)

##
We can observe that from with just 2 categorical features we are getting 69 features with the help of one hot encoding.


In [9]:
df.X2.value_counts().to_dict()

# storedin dict cause we can remap teh labels

{'as': 1659,
 'ae': 496,
 'ai': 415,
 'm': 367,
 'ak': 265,
 'r': 153,
 'n': 137,
 's': 94,
 'f': 87,
 'e': 81,
 'aq': 63,
 'ay': 54,
 'a': 47,
 't': 29,
 'k': 25,
 'i': 25,
 'b': 21,
 'ao': 20,
 'ag': 19,
 'z': 19,
 'd': 18,
 'ac': 13,
 'g': 12,
 'ap': 11,
 'y': 11,
 'x': 10,
 'aw': 8,
 'at': 6,
 'h': 6,
 'al': 5,
 'an': 5,
 'q': 5,
 'av': 4,
 'ah': 4,
 'p': 4,
 'au': 3,
 'am': 1,
 'j': 1,
 'af': 1,
 'l': 1,
 'aa': 1,
 'c': 1,
 'o': 1,
 'ar': 1}

In [10]:
df_freq = df.X2.value_counts().to_dict()

In [11]:
df.X2 = df.X2.map(df_freq)
df.head()


# here wat we did was we replaced our cate value with its count
# advatage is that we dont have to craete many other col so no increa in dim and simple to impe
# disadvantage is that if 2 cate has same value count then it is hard for teh machien to predict

Unnamed: 0,X1,X2
0,v,6
1,t,4
2,w,137
3,t,137
4,v,137


##
One Hot Encoding - variables with many categories


In [25]:
dD = pd.read_csv('mercedesbenz.csv', usecols=['X1', 'X2'])
dD.head()

Unnamed: 0,X1,X2
0,v,at
1,t,av
2,w,n
3,t,n
4,v,n


In [26]:
for col in df.columns:
    print(col, ': ', len(df[col].unique()), ' labels')

X1 :  27  labels
X2 :  44  labels


In [28]:
# let's examine how many columns we will obtain after one hot encoding these variables
pd.get_dummies(df, drop_first=True).shape

# We can observe that from with just 2 categorical features we are getting 69 features with the help of one hot encoding.

(4209, 69)

##

KDD Cup Orange Challenge
What can we do instead?

http://proceedings.mlr.press/v7/niculescu09/niculescu09.pdf In the winning solution of the KDD 2009 cup: "Winning the KDD Cup Orange Challenge with Ensemble

The Team suggested using 10 most frequent labels convert them into dummy variables using onehotencoding

How can we do that in python?

In [29]:
# let's find the top 10 most frequent categories for the variable X2

dD.X2.value_counts().sort_values(ascending=False).head(20)

X2
as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
k       25
i       25
b       21
ao      20
ag      19
z       19
Name: count, dtype: int64

In [30]:
# let's make a list with the most frequent categories of the variable

top_10_labels = [y for y in dD.X2.value_counts().sort_values(ascending=False).head(10).index]
top_10_labels

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [31]:
#  get whole set of dummy variables, for all the categorical variables

def one_hot_encoding_top_x(dd, variable, top_x_labels):
    # function to create the dummy variables for the most frequent labels
    # we can vary the number of most frequent labels that we encode
    
    for label in top_x_labels:
        dD[variable+'_'+label] = np.where(dd[variable]==label, 1, 0)

In [32]:
# read the data again
dd = pd.read_csv('mercedesbenz.csv', usecols=['X1', 'X2'])

# encode X2 into the 10 most frequent categories
one_hot_encoding_top_x(df, 'X2', top_10_labels)
dd.head()

Unnamed: 0,X1,X2
0,v,at
1,t,av
2,w,n
3,t,n
4,v,n


##
Ordinal numbering encoding or Label Encoding

[ORDINAL ENCODING]

In [33]:
import datetime


In [39]:
# create a variable with dates, and from that extract the weekday
# I create a list of dates with 20 days difference from today
# and then transform it into a datafame

df_base = datetime.datetime.today()
df_date_list = [df_base - datetime.timedelta(days=x) for x in range(0, 20)]
df = pd.DataFrame(df_date_list)
df.columns = ['day']
df

Unnamed: 0,day
0,2023-06-22 17:57:52.165787
1,2023-06-21 17:57:52.165787
2,2023-06-20 17:57:52.165787
3,2023-06-19 17:57:52.165787
4,2023-06-18 17:57:52.165787
5,2023-06-17 17:57:52.165787
6,2023-06-16 17:57:52.165787
7,2023-06-15 17:57:52.165787
8,2023-06-14 17:57:52.165787
9,2023-06-13 17:57:52.165787


In [42]:
# extract the week day name

df['day_of_week'] = df['day'].dt.day_name()
df.head()

Unnamed: 0,day,day_of_week
0,2023-06-22 17:57:52.165787,Thursday
1,2023-06-21 17:57:52.165787,Wednesday
2,2023-06-20 17:57:52.165787,Tuesday
3,2023-06-19 17:57:52.165787,Monday
4,2023-06-18 17:57:52.165787,Sunday


In [43]:
# Engineer categorical variable by ordinal number replacement

weekday_map = {'Monday':1,
               'Tuesday':2,
               'Wednesday':3,
               'Thursday':4,
               'Friday':5,
               'Saturday':6,
               'Sunday':7
}

df['day_ordinal'] = df.day_of_week.map(weekday_map)
df.head(20)


Unnamed: 0,day,day_of_week,day_ordinal
0,2023-06-22 17:57:52.165787,Thursday,4
1,2023-06-21 17:57:52.165787,Wednesday,3
2,2023-06-20 17:57:52.165787,Tuesday,2
3,2023-06-19 17:57:52.165787,Monday,1
4,2023-06-18 17:57:52.165787,Sunday,7
5,2023-06-17 17:57:52.165787,Saturday,6
6,2023-06-16 17:57:52.165787,Friday,5
7,2023-06-15 17:57:52.165787,Thursday,4
8,2023-06-14 17:57:52.165787,Wednesday,3
9,2023-06-13 17:57:52.165787,Tuesday,2
