In [None]:
import pandas as pd
import numpy as np

# let's load the mercedes benz data for demonistartion,  only the categorical variables
data = pd.read_csv('/content/mercedesbenz.csv', usecols=['X1', 'X2', 'X3', 'X4', 'X5', 'X6'])
data.head()

In [None]:
# let's have a look at how many labels each variable has
for col in data.columns:
    print(col, ': ', len(data[col].unique()), ' labels')

In [None]:
# let's examine how many columns we will obtain after one hot encoding these variables
pd.get_dummies(data, drop_first=True).shape

We can see that from just 6 intial categories, but we ended up with 117 columns.
what can we do instead ?


In [None]:
# Let's find the 10 most frequent catergories for the variables X2
data.X2.value_counts().sort_values(ascending=False).head(20)

In [None]:
# let's make a list of most frequent categories of the variable X2
top_10 = [x for x in data.X2.value_counts().sort_values(ascending=False).head(10).index]
top_10

In [None]:
# and now we make the 10 binary variables
for label in top_10:
    data[label] = np.where(data['X2']==label, 1, 0)
data[['X2']+top_10].head(40)

In [None]:
# get whole set of dummy variables, for all categorical variables
def one_hot_encoding(df, variable, top_x_labels):
  # function to create the dummy variables for the most frequent labels
  # we can vary the number of most frequent labels that we encode
  for label in top_x_labels:
    df[variable+'_'+label] = np.where(data[variable]==label, 1, 0)

# read the data
data = pd.read_csv('/content/mercedesbenz.csv', usecols=['X1', 'X2', 'X3', 'X4', 'X5', 'X6'])
one_hot_encoding(data, 'X2', top_10)
data.head()

In [None]:
# find the top 10 most for X1 category
top_10 = [x for x in data.X1.value_counts().sort_values(ascending=False).head(10).index]
one_hot_encoding(data, 'X1', top_10)
data.head()

### One Hot enocding of top variables
#### Advantages
- Straightforward to implement
- Does not require hrs of variable exploration
- Does not expand massively feature space (number of columns in the dataset)

#### Disadvantages
- Does not add any information that make the variable more predictive
- Does not keep the information of the ignored labels


# Count / Frequency Encoding

If we have categorical variables containing many mutliple labels or high cardinality, then by using one hot encoding. we will expand the feature space drastically
- - -
One approach that is heavily used in kaggle competetions, is to replace each label of the categorical variable by the count, this is the amount of times each label appears in the dataset. Or the Frequency


In [None]:
# It is uses when the data has high cardinality i.e higher number of labels in the feature
import pandas as pd
import numpy as np

data = pd.read_csv('/content/mercedesbenz.csv', usecols=['X1', 'X2'])
data.head()


In [None]:
data.shape

## One hot encoding

In [None]:
pd.get_dummies(data).shape

In [None]:
# let's know how many unique values does the feature contains
len(data['X1'].unique())

In [None]:
len(data['X2'].unique())

In [None]:
# let's look at how many labels
for col in data.columns:
  print(col, ': ', len(data[col].unique()), 'labels')


In [None]:
# let's get the count of each label in variable X2
# first we make a dictionary that maps each label to the counts
data.X2.value_counts().to_dict()

In [None]:
# now let's replace each label in X2 by it's count
# first we make a dictionary that maps each label to the counts
df_frequency_map = data.X2.value_counts().to_dict()


In [None]:
data.X2 = data.X2.map(df_frequency_map)
data.head()

# Handling missing categorical values


In [None]:
data = pd.DataFrame({'Shape':['square', 'square', 'oval', 'circle', np.nan]})
data

In [None]:
# let's use simpleImputer with most frequent strategy
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit_transform(data)

In [None]:
# let's use simpleImputer with constant strategy
imputer = SimpleImputer(strategy='constant', fill_value='missing')
imputer.fit_transform(data)