### Download this module

In [5]:
pip install category-encoders

Defaulting to user installation because normal site-packages is not writeable
Collecting category-encoders
  Using cached category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting statsmodels>=0.9.0 (from category-encoders)
  Using cached statsmodels-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.5 kB)
Collecting patsy>=0.5.1 (from category-encoders)
  Using cached patsy-0.5.6-py2.py3-none-any.whl.metadata (3.5 kB)
Using cached category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
Using cached patsy-0.5.6-py2.py3-none-any.whl (233 kB)
Downloading statsmodels-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m960.3 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: patsy, statsmodels, category-encoders
Successfully installed category-encoders-2.6.3 patsy-0.5.6 statsmodels-0.14.1
Note: you may need to 

# **Feature Encoding**

### **Introduction**
>In machine learning, feature encoding is the process of converting categorical or non-numeric data into a numerical format that can be used by machine learning algorithms. Features are the input variables used to make predictions, and these features can be of different types, such as numerical or categorical. Since many machine learning algorithms require numerical input, feature encoding is crucial for handling categorical data.

### **Importance of feature encoding in machine learning**


> - **Algorithm Compatibility**: Many machine learning algorithms, such as linear regression or support vector machines, work with numerical input. Feature encoding ensures that these algorithms can handle categorical data effectively.

> - **Improved Performance**: Feature encoding can lead to better model performance. Models trained on encoded features often capture underlying patterns in the data more accurately.

> - **Consistency in Data Representation**: Feature encoding provides a consistent numerical representation of data, making it easier to compare, analyze, and process information.



## **Types of feature encoding**

![img](feature_encoding.png)


## **1. One Hot Encoding**

In [1]:
# For each unique category, create a binary column. Assign 1/True if the category is present for the observation; otherwise, assign 0/False.
import pandas as pd
# Sample data
data = {'Color': ['Red', 'Green', 'Blue', 'Red']}
df = pd.DataFrame(data)
print(df)
# One-Hot Encoding
encoded_data = pd.get_dummies(df, columns=['Color'])
print(encoded_data)

   Color
0    Red
1  Green
2   Blue
3    Red
   Color_Blue  Color_Green  Color_Red
0       False        False       True
1       False         True      False
2        True        False      False
3       False        False       True


## **2. Label Encoding**

In [2]:
# In label encoding we assign a unique integer to each category.
from sklearn.preprocessing import LabelEncoder
# Sample data
data = {'Animal': ['Dog', 'Cat', 'Bird', 'Dog', "Bird"]}
df = pd.DataFrame(data)
print(df)

# Label Encoding
label_encoder = LabelEncoder()
df['Animal_encoded'] = label_encoder.fit_transform(df['Animal'])
print(df)

  Animal
0    Dog
1    Cat
2   Bird
3    Dog
4   Bird
  Animal  Animal_encoded
0    Dog               2
1    Cat               1
2   Bird               0
3    Dog               2
4   Bird               0


## **3. Ordinal Encoding**

In [3]:
# In ordinal encoding we encoded the labels on the basis or order or rank.
from sklearn.preprocessing import OrdinalEncoder
# Sample data
data = {'Size': ['Small', 'Medium', 'Large', 'Medium']}
df = pd.DataFrame(data)
print(df)

# Ordinal Encoding
ordinal_encoder = OrdinalEncoder(categories=[['Small', 'Medium', 'Large']])
df['Size_encoded'] = ordinal_encoder.fit_transform(df[['Size']])
print(df)

     Size
0   Small
1  Medium
2   Large
3  Medium
     Size  Size_encoded
0   Small           0.0
1  Medium           1.0
2   Large           2.0
3  Medium           1.0


## **4. Target Encoding**

In [3]:
import pandas as pd
data = [['Salt Lake City', 10, 120], ['Seattle', 5, 120], ['San Franscisco', 5, 140], 
        ['Seattle', 3, 100], ['Seattle', 1, 70], ['San Franscisco', 2, 100],['Salt Lake City', 1, 60], 
        ['San Franscisco', 2, 110], ['Seattle', 4, 100],['Salt Lake City', 2, 70] ]
dataframe = pd.DataFrame(data, columns = ['City', 'Years OF Exp','Yearly Salary in Thousands'])
dataframe

Unnamed: 0,City,Years OF Exp,Yearly Salary in Thousands
0,Salt Lake City,10,120
1,Seattle,5,120
2,San Franscisco,5,140
3,Seattle,3,100
4,Seattle,1,70
5,San Franscisco,2,100
6,Salt Lake City,1,60
7,San Franscisco,2,110
8,Seattle,4,100
9,Salt Lake City,2,70


In [7]:
import category_encoders as ce
tenc=ce.TargetEncoder() 
df_city=tenc.fit_transform(dataframe['City'],dataframe['Yearly Salary in Thousands'])

df_new = df_city.join(dataframe.drop('City',axis = 1))
df_new

Unnamed: 0,City,Years OF Exp,Yearly Salary in Thousands
0,96.580044,10,120
1,98.748028,5,120
2,101.728886,5,140
3,98.748028,3,100
4,98.748028,1,70
5,101.728886,2,100
6,96.580044,1,60
7,101.728886,2,110
8,98.748028,4,100
9,96.580044,2,70


## **5. Frequency Encoding**

In [9]:
# In frequency encoding, we encode the categorical variables with the frequency of the categories.
import seaborn as sns
titanic = sns.load_dataset('titanic')

titanic.dropna(inplace=True)

value_counts = titanic['embarked'].value_counts().to_dict()
titanic['embarked_encoded'] = titanic['embarked'].map(value_counts)
print(value_counts)
titanic.head()

{'S': 115, 'C': 65, 'Q': 2}


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,embarked_encoded
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,65
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,115
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True,115
10,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False,115
11,1,1,female,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True,115


## **6. Hash Encoding**

In [11]:
import category_encoders as ce
import pandas as pd

#Create the dataframe
data=pd.DataFrame({'Month':['January','April','March','April','Februay','June','July','June','September']})
print(data)
#Create object for hash encoder
encoder=ce.HashingEncoder(cols='Month',n_components=6)
encoder.fit_transform(data)

       Month
0    January
1      April
2      March
3      April
4    Februay
5       June
6       July
7       June
8  September


Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5
0,0,0,0,0,1,0
1,0,0,0,1,0,0
2,0,0,0,0,1,0
3,0,0,0,1,0,0
4,0,0,0,1,0,0
5,0,1,0,0,0,0
6,1,0,0,0,0,0
7,0,1,0,0,0,0
8,0,0,0,0,1,0
