# Data Encoding
Is part of data cleaning technique. Transformation of data is done here

TYPES:
*   Binary
*   One hot
*   Label
*   Target
*   DUMMY



### One Hot Encoding
One-hot-encoding is a powerful technique to treat categorical data, but it can lead to increased dimensionality, sparsity, and overfitting.


In [None]:
import pandas as pd
from sklearn import preprocessing

# Sample dataset
data = {'vehicle': ['car', 'truck', 'motorcycle', 'car', 'motorcycle']}
df_vehicles = pd.DataFrame(data)

# One-hot encoding
onehot_encoder = preprocessing.OneHotEncoder()
encoded = onehot_encoder.fit_transform(df_vehicles[['vehicle']]).toarray()

# Convert the encoded data into a DataFrame
df_encoded = pd.DataFrame(encoded, columns=onehot_encoder.get_feature_names_out(['vehicle']))

# Concatenate the original and encoded DataFrames
df_result = pd.concat([df_vehicles, df_encoded], axis=1)

print(df_result)


      vehicle  vehicle_car  vehicle_motorcycle  vehicle_truck
0         car          1.0                 0.0            0.0
1       truck          0.0                 0.0            1.0
2  motorcycle          0.0                 1.0            0.0
3         car          1.0                 0.0            0.0
4  motorcycle          0.0                 1.0            0.0


### Label Encoding
Label encoding is a technique used in machine learning and data analysis to convert categorical variables into numerical format.


In [None]:
import pandas as pd
from sklearn import preprocessing

# New sample data
data = {'color': ['red', 'blue', 'green', 'blue', 'red']}
df = pd.DataFrame(data)

# Label encoding
label_encoder = preprocessing.LabelEncoder()
df['color'] = label_encoder.fit_transform(df['color'])

print(df)
print(df['color'].unique())


   color
0      2
1      0
2      1
3      0
4      2
[2 0 1]


### Target Encoding
A target encoding is any kind of encoding that replaces a feature's categories with some number derived from the target.

In [None]:
import pandas as pd
from sklearn import preprocessing

# New sample data
data = {'fruit': ['apple', 'banana', 'orange', 'apple', 'banana']}
df = pd.DataFrame(data)

# Target encoding
target_encoder = preprocessing.OrdinalEncoder()
target_encoded = target_encoder.fit_transform(df['fruit'].values.reshape(-1, 1))

# Convert the encoded data into a DataFrame
target_df = pd.DataFrame(target_encoded, columns=['target_encoded'])

print(target_df)


   target_encoded
0             0.0
1             1.0
2             2.0
3             0.0
4             1.0


### Dummy Encoding
Dummy coding scheme is similar to one-hot encoding. This categorical data encoding method transforms the categorical variable into a set of binary variables (also known as dummy variables).


In [None]:
import pandas as pd

# New sample data
data = {'vehicle': ['car', 'truck', 'motorcycle', 'car', 'motorcycle']}
df = pd.DataFrame(data)

# Dummy encoding
dummy_encoded = pd.get_dummies(df['vehicle'], prefix='vehicle')

print(dummy_encoded)


   vehicle_car  vehicle_motorcycle  vehicle_truck
0         True               False          False
1        False               False           True
2        False                True          False
3         True               False          False
4        False                True          False


DATA ENCODING IN CSV FILE

In [1]:


from google.colab import files
uploaded = files.upload()

!pip install category_encoders


import pandas as pd

df = pd.read_csv('evdata.csv')



Saving evdata.csv to evdata.csv
Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.3


BINARY CODING

In [2]:
import category_encoders as ce

# Apply binary encoding to the 'region' column
binary_encoder = ce.BinaryEncoder(cols=['region'])
df_binary_encoded = binary_encoder.fit_transform(df['region'])

print(df_binary_encoded.head())


   region_0  region_1  region_2  region_3  region_4  region_5
0         0         0         0         0         0         1
1         0         0         0         0         0         1
2         0         0         0         0         0         1
3         0         0         0         0         0         1
4         0         0         0         0         0         1


ONE HOT ENCODING

In [3]:

one_hot_encoded = pd.get_dummies(df['region'], prefix='region')

print(one_hot_encoded.head())


   region_Australia  region_Austria  region_Belgium  region_Brazil  \
0              True           False           False          False   
1              True           False           False          False   
2              True           False           False          False   
3              True           False           False          False   
4              True           False           False          False   

   region_Bulgaria  region_Canada  region_Chile  region_China  \
0            False          False         False         False   
1            False          False         False         False   
2            False          False         False         False   
3            False          False         False         False   
4            False          False         False         False   

   region_Colombia  region_Costa Rica  ...  region_South Africa  region_Spain  \
0            False              False  ...                False         False   
1            False        

LABEL ENCODING

In [4]:
from sklearn.preprocessing import LabelEncoder

# Apply label encoding to the 'region' column
label_encoder = LabelEncoder()
df['region_encoded'] = label_encoder.fit_transform(df['region'])

print(df[['region', 'region_encoded']].head())


      region  region_encoded
0  Australia               0
1  Australia               0
2  Australia               0
3  Australia               0
4  Australia               0


TARGET ENCODING

In [5]:
# Calculate the mean of 'value' for each 'region'
target_mean = df.groupby('region')['value'].mean()

# Map the mean values to the corresponding 'region' entries
df['region_target_encoded'] = df['region'].map(target_mean)

print(df[['region', 'region_target_encoded']].head())


      region  region_target_encoded
0  Australia            6294.884095
1  Australia            6294.884095
2  Australia            6294.884095
3  Australia            6294.884095
4  Australia            6294.884095


DUMMY ENCODING

In [6]:
# Apply dummy encoding to the 'region' column, dropping the first category
dummy_encoded = pd.get_dummies(df['region'], drop_first=True, prefix='region')

print(dummy_encoded.head())


   region_Austria  region_Belgium  region_Brazil  region_Bulgaria  \
0           False           False          False            False   
1           False           False          False            False   
2           False           False          False            False   
3           False           False          False            False   
4           False           False          False            False   

   region_Canada  region_Chile  region_China  region_Colombia  \
0          False         False         False            False   
1          False         False         False            False   
2          False         False         False            False   
3          False         False         False            False   
4          False         False         False            False   

   region_Costa Rica  region_Croatia  ...  region_South Africa  region_Spain  \
0              False           False  ...                False         False   
1              False           Fal