# Types of encodings
1. one-hot encoding
2. Binary encoding
3. Frequecy encoding
4. Hash encoding
5. Dummy encoding

In [3]:
import pandas, sklearn, numpy

In [4]:
data = pandas.read_csv('data/7Aug(2).csv')
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 452.0+ bytes


In [6]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [7]:
print("X.info: \n")
X.info()

print("\n")
print("y.info: \n")
y.info()

X.info: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  10 non-null     object 
 1   Age      9 non-null      float64
 2   Salary   9 non-null      float64
dtypes: float64(2), object(1)
memory usage: 372.0+ bytes


y.info: 

<class 'pandas.core.series.Series'>
RangeIndex: 10 entries, 0 to 9
Series name: Purchased
Non-Null Count  Dtype 
--------------  ----- 
10 non-null     object
dtypes: object(1)
memory usage: 212.0+ bytes


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print(f"X_train.shape: {X_train.shape}")
print(f"X_train.shape: {X_test.shape}")
print(f"X_train.shape: {y_train.shape}")
print(f"X_train.shape: {y_test.shape}")

X_train.shape: (7, 3)
X_train.shape: (3, 3)
X_train.shape: (7,)
X_train.shape: (3,)


## Handling of missing data



In [20]:
from sklearn.impute import SimpleImputer
numericdataset=data.select_dtypes(include=['number'])

imp_mean = SimpleImputer(missing_values=pandas.NA, strategy='mean')

# print(f"fit method: {}")
fit_temp = imp_mean.fit(numericdataset)
X_train_temp = imp_mean.transform(numericdataset)

X_train_temp
print(fit_temp)

SimpleImputer(missing_values=<NA>)


## Encoding of categorical data

There are two ways of encoding used depending on types of variable
- Independent variables(one hot)
- Dependent variable (label encoding)

### Independent variable

- using compose package of sklearn library
- use reference of class column tranformer

In [24]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Define the ColumnTransformer with a list of transformers
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), [0])],
    remainder='passthrough'
)

# Fit the transformer to the training data


X_train_temp = numpy.array(ct.fit_transform(data))

print(f"passthrough: \n{X_train_temp}")


ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), [0])],
    remainder='drop'
)

# Fit the transformer to the training data


# Transform the training data and convert it to a numpy array
X_train_temp = numpy.array(ct.fit_transform(data))

X_train_temp

print(f"drop: \n{X_train_temp}")

passthrough: 
[[1.0 0.0 0.0 44.0 72000.0 'No']
 [0.0 0.0 1.0 27.0 48000.0 'Yes']
 [0.0 1.0 0.0 30.0 54000.0 'No']
 [0.0 0.0 1.0 38.0 61000.0 'No']
 [0.0 1.0 0.0 40.0 nan 'Yes']
 [1.0 0.0 0.0 35.0 58000.0 'Yes']
 [0.0 0.0 1.0 nan 52000.0 'No']
 [1.0 0.0 0.0 48.0 79000.0 'Yes']
 [0.0 1.0 0.0 50.0 83000.0 'No']
 [1.0 0.0 0.0 37.0 67000.0 'Yes']]
drop: 
[[1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]
