In [45]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [46]:
data = pd.read_csv("modified_titanic.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71,C85,C
1,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53,C123,S
2,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51,E46,S
3,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16,G6,S
4,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26,C103,S


np.int64(0)

In [30]:
data2 = pd.read_csv("ELectricity_clean_data.csv")
data2.head()

Unnamed: 0,year,month,stateDescription,sectorName,customers,price,revenue,sales
0,2001,1,Wyoming,all sectors,,4.31,48.1284,1116.17208
1,2001,1,Wyoming,commercial,,5.13,12.67978,247.08691
2,2001,1,Wyoming,industrial,,3.26,19.60858,602.30484
3,2001,1,Wyoming,other,,4.75,0.76868,16.17442
4,2001,1,Wyoming,residential,,6.01,15.07136,250.60591


# Label Encoding

Label Encoding is a technique that is used to convert categorical columns into numerical ones so that they can be fitted by machine learning models which only take numerical data. It is an important pre-processing step in a machine-learning project.

![image.png](attachment:e43b9b1b-e486-4692-83be-d1bc46273cb4.png)

## Nominal Data

In [11]:
data["Male"] = data.Sex == "male"
data["Male"] = data["Male"].astype(int)

data["Female"] = data.Sex == "female"
data["Female"] = data["Female"].astype(int)

In [13]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Male,Female
0,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71,C85,C,0,1
1,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53,C123,S,0,1
2,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51,E46,S,1,0
3,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16,G6,S,0,1
4,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26,C103,S,0,1


### One-Hot Encoding

In [20]:
# pandas
pd.get_dummies(data, columns=['Sex'], dtype=int)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_female,Sex_male
0,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71,C85,C,1,0
1,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53,C123,S,1,0
2,7,0,1,"McCarthy, Mr. Timothy J",54.0,0,0,17463,51,E46,S,0,1
3,11,1,3,"Sandstrom, Miss. Marguerite Rut",4.0,1,1,PP 9549,16,G6,S,1,0
4,12,1,1,"Bonnell, Miss. Elizabeth",58.0,0,0,113783,26,C103,S,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",47.0,1,1,11751,52,D35,S,1,0
198,873,0,1,"Carlsson, Mr. Frans Olof",33.0,0,0,695,5,B51 B53 B55,S,0,1
199,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",56.0,0,1,11767,83,C50,C,1,0
200,888,1,1,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30,B42,S,1,0


In [23]:
# sklearn
one_hot_encoder = OneHotEncoder(sparse_output=False)
encoded = one_hot_encoder.fit_transform(data['Sex'].values.reshape(-1, 1))

In [24]:
encoded

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.

In [25]:
one_hot_encoder.categories_

[array(['female', 'male'], dtype=object)]

In [28]:
one_hot_encoder.inverse_transform(
    [[1,0],
     [0,1]]
)

array([['female'],
       ['male']], dtype=object)

### Ordinal Data

In [37]:
# pandas
labels, rule = data.Embarked.factorize(sort=True)

In [44]:
data['Embarked_encoded'] = labels
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71,C85,0
1,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53,C123,2
2,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51,E46,2
3,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16,G6,2
4,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26,C103,2


In [42]:
rule

Index(['C', 'Q', 'S'], dtype='object')

In [47]:
# sklearn
label_encoder = LabelEncoder()

encoded = label_encoder.fit_transform(data.Embarked)

In [48]:
encoded

array([0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 0,
       2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 0, 0, 0, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 2, 0, 1, 2, 0, 0,
       0, 0, 2, 0, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2,
       0, 2, 1, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2,
       0, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 0, 2, 0, 0, 0, 2, 2, 2, 2, 0, 0,
       2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 2,
       0, 2, 0, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2,
       2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2, 2,
       2, 0, 2, 0])

In [49]:
label_encoder.classes_

array(['C', 'Q', 'S'], dtype=object)

In [50]:
label_encoder.inverse_transform([0,2,1,1,0])

array(['C', 'S', 'Q', 'Q', 'C'], dtype=object)

In [51]:
label_encoder