In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
import pandas as pd


## What is Data Encoding?
Data Encoding is an important pre-processing step in Machine Learning. It refers to the process of converting categorical or textual data into numerical format, so that it can be used as input for algorithms to process. The reason for encoding is that most machine learning algorithms work with numbers and not with text or categorical variables.

## One-hot Encoding:
 One-hot encoding creates binary columns for each category and indicates the presence of the category with a 1 or 0. Using the same "Color" example, this would create three columns: "Red," "Green," and "Blue."

 One-hot encoding converts each category in a categorical variable into a binary vector. It creates new binary columns for each category, representing the presence or absence of the category. Each category is mutually exclusive. For example, “Red” may be encoded as [1, 0, 0], “Green” as [0, 1, 0], and “Blue” as [0, 0, 1].

### One-Hot Encoding:
• One-Hot Encoding is the Most Common method for encoding Categorical variables.

• a Binary Column is created for each Unique Category in the variable.

• If a category is present in a sample, the corresponding column is set to 1, and all other columns are set to 0.

• For example, if a variable has three categories ‘A’, ‘B’ and ‘C’, three columns will be created and a sample with category ‘B’ will have the value [0,1,0].

In [3]:
df = pd.read_csv("/content/drive/MyDrive/Datasets (1)/bikes.data")

In [4]:
df

Unnamed: 0,ticket,cost,month,location_from,location_to,duration,distance,assistance,energy_used,energy_collected
0,single,0.35,9,MICROTEKNIA,PUIJONLAAKSO,411.0,2150,1,19.0,2.7
1,single,1.20,5,SATAMA,KEILANKANTA,1411.0,7130,1,53.8,15.3
2,savonia,0.00,9,TASAVALLANKATU,NEULAMÄKI,1308.0,5420,1,43.0,9.9
3,savonia,0.00,10,TORI,KAUPPAKATU,1036.0,1180,1,6.5,2.1
4,single,0.30,9,TORI,TORI,319.0,1120,1,13.7,1.2
...,...,...,...,...,...,...,...,...,...,...
1769,savonia,0.00,10,KAUPPAKATU,TORI,836.0,960,1,8.0,2.7
1770,single,0.20,7,TORI,SATAMA,199.0,930,1,3.7,3.6
1771,season,0.00,7,TORI,TORI,61.0,0,1,0.0,0.0
1772,savonia,0.00,9,MICROTEKNIA,PUIJONLAAKSO,610.0,2460,1,36.5,6.9


In [5]:
# one hot encoding ticket variable

df_encoded = pd.get_dummies(df,columns = ["ticket"])
df_encoded

Unnamed: 0,cost,month,location_from,location_to,duration,distance,assistance,energy_used,energy_collected,ticket_savonia,ticket_season,ticket_single
0,0.35,9,MICROTEKNIA,PUIJONLAAKSO,411.0,2150,1,19.0,2.7,0,0,1
1,1.20,5,SATAMA,KEILANKANTA,1411.0,7130,1,53.8,15.3,0,0,1
2,0.00,9,TASAVALLANKATU,NEULAMÄKI,1308.0,5420,1,43.0,9.9,1,0,0
3,0.00,10,TORI,KAUPPAKATU,1036.0,1180,1,6.5,2.1,1,0,0
4,0.30,9,TORI,TORI,319.0,1120,1,13.7,1.2,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1769,0.00,10,KAUPPAKATU,TORI,836.0,960,1,8.0,2.7,1,0,0
1770,0.20,7,TORI,SATAMA,199.0,930,1,3.7,3.6,0,0,1
1771,0.00,7,TORI,TORI,61.0,0,1,0.0,0.0,0,1,0
1772,0.00,9,MICROTEKNIA,PUIJONLAAKSO,610.0,2460,1,36.5,6.9,1,0,0


In [6]:
# concatenate the one hot encoding with the original dataframe

df1 = pd.concat([df,df_encoded],axis = 1)
df1

Unnamed: 0,ticket,cost,month,location_from,location_to,duration,distance,assistance,energy_used,energy_collected,...,location_from.1,location_to.1,duration.1,distance.1,assistance.1,energy_used.1,energy_collected.1,ticket_savonia,ticket_season,ticket_single
0,single,0.35,9,MICROTEKNIA,PUIJONLAAKSO,411.0,2150,1,19.0,2.7,...,MICROTEKNIA,PUIJONLAAKSO,411.0,2150,1,19.0,2.7,0,0,1
1,single,1.20,5,SATAMA,KEILANKANTA,1411.0,7130,1,53.8,15.3,...,SATAMA,KEILANKANTA,1411.0,7130,1,53.8,15.3,0,0,1
2,savonia,0.00,9,TASAVALLANKATU,NEULAMÄKI,1308.0,5420,1,43.0,9.9,...,TASAVALLANKATU,NEULAMÄKI,1308.0,5420,1,43.0,9.9,1,0,0
3,savonia,0.00,10,TORI,KAUPPAKATU,1036.0,1180,1,6.5,2.1,...,TORI,KAUPPAKATU,1036.0,1180,1,6.5,2.1,1,0,0
4,single,0.30,9,TORI,TORI,319.0,1120,1,13.7,1.2,...,TORI,TORI,319.0,1120,1,13.7,1.2,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1769,savonia,0.00,10,KAUPPAKATU,TORI,836.0,960,1,8.0,2.7,...,KAUPPAKATU,TORI,836.0,960,1,8.0,2.7,1,0,0
1770,single,0.20,7,TORI,SATAMA,199.0,930,1,3.7,3.6,...,TORI,SATAMA,199.0,930,1,3.7,3.6,0,0,1
1771,season,0.00,7,TORI,TORI,61.0,0,1,0.0,0.0,...,TORI,TORI,61.0,0,1,0.0,0.0,0,1,0
1772,savonia,0.00,9,MICROTEKNIA,PUIJONLAAKSO,610.0,2460,1,36.5,6.9,...,MICROTEKNIA,PUIJONLAAKSO,610.0,2460,1,36.5,6.9,1,0,0


In [7]:
df1.columns

Index(['ticket', 'cost', 'month', 'location_from', 'location_to', 'duration',
       'distance', 'assistance', 'energy_used', 'energy_collected', 'cost',
       'month', 'location_from', 'location_to', 'duration', 'distance',
       'assistance', 'energy_used', 'energy_collected', 'ticket_savonia',
       'ticket_season', 'ticket_single'],
      dtype='object')

In [8]:
df1.index

RangeIndex(start=0, stop=1774, step=1)

In [9]:
df.dtypes

ticket               object
cost                float64
month                 int64
location_from        object
location_to          object
duration            float64
distance              int64
assistance            int64
energy_used         float64
energy_collected    float64
dtype: object

In [10]:
df.shape

(1774, 10)

In [11]:
df.size

17740

In [12]:
df.axes

[RangeIndex(start=0, stop=1774, step=1),
 Index(['ticket', 'cost', 'month', 'location_from', 'location_to', 'duration',
        'distance', 'assistance', 'energy_used', 'energy_collected'],
       dtype='object')]

In [13]:
df.describe()

Unnamed: 0,cost,month,duration,distance,assistance,energy_used,energy_collected
count,1774.0,1774.0,1774.0,1774.0,1774.0,1774.0,1774.0
mean,0.491234,7.233371,671.323563,2460.067644,0.916009,17.399493,5.63929
std,4.127344,1.720351,1141.708627,2352.529305,0.277452,17.194463,6.379593
min,0.0,4.0,2.0,-3380.0,0.0,0.0,0.0
25%,0.0,6.0,245.25,910.0,1.0,1.425,0.6
50%,0.05,7.0,546.5,2030.0,1.0,14.35,3.9
75%,0.55,9.0,877.75,3605.0,1.0,26.9,8.325
max,100.0,10.0,25614.0,20770.0,1.0,144.9,56.4


In [14]:
df["ticket"].value_counts()

single     967
season     553
savonia    254
Name: ticket, dtype: int64

In [15]:
df.info

<bound method DataFrame.info of        ticket  cost  month   location_from   location_to  duration  distance  \
0      single  0.35      9     MICROTEKNIA  PUIJONLAAKSO     411.0      2150   
1      single  1.20      5          SATAMA   KEILANKANTA    1411.0      7130   
2     savonia  0.00      9  TASAVALLANKATU     NEULAMÄKI    1308.0      5420   
3     savonia  0.00     10            TORI    KAUPPAKATU    1036.0      1180   
4      single  0.30      9            TORI          TORI     319.0      1120   
...       ...   ...    ...             ...           ...       ...       ...   
1769  savonia  0.00     10      KAUPPAKATU          TORI     836.0       960   
1770   single  0.20      7            TORI        SATAMA     199.0       930   
1771   season  0.00      7            TORI          TORI      61.0         0   
1772  savonia  0.00      9     MICROTEKNIA  PUIJONLAAKSO     610.0      2460   
1773   season  0.00      8    PUIJONLAAKSO    KAUPPAKATU     478.0      2250   

      a

In [16]:
from sklearn.preprocessing import OneHotEncoder



In [33]:
encoder = OneHotEncoder()

In [36]:
location_from_encoded = encoder.fit_transform(df[["location_from"]]).toarray()
print(location_from_encoded)

[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]]


In [37]:
from sklearn.preprocessing import LabelEncoder

In [39]:
le = LabelEncoder()
df["Location_from_encoded"]= le.fit_transform(df["location_from"])
df["Location_from_encoded"]

0    0
1    1
2    2
3    3
4    3
Name: Location_from_encoded, dtype: int64

In [46]:
education = {"Name":["Ann","Chan","Juho"], "Degree":["MSc.","B.Sc","MBA"]}
df_education = pd.DataFrame(education)

In [47]:
df_education

Unnamed: 0,Name,Degree
0,Ann,MSc.
1,Chan,B.Sc
2,Juho,MBA


In [53]:
edu_mapping = {"MSc.":1,"B.Sc":2,"MBA":3}
df_education["Education_numeric"] = df_education["Degree"].map(edu_mapping)
df_education

Unnamed: 0,Name,Degree,Education_numeric
0,Ann,MSc.,1
1,Chan,B.Sc,2
2,Juho,MBA,3


##Ordinal Encoding
Ordinal Encoding:
• Ordinal Encoding is used when the categories in a variable have a Natural Ordering.

• In this method, the categories are assigned a numerical value based on their order, such as 1, 2, 3, etc.

For example, if a variable has categories ‘Low’, ‘Medium’ and ‘High’, they can be assigned the values 1, 2, and 3, respectively.



https://medium.com/aiskunks/categorical-data-encoding-techniques-d6296697a40f

In [57]:
#Ordinal Encoding
# creating a dataframe with a categorical variable
persons_quality = {

"Name": ["Lisa","Juho","Ann"] ,

"Education_Quality": ["low","medium","high"]

}

df_quality = pd.DataFrame(persons_quality)
df_quality

Unnamed: 0,Name,Education_Quality
0,Lisa,low
1,Juho,medium
2,Ann,high


In [None]:
map_quality = {"low":1,"medium":2,"high":3}

In [61]:
df_quality["Ordinal_Quality"] = df_quality ["Education_Quality"].map(map_quality)
df_quality

Unnamed: 0,Name,Education_Quality,Ordinal_Quality
0,Lisa,low,1
1,Juho,medium,2
2,Ann,high,3


In [71]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder(categories = [["low","medium","high"]])
df_quality["Education_Encoded"] = ordinal_encoder.fit_transform(df_quality[["Education_Quality"]])
df_quality

Unnamed: 0,Name,Education_Quality,Ordinal_Quality,Education_Encoded
0,Lisa,low,1,0.0
1,Juho,medium,2,1.0
2,Ann,high,3,2.0


## Binary Encoding: Binary Encoding:
• Binary Encoding is similar to One-Hot Encoding, but instead of creating a separate column for each category, the categories are represented as binary digits.

For example, if a variable has four categories ‘A’, ‘B’, ‘C’ and ‘D’, they can be represented as 0001, 0010, 0100 and 1000, respectively.

In [62]:
animals = {"animal": ["cat","dog","rabbit"]}
df_animals = pd.DataFrame(animals)
df_animals

Unnamed: 0,animal
0,cat
1,dog
2,rabbit


In [69]:
# Binary Encoding:

import pandas as pd

# create a sample dataframe with a categorical variable
df = pd.DataFrame({'animal': ['cat', 'dog', 'bird', 'cat']})
print(f"Before Encoding the Data:\n\n{df}\n")

# perform binary encoding on the 'animal' column
animal_map = {'cat': 0, 'dog': 1, 'bird': 2}
df['animal'] = df['animal'].map(animal_map)



Before Encoding the Data:

  animal
0    cat
1    dog
2   bird
3    cat

After Encoding the Data:

  animal
0      0
1      1
2     10
3      0

