# Encoding

Die meisten Machine Learning-Algorithmen sind nicht in der Lage mit kategorialen Variablen umgehen, es sei denn, sie werden in numerische Werte umgewandelt.

In [1]:
import pandas as pd
import numpy as np

In [2]:
patients = pd.DataFrame({'sex': ['male', 'male', 'female', 'male', 'female', 'unknown', 'unknown'],
                'age': [35, 46, 42, 15, 36, 42, 42],
                'BMI': [20, 20, 19.5, 19, 28, 21, 21],
                'sports activity': ['medium', 'medium', 'high', 'low', 'low', 'high', 'high']})
patients

Unnamed: 0,sex,age,BMI,sports activity
0,male,35,20.0,medium
1,male,46,20.0,medium
2,female,42,19.5,high
3,male,15,19.0,low
4,female,36,28.0,low
5,unknown,42,21.0,high
6,unknown,42,21.0,high


In [3]:
pd.get_dummies(patients['sex'], prefix='sex_')

Unnamed: 0,sex__female,sex__male,sex__unknown
0,0,1,0
1,0,1,0
2,1,0,0
3,0,1,0
4,1,0,0
5,0,0,1
6,0,0,1


In [4]:
pd.get_dummies(patients['sex'], prefix='sex_', drop_first=True)

Unnamed: 0,sex__male,sex__unknown
0,1,0
1,1,0
2,0,0
3,1,0
4,0,0
5,0,1
6,0,1


In [5]:
result1 = pd.get_dummies(patients, drop_first=True)
result1

Unnamed: 0,age,BMI,sex_male,sex_unknown,sports activity_low,sports activity_medium
0,35,20.0,1,0,0,1
1,46,20.0,1,0,0,1
2,42,19.5,0,0,0,0
3,15,19.0,1,0,1,0
4,36,28.0,0,0,1,0
5,42,21.0,0,1,0,0
6,42,21.0,0,1,0,0


In [8]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

In [6]:
patients

Unnamed: 0,sex,age,BMI,sports activity
0,male,35,20.0,medium
1,male,46,20.0,medium
2,female,42,19.5,high
3,male,15,19.0,low
4,female,36,28.0,low
5,unknown,42,21.0,high
6,unknown,42,21.0,high


In [9]:
label = LabelEncoder()

patients['sex'] = label.fit_transform(patients['sex'])
patients['sports activity'] = label.fit_transform(patients['sports activity'])
patients


Unnamed: 0,sex,age,BMI,sports activity
0,1,35,20.0,2
1,1,46,20.0,2
2,0,42,19.5,0
3,1,15,19.0,1
4,0,36,28.0,1
5,2,42,21.0,0
6,2,42,21.0,0


In [10]:
patients = pd.DataFrame({'sex': ['male', 'male', 'female', 'male', 'female', 'unknown', 'unknown'],
                'age': [35, 46, 42, 15, 36, 42, 42],
                'BMI': [20, 20, 19.5, 19, 28, 21, 21],
                'sports activity': ['medium', 'medium', 'high', 'low', 'low', 'high', 'high']})
patients

Unnamed: 0,sex,age,BMI,sports activity
0,male,35,20.0,medium
1,male,46,20.0,medium
2,female,42,19.5,high
3,male,15,19.0,low
4,female,36,28.0,low
5,unknown,42,21.0,high
6,unknown,42,21.0,high


In [11]:
ordinal = OrdinalEncoder()

patients['sports activity'] = ordinal.fit_transform(patients[['sports activity']])
patients


Unnamed: 0,sex,age,BMI,sports activity
0,male,35,20.0,2.0
1,male,46,20.0,2.0
2,female,42,19.5,0.0
3,male,15,19.0,1.0
4,female,36,28.0,1.0
5,unknown,42,21.0,0.0
6,unknown,42,21.0,0.0
