In [1]:
import pandas as pd
import seaborn as sns

# Kategorien vorbereiten

Hier geht es darum, die Kategorien so zu bearbeiten, dass sie als Zahlen verarbeitet werden können.

In [12]:
df_ex = sns.load_dataset('exercise')
df_ex.head(10)

Unnamed: 0.1,Unnamed: 0,id,diet,pulse,time,kind
0,0,1,low fat,85,1 min,rest
1,1,1,low fat,85,15 min,rest
2,2,1,low fat,88,30 min,rest
3,3,2,low fat,90,1 min,rest
4,4,2,low fat,92,15 min,rest
5,5,2,low fat,93,30 min,rest
6,6,3,low fat,97,1 min,rest
7,7,3,low fat,97,15 min,rest
8,8,3,low fat,94,30 min,rest
9,9,4,low fat,80,1 min,rest


In [3]:
df_ex['diet'].value_counts()

no fat     45
low fat    45
Name: diet, dtype: int64

In [4]:
df_ex['time'].value_counts()

1 min     30
15 min    30
30 min    30
Name: time, dtype: int64

In [5]:
df_ex['kind'].value_counts()

rest       30
walking    30
running    30
Name: kind, dtype: int64

### Ansatz 1: Werte ersetzen

In [6]:
df_ex_replace = df_ex.replace({
    "diet": {
        "no fat": 0,
        "low fat": 1
    },
    "time": {
        "1 min": 1,
        "15 min": 15,
        "30 min": 30
    }
})
df_ex_replace

Unnamed: 0.1,Unnamed: 0,id,diet,pulse,time,kind
0,0,1,1,85,1,rest
1,1,1,1,85,15,rest
2,2,1,1,88,30,rest
3,3,2,1,90,1,rest
4,4,2,1,92,15,rest
...,...,...,...,...,...,...
85,85,29,0,135,15,running
86,86,29,0,130,30,running
87,87,30,0,99,1,running
88,88,30,0,111,15,running


### Ansatz 2: `Categorical`

`Categorical` nimmt verschiedene Kategorien und weist ihnen eine Zahl zu.

In [13]:
df_ex_cat = df_ex.copy()
df_ex_cat['diet'] = df_ex_cat['diet'].astype('category')
df_ex_cat['time'] = df_ex_cat['time'].astype('category')
df_ex_cat[['diet', 'time']].dtypes

diet    category
time    category
dtype: object

In [14]:
df_ex_cat['diet_encoded'] = df_ex_cat['diet'].cat.codes
df_ex_cat['time_encoded'] = df_ex_cat['time'].cat.codes
df_ex_cat

Unnamed: 0.1,Unnamed: 0,id,diet,pulse,time,kind,diet_encoded,time_encoded
0,0,1,low fat,85,1 min,rest,1,0
1,1,1,low fat,85,15 min,rest,1,1
2,2,1,low fat,88,30 min,rest,1,2
3,3,2,low fat,90,1 min,rest,1,0
4,4,2,low fat,92,15 min,rest,1,1
...,...,...,...,...,...,...,...,...
85,85,29,no fat,135,15 min,running,0,1
86,86,29,no fat,130,30 min,running,0,2
87,87,30,no fat,99,1 min,running,0,0
88,88,30,no fat,111,15 min,running,0,1


Betrachte die Werte von `time_encoded`: Bei Variante 1 stand da 1, 15 oder 30, jetzt steht da 0, 1 oder 2. Es ist unterscheidbar, aber die Variante 1 lässt präzisere Werte erscheinen.

### Variante 3: One-hot encoding

Diese Variante erstellt eine neue Spalte pro möglichen Weg. 

In [15]:
df_ex_onehot = pd.get_dummies(df_ex, columns=['diet', 'time'])
df_ex_onehot

Unnamed: 0.1,Unnamed: 0,id,pulse,kind,diet_no fat,diet_low fat,time_1 min,time_15 min,time_30 min
0,0,1,85,rest,0,1,1,0,0
1,1,1,85,rest,0,1,0,1,0
2,2,1,88,rest,0,1,0,0,1
3,3,2,90,rest,0,1,1,0,0
4,4,2,92,rest,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...
85,85,29,135,running,1,0,0,1,0
86,86,29,130,running,1,0,0,0,1
87,87,30,99,running,1,0,1,0,0
88,88,30,111,running,1,0,0,1,0
