# statsmodels.tools.tools.categorical

公式ドキュメントは[こちら](https://www.statsmodels.org/stable/generated/statsmodels.tools.tools.categorical.html)

## 紹介する機能

- DataFrameのcategorical変数について、valueに応じたdummy変数を作成する
- `pandas.get_dummy`と`sklearn.preprocessing.OneHotEncoder`

### Dummy変数とOne-hot encoding

この二つは同じ

### Import

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder
import string
import sys

### 環境

In [2]:
print(sys.version)

3.7.4 (default, Sep  7 2019, 18:45:40) 
[Clang 9.0.0 (clang-900.0.39.2)]


## statsmodels.tools.tools.categorical
### systax

`statsmodels.tools.tools.categorical(data, col=None, dictnames=False, drop=False)`


- data: DataFrame or array
- col: DataFrame/structured arrayをdataで呼び出した場合、column nameを指定する必要がある
- drop: Boolean, categorical variableを保持するとかしないか

In [3]:
string_var = [string.ascii_lowercase[0:5],
               string.ascii_lowercase[5:10],
               string.ascii_lowercase[10:15],
               string.ascii_lowercase[15:20],
               string.ascii_lowercase[20:25]]
string_var

['abcde', 'fghij', 'klmno', 'pqrst', 'uvwxy']

In [4]:
string_var *= 5
string_var = np.asarray(sorted(string_var))
string_var

array(['abcde', 'abcde', 'abcde', 'abcde', 'abcde', 'fghij', 'fghij',
       'fghij', 'fghij', 'fghij', 'klmno', 'klmno', 'klmno', 'klmno',
       'klmno', 'pqrst', 'pqrst', 'pqrst', 'pqrst', 'pqrst', 'uvwxy',
       'uvwxy', 'uvwxy', 'uvwxy', 'uvwxy'], dtype='<U5')

In [5]:
design = sm.tools.categorical(string_var, drop=True)
design

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]])

### numerical version

In [6]:
instr = np.floor(np.arange(10,60, step=2)/10)
design = sm.tools.categorical(instr, drop=True)
design

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]])

### structured array

- `a5`: ascii encode 5 strings
- supportされなくなるので覚えなくても良い

In [7]:
num = np.random.randn(25,2)
struct_ar = np.zeros((25,1),
                      dtype=[('var1', 'f4'),('var2', 'f4'),
                             ('instrument','f4'),('str_instr','a5')])
#struct_ar.dtype

In [8]:
struct_ar['var1'] = num[:,0][:,None]
struct_ar['var2'] = num[:,1][:,None]
struct_ar['instrument'] = instr[:,None]
struct_ar['str_instr'] = string_var[:,None]
design = sm.tools.categorical(struct_ar, col='instrument', drop=False)
design






array([(-0.78529596,  2.2280085 , 1., b'abcde', 1., 0., 0., 0., 0.),
       (-0.706078  ,  1.75135   , 1., b'abcde', 1., 0., 0., 0., 0.),
       ( 0.16763344, -0.9354832 , 1., b'abcde', 1., 0., 0., 0., 0.),
       (-0.81468207, -0.23331353, 1., b'abcde', 1., 0., 0., 0., 0.),
       (-0.3638719 ,  1.2785684 , 1., b'abcde', 1., 0., 0., 0., 0.),
       ( 0.52294135,  0.85142905, 2., b'fghij', 0., 1., 0., 0., 0.),
       (-0.66725355, -0.7138105 , 2., b'fghij', 0., 1., 0., 0., 0.),
       (-0.65153354, -1.131404  , 2., b'fghij', 0., 1., 0., 0., 0.),
       (-1.0979321 ,  0.34632787, 2., b'fghij', 0., 1., 0., 0., 0.),
       (-0.2588755 , -1.1170781 , 2., b'fghij', 0., 1., 0., 0., 0.),
       ( 1.2956228 ,  0.88263744, 3., b'klmno', 0., 0., 1., 0., 0.),
       (-0.21507294, -1.2486111 , 3., b'klmno', 0., 0., 1., 0., 0.),
       ( 2.4116175 , -1.5807852 , 3., b'klmno', 0., 0., 1., 0., 0.),
       (-0.09945375, -0.38486436, 3., b'klmno', 0., 0., 1., 0., 0.),
       ( 0.33993727, -1.3418326 , 

## sklearn.preprocessing.OneHotEncoder
### syntax
```
sklearn.preprocessing.OneHotEncoder(*, categories='auto', drop=None, sparse=True, dtype=<class 'numpy.float64'>, handle_unknown='error')
```



In [9]:
enc = OneHotEncoder(handle_unknown='ignore')
X = [['Male', 1], ['Female', 3], ['Female', 2]]
enc.fit(np.array(X))
enc.categories_

[array(['Female', 'Male'], dtype='<U6'), array(['1', '2', '3'], dtype='<U6')]

In [10]:
enc.transform(X)

<3x5 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [11]:
enc.transform(X).toarray()

array([[0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.]])

In [12]:
col = ['gender', 'group']
col.extend(list(enc.get_feature_names(['gender', 'group'])))
pd.DataFrame(np.hstack([np.array(X), enc.transform(X).toarray()]), columns = col)

Unnamed: 0,gender,group,gender_Female,gender_Male,group_1,group_2,group_3
0,Male,1,0.0,1.0,0.0,0.0,0.0
1,Female,3,1.0,0.0,0.0,0.0,0.0
2,Female,2,1.0,0.0,0.0,0.0,0.0


## pandas.get_dummy

- こっちの方が使い勝手が良い

In [13]:
s = pd.Series(list('abca'))
s

0    a
1    b
2    c
3    a
dtype: object

In [14]:
pd.get_dummies(s)

Unnamed: 0,a,b,c
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0


### NaNの処理

In [15]:
s1 = ['a', 'b', np.nan]
pd.get_dummies(s1)

Unnamed: 0,a,b
0,1,0
1,0,1
2,0,0


In [16]:
pd.get_dummies(s1, dummy_na =True)

Unnamed: 0,a,b,NaN
0,1,0,0
1,0,1,0
2,0,0,1


### Drop First

In [17]:
pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)

Unnamed: 0,b,c
0,0,0
1,1,0
2,0,1
3,0,0
4,0,0


### DataFrameとDummy

- Convert categorical variable into dummy/indicator variables.
- numericはdummy化はしてくれない

In [18]:
df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
                   'C': [1, 2, 3]})
pd.get_dummies(df, prefix=['col1', 'col2'])

Unnamed: 0,C,col1_a,col1_b,col2_a,col2_b,col2_c
0,1,1,0,0,1,0
1,2,0,1,1,0,0
2,3,1,0,0,0,1


numericを一回カテゴリカル変数へconvertした後dummy化することは可能

In [19]:
df['C'] = df['C'].astype('category')
pd.get_dummies(df, prefix=['col1', 'col2', 'col_3'])

Unnamed: 0,col1_a,col1_b,col2_a,col2_b,col2_c,col_3_1,col_3_2,col_3_3
0,1,0,0,1,0,1,0,0
1,0,1,1,0,0,0,1,0
2,1,0,0,0,1,0,0,1
