### patsy를 통한 데이터 프레임을 사용!

In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import missingno as msno
import koreanize_matplotlib

In [4]:
from patsy import demo_data
df = pd.DataFrame(demo_data('x1','x2','x3','x4','x5', 'x6'))
df

Unnamed: 0,x1,x2,x3,x4,x5,x6
0,1.764052,-0.977278,0.144044,0.333674,-2.55299,-1.454366
1,0.400157,0.950088,1.454274,1.494079,0.653619,0.045759
2,0.978738,-0.151357,0.761038,-0.205158,0.864436,-0.187184
3,2.240893,-0.103219,0.121675,0.313068,-0.742165,1.532779
4,1.867558,0.410599,0.443863,-0.854096,2.269755,1.469359


In [7]:
from patsy import dmatrix
dmatrix('x1 + x2 + x3 +0', data=df)

DesignMatrix with shape (5, 3)
       x1        x2       x3
  1.76405  -0.97728  0.14404
  0.40016   0.95009  1.45427
  0.97874  -0.15136  0.76104
  2.24089  -0.10322  0.12168
  1.86756   0.41060  0.44386
  Terms:
    'x1' (column 0)
    'x2' (column 1)
    'x3' (column 2)

In [8]:
dmatrix('x1 + x2 + x3', data=df)

DesignMatrix with shape (5, 4)
  Intercept       x1        x2       x3
          1  1.76405  -0.97728  0.14404
          1  0.40016   0.95009  1.45427
          1  0.97874  -0.15136  0.76104
          1  2.24089  -0.10322  0.12168
          1  1.86756   0.41060  0.44386
  Terms:
    'Intercept' (column 0)
    'x1' (column 1)
    'x2' (column 2)
    'x3' (column 3)

In [9]:
dmatrix('x1 + np.log(np.abs(x2))', data=df)

DesignMatrix with shape (5, 3)
  Intercept       x1  np.log(np.abs(x2))
          1  1.76405            -0.02298
          1  0.40016            -0.05120
          1  0.97874            -1.88811
          1  2.24089            -2.27090
          1  1.86756            -0.89014
  Terms:
    'Intercept' (column 0)
    'x1' (column 1)
    'np.log(np.abs(x2))' (column 2)

In [20]:
# 함수형으로 데이터들에 5를 곱한값
def five_times(x):
  return 5*x


dmatrix('five_times(x1) + x1',data=df)

DesignMatrix with shape (5, 3)
  Intercept  five_times(x1)       x1
          1         8.82026  1.76405
          1         2.00079  0.40016
          1         4.89369  0.97874
          1        11.20447  2.24089
          1         9.33779  1.86756
  Terms:
    'Intercept' (column 0)
    'five_times(x1)' (column 1)
    'x1' (column 2)

In [21]:
# 변수를 이용해서 5를 곱한값(lambda이용)
a = lambda x: 5*x

dmatrix('a(x1) + x1',data=df)

DesignMatrix with shape (5, 3)
  Intercept     a(x1)       x1
          1   8.82026  1.76405
          1   2.00079  0.40016
          1   4.89369  0.97874
          1  11.20447  2.24089
          1   9.33779  1.86756
  Terms:
    'Intercept' (column 0)
    'a(x1)' (column 1)
    'x1' (column 2)

In [22]:
# x1과 x2의 곱 => x1:x2
dmatrix('x1 + x2 + x1:x2 + 0', data=df)

DesignMatrix with shape (5, 3)
       x1        x2     x1:x2
  1.76405  -0.97728  -1.72397
  0.40016   0.95009   0.38018
  0.97874  -0.15136  -0.14814
  2.24089  -0.10322  -0.23130
  1.86756   0.41060   0.76682
  Terms:
    'x1' (column 0)
    'x2' (column 1)
    'x1:x2' (column 2)

In [24]:
# 물론 *로도 곱을 표현 가능
dmatrix('x1 + x2 + x1*x2 + 0', data=df)

DesignMatrix with shape (5, 3)
       x1        x2     x1:x2
  1.76405  -0.97728  -1.72397
  0.40016   0.95009   0.38018
  0.97874  -0.15136  -0.14814
  2.24089  -0.10322  -0.23130
  1.86756   0.41060   0.76682
  Terms:
    'x1' (column 0)
    'x2' (column 1)
    'x1:x2' (column 2)

In [26]:
# x3+x4를 표현하고 싶지만, 아래 식으로는 그냥 x3, x4데이터가 뽑힘
dmatrix('x1 + x2 + x3 +x4 + 0', data=df) 

DesignMatrix with shape (5, 4)
       x1        x2       x3        x4
  1.76405  -0.97728  0.14404   0.33367
  0.40016   0.95009  1.45427   1.49408
  0.97874  -0.15136  0.76104  -0.20516
  2.24089  -0.10322  0.12168   0.31307
  1.86756   0.41060  0.44386  -0.85410
  Terms:
    'x1' (column 0)
    'x2' (column 1)
    'x3' (column 2)
    'x4' (column 3)

In [27]:
# 항목에 대한 덧셈표현은 이런식으로
dmatrix('x1 + x2 + I(x1+x2)+0', data = df)

DesignMatrix with shape (5, 3)
       x1        x2  I(x1 + x2)
  1.76405  -0.97728     0.78677
  0.40016   0.95009     1.35025
  0.97874  -0.15136     0.82738
  2.24089  -0.10322     2.13767
  1.86756   0.41060     2.27816
  Terms:
    'x1' (column 0)
    'x2' (column 1)
    'I(x1 + x2)' (column 2)

In [31]:
# center : 평균을 0으로 만들어 준다.
# center = x1-평균값(대략 1.4정도?)
dmatrix('x1 + center(x1)+0',data=df)

DesignMatrix with shape (5, 2)
       x1  center(x1)
  1.76405     0.31377
  0.40016    -1.05012
  0.97874    -0.47154
  2.24089     0.79061
  1.86756     0.41728
  Terms:
    'x1' (column 0)
    'center(x1)' (column 1)

In [30]:
# standardize : 평균을 0으로 하고 표준편차를 1로 스케일
dmatrix('x1 + standardize(x1)', data=df)

DesignMatrix with shape (5, 3)
  Intercept       x1  standardize(x1)
          1  1.76405          0.47060
          1  0.40016         -1.57500
          1  0.97874         -0.70723
          1  2.24089          1.18578
          1  1.86756          0.62584
  Terms:
    'Intercept' (column 0)
    'x1' (column 1)
    'standardize(x1)' (column 2)

In [34]:
# xs가 표준화 된 데이터라고 할때
xs = dmatrix('standardize(x1)', data=df)

# 어떤 값이 적용되었는지 알려줌
xs.design_info.factor_infos

{EvalFactor('standardize(x1)'): FactorInfo(factor=EvalFactor('standardize(x1)'),
            type='numerical',
            state=<factor state>,
            num_columns=1)}