In [50]:
from sklearn import linear_model
import numpy as np
import pandas as pd

In [51]:
data = {
    'x1': [1, 4, 5, 8, 10],
    'x2': [3, 5, 7, 9, 10],
    'x3': [7, 8, 5, 7, 4],
    'y': [30, 40, 45, 60, 61]
}
data = pd.DataFrame(data)
data

Unnamed: 0,x1,x2,x3,y
0,1,3,7,30
1,4,5,8,40
2,5,7,5,45
3,8,9,7,60
4,10,10,4,61


In [52]:
linear_regression = linear_model.LinearRegression()
linear_regression.fit(X = data[['x1', 'x2', 'x3']], y = data['y'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [53]:
linear_regression.coef_

array([0.82258065, 4.09677419, 1.41935484])

In [54]:
linear_regression.intercept_

5.935483870967772

In [55]:
prediction = linear_regression.predict(X = data[['x1', 'x2', 'x3']])

In [56]:
prediction

array([28.98387097, 41.06451613, 45.82258065, 59.32258065, 60.80645161])

In [57]:
data['y']

0    30
1    40
2    45
3    60
4    61
Name: y, dtype: int64

In [58]:
linear_regression.score(X = data[['x1', 'x2', 'x3']], y = data['y']) # R^2값을 보여줌

0.9952763020975957

In [59]:
residual = prediction - data['y']; residual

0   -1.016129
1    1.064516
2    0.822581
3   -0.677419
4   -0.193548
Name: y, dtype: float64

In [60]:
MSE = (residual**2).sum()/5; MSE

0.6677419354838732

In [61]:
RMSE = ((residual**2).sum()**0.5) / 5; RMSE

0.3654427275193401

# x2와 x3를 이용해서 x1을 예측할 수 있을까?

In [62]:
linear_regression = linear_model.LinearRegression()
linear_regression.fit(X = data[['x2', 'x3']], y = data['x1'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [63]:
linear_regression.coef_

array([1.2427907 , 0.09860465])

In [64]:
linear_regression.intercept_

-3.4623255813953495

In [65]:
linear_regression.predict(X= data[['x2', 'x3']])

array([0.95627907, 3.54046512, 5.73023256, 8.41302326, 9.36      ])

In [66]:
linear_regression.score(X = data[['x2', 'x3']], y = data['x1'])

0.9730383815466063

In [67]:
# 더 쉽게 vif 구하는 법
from sklearn.linear_model import LinearRegression

def sklearn_vif(X, y):
  # X와 y에 대해서 선형회귀 함수를 구하고 r_square 리턴
  r_square = LinearRegression().fit(X, y).score(X,y)
  #VIF를 계산
  vif = 1./(1. - r_square)
  return vif

In [68]:
# x1의 vif를 계산
sklearn_vif(data[['x2', 'x3']], data['x1'])

37.08976157082763

In [69]:
# x2의 vif를 계산
sklearn_vif(data[['x1', 'x3']], data['x2'])

39.83842917251046

In [70]:
# x3의 vif를 계산
sklearn_vif(data[['x1', 'x2']], data['x3'])
# 관계가 없음

1.726788218793828

vif가 10 초과면 연관 있음
1. vif가 가장 높은 열 삭제
2. 너머지 열의 vif 계산

1, 2를 vif가 10 이상인 열이 없을 때 까지 반복

In [71]:
data = data[['x1', 'x3', 'y']]
sklearn_vif(data[['x3']], data['x1'])

1.5339491916859123

In [72]:
sklearn_vif(data[['x1']], data['x3'])

1.5339491916859125

In [73]:
linear_regression = linear_model.LinearRegression()
linear_regression.fit(X=data[['x1', 'x3']], y = data['y'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [74]:
linear_regression.score(X=data[['x1', 'x3']], y = data['y'])

0.9757257443691176

In [75]:
linear_regression.predict(X=data[['x1', 'x3']])

array([29.61778291, 42.48845266, 43.70323326, 57.4965358 , 62.69399538])

In [76]:
residual = linear_regression.predict(X = data[['x1', 'x3']]) - data['y']; residual

0   -0.382217
1    2.488453
2   -1.296767
3   -2.503464
4    1.693995
Name: y, dtype: float64

In [77]:
MSE = (residual**2).sum()/5; MSE

3.4314087759815264

In [78]:
RMSE = ((residual**2).sum()**0.5)/5; RMSE

0.8284212426032456

# 데이터들을 비율로 수정

In [81]:
df = pd.DataFrame({
    '집값':[185000, 190000, 210000, 220000, 225000],
    '아파트연식':[12, 15, 10, 5, 3]
}); df

Unnamed: 0,집값,아파트연식
0,185000,12
1,190000,15
2,210000,10
3,220000,5
4,225000,3


In [82]:
df.min()

집값       185000
아파트연식         3
dtype: int64

In [83]:
df-df.min()

Unnamed: 0,집값,아파트연식
0,0,9
1,5000,12
2,25000,7
3,35000,2
4,40000,0


In [85]:
df.max()

집값       225000
아파트연식        15
dtype: int64

In [84]:
df.max() - df.min()

집값       40000
아파트연식       12
dtype: int64

In [86]:
(df-df.min()) / (df.max()-df.min()) # 이게 MinMaxScaler의 원리!

Unnamed: 0,집값,아파트연식
0,0.0,0.75
1,0.125,1.0
2,0.625,0.583333
3,0.875,0.166667
4,1.0,0.0


In [87]:
df

Unnamed: 0,집값,아파트연식
0,185000,12
1,190000,15
2,210000,10
3,220000,5
4,225000,3


In [89]:
from sklearn.preprocessing import MinMaxScaler

In [90]:
scaler = MinMaxScaler()

In [91]:
scaler.fit(df)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [92]:
scaler.transform(df)

array([[0.        , 0.75      ],
       [0.125     , 1.        ],
       [0.625     , 0.58333333],
       [0.875     , 0.16666667],
       [1.        , 0.        ]])