In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import  LinearRegression
from sklearn.linear_model import Lasso, LassoCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_diabetes

import warnings
warnings.filterwarnings("ignore")

data = pd.read_csv('https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt', sep='\t')
#print('>>>X shape {}'.format(data['data'].shape))
#print('>>>Y shape {}'.format(data['target'].shape))
#print(data['DESCR'])

In [7]:
Y = data['Y']
X = data.drop(columns=['Y'])
X = pd.get_dummies(X, columns=['SEX'])

In [8]:
idx = list(range(X.shape[0]))
train_idx, valid_idx = train_test_split(idx, test_size=0.3, random_state=2023)
#print(">>> # of Trained data: {}".format(len(train_idx)))
#print(">>> # of Validd data: {}".format(len(valid_idx)))

In [9]:
scaler = MinMaxScaler().fit(X.iloc[train_idx])
X_scal = scaler.transform(X)
X_scal = pd.DataFrame(X_scal, columns=X.columns)

In [10]:
import numpy as np
import pandas as pd
import scipy.stats
from sklearn import metrics

def sse(clf, X, y):
    y_hat = clf.predict(X)
    return np.sum((y_hat - y) ** 2) / X.shape[0]

def adjr_r2_score(clf, X, y):
    n, p = X.shape
    r_squared = metrics.r2_score(y, clf.predict(X))
    return 1 - (1 - r_squared) * ((n - 1) / (n - p - 1))

def coef_se(clf, X, y):
    n = X.shape[0]
    # np.matrix 대신 np.array와 hstack 사용 권장
    X1 = np.column_stack((np.ones(n), X))
    # linalg.inv와 MSE를 이용한 표준오차 계산
    mse = metrics.mean_squared_error(y, clf.predict(X))
    # xtx_inv = np.linalg.inv(X1.T @ X1)
    var_beta = mse * np.linalg.inv(np.dot(X1.T, X1))
    return np.sqrt(np.diagonal(var_beta))

def coef_tval(clf, X, y):
    se = coef_se(clf, X, y)
    t_intercept = clf.intercept_ / se[0]
    t_coefs = clf.coef_ / se[1:]
    return np.append(t_intercept, t_coefs)

def coef_pval(clf, X, y):
    n, p = X.shape
    t = coef_tval(clf, X, y)
    # 자유도는 n - p - 1 사용
    return 2 * (1 - scipy.stats.t.cdf(np.abs(t), n - p - 1))

def summary(clf, X, y, xlabels=None):
    n, p = X.shape
    if xlabels is None:
        xlabels = [f'x{i}' for i in range(1, p + 1)]
    elif len(xlabels) != p:
        raise AssertionError(f"Dimension of xlabels {len(xlabels)} doesn't match X {p}")

    # 데이터 프레임 구성
    coef_df = pd.DataFrame(
        index=['_intercept'] + list(xlabels),
        columns=['Estimate', 'Std. Error', 't value', 'p value']
    )
    
    # 통계값 계산 (중복 연산을 줄이기 위해 변수에 담아 사용)
    se = coef_se(clf, X, y)
    t_val = coef_tval(clf, X, y)
    p_val = coef_pval(clf, X, y)
    estimates = np.append(clf.intercept_, clf.coef_)

    coef_df['Estimate'] = np.round(estimates, 6)
    coef_df['Std. Error'] = np.round(se, 6)
    coef_df['t value'] = np.round(t_val, 6)
    coef_df['p value'] = np.round(p_val, 6)
    
    print('Coefficients:')
    print(coef_df.to_string(index=True))

In [12]:
penalty = [0.0000001, 0.00000005]

for a in penalty:
    model = Lasso(alpha=a).fit(X_scal.iloc[train_idx], Y.iloc[train_idx] )
    score = model.score(X_scal.iloc[valid_idx], Y.iloc[valid_idx])
    pred_y = model.predict(X_scal.iloc[valid_idx])
    mse = mean_squared_error(Y.iloc[valid_idx], pred_y)
    print("Alpha:{0: .7f}".format(a))

Alpha: 0.0000001
Alpha: 0.0000000
