## 1. OLS 

In [3]:
from sklearn import linear_model, datasets
import numpy as np

In [2]:
ols = linear_model.LinearRegression()

In [4]:
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)

In [5]:
ols.fit(diabetes_X,diabetes_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [10]:
ols.coef_  # print coefs

array([ -10.01219782, -239.81908937,  519.83978679,  324.39042769,
       -792.18416163,  476.74583782,  101.04457032,  177.06417623,
        751.27932109,   67.62538639])

In [8]:
y_pred = ols.predict(diabetes_X[:1])
y_pred

array([206.11706979])

## 2. Ridge $L_2$-norm

$$ \min_{w} ||Xw-y||_2^2 + \alpha ||w||_2^2 $$

In [12]:
ridge = linear_model.Ridge(alpha=0.5)

In [13]:
ridge.fit(diabetes_X,diabetes_y)

Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [15]:
ridge.coef_   # you can see some coefs shrink

array([  20.13735735, -131.24260575,  383.48178285,  244.83787184,
        -15.18705619,  -58.34479839, -174.84279842,  121.98505458,
        328.49970219,  110.88603567])

In [17]:
ridge_pred = ridge.predict(diabetes_X[:1])
ridge_pred

array([189.82193827])

#### Choose the best tuning parameter  $\alpha$

In [30]:
a = np.arange(0.1,10,0.5)

In [31]:
ridge_CV = linear_model.RidgeCV(alphas = a)

In [32]:
ridge_CV.fit(diabetes_X,diabetes_y)

RidgeCV(alphas=array([0.1, 0.6, 1.1, 1.6, 2.1, 2.6, 3.1, 3.6, 4.1, 4.6, 5.1, 5.6, 6.1,
       6.6, 7.1, 7.6, 8.1, 8.6, 9.1, 9.6]),
        cv=None, fit_intercept=True, gcv_mode=None, normalize=False,
        scoring=None, store_cv_values=False)

In [36]:
ridge_CV.alpha_  

0.1

## 3. Lasso $L_1$-norm 

$$\min_{w} \frac{1}{2N} ||Xw - y||_2^2 + \alpha ||w||$$

In [39]:
lasso = linear_model.Lasso(alpha=0.1)
lasso.fit(diabetes_X,diabetes_y)
lasso.coef_ # coefs shrink to 0

array([  -0.        , -155.36288234,  517.18201661,  275.08235083,
        -52.54026923,   -0.        , -210.15975349,    0.        ,
        483.91440913,   33.67282148])

## 4. Elastic Nets

In [48]:
from sklearn.datasets import make_regression
from sklearn.linear_model import ElasticNetCV

X, y = make_regression(n_features=2, random_state=0)
regr = ElasticNetCV(cv=10, random_state=0)
regr.fit(X, y)

print(regr.alpha_)

print(regr.intercept_)

print(regr.predict([[0, 0]]))

0.1994727942696716
0.3988829654276791
[0.39888297]


## 5. Bayesian Regression 

## 6. Logistic Regression

In [None]:
class sklearn.linear_model.LogisticRegression(penalty='l2', 
          dual=False, tol=0.0001, C=1.0, fit_intercept=True, 
          intercept_scaling=1, class_weight=None, 
          random_state=None, solver='liblinear', max_iter=100, 
          multi_class='ovr', verbose=0, warm_start=False, n_jobs=1)

dual=False : 对偶或者原始方法。Dual只适用于正则化相为l2的‘liblinear’的情况，通常样本数大于特征数的情况下，默认为False

C=1.0 : C为正则化系数λ的倒数，必须为正数，默认为1。和SVM中的C一样，值越小，代表正则化越强。

solver='liblinear' : solver参数决定了我们对逻辑回归损失函数的优化方法，有四种算法可以选择。

    - liblinear：使用了开源的liblinear库实现，内部使用了坐标轴下降法来迭代优化损失函数。
    - lbfgs：拟牛顿法的一种，利用损失函数二阶导数矩阵即海森矩阵来迭代优化损失函数
    - newton-cg：也是牛顿法家族的一种，利用损失函数二阶导数矩阵即海森矩阵来迭代优化损失函数
    - sag：随机平均梯度下降，是梯度下降法的变种，和普通梯度下降法的区别是每次迭代仅仅用一部分的样本来计算梯度，适合于样本数据多的时候
