In [7]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.linear_model import SGDRegressor, Ridge, LogisticRegression
from sklearn.metrics import mean_squared_error, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 线性回归-梯度下降

In [9]:
"""
线性回归直接预测房子价格
:return: None
"""
# 获取数据
lb = load_boston()

# 分割数据集到训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(lb.data, lb.target, test_size=0.25, random_state=1)
print(x_train.shape)
print('-'*50)

# 进行标准化处理（？）目标值处理？
# 特征值和目标值是都必须进行标准化处理, 实例化两个标准化API
std_x = StandardScaler()

x_train = std_x.fit_transform(x_train)
x_test = std_x.transform(x_test)

# 目标值进行了标准化
std_y = StandardScaler()

y_train = std_y.fit_transform(y_train.reshape(-1, 1))  # 目标值是1维的，这里需要传进去2维的（不然会报错）
y_test = std_y.transform(y_test.reshape(-1, 1))
# y_train

(379, 13)
--------------------------------------------------



    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [10]:
# 梯度下降去进行房价预测,适合数据量大的情况
# eta0：学习率的初始值，然后依照learning_rate参数进行调整
# learning_rate='optimal',alpha会影响学习率的值，由alpha来算学习率的值eta
# learning_rate其他参数：constant, invscaling, adaptive
sgd = SGDRegressor(eta0=0.008, penalty='l1', alpha=0.005)

# 训练
sgd.fit(x_train, y_train)

print('梯度下降的回归系数', sgd.coef_)

y_sgd_predict= std_y.inverse_transform(sgd.predict(x_test).reshape(-1, 1))  # 实际想要的预测值
y_predict = sgd.predict(x_test)  # 可以理解为中间值（标准化所导致的结果）

print("梯度下降测试集里面每个房子的预测价格：", y_sgd_predict)
print("梯度下降的均方误差：", mean_squared_error(y_test, y_predict))
print("梯度下降的 原始房价量纲下的 均方误差：", mean_squared_error(std_y.inverse_transform(y_test), y_sgd_predict))

梯度下降的回归系数 [-8.62617279e-02  7.56301002e-02 -2.13600812e-02  8.03294819e-02
 -1.62212710e-01  2.67674115e-01 -3.88068948e-04 -2.34792854e-01
  9.24161772e-02 -2.04656204e-02 -2.16070203e-01  6.45379155e-02
 -4.17080897e-01]
梯度下降测试集里面每个房子的预测价格： [[30.06878008]
 [28.03564563]
 [18.2126237 ]
 [22.36802435]
 [18.64656047]
 [20.89019439]
 [29.86841938]
 [18.68845724]
 [23.82621647]
 [26.79536392]
 [26.36166509]
 [29.2015343 ]
 [21.56848569]
 [25.52331269]
 [22.94749063]
 [19.67705078]
 [17.3061031 ]
 [37.67184741]
 [29.54333854]
 [10.16397073]
 [20.87063889]
 [17.75173605]
 [25.3256753 ]
 [25.06966501]
 [30.13405974]
 [11.11780749]
 [14.58270442]
 [19.30903763]
 [35.51728648]
 [14.23157016]
 [23.7809595 ]
 [14.73633987]
 [40.14287256]
 [18.33715077]
 [24.01893959]
 [20.97845519]
 [17.92519425]
 [27.99059691]
 [ 8.44284556]
 [19.59542355]
 [26.26260032]
 [21.97487711]
 [28.50906763]
 [15.74006161]
 [18.78359248]
 [15.41977365]
 [39.67751151]
 [17.87569655]
 [25.81747467]
 [20.95283479]
 [24.89

  y = column_or_1d(y, warn=True)


# 岭回归

In [11]:
# 岭回归去进行房价预测
rd = Ridge(alpha=0.05)

rd.fit(x_train, y_train)

print(rd.coef_)

# 预测测试集的房子价格
y_rd_predict = std_y.inverse_transform(rd.predict(x_test))
y_predict = rd.predict(x_test)

# print("岭回归里面每个房子的预测价格：", y_rd_predict)
print("岭回归的均方误差：", mean_squared_error(y_test, y_predict))
print("岭回归的 原始房价量纲下的 均方误差：", mean_squared_error(std_y.inverse_transform(y_test), y_rd_predict))

[[-0.12019408  0.15027489  0.02932631  0.07472724 -0.28019156  0.22179958
   0.0218258  -0.35250679  0.29879635 -0.20224632 -0.23906031  0.06305591
  -0.45246484]]
岭回归的均方误差： 0.27588055100713926
岭回归的 原始房价量纲下的 均方误差： 21.897473825960407


# 逻辑回归

In [12]:
"""
逻辑回归做二分类进行癌症预测（根据细胞的属性特征）
:return: None
"""
# 构造列标签名字
column = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
          'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli',
          'Mitoses', 'Class']

# 读取数据
data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",
    names=column)

print(data)

     Sample code number  Clump Thickness  Uniformity of Cell Size  \
0               1000025                5                        1   
1               1002945                5                        4   
2               1015425                3                        1   
3               1016277                6                        8   
4               1017023                4                        1   
..                  ...              ...                      ...   
694              776715                3                        1   
695              841769                2                        1   
696              888820                5                       10   
697              897471                4                        8   
698              897471                4                        8   

     Uniformity of Cell Shape  Marginal Adhesion  Single Epithelial Cell Size  \
0                           1                  1                            2   
1        

In [18]:
# 缺失值进行处理
data = data.replace(to_replace='?', value=np.nan)
# 哪一行有空值，就直接删除对应的样本
data = data.dropna()
print('-' * 50)
print(data)

# 进行数据的分割
x_train, x_test, y_train, y_test = train_test_split(data[column[1:10]], data[column[10]], test_size=0.25,
                                                    random_state=1)  # 第0列没用

# 进行标准化处理
std = StandardScaler()

x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)

# 逻辑回归预测
# C：正则化力度
# 默认solver = 'lbfgs'  这个有点类似于之后深度学习的optimizer优化器？
# 逻辑回归需要调整的超参数是什么？
lg = LogisticRegression(C=0.8, solver='newton-cg')

# 训练
lg.fit(x_train, y_train)
# 逻辑回归的权重参数
print(lg.coef_)

# 预测
y_predict = lg.predict(x_test)
print(y_predict)

# 评估
print("准确率：", lg.score(x_test, y_test))  # 可以算这个参数，毕竟逻辑回归是用回归的方法解决分类的问题
print(lg.predict_proba(x_test))  # 得出对应分类的概率

# 为什么还要看下召回率，labels和target_names对应（改名操作）
# macro avg 平均值  weighted avg 加权平均值
print("召回率：", classification_report(y_test, y_predict, labels=[2, 4], target_names=["良性", "恶性"]))
# AUC计算要求是二分类，不需要是0和1
print("AUC指标：", roc_auc_score(y_test, y_predict))


--------------------------------------------------
     Sample code number  Clump Thickness  Uniformity of Cell Size  \
0               1000025                5                        1   
1               1002945                5                        4   
2               1015425                3                        1   
3               1016277                6                        8   
4               1017023                4                        1   
..                  ...              ...                      ...   
694              776715                3                        1   
695              841769                2                        1   
696              888820                5                       10   
697              897471                4                        8   
698              897471                4                        8   

     Uniformity of Cell Shape  Marginal Adhesion  Single Epithelial Cell Size  \
0                           1          