In [11]:
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, classification_report, roc_auc_score
import joblib
import pandas as pd
import numpy as np

In [16]:
"""
线性回归直接预测房子价格
:return: None
"""
# 手动加载波士顿房价数据集
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep=r"\s+", skiprows=22, header=None)

# 将数据分为特征矩阵和目标变量
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])  # 前12列为特征值
target = raw_df.values[1::2, 2]  # 第13列为目标值（房价）

# 将数据转化为DataFrame并添加列名，模拟load_boston的效果
feature_names = [
    "CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT"
]
data_df = pd.DataFrame(data, columns=feature_names)
target_df = pd.Series(target, name="MEDV")

# 将数据和目标封装在一起，以模拟load_boston的效果
class Bunch:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

lb = Bunch(data=data_df, target=target_df, feature_names=feature_names, DESCR="Boston housing dataset")


print("获取特征值")
print(lb.data)
print("目标值")
print(lb.target)
print(lb.DESCR)
print(lb.feature_names)
print('-' * 50)
# 分割数据集到训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(lb.data, lb.target, test_size=0.25, random_state=1)
#
print(x_train.shape)
#
# # 进行标准化处理(?) 目标值处理？
# # 特征值和目标值是都必须进行标准化处理, 实例化两个标准化API
std_x = StandardScaler()
#
x_train = std_x.fit_transform(x_train)
x_test = std_x.transform(x_test)

# 目标值进行了标准化
std_y = StandardScaler()
y_train = std_y.fit_transform(y_train.values.reshape(-1, 1))  # 使用 .values 将 Series 转为 numpy 数组
y_test = std_y.transform(y_test.values.reshape(-1, 1))
y_train

获取特征值
        CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0    0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1    0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2    0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3    0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4    0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   
..       ...   ...    ...   ...    ...    ...   ...     ...  ...    ...   
501  0.06263   0.0  11.93   0.0  0.573  6.593  69.1  2.4786  1.0  273.0   
502  0.04527   0.0  11.93   0.0  0.573  6.120  76.7  2.2875  1.0  273.0   
503  0.06076   0.0  11.93   0.0  0.573  6.976  91.0  2.1675  1.0  273.0   
504  0.10959   0.0  11.93   0.0  0.573  6.794  89.3  2.3889  1.0  273.0   
505  0.04741   0.0  11.93   0.0  0.573  6.030  80.8  2.5050  1.0  273.0   

     PTRATIO       B  LSTAT  
0       15.3  396.90   4.98  
1       17.8  396.90   9.14  
2  

array([[-0.19582006],
       [ 0.0847902 ],
       [ 0.63478631],
       [-0.26316652],
       [ 0.0847902 ],
       [ 0.298054  ],
       [-1.41928078],
       [ 0.17458549],
       [ 1.54396354],
       [-1.0039776 ],
       [-1.08254847],
       [ 1.95926673],
       [-1.34070991],
       [-0.19582006],
       [-0.51010354],
       [-0.31928857],
       [ 0.15213666],
       [ 0.69090836],
       [ 0.21948313],
       [ 0.16336107],
       [-0.36418621],
       [ 0.67968395],
       [-0.20704447],
       [ 1.28580211],
       [-0.88050909],
       [-0.21826888],
       [-0.71214293],
       [-1.01520201],
       [ 0.9490698 ],
       [ 0.57866426],
       [ 0.23070754],
       [ 0.23070754],
       [ 0.30927841],
       [ 2.40824314],
       [-0.04990272],
       [ 0.43274692],
       [-0.91418232],
       [-0.17337123],
       [-0.25194211],
       [ 0.0847902 ],
       [-1.03765083],
       [-0.68969411],
       [ 0.27560518],
       [-0.2407177 ],
       [ 0.01744374],
       [-0

In [32]:
# # estimator预测
# # # 正规方程求解方式预测结果，正规方程进行线性回归
lr = LinearRegression()
# #
lr.fit(x_train, y_train)
#
print('回归系数', lr.coef_)  #回归系数可以看特征与目标之间的相关性
#
y_predict = lr.predict(x_test)
# 预测测试集的房子价格，通过inverse得到真正的房子价格
y_lr_predict = std_y.inverse_transform(y_predict)
# 保存训练好的模型
joblib.dump(lr, "./model/test.pkl")
print("正规方程测试集里面每个房子的预测价格：", y_lr_predict)
print("正规方程的均方误差：", mean_squared_error(y_test, y_predict))

回归系数 [[-0.12026411  0.15044778  0.02951803  0.07470354 -0.28043353  0.22170939
   0.02190624 -0.35275513  0.29939558 -0.2028089  -0.23911894  0.06305081
  -0.45259462]]
正规方程测试集里面每个房子的预测价格： [[32.37816533]
 [27.95684437]
 [18.07213891]
 [21.63166556]
 [18.93029508]
 [19.96277202]
 [32.2834674 ]
 [18.06715668]
 [24.72989076]
 [26.85359369]
 [27.23326816]
 [28.57021239]
 [21.18778302]
 [26.94393815]
 [23.37892579]
 [20.89176865]
 [17.11746934]
 [37.73997945]
 [30.51980066]
 [ 8.44489436]
 [20.86557977]
 [16.21989418]
 [25.13605925]
 [24.77658813]
 [31.40497629]
 [11.02741407]
 [13.82097563]
 [16.80208261]
 [35.94637198]
 [14.7155729 ]
 [21.23939821]
 [14.15079469]
 [42.72492585]
 [17.83887162]
 [21.84610225]
 [20.40178099]
 [17.50287927]
 [27.00093206]
 [ 9.80760408]
 [20.00288662]
 [24.27066782]
 [21.06719021]
 [29.47089776]
 [16.48482565]
 [19.38852695]
 [14.54778282]
 [39.39838319]
 [18.09810655]
 [26.22164983]
 [20.60676525]
 [25.09994066]
 [24.48366723]
 [25.02297948]
 [26.84986898]
 

# 2 加载保存的模型

In [33]:
model = joblib.load("./model/test.pkl")
# # 因为目标值进行了标准化，一定要把预测后的值逆向转换回来
y_predict = model.predict(x_test)

#
print("保存的模型预测的结果：", y_predict)
print("正规方程的均方误差：", mean_squared_error(y_test, y_predict))

print("正规方程inverse后的均方误差：", mean_squared_error(std_y.inverse_transform(y_test),
                                               std_y.inverse_transform(y_predict)))

保存的模型预测的结果： [[ 1.12620955]
 [ 0.62994234]
 [-0.47955756]
 [-0.08002168]
 [-0.38323459]
 [-0.26734514]
 [ 1.11558027]
 [-0.48011678]
 [ 0.26773583]
 [ 0.50610896]
 [ 0.54872518]
 [ 0.69878929]
 [-0.12984488]
 [ 0.51624959]
 [ 0.11609798]
 [-0.16307075]
 [-0.58671359]
 [ 1.72804157]
 [ 0.91761907]
 [-1.56015899]
 [-0.16601029]
 [-0.68746111]
 [ 0.31332585]
 [ 0.27297733]
 [ 1.01697482]
 [-1.27028638]
 [-0.95672557]
 [-0.62211389]
 [ 1.5267197 ]
 [-0.8563123 ]
 [-0.12405138]
 [-0.91970532]
 [ 2.28757241]
 [-0.50574043]
 [-0.05595243]
 [-0.21806897]
 [-0.54345359]
 [ 0.52264682]
 [-1.40720286]
 [-0.26284251]
 [ 0.21619076]
 [-0.14338071]
 [ 0.79988591]
 [-0.65772411]
 [-0.33180076]
 [-0.87514574]
 [ 1.91418761]
 [-0.47664284]
 [ 0.43517699]
 [-0.1950607 ]
 [ 0.30927175]
 [ 0.24009869]
 [ 0.30063331]
 [ 0.50569088]
 [-1.94512422]
 [ 0.20018782]
 [-1.30384514]
 [ 0.50366068]
 [-0.6220835 ]
 [ 1.47453167]
 [-0.31823582]
 [ 0.57109939]
 [-0.64702253]
 [-0.35840699]
 [-1.27347275]
 [ 1.08939349

# 3 梯度下降

In [36]:
# 梯度下降去进行房价预测,数据量大要用这个
# 默认可以去调 eta0 = 0.008，会改变learning_rate
# learning_rate='optimal',alpha会影响学习率的值，由alpha来算学习率
sgd = SGDRegressor(eta0=0.008, penalty='l1', alpha=0.005)
# # 训练
sgd.fit(x_train, y_train)
#
print('梯度下降的回归系数', sgd.coef_)
#
# 预测测试集的房子价格
y_sgd_predict = std_y.inverse_transform(sgd.predict(x_test).reshape(-1, 1))
y_predict = sgd.predict(x_test)
print("梯度下降测试集里面每个房子的预测价格：", y_sgd_predict)
print("梯度下降的均方误差：", mean_squared_error(y_test, y_predict))
print("梯度下降的原始房价量纲均方误差：", mean_squared_error(std_y.inverse_transform(y_test), y_sgd_predict))

梯度下降的回归系数 [-0.09335758  0.0869953  -0.02094664  0.07869651 -0.19052879  0.26103409
  0.         -0.25008855  0.09996821 -0.03080633 -0.22498524  0.06606935
 -0.43283037]
梯度下降测试集里面每个房子的预测价格： [[30.53935688]
 [28.36578798]
 [18.26853377]
 [22.63354512]
 [18.28859074]
 [20.68297313]
 [30.5534369 ]
 [18.60085541]
 [23.86683453]
 [27.06227482]
 [26.34905925]
 [29.51453698]
 [21.64365496]
 [26.17110144]
 [23.07074565]
 [19.48763308]
 [16.76331074]
 [37.84119236]
 [30.23979388]
 [ 9.50501661]
 [20.95319036]
 [17.24661524]
 [25.39915931]
 [25.22113492]
 [30.84497116]
 [10.50662475]
 [14.35622708]
 [19.28747629]
 [35.73152497]
 [13.81103319]
 [23.950391  ]
 [14.75306047]
 [40.97344601]
 [17.79869064]
 [24.14257844]
 [20.94712441]
 [17.37461128]
 [28.1379273 ]
 [ 8.11612834]
 [19.37483214]
 [26.44497638]
 [21.97669715]
 [28.77324328]
 [15.40549589]
 [18.48845292]
 [14.71562573]
 [39.79006928]
 [17.47550413]
 [25.93038255]
 [20.94396557]
 [24.66233764]
 [24.64518164]
 [25.75820326]
 [26.60242669]


  y = column_or_1d(y, warn=True)


#4 岭回归

In [38]:
# # # 岭回归去进行房价预测
rd = Ridge(alpha=0.05)

rd.fit(x_train, y_train)

print(rd.coef_)
# # 预测测试集的房子价格
y_rd_predict = std_y.inverse_transform(rd.predict(x_test))
y_predict = rd.predict(x_test)
print("岭回归里面每个房子的预测价格：", y_rd_predict)
print("岭回归的均方误差：", mean_squared_error(y_test, y_predict))
print("岭回归的均方误差：", mean_squared_error(std_y.inverse_transform(y_test), y_rd_predict))

[[-0.12019408  0.15027489  0.02932631  0.07472724 -0.28019156  0.22179958
   0.0218258  -0.35250679  0.29879635 -0.20224632 -0.23906031  0.06305591
  -0.45246484]]
岭回归里面每个房子的预测价格： [[32.37243478]
 [27.95765983]
 [18.07267415]
 [21.63438995]
 [18.92855656]
 [19.96443553]
 [32.27804137]
 [18.06862972]
 [24.72766116]
 [26.85407942]
 [27.23035657]
 [28.57229906]
 [21.18968684]
 [26.94188671]
 [23.37787012]
 [20.88817673]
 [17.11756142]
 [37.73879777]
 [30.51888617]
 [ 8.44898103]
 [20.86558956]
 [16.2225651 ]
 [25.13613174]
 [24.77720348]
 [31.40296023]
 [11.02734449]
 [13.82324405]
 [16.8088679 ]
 [35.94460943]
 [14.71413735]
 [21.24632593]
 [14.15273196]
 [42.71858578]
 [17.83913075]
 [21.8528479 ]
 [20.40400766]
 [17.50305591]
 [27.00302308]
 [ 9.80457598]
 [20.00141854]
 [24.2770577 ]
 [21.06882862]
 [29.46824032]
 [16.48305607]
 [19.38663962]
 [14.54936099]
 [39.39782042]
 [18.09691469]
 [26.21996707]
 [20.60807936]
 [25.09870476]
 [24.48356503]
 [25.02409394]
 [26.84836627]
 [ 5.02384

# 5 逻辑回归

In [40]:
"""
逻辑回归做二分类进行癌症预测（根据细胞的属性特征）
:return: NOne
"""
# 构造列标签名字
column = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
          'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli',
          'Mitoses', 'Class']

# 读取数据
data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",
    names=column)

print(data)

# 缺失值进行处理
data = data.replace(to_replace='?', value=np.nan)
#直接删除，哪一行有空值，就删除对应的样本
data = data.dropna()
print('-' * 50)
print(data)
# 进行数据的分割
x_train, x_test, y_train, y_test = train_test_split(data[column[1:10]], data[column[10]], test_size=0.25,
                                                    random_state=1)

# 进行标准化处理
std = StandardScaler()

x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)
#
# # 逻辑回归预测
# C正则化力度
# solver = 'liblinear'
lg = LogisticRegression(C=0.8, solver='newton-cg')
#
lg.fit(x_train, y_train)
# 逻辑回归的权重参数，了解
print(lg.coef_)

y_predict = lg.predict(x_test)
print(y_predict)
print("准确率：", lg.score(x_test, y_test))
print(lg.predict_proba(x_test))  #得出对应分类的概率
# 为什么还要看下召回率，labels和target_names对应
# macro avg 平均值  weighted avg 加权平均值
print("召回率：", classification_report(y_test, y_predict, labels=[2, 4], target_names=["良性", "恶性"]))
#AUC计算要求是二分类，不需要是0和1
print("AUC指标：", roc_auc_score(y_test, y_predict))
print(x_train.shape)
print(x_test.shape)


     Sample code number  Clump Thickness  Uniformity of Cell Size  \
0               1000025                5                        1   
1               1002945                5                        4   
2               1015425                3                        1   
3               1016277                6                        8   
4               1017023                4                        1   
..                  ...              ...                      ...   
694              776715                3                        1   
695              841769                2                        1   
696              888820                5                       10   
697              897471                4                        8   
698              897471                4                        8   

     Uniformity of Cell Shape  Marginal Adhesion  Single Epithelial Cell Size  \
0                           1                  1                            2   
1        