In [58]:
#..........................案例1:
#.......................... 理解树回归
#...............对于非线性问题，可以考虑切分后再进行回归，这就是树回归
#.......CART（回归分类树），可用于分类和回归
# 之前决策树采取的ID3算法，ID3切法，一旦切了，这个特征之后不起作用，并且不能处理连续型
# CART采用的二元切分
import numpy as np
import pandas as pd
# 切分数据函数
def binSplitDataSet(dataSet, feature, value):
    mat0 = dataSet[np.nonzero(dataSet[:,feature] > value)[0],:]
    mat1 = dataSet[np.nonzero(dataSet[:,feature] <= value)[0],:]
    return mat0,mat1
test=np.mat(np.eye(4))
print('矩阵：\n',test)
m1,m2=binSplitDataSet(test,1,0.5)
print('切分矩阵：\n',m1,'\n',m2) # 指定列进行二元切分
# 叶节点
def regLeaf(dataSet): #returns the value used for each leaf
    return np.mean(dataSet[:,-1])
# 计算误差
def regErr(dataSet):
    return np.var(dataSet[:,-1]) * np.shape(dataSet)[0]
# 遍历所有的特征及其可能的取值来找到使误差最小化的切分阈值
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
    # 量tolS是容许的误差下降值，tolN是切分的最少样本数
    tolS = ops[0]; tolN = ops[1]
    #if all the target variables are the same value: quit and return value
    if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #exit cond 1
        return None, leafType(dataSet)
    m,n = np.shape(dataSet)
    #the choice of the best feature is driven by Reduction in RSS error from mean
    S = errType(dataSet)
    bestS = np.inf; bestIndex = 0; bestValue = 0
    for featIndex in range(n-1):
        for splitVal in set(dataSet[:,featIndex].flatten().A[0]):
            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
            if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): continue
            newS = errType(mat0) + errType(mat1)
            if newS < bestS:
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS
    #if the decrease (S-bestS) is less than a threshold don't do the split
    if (S - bestS) < tolS:
        return None, leafType(dataSet) #exit cond 2
    mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
    if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):  #exit cond 3
        return None, leafType(dataSet)
    return bestIndex,bestValue #returns the best feature to split on
                              #and the value used for that split
# 创建回归树
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)): #assume dataSet is NumPy Mat so we can array filtering
    feat, val = chooseBestSplit(dataSet, leafType, errType, ops) #choose the best split
    if feat == None: return val #if the splitting hit a stop condition return val
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = binSplitDataSet(dataSet, feat, val)
    retTree['left'] = createTree(lSet, leafType, errType, ops)
    retTree['right'] = createTree(rSet, leafType, errType, ops)
    return retTree
dta=pd.read_table(r'C:\Users\Wudey\Desktop\machinelearninginaction\Ch09\ex00.txt',header=None)
print('树回归结果：\n',createTree(np.mat(dta)))

矩阵：
 [[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]
切分矩阵：
 [[0. 1. 0. 0.]] 
 [[1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]
树回归结果：
 {'spInd': 0, 'spVal': 0.48813, 'left': 1.0180967672413792, 'right': -0.04465028571428572}


In [59]:
#..........................案例2:
#.......................... 剪枝：通过降低决策树的复杂度来避免过拟合的过程称为剪枝（pruning）
#............预剪枝：在训练过程中剪枝，chooseBestSplit中的opt其实就是预剪枝。
print('树回归结果：\n',createTree(np.mat(dta),ops=(0.2,4)))
# 这个结果的树节点就比较多
# 如果ops[0]=0，每个点都会是树节点。我们不知道结果应该是怎样的，这样剪枝效果并不好。
#............后剪枝：指定参数使得结果比较复杂，方便剪枝。
# 后剪枝需要同时考虑训练结果和测试集，可以合并的则合并（总误差方差减少）
# 判断是否决策树
def isTree(obj):
    return (type(obj).__name__=='dict')
# 用于判断是否有测试集,没有的话就return均值
def getMean(tree):
    if isTree(tree['right']): tree['right'] = getMean(tree['right'])
    if isTree(tree['left']): tree['left'] = getMean(tree['left'])
    return (tree['left']+tree['right'])/2.0
# 后剪枝
def prune(tree, testData):
    if np.shape(testData)[0] == 0: return getMean(tree) #if we have no test data collapse the tree
    if (isTree(tree['right']) or isTree(tree['left'])):#if the branches are not trees try to prune them
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
    if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet)
    if isTree(tree['right']): tree['right'] =  prune(tree['right'], rSet)
    #if they are now both leafs, see if we can merge them
    if not isTree(tree['left']) and not isTree(tree['right']):
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
        errorNoMerge = sum(np.power(lSet[:,-1] - tree['left'],2)) +\
            sum(np.power(rSet[:,-1] - tree['right'],2))
        treeMean = (tree['left']+tree['right'])/2.0
        errorMerge = sum(np.power(testData[:,-1] - treeMean,2))
        if errorMerge < errorNoMerge:
            print("merging")
            return treeMean
        else: return tree
    else: return tree
tree=createTree(np.mat(dta),ops=(0.2,4))
print('树回归结果(后剪枝)：\n',prune(tree,np.mat(dta.iloc[0:30,:].values)))

树回归结果：
 {'spInd': 0, 'spVal': 0.48813, 'left': {'spInd': 0, 'spVal': 0.620599, 'left': 0.9852403058823527, 'right': 1.1081870645161291}, 'right': -0.04465028571428572}
树回归结果(后剪枝)：
 {'spInd': 0, 'spVal': 0.48813, 'left': {'spInd': 0, 'spVal': 0.620599, 'left': 0.9852403058823527, 'right': 1.1081870645161291}, 'right': -0.04465028571428572}


In [60]:
#..........................案例3:
#.......................... 树回归（2）：分段线性回归
# 线性回归
def linearSolve(dataSet):   #helper function used in two places
    m,n = np.shape(dataSet)
    X = np.mat(np.ones((m,n))) # create a copy of data with 1 in 0th postion
    X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1] #and strip out Y
    xTx = X.T*X
    if np.linalg.det(xTx) == 0.0:
        raise NameError('This matrix is singular, cannot do inverse,\n\
        try increasing the second value of ops')
    ws = xTx.I * (X.T * Y)
    return ws,X,Y
# 回归系数
def modelLeaf(dataSet):#create linear model and return coeficients
    ws,X,Y = linearSolve(dataSet)
    return ws
# 回归误差最小化
def modelErr(dataSet):
    ws,X,Y = linearSolve(dataSet)
    yHat = X * ws
    return sum(np.power(Y - yHat,2))
print('树回归：\n',createTree(np.mat(dta),modelLeaf,modelErr,(1,2)))

树回归：
 {'spInd': 0, 'spVal': 0.48813, 'left': matrix([[ 1.21832215],
        [-0.27389304]]), 'right': matrix([[ 0.00779999],
        [-0.22210371]])}


In [74]:
#..........................案例4:
#.......................... 比较模型结果
def regTreeEval(model, inDat):
    return float(model)
def modelTreeEval(model, inDat):
    n = np.shape(inDat)[1]
    X = np.mat(np.ones((1,n+1)))
    X[:,1:n+1]=inDat
    return float(X*model)
def treeForeCast(tree, inData, modelEval=regTreeEval):
    if not isTree(tree): return modelEval(tree, inData)
    if inData[tree['spInd']] > tree['spVal']:
        if isTree(tree['left']): return treeForeCast(tree['left'], inData, modelEval)
        else: return modelEval(tree['left'], inData)
    else:
        if isTree(tree['right']): return treeForeCast(tree['right'], inData, modelEval)
        else: return modelEval(tree['right'], inData)
def createForeCast(tree, testData, modelEval=regTreeEval):
    m=len(testData)
    yHat = np.mat(np.zeros((m,1)))
    for i in range(m):
        yHat[i,0] = treeForeCast(tree, np.mat(testData[i]), modelEval)
    return yHat
dta=pd.read_table(r'C:\Users\Wudey\Desktop\machinelearninginaction\Ch09\bikeSpeedVsIq.txt',header=None)
train,test=dta.iloc[0:200,:].values,dta.iloc[200:,:].values
mytree1=createTree(np.mat(train),ops=(1,20))
yhat=createForeCast(mytree1,test[:,0])
print('R方（二分回归）：\n',np.corrcoef(yhat.flatten().A[0],test[:,1]))
mytree2=createTree(np.mat(train),modelLeaf,modelErr,ops=(1,20))
yhat=createForeCast(mytree2,test[:,0],modelTreeEval)
print('R方（分段回归）：\n',np.corrcoef(yhat.flatten().A[0],test[:,1]))
print('R方（线性回归）：\n','0.943468')

R方（二分回归）：
 [[1.         0.96408523]
 [0.96408523 1.        ]]
R方（分段回归）：
 [[1.         0.97604122]
 [0.97604122 1.        ]]
R方（线性回归）：
 0.943468
