## 数据分析常用算法

### cross validation

将数据集分为train data, develope data, test data 或（train data, test data）

train data 用于模型的训练，其他用于模型的评估
模型在训练集和测试集上的结果可以反映拟合情况

#### train_test_split

In [35]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
data = iris.data
target = iris.target

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.1)
print(len(X_train), len(X_test))

135 15


#### 分类问题一般用StratifiedKFold

In [51]:
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold

iris = load_iris()
data = iris.data
target = iris.target

skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)
print(skf.get_n_splits(data, target))
print(skf)
for train_idx, test_idx in skf.split(data, target):
    print("TRAIN:", train_idx, "TEST:", test_idx)
    X_train, X_test = data[train_idx], data[test_idx]
    y_train, y_test = target[train_idx], target[test_idx]

2
StratifiedKFold(n_splits=2, random_state=1, shuffle=True)
TRAIN: [  0   1   4   5   6   7   8   9  10  11  12  14  15  16  18  20  25  28
  30  33  34  37  43  44  47  50  53  54  56  57  58  59  62  65  66  68
  71  72  73  74  75  76  82  84  85  91  94  95  98  99 100 102 104 106
 107 108 110 111 112 113 116 118 120 121 125 127 128 130 139 140 141 144
 147 148 149] TEST: [  2   3  13  17  19  21  22  23  24  26  27  29  31  32  35  36  38  39
  40  41  42  45  46  48  49  51  52  55  60  61  63  64  67  69  70  77
  78  79  80  81  83  86  87  88  89  90  92  93  96  97 101 103 105 109
 114 115 117 119 122 123 124 126 129 131 132 133 134 135 136 137 138 142
 143 145 146]
TRAIN: [  2   3  13  17  19  21  22  23  24  26  27  29  31  32  35  36  38  39
  40  41  42  45  46  48  49  51  52  55  60  61  63  64  67  69  70  77
  78  79  80  81  83  86  87  88  89  90  92  93  96  97 101 103 105 109
 114 115 117 119 122 123 124 126 129 131 132 133 134 135 136 137 138 142
 143 145 146] TE

#### 回归问题一般用KFold

In [53]:
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold

iris = load_iris()
data = iris.data
target = iris.target

skf = KFold(n_splits=2, shuffle=True, random_state=1)
print(skf.get_n_splits(data, target))
print(skf)
for train_idx, test_idx in skf.split(data, target):
    print("TRAIN:", train_idx, "TEST:", test_idx)
    X_train, X_test = data[train_idx], data[test_idx]
    y_train, y_test = target[train_idx], target[test_idx]

2
KFold(n_splits=2, random_state=1, shuffle=True)
TRAIN: [  0   1   3   7   8   9  10  13  15  20  21  22  23  24  25  26  27  30
  32  34  37  38  41  43  47  49  50  52  57  60  61  63  64  67  68  70
  71  72  74  76  79  80  81  82  83  86  87  88  89  93  96  97 100 101
 105 106 109 111 115 116 121 124 129 130 133 134 136 137 140 142 143 145
 147 148 149] TEST: [  2   4   5   6  11  12  14  16  17  18  19  28  29  31  33  35  36  39
  40  42  44  45  46  48  51  53  54  55  56  58  59  62  65  66  69  73
  75  77  78  84  85  90  91  92  94  95  98  99 102 103 104 107 108 110
 112 113 114 117 118 119 120 122 123 125 126 127 128 131 132 135 138 139
 141 144 146]
TRAIN: [  2   4   5   6  11  12  14  16  17  18  19  28  29  31  33  35  36  39
  40  42  44  45  46  48  51  53  54  55  56  58  59  62  65  66  69  73
  75  77  78  84  85  90  91  92  94  95  98  99 102 103 104 107 108 110
 112 113 114 117 118 119 120 122 123 125 126 127 128 131 132 135 138 139
 141 144 146] TEST: [  0  

### 特征构造

sklearn.preprocessing LabelEncoder , OneHotEncoder ...

### 组合数据

如果数据是稠密的可以用numpy的hstack，如果数据是系数的可以用scipy sparse的hstack

In [58]:
# numpy.hstack()
# 等价于np.concatenate(tup, axis=1)
# numpy.vstack()
# 等价于np.concatenate(tup, axis=0)

import numpy as np

a = np.array([1, 2, 3])
b = np.array([2, 3, 4])

c = np.hstack((a, b))
print('hstack', c)
c = np.vstack((a, b))
print('vstack', c)

hstack [1 2 3 2 3 4]
vstack [[1 2 3]
 [2 3 4]]


In [79]:
import numpy as np
from scipy import sparse

a = np.random.randn(12)
b = np.random.randn(12)
a.shape = (1, a.shape[0])
#b.shape = (1, b.shape[0])
#print(b.shape)
c = sparse.hstack((a, b))

### 常用组合算法

**随机森林: RandomForestClassifier，RandomForestRegressor**

**ExtraTree: ExtraTreesClassifier, ExtraTreesRegressor** (随机森林的改进)

**GDBT: GradientBoostingClassifier, GradientBoostingRegressor**

**XGB: XGBClassifier, XGBRegressor**(GDBT的改进)

In [2]:
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

'''
iris = load_iris()
data = iris.data
target = iris.target
'''
data, target = make_blobs(n_samples=10000, n_features=10, centers=100,random_state=0)
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=0)

# 决策树
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
print('descisiontree score', clf.score(X_test, y_test))

# 随机森林
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
print('randomforest score', clf.score(X_test, y_test))

# extratree
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
print('extratree score', clf.score(X_test, y_test))

'''
# GDBT
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)
print('gdbt score', clf.score(X_test, y_test))
'''
'''
import xgboost as xgb
clf = xgb.XGBClassifier()
clf.fit(X_train, y_train)
print('xgb score', clf.score(X_test, y_test))
'''

descisiontree score 0.978333333333
randomforest score 1.0
extratree score 1.0


"\nimport xgboost as xgb\nclf = xgb.XGBClassifier()\nclf.fit(X_train, y_train)\nprint('xgb score', clf.score(X_test, y_test))\n"

### 降维分解
PCA LDA SVD

### pipeline

In [23]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import validation_curve, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

pipl = Pipeline([('scl', StandardScaler()), ('pca', PCA(n_components=2)), ('clf', LinearSVC())])
train_scores, test_scores = validation_curve(estimator=pipl, X=X_train, y=y_train, param_name='clf__C', param_range=[0.001, 0.01, 0.1, 1, 10, 100], cv=10) # 十折
print(train_scores)
print(test_scores)

param_grid = [
    {'clf__C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0], }
]
gs = GridSearchCV(estimator=pipl, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=1)
gs = gs.fit(X_train, y_train)
print('best_score> ', gs.best_score_)
print('best_params> ', gs.best_params_)

clf = gs.best_estimator_
clf.fit(X_train, y_train)
accu = clf.score(X_test, y_test)
print('accuracy', accu)

[[ 0.86021505  0.87096774  0.86170213  0.82978723  0.82105263  0.82105263
   0.81052632  0.82105263  0.82105263  0.82291667]
 [ 0.84946237  0.87096774  0.85106383  0.85106383  0.83157895  0.85263158
   0.83157895  0.84210526  0.84210526  0.84375   ]
 [ 0.86021505  0.87096774  0.86170213  0.86170213  0.85263158  0.87368421
   0.86315789  0.87368421  0.86315789  0.85416667]
 [ 0.90322581  0.89247312  0.88297872  0.90425532  0.88421053  0.89473684
   0.89473684  0.89473684  0.89473684  0.89583333]
 [ 0.91397849  0.93548387  0.89361702  0.92553191  0.90526316  0.90526316
   0.90526316  0.91578947  0.91578947  0.90625   ]
 [ 0.89247312  0.91397849  0.89361702  0.93617021  0.91578947  0.91578947
   0.90526316  0.93684211  0.92631579  0.90625   ]]
[[ 0.83333333  0.75        0.90909091  0.81818182  0.8         0.8         0.9
   0.8         0.8         0.77777778]
 [ 0.83333333  0.75        0.90909091  0.90909091  1.          0.8         0.9
   0.8         0.8         0.88888889]
 [ 0.83333333