In [2]:
import pandas as pd
import numpy as np

##### 2.1.1.1 线性分类器
##### 良性/恶性肿瘤预测

In [3]:
columns_names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 
                'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']

data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data',
                   names=columns_names)
# 替换缺失值
data = data.replace(to_replace='?', value=np.nan)
data = data.dropna(how='any')
data.shape

(683, 11)

In [38]:
# 使用sklearn.cross_valiation里的train_test_split模块分割数据
# 随机采样25%的数据用于测试，75%用于训练
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[columns_names[1:10]], data[columns_names[10]],
                                                    test_size=0.25, random_state=33)
y_train.value_counts()

2    344
4    168
Name: Class, dtype: int64

In [39]:
y_test.value_counts()

2    100
4     71
Name: Class, dtype: int64

In [40]:
# 使用线性分类模型对良性/恶性肿瘤预测分类
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

In [41]:
# 标准化数据，保证每个维度的特征数据方差为1，均值为0.使预测结果不会被某些维度过大的特征值而主导。
# fit_transform方法是fit和transform的结合，fit_transform(X_train) 意思是找出X_train的@和@，并应用在X_train上。
# 这时对于X_test，我们就可以直接使用transform方法。因为此时StandardScaler已经保存了X_train的@和@。
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [42]:
# 初始化LogisticRegression与SGDClassifier
lr = LogisticRegression(solver='liblinear')
sgdc = SGDClassifier()
# 利用lr训练
lr.fit(X_train, y_train)
lr_y_predict = lr.predict(X_test)
# 利用sgdc训练
sgdc.fit(X_train, y_train)
sgdc_y_predict = sgdc.predict(X_test)

In [43]:
lr_y_predict

array([2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 4, 4, 4, 4, 4, 2, 2, 4,
       4, 2, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 2, 4,
       2, 2, 4, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4,
       2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2, 4, 2,
       4, 2, 4, 4, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2,
       4, 2, 2, 2, 2, 2, 2, 4, 2, 2, 4, 4, 2, 4, 2, 2, 2, 4, 2, 2, 4, 4,
       2, 4, 4, 2, 2, 2, 2, 4, 2, 4, 2, 4, 2, 2, 2, 2, 2, 4, 4, 2, 4, 4,
       2, 4, 2, 2, 2, 2, 4, 4, 4, 2, 4, 2, 2, 4, 2, 4, 4])

In [44]:
sgdc_y_predict

array([4, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 4, 2, 2, 4,
       4, 2, 2, 4, 2, 2, 4, 2, 4, 4, 4, 4, 2, 4, 2, 4, 4, 4, 4, 4, 2, 4,
       2, 2, 4, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4,
       2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 2, 4, 2,
       4, 2, 2, 4, 2, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2,
       4, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 4, 4,
       2, 4, 4, 2, 2, 2, 2, 4, 2, 4, 2, 4, 2, 2, 2, 2, 2, 4, 4, 2, 4, 4,
       2, 4, 2, 2, 2, 2, 4, 4, 4, 2, 4, 2, 2, 4, 2, 4, 4])

In [45]:
# 使用线性分类模型对良性/恶性肿瘤的性能分析
from sklearn.metrics import classification_report

In [46]:
# 利用lr
# 测试lr模型在测试集上的准确性结果
print('Accuracy of LR Classifier:', lr.score(X_test, y_test))
# 获得LR的召回率、准确率和F1指标
print(classification_report(y_test, lr_y_predict, target_names=['Benign', 'Malignant']))

# 利用sgdc
print('Accuracy of SGDC Classifier:', sgdc.score(X_test, y_test))
print(classification_report(y_test, sgdc_y_predict, target_names=['Benign', 'Malignant']))

Accuracy of LR Classifier: 0.9883040935672515
              precision    recall  f1-score   support

      Benign       0.99      0.99      0.99       100
   Malignant       0.99      0.99      0.99        71

    accuracy                           0.99       171
   macro avg       0.99      0.99      0.99       171
weighted avg       0.99      0.99      0.99       171

Accuracy of SGDC Classifier: 0.9298245614035088
              precision    recall  f1-score   support

      Benign       0.90      0.99      0.94       100
   Malignant       0.98      0.85      0.91        71

    accuracy                           0.93       171
   macro avg       0.94      0.92      0.93       171
weighted avg       0.93      0.93      0.93       171



##### 2.1.1.2 支持向量机（分类）
##### 使用支持向量机分类器处理Scikit-learn内部集成的手写体数字图片数据集

In [49]:
# 导入手写体数字加载器,获得数码图像数据,每幅图片由8*8=64的像素矩阵表示
from sklearn.datasets import load_digits
digits = load_digits()
digits.data.shape

(1797, 64)

In [51]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.25, random_state=33)

X_train.shape

(1347, 64)

In [58]:
X_test.shape

(450, 64)

In [64]:
# 导入标准化模块和基于线性假设的支持向量机分类器LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

lsvc = LinearSVC(max_iter=10000) # 报错ConvergenceWarning，模型未收敛，增加迭代次数
lsvc.fit(X_train, y_train)
y_predict = lsvc.predict(X_test)

In [66]:
# 评估
from sklearn.metrics import classification_report
print('Accuracy of LinearSVC is ', lsvc.score(X_test, y_test))
print(classification_report(y_test, y_predict, target_names=digits.target_names.astype(str)))

Accuracy of LinearSVC is  0.9511111111111111
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        35
           1       0.95      0.98      0.96        54
           2       0.98      1.00      0.99        44
           3       0.93      0.93      0.93        46
           4       0.97      1.00      0.99        35
           5       0.94      0.94      0.94        48
           6       0.96      0.98      0.97        51
           7       0.92      1.00      0.96        35
           8       0.98      0.83      0.90        58
           9       0.95      0.91      0.93        44

    accuracy                           0.95       450
   macro avg       0.95      0.96      0.95       450
weighted avg       0.95      0.95      0.95       450



##### 2.1.1.3 朴素贝叶斯
##### 使用经典的20类新闻文本做文本分类

In [70]:
# 获取新闻数据
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all')
print(len(news.data))
print(news.data[0])

18846
From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!




In [92]:
# 分割数据集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(news.data, news.target,test_size=0.25, random_state=33)

In [93]:
# 使用朴素贝叶斯进行类别预测
# 从feature_extraction.text里导入用于文本特征向量转化模块，只考虑词汇在文本出现的频率
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)

from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_predict = mnb.predict(X_test)

In [97]:
# 性能评估
from sklearn.metrics import classification_report
print('The Accuracy of NB is ', mnb.score(X_test, y_test))
print(classification_report(y_test, y_predict, target_names=news.target_names))

The Accuracy of NB is  0.8397707979626485
                          precision    recall  f1-score   support

             alt.atheism       0.86      0.86      0.86       201
           comp.graphics       0.59      0.86      0.70       250
 comp.os.ms-windows.misc       0.89      0.10      0.17       248
comp.sys.ibm.pc.hardware       0.60      0.88      0.72       240
   comp.sys.mac.hardware       0.93      0.78      0.85       242
          comp.windows.x       0.82      0.84      0.83       263
            misc.forsale       0.91      0.70      0.79       257
               rec.autos       0.89      0.89      0.89       238
         rec.motorcycles       0.98      0.92      0.95       276
      rec.sport.baseball       0.98      0.91      0.95       251
        rec.sport.hockey       0.93      0.99      0.96       233
               sci.crypt       0.86      0.98      0.91       238
         sci.electronics       0.85      0.88      0.86       249
                 sci.med       0.

##### 2.1.1.4 K近邻（分类）
##### 利用K近邻对生物物种进行分类，鸢尾Iris数据集

In [108]:
# 导入iris数据加载器，读取数据
# 特征是4列花的形状长度宽度，最后一列是类别（150条，共3类）
from sklearn.datasets import load_iris
iris = load_iris()
iris.data.shape

(150, 4)

In [109]:
# 数据说明
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [101]:
# 分割数据集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=33)

In [111]:
X_train.shape

(112, 4)

In [112]:
X_train

array([[5. , 2.3, 3.3, 1. ],
       [4.9, 3.1, 1.5, 0.2],
       [6.3, 2.3, 4.4, 1.3],
       [5.8, 2.6, 4. , 1.2],
       [6.2, 2.9, 4.3, 1.3],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.4, 1.4, 0.3],
       [5.1, 2.5, 3. , 1.1],
       [4.8, 3.4, 1.6, 0.2],
       [7.9, 3.8, 6.4, 2. ],
       [5.1, 3.4, 1.5, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [5.6, 2.9, 3.6, 1.3],
       [6.5, 3. , 5.5, 1.8],
       [5.4, 3.9, 1.7, 0.4],
       [7. , 3.2, 4.7, 1.4],
       [5.8, 2.8, 5.1, 2.4],
       [7.7, 2.6, 6.9, 2.3],
       [5.5, 2.5, 4. , 1.3],
       [5.9, 3.2, 4.8, 1.8],
       [4.9, 3.6, 1.4, 0.1],
       [4.5, 2.3, 1.3, 0.3],
       [6.3, 2.8, 5.1, 1.5],
       [4.4, 2.9, 1.4, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [7.2, 3. , 5.8, 1.6],
       [6. , 3.4, 4.5, 1.6],
       [6.2, 2.2, 4.5, 1.5],
       [7.4, 2.8, 6.1, 1.9],
       [6.8, 3. , 5.5, 2.1],
       [6.4, 2.8, 5.6, 2.2],
       [5.7, 2.5, 5. , 2. ],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.5, 0.3],
       [6.8, 2

In [113]:
# 数据标准化，使用K近邻进行类别预测
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier()
knc.fit(X_train, y_train)
y_predict = knc.predict(X_test)

In [114]:
# 性能评估
print('The Accuracy of KNC is ', knc.score(X_test, y_test))

from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict, target_names=iris.target_names))

The Accuracy of KNC is  0.8947368421052632
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         8
  versicolor       0.73      1.00      0.85        11
   virginica       1.00      0.79      0.88        19

    accuracy                           0.89        38
   macro avg       0.91      0.93      0.91        38
weighted avg       0.92      0.89      0.90        38



##### 2.1.1.5 决策树
##### 使用决策树预测泰坦尼克号乘客的生还情况

In [120]:
# 导入数据
import pandas as pd
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
titanic

Unnamed: 0,row.names,pclass,survived,name,age,embarked,home.dest,room,ticket,boat,sex
0,1,1st,1,"Allen, Miss Elisabeth Walton",29.0000,Southampton,"St Louis, MO",B-5,24160 L221,2,female
1,2,1st,0,"Allison, Miss Helen Loraine",2.0000,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
2,3,1st,0,"Allison, Mr Hudson Joshua Creighton",30.0000,Southampton,"Montreal, PQ / Chesterville, ON",C26,,(135),male
3,4,1st,0,"Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)",25.0000,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
4,5,1st,1,"Allison, Master Hudson Trevor",0.9167,Southampton,"Montreal, PQ / Chesterville, ON",C22,,11,male
5,6,1st,1,"Anderson, Mr Harry",47.0000,Southampton,"New York, NY",E-12,,3,male
6,7,1st,1,"Andrews, Miss Kornelia Theodosia",63.0000,Southampton,"Hudson, NY",D-7,13502 L77,10,female
7,8,1st,0,"Andrews, Mr Thomas, jr",39.0000,Southampton,"Belfast, NI",A-36,,,male
8,9,1st,1,"Appleton, Mrs Edward Dale (Charlotte Lamson)",58.0000,Southampton,"Bayside, Queens, NY",C-101,,2,female
9,10,1st,0,"Artagaveytia, Mr Ramon",71.0000,Cherbourg,"Montevideo, Uruguay",,,(22),male


In [123]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 11 columns):
row.names    1313 non-null int64
pclass       1313 non-null object
survived     1313 non-null int64
name         1313 non-null object
age          633 non-null float64
embarked     821 non-null object
home.dest    754 non-null object
room         77 non-null object
ticket       69 non-null object
boat         347 non-null object
sex          1313 non-null object
dtypes: float64(1), int64(2), object(8)
memory usage: 112.9+ KB


In [124]:
# 基于背景知识做特征选择
X = titanic[['pclass', 'age', 'sex']]
y = titanic['survived']

X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 3 columns):
pclass    1313 non-null object
age       633 non-null float64
sex       1313 non-null object
dtypes: float64(1), object(2)
memory usage: 30.9+ KB


In [125]:
# 基于info，进行数据预处理
# 1)age数据列只有633条数据，需要补全
# 2）pclass和sex是类别型，需要转化为数值特征 0/1 表示

# age补充，使用中位数或平均数，对模型偏离造成的影响较小
X['age'].fillna(X['age'].mean(), inplace=True)

X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 3 columns):
pclass    1313 non-null object
age       1313 non-null float64
sex       1313 non-null object
dtypes: float64(1), object(2)
memory usage: 30.9+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [128]:
# 数据分割
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

In [129]:
# 进行特征转换
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)

# 转换特征后，所有类别型特征都单独剥离出来，独成一列特征，数值型的则保持不变
X_train = vec.fit_transform(X_train.to_dict(orient='record'))
print(vec.feature_names_)

['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'sex=female', 'sex=male']


In [137]:
# 特征提取后，对应的属性下面为1，其余为0
X_train

array([[31.19418104,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ],
       [31.19418104,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [31.19418104,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ],
       ...,
       [12.        ,  0.        ,  1.        ,  0.        ,  1.        ,
         0.        ],
       [18.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ],
       [31.19418104,  0.        ,  0.        ,  1.        ,  1.        ,
         0.        ]])

In [138]:
# 测试集也要特征转换
X_test = vec.transform(X_test.to_dict(orient='record'))

In [141]:
# 利用决策树进行预测
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_predict = dtc.predict(X_test)

In [147]:
# 性能评估
from sklearn.metrics import classification_report
print('The Accuracy of dtc is ', dtc.score(X_test, y_test))

print(classification_report(y_test, y_predict, target_names=['died', 'survived']))

The Accuracy of dtc is  0.7811550151975684
              precision    recall  f1-score   support

        died       0.78      0.91      0.84       202
    survived       0.80      0.58      0.67       127

    accuracy                           0.78       329
   macro avg       0.79      0.74      0.75       329
weighted avg       0.78      0.78      0.77       329



##### 2.1.1.6 集成模型（分类）
##### 利用单一决策树、随机森林分类以及梯度上升决策树对泰坦尼克进行预测

In [145]:
import pandas as pd
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
X = titanic[['age', 'pclass', 'sex']]
y = titanic['survived']
X['age'].fillna(X['age'].mean(), inplace=True)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
X_train = vec.fit_transform(X_train.to_dict(orient='record'))
X_test = vec.transform(X_test.to_dict(orient='record'))

# 单一决策树
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
dtc_y_predict = dtc.predict(X_test)

# 随机森林分类器
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_y_predict = rfc.predict(X_test)

# 梯度提升决策树
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
gbc_y_predict = gbc.predict(X_test)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [150]:
# 性能评估
from sklearn.metrics import classification_report
print('The accuracy of dtc is ', dtc.score(X_test, y_test))
print(classification_report(y_test, dtc_y_predict))

print('The accuracy of rfc is ', rfc.score(X_test, y_test))
print(classification_report(y_test, rfc_y_predict))

print('The accuracy of gbc is ', gbc.score(X_test, y_test))
print(classification_report(y_test, gbc_y_predict))


The accuracy of dtc is  0.7811550151975684
              precision    recall  f1-score   support

           0       0.78      0.91      0.84       202
           1       0.80      0.58      0.67       127

    accuracy                           0.78       329
   macro avg       0.79      0.74      0.75       329
weighted avg       0.78      0.78      0.77       329

The accuracy of rfc is  0.78419452887538
              precision    recall  f1-score   support

           0       0.78      0.91      0.84       202
           1       0.80      0.59      0.68       127

    accuracy                           0.78       329
   macro avg       0.79      0.75      0.76       329
weighted avg       0.79      0.78      0.78       329

The accuracy of gbc is  0.790273556231003
              precision    recall  f1-score   support

           0       0.78      0.92      0.84       202
           1       0.82      0.58      0.68       127

    accuracy                           0.79       329
  