In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(37) # 使得每次运行得到的随机数都一样

In [0]:
# 准备数据集
dataset_path='https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df=pd.read_csv(dataset_path,header=None)
print(df.info()) # 加载没有问题
# 原数据集包含有32561个样本，每一个样本含有14个features, 一个label
# print(df.head())
raw_set=df.values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
0     32561 non-null int64
1     32561 non-null object
2     32561 non-null int64
3     32561 non-null object
4     32561 non-null int64
5     32561 non-null object
6     32561 non-null object
7     32561 non-null object
8     32561 non-null object
9     32561 non-null object
10    32561 non-null int64
11    32561 non-null int64
12    32561 non-null int64
13    32561 non-null object
14    32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None


In [0]:
# 打印数据集各列的基本信息
def print_col_info(dataset):
    '''print info of every column in dataset:
    detailed info includes:
    1, values
    2, value type num'''
    col_num=dataset.shape[1]
    for i in range(col_num):
        print('\ncol-{} info: '.format(i))
        temp=np.sort(list(set(dataset[:,i])))
        print('values: {}'.format(temp))
        print('values num: {}'.format(temp.shape[0]))

print_col_info(raw_set)


col-0 info: 
values: [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
 90]
values num: 73

col-1 info: 
values: [' ?' ' Federal-gov' ' Local-gov' ' Never-worked' ' Private'
 ' Self-emp-inc' ' Self-emp-not-inc' ' State-gov' ' Without-pay']
values num: 9

col-2 info: 
values: [  12285   13769   14878 ... 1366120 1455435 1484705]
values num: 21648

col-3 info: 
values: [' 10th' ' 11th' ' 12th' ' 1st-4th' ' 5th-6th' ' 7th-8th' ' 9th'
 ' Assoc-acdm' ' Assoc-voc' ' Bachelors' ' Doctorate' ' HS-grad'
 ' Masters' ' Preschool' ' Prof-school' ' Some-college']
values num: 16

col-4 info: 
values: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]
values num: 16

col-5 info: 
values: [' Divorced' ' Married-AF-spouse' ' Married-civ-spouse'
 ' Married-spouse-absent' ' Never-married' ' Separated' ' Widowed']
values num: 7

col-6 info

In [0]:
# 数据处理一：去除字符串数值前面的空格
str_cols=[1,3,5,6,7,8,9,13,14]
for col in str_cols:
    df.iloc[:,col]=df.iloc[:,col].map(lambda x: x.strip())

# print_col_info(df.values) # 检查发现所有的字符串列都已经掉了空格

In [0]:
# 数据处理二： 删除缺失值样本
# 将?字符串替换为NaN缺失值标志
df.replace("?",np.nan,inplace=True)
# 此处直接删除缺失值样本
df.dropna(inplace=True)
# print(df2.shape) # (30162, 15)

In [0]:
# 数据处理三：对字符数据进行编码
from sklearn import preprocessing
label_encoder=[] # 放置每一列的encoder
encoded_set = np.empty(df.shape)
for col in range(df.shape[1]):
    encoder=None
    if df.iloc[:,col].dtype==object: # 字符型数据
        encoder=preprocessing.LabelEncoder()
        encoded_set[:,col]=encoder.fit_transform(df.iloc[:,col])
    else:  # 数值型数据
        encoded_set[:,col]=df.iloc[:,col]
    label_encoder.append(encoder)

print_col_info(encoded_set) # 全都是数字，没有问题


col-0 info: 
values: [17. 18. 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34.
 35. 36. 37. 38. 39. 40. 41. 42. 43. 44. 45. 46. 47. 48. 49. 50. 51. 52.
 53. 54. 55. 56. 57. 58. 59. 60. 61. 62. 63. 64. 65. 66. 67. 68. 69. 70.
 71. 72. 73. 74. 75. 76. 77. 78. 79. 80. 81. 82. 83. 84. 85. 86. 88. 90.]
values num: 72

col-1 info: 
values: [0. 1. 2. 3. 4. 5. 6.]
values num: 7

col-2 info: 
values: [  13769.   14878.   18827. ... 1366120. 1455435. 1484705.]
values num: 20263

col-3 info: 
values: [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15.]
values num: 16

col-4 info: 
values: [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16.]
values num: 16

col-5 info: 
values: [0. 1. 2. 3. 4. 5. 6.]
values num: 7

col-6 info: 
values: [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13.]
values num: 14

col-7 info: 
values: [0. 1. 2. 3. 4. 5.]
values num: 6

col-8 info: 
values: [0. 1. 2. 3. 4.]
values num: 5

col-9 info: 
values: [0. 1.]
values num: 

In [0]:
# 数据处理四：对某些列进行范围缩放
# print(encoded_set.dtype) # float64 没问题

cols=[2,10,11]
data_scalers=[] # 专门用来放置scaler
for col in cols:
    data_scaler=preprocessing.MinMaxScaler(feature_range=(-1,1)) 
    encoded_set[:,col]=np.ravel(data_scaler.fit_transform(encoded_set[:,col].reshape(-1,1)))
    data_scalers.append(data_scaler)
    
# print_col_info(encoded_set) # 已经发生了改变，没问题

In [0]:
dataset_X,dataset_y=encoded_set[:,:-1],encoded_set[:,-1]
# 数据处理五：拆分数据集为train set和test set
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y=train_test_split(dataset_X,dataset_y,
                                                  test_size=0.3,random_state=42)

# print(dataset_X.shape) # (30162, 14)
# print(dataset_y.shape) # (30162,)
# print(train_X.shape) # (21113, 14)
# print(train_y.shape) # (21113,)
# print(test_X.shape) # (9049, 14)

In [0]:
# 建立朴素贝叶斯分类器模型
from sklearn.naive_bayes import GaussianNB
gaussianNB=GaussianNB()
gaussianNB.fit(train_X,train_y)

# 2 用交叉验证来检验模型的准确性，只是在test set上验证准确性
from sklearn.model_selection import cross_val_score
num_validations=5
accuracy=cross_val_score(gaussianNB,test_X,test_y,
                         scoring='accuracy',cv=num_validations)
print('准确率：{:.2f}%'.format(accuracy.mean()*100))
precision=cross_val_score(gaussianNB,test_X,test_y,
                         scoring='precision_weighted',cv=num_validations)
print('精确度：{:.2f}%'.format(precision.mean()*100))
recall=cross_val_score(gaussianNB,test_X,test_y,
                         scoring='recall_weighted',cv=num_validations)
print('召回率：{:.2f}%'.format(recall.mean()*100))
f1=cross_val_score(gaussianNB,test_X,test_y,
                         scoring='f1_weighted',cv=num_validations)
print('F1  值：{:.2f}%'.format(f1.mean()*100))
                   
# 3 打印性能报告
from sklearn.metrics import confusion_matrix
y_pred=gaussianNB.predict(test_X)
confusion_mat = confusion_matrix(test_y, y_pred)
print(confusion_mat) #看看混淆矩阵长啥样

from sklearn.metrics import classification_report
# 直接使用sklearn打印精度，召回率和F1值
target_names = ['<=50K', '>50K']
print(classification_report(test_y, y_pred, target_names=target_names))

准确率：79.84%
精确度：78.45%
召回率：79.84%
F1  值：77.21%
[[6420  347]
 [1518  764]]
              precision    recall  f1-score   support

       <=50K       0.81      0.95      0.87      6767
        >50K       0.69      0.33      0.45      2282

    accuracy                           0.79      9049
   macro avg       0.75      0.64      0.66      9049
weighted avg       0.78      0.79      0.77      9049

