In [1]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

import pandas as pd

In [2]:
# 加载数据集
datas = pd.read_csv('./data/train.csv')

In [3]:
train_datas = datas.drop('label', axis=1)
labels = datas['label']

print(train_datas)

      level_0  index         a      b    c   d
0           0      0  0.669335   95.0  NaN  MT
1           1      1  1.047217   -1.0    I   Z
2           2      2  1.661245  -43.0  III   F
3           3      3  0.377449   44.0   IV   Z
4           4      4  0.507757   81.0  VII  MT
5           5      5  7.071860    NaN   IV  FI
6           6      6  0.403623   25.0   IV   Z
7           7      7  2.827981  -58.0   IV   T
8           8      8  6.521888    NaN   IV   Z
9           9      9  0.994477   41.0   VI   F
10         10     10  1.537824  -25.0   IV  FI
11         11     11  0.648144   89.0   IV   T
12         12     12  0.844189  100.0   IV   T
13         13     13  2.159246   71.0    I   O
14         14     14  2.494773  -46.0  III   F
15         15     15  2.144090   59.0    I   T
16         16     16  1.935107   73.0   IV   Z
17         17     17  1.032773   37.0    V  FI
18         18     18  1.168332   92.0  VII   O
19         19     19  1.221787    1.0  VII  MT
20         20

In [4]:
# 查看数据的完整情况
print(train_datas['a'].isna().sum(), train_datas['b'].isna().sum(), 
      train_datas['c'].isnull().sum(), train_datas['d'].isnull().sum())

0 200 51 0


因为`b`列空数据多，所以舍弃

In [5]:
train_datas = train_datas.drop('b', axis=1)
print(train_datas)

      level_0  index         a    c   d
0           0      0  0.669335  NaN  MT
1           1      1  1.047217    I   Z
2           2      2  1.661245  III   F
3           3      3  0.377449   IV   Z
4           4      4  0.507757  VII  MT
5           5      5  7.071860   IV  FI
6           6      6  0.403623   IV   Z
7           7      7  2.827981   IV   T
8           8      8  6.521888   IV   Z
9           9      9  0.994477   VI   F
10         10     10  1.537824   IV  FI
11         11     11  0.648144   IV   T
12         12     12  0.844189   IV   T
13         13     13  2.159246    I   O
14         14     14  2.494773  III   F
15         15     15  2.144090    I   T
16         16     16  1.935107   IV   Z
17         17     17  1.032773    V  FI
18         18     18  1.168332  VII   O
19         19     19  1.221787  VII  MT
20         20     20  2.059488   II   F
21         21     21  1.056337  VII  MT
22         22     22  0.757278   IV   Z
23         23     23  4.306444   VI  MT


查看`c`,`d`列的字符串种类

In [6]:
print('c: ', set(train_datas['c']))
print('d: ', set(train_datas['d']))

c:  {nan, 'VII', 'XI', 'X', 'I', 'V', 'IX', 'II', 'IV', 'VI', 'III'}
d:  {'O', 'MO', 'MT', 'FI', 'TH', 'Z', 'F', 'T'}


In [7]:
train_datas['c'].fillna(0, inplace=True)
print(train_datas['c'])

0         0
1         I
2       III
3        IV
4       VII
5        IV
6        IV
7        IV
8        IV
9        VI
10       IV
11       IV
12       IV
13        I
14      III
15        I
16       IV
17        V
18      VII
19      VII
20       II
21      VII
22       IV
23       VI
24      VII
25      VII
26      VII
27       VI
28        V
29        X
       ... 
995      IV
996       I
997       X
998       V
999       X
1000      X
1001      V
1002      I
1003      0
1004      V
1005    VII
1006     IX
1007      X
1008     II
1009    VII
1010      X
1011     IV
1012     IV
1013      I
1014     IV
1015     IX
1016    VII
1017      0
1018    VII
1019    VII
1020     VI
1021     IV
1022      I
1023      V
1024      X
Name: c, Length: 1025, dtype: object


将字符映射成整数类型

In [8]:
train_datas.replace({'c': {'XI': 1, 'I': 2, 'III': 3, 'X': 4, 'VI': 5, 'IX': 6, 'VII': 7, 'V': 8, 'IV': 9, 'II': 10}}
                   , inplace=True)


In [9]:
print(train_datas)

      level_0  index         a   c   d
0           0      0  0.669335   0  MT
1           1      1  1.047217   2   Z
2           2      2  1.661245   3   F
3           3      3  0.377449   9   Z
4           4      4  0.507757   7  MT
5           5      5  7.071860   9  FI
6           6      6  0.403623   9   Z
7           7      7  2.827981   9   T
8           8      8  6.521888   9   Z
9           9      9  0.994477   5   F
10         10     10  1.537824   9  FI
11         11     11  0.648144   9   T
12         12     12  0.844189   9   T
13         13     13  2.159246   2   O
14         14     14  2.494773   3   F
15         15     15  2.144090   2   T
16         16     16  1.935107   9   Z
17         17     17  1.032773   8  FI
18         18     18  1.168332   7   O
19         19     19  1.221787   7  MT
20         20     20  2.059488  10   F
21         21     21  1.056337   7  MT
22         22     22  0.757278   9   Z
23         23     23  4.306444   5  MT
24         24     24  7.4

In [10]:
train_datas.replace({'d': {'TH': 1, 'F': 2, 'Z': 3, 'T': 4, 'MO': 5, 'MT': 6, 'O': 7, 'FI': 8}}, inplace=True,)

In [11]:
print(train_datas)

      level_0  index         a   c  d
0           0      0  0.669335   0  6
1           1      1  1.047217   2  3
2           2      2  1.661245   3  2
3           3      3  0.377449   9  3
4           4      4  0.507757   7  6
5           5      5  7.071860   9  8
6           6      6  0.403623   9  3
7           7      7  2.827981   9  4
8           8      8  6.521888   9  3
9           9      9  0.994477   5  2
10         10     10  1.537824   9  8
11         11     11  0.648144   9  4
12         12     12  0.844189   9  4
13         13     13  2.159246   2  7
14         14     14  2.494773   3  2
15         15     15  2.144090   2  4
16         16     16  1.935107   9  3
17         17     17  1.032773   8  8
18         18     18  1.168332   7  7
19         19     19  1.221787   7  6
20         20     20  2.059488  10  2
21         21     21  1.056337   7  6
22         22     22  0.757278   9  3
23         23     23  4.306444   5  6
24         24     24  7.445429   7  6
25         2

将数据集分成训练数据集和测试数据集

In [12]:
train_, test_, train_labels, test_labels = train_test_split(train_datas, labels, test_size=0.33, random_state=42)

In [13]:
gnb = GaussianNB()


训练模型

In [14]:
gnb.fit(train_, train_labels)

GaussianNB(priors=None)

利用模型预测

In [15]:
preds = gnb.predict(test_)
print(preds)

[0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 1 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 1 0 0 0 1 1 0 0 0 0 0 0
 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 1 0 0 1 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1
 1 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0
 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 1 1 0 1 1 1 0 0 0 0 0 0
 1 0 0 0 1 0 0 0 0 0 1 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0
 1 1 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0
 1 0 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 1 0 1 0 1
 0 0 1 1 0 1 0 1 0 0 0 1 0 0 0 0 0 1 0 1 0 1 1 1 0 0 1 1 0 0 0 0 0 1 0 0 1
 0 1 1 1 0 0]


In [16]:
print(accuracy_score(test_labels, preds))

0.522123893805


加载测试数据

In [17]:
td = pd.read_csv('./data/test.csv')
td = td.drop('b', axis=1)
td['c'].fillna(0, inplace=True)
td.replace({'c': {'XI': 1, 'I': 2, 'III': 3, 'X': 4, 'VI': 5, 'IX': 6, 'VII': 7, 'V': 8, 'IV': 9, 'II': 10}}
                   , inplace=True)
td.replace({'d': {'TH': 1, 'F': 2, 'Z': 3, 'T': 4, 'MO': 5, 'MT': 6, 'O': 7, 'FI': 8}}, inplace=True,)
print(td)

      level_0  index         a   c  d
0        1024   1024  1.092965   4  3
1        1025   1025  1.086392   5  2
2        1026   1026  2.582640   4  3
3        1027   1027  1.044559   9  7
4        1028   1028  2.267531   8  5
5        1029   1029  2.099086   7  4
6        1030   1030  1.043413   7  6
7        1031   1031  2.797659   5  6
8        1032   1032  6.613535   9  8
9        1033   1033  0.471128   4  7
10       1034   1034  0.373194   9  8
11       1035   1035  2.181271   0  6
12       1036   1036  2.700579   9  3
13       1037   1037  4.148710   4  7
14       1038   1038  2.705647   5  1
15       1039   1039  0.811107   8  2
16       1040   1040  1.049189   7  6
17       1041   1041  7.072700   4  7
18       1042   1042  0.835750   2  7
19       1043   1043  2.317238   0  6
20       1044   1044  7.181690   9  3
21       1045   1045  0.609248   4  3
22       1046   1046  7.494870   0  6
23       1047   1047  4.071359   8  5
24       1048   1048  1.036212   5  6
25       104

In [18]:
my_preds = gnb.predict(td)
print(my_preds)

[0 0 0 ..., 0 0 0]


In [19]:
d = {'index': td['index'], 'lable': my_preds}
df = pd.DataFrame(data=d)
print(df)

      index  lable
0      1024      0
1      1025      0
2      1026      0
3      1027      0
4      1028      0
5      1029      0
6      1030      0
7      1031      0
8      1032      0
9      1033      0
10     1034      0
11     1035      1
12     1036      0
13     1037      0
14     1038      0
15     1039      0
16     1040      0
17     1041      0
18     1042      0
19     1043      1
20     1044      0
21     1045      0
22     1046      1
23     1047      0
24     1048      0
25     1049      0
26     1050      0
27     1051      0
28     1052      0
29     1053      0
...     ...    ...
994    2018      0
995    2019      0
996    2020      0
997    2021      0
998    2022      0
999    2023      0
1000   2024      0
1001   2025      0
1002   2026      0
1003   2027      0
1004   2028      0
1005   2029      0
1006   2030      0
1007   2031      0
1008   2032      0
1009   2033      0
1010   2034      0
1011   2035      0
1012   2036      0
1013   2037      0
1014   2038 

In [20]:
df.to_csv('./data/my_preds.csv')