In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv('data/test.csv')

In [3]:
#处理分类对象为numeric类型
def trans_prognois(col):
    name_list = list(col.value_counts().index)
    target_list = []
    for each in col:
         target_list.append(name_list.index(each)) #返回指定值的索引
    return target_list

In [4]:
%matplotlib inline

In [5]:
#定义X,y
X = train_data.iloc[:,:-1]
y = trans_prognois(train_data.prognosis)

In [6]:
def preprocess_data(df,sc=False):
    df.drop(columns=['id'],inplace=True)
    
    if sc == True:
        from sklearn.preprocessing import StandardScaler
        sc = StandardScaler()
        df.iloc[:,:] = sc.fit_transform(df.iloc[:,:])
    
    return df

In [7]:
X_train = preprocess_data(X)

In [8]:
from sklearn.svm import SVC

svc = SVC(kernel='rbf',probability=True,random_state=318)
svc.fit(X_train, y)

SVC(probability=True, random_state=318)

In [9]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(svc, X_train, y).mean())

0.32259514534012584


In [10]:
from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,CategoricalNB

mnb = MultinomialNB()
cnb = ComplementNB()
bnb = BernoulliNB()
ctnb = CategoricalNB()

print(cross_val_score(bnb, X_train, y).mean())

0.31830985915492954


In [11]:
#model ensembling
from sklearn.ensemble import VotingClassifier

vc = VotingClassifier([('svc',svc),('bnb',bnb),('ctnb',ctnb)])
print(cross_val_score(vc, X_train, y).mean())

0.31830985915492954


In [12]:
params = {'voting':['hard', 'soft'],
          'weights':[(1,1,1), (2,1,1), (1,2,1), (1,1,2)]}

from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(vc, params)
grid.fit(X_train, y)

GridSearchCV(estimator=VotingClassifier(estimators=[('svc',
                                                     SVC(probability=True,
                                                         random_state=318)),
                                                    ('bnb', BernoulliNB()),
                                                    ('ctnb', CategoricalNB())]),
             param_grid={'voting': ['hard', 'soft'],
                         'weights': [(1, 1, 1), (2, 1, 1), (1, 2, 1),
                                     (1, 1, 2)]})

In [13]:
grid.best_params_

{'voting': 'soft', 'weights': (2, 1, 1)}

In [14]:
grid.best_score_

0.3183198481670163

In [15]:
#进行预测
X_new = test_data.drop(columns=['id'])
X_new

Unnamed: 0,sudden_fever,headache,mouth_bleed,nose_bleed,muscle_pain,joint_pain,vomiting,rash,diarrhea,hypotension,...,lymph_swells,breathing_restriction,toe_inflammation,finger_inflammation,lips_irritation,itchiness,ulcers,toenail_loss,speech_problem,bullseye_rash
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
299,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
300,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
301,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
#绘制预测概率矩阵
grid_preds = pd.DataFrame(grid.predict_proba(X_new),columns = list(train_data.prognosis.value_counts().index))
grid_preds

Unnamed: 0,West_Nile_fever,Japanese_encephalitis,Tungiasis,Rift_Valley_fever,Chikungunya,Dengue,Yellow_Fever,Zika,Plague,Lyme_disease,Malaria
0,0.044592,0.075673,0.467777,0.178590,0.016351,0.120636,0.032522,0.036827,0.021202,0.001645,0.004185
1,0.032903,0.026541,0.012884,0.048053,0.592589,0.208271,0.014464,0.010720,0.025559,0.002947,0.025070
2,0.456074,0.203248,0.026341,0.062768,0.006880,0.022668,0.052873,0.051979,0.048855,0.029851,0.038463
3,0.041228,0.243734,0.110901,0.250620,0.014329,0.085473,0.143346,0.053010,0.025959,0.005705,0.025695
4,0.244376,0.138837,0.011182,0.045984,0.004272,0.033538,0.082659,0.161535,0.162686,0.024429,0.090503
...,...,...,...,...,...,...,...,...,...,...,...
298,0.072106,0.072512,0.026267,0.018595,0.003143,0.003811,0.371744,0.321870,0.077408,0.008931,0.023613
299,0.031737,0.012060,0.003227,0.009298,0.002485,0.003655,0.011812,0.017243,0.049007,0.362542,0.496934
300,0.091205,0.040017,0.013703,0.023892,0.002779,0.014038,0.023339,0.020567,0.046124,0.673927,0.050410
301,0.485117,0.104172,0.021895,0.114666,0.004466,0.047229,0.032827,0.099941,0.049563,0.027030,0.013094


In [17]:
top3_cols = grid_preds.apply(lambda x: ' '.join(x.nlargest(3).index.tolist()), axis=1)
top3_cols

0                     Tungiasis Rift_Valley_fever Dengue
1                   Chikungunya Dengue Rift_Valley_fever
2      West_Nile_fever Japanese_encephalitis Rift_Val...
3      Rift_Valley_fever Japanese_encephalitis Yellow...
4                            West_Nile_fever Plague Zika
                             ...                        
298                             Yellow_Fever Zika Plague
299                          Malaria Lyme_disease Plague
300                 Lyme_disease West_Nile_fever Malaria
301    West_Nile_fever Rift_Valley_fever Japanese_enc...
302              Zika Yellow_Fever Japanese_encephalitis
Length: 303, dtype: object

In [18]:
t = pd.read_csv('data/test.csv')
pd.DataFrame({'id':t.id,'prognosis':top3_cols}).set_index('id').to_csv('svc_naivebayes_ver5.0.csv')

- 0.39183