In [1]:
# coding=utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.utils import np_utils
from keras.models import Sequential,load_model,save_model
from keras.layers import Dense, Dropout, Activation,LeakyReLU
from keras.optimizers import SGD, Adam
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras import backend as K
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score,accuracy_score
from scipy import sparse
import gc
import printTime as pt
import os
import time

Using TensorFlow backend.


In [2]:
pt.printTime()

2019-08-22 14:57:50


In [3]:
csr_trainData = sparse.load_npz(r'../trainTestData/trainData4000.npz')
print(csr_trainData.shape)

age_train = pd.read_csv(r'../data/age_train.csv',header=None)
label = age_train[1].values
print(label.shape)

(2010000, 4000)
(2010000,)


In [4]:
pt.printTime()

2019-08-22 14:57:54


In [6]:
seed = 7
np.random.seed(seed)

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

model_filePath = r'../model/NN_model_0_'
currK = 0
val_index_list, score = [], []
val_probability = np.zeros((2010000,7))

In [7]:
pt.printTime()
for train_index, val_index in kfold.split(csr_trainData,label):
    K.clear_session()
    trainData, trainLabel, valData, valLabel = csr_trainData[train_index,:], label[train_index], csr_trainData[val_index,:] , label[val_index] 
    trainLabel,valLabel = np_utils.to_categorical(trainLabel,num_classes=7),np_utils.to_categorical(valLabel,num_classes=7)
    print('----------------------------------------------------------------------------------------------------------------------------------')
    print(currK,'split Done!\n')
    
    # 全连接模型
    model = Sequential()
    model.add(Dense(4000, activation='tanh', input_shape=(csr_trainData.shape[1],)))
    model.add(Dense(2000, activation='relu'))
    model.add(Dense(1000, activation='sigmoid'))
    model.add(Dense(7, activation='softmax'))
    #损失函数使用交叉熵
    adam = Adam(lr=0.0003)
    model.compile(loss='categorical_crossentropy',
                  optimizer = adam,
                  metrics=['accuracy'])
    #模型训练
    batch_size = 1024
    epochs = 100
    early_stopping = EarlyStopping(monitor='val_loss', patience=1, verbose=2)
    bestModel = ModelCheckpoint(model_filePath + str(currK) + r'.h5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
    hist = model.fit(trainData, trainLabel,
                      batch_size=batch_size,
                      epochs=epochs,
                      verbose=1,
                      shuffle=True,
                      validation_data=(valData,valLabel),
                      callbacks=[early_stopping,bestModel],
                     ) 
    print('\n',currK,'train Done!')
    pt.printTime()
    
    K.clear_session()
    model = load_model(model_filePath + str(currK) + r'.h5')
    probability = model.predict(valData,batch_size=1024)
    val_probability[val_index,:] = probability
    
    score.append(np.max(hist.history['val_acc']))
    y_label = label[val_index]
    val_label = np.argmax(probability,axis=1) 
    print(currK,'val_acc:',accuracy_score(val_label,y_label),'\n\n')
    
    currK += 1
    K.clear_session()
    del trainData, valData, trainLabel,valLabel,model
    print('----------------------------------------------------------------------------------------------------------------------------------')
print('mean val_acc:', np.mean(score))
pt.printTime()

2019-08-22 14:57:54
----------------------------------------------------------------------------------------------------------------------------------
0 split Done!

Train on 1809000 samples, validate on 201000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 00004: early stopping

 0 train Done!
2019-08-22 15:03:02
0 val_acc: 0.6173880597014926 


----------------------------------------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------------------------------------
1 split Done!

Train on 1809000 samples, validate on 201000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 00004: early stopping

 1 train Done!
2019-08-22 15:08:16
1 val_acc: 0.6149452736318408 


----------------------------------------------------------------------------------------------------------------------------------
------------------------------

Epoch 00004: early stopping

 8 train Done!
2019-08-22 15:39:30
8 val_acc: 0.6149701492537314 


----------------------------------------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------------------------------------
9 split Done!

Train on 1809000 samples, validate on 201000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 00004: early stopping

 9 train Done!
2019-08-22 15:44:39
9 val_acc: 0.618681592039801 


----------------------------------------------------------------------------------------------------------------------------------
mean val_acc: 0.615920895502935
2019-08-22 15:44:48


In [8]:
accuracy_score(np.argmax(val_probability,axis=1) ,label)

0.6159119402985075

In [9]:
del csr_trainData 
gc.collect()

46154

# 验证集

In [10]:
val_probability = pd.DataFrame(val_probability)
print(val_probability.shape)
print(val_probability.head())

(2010000, 7)
              0         1         2         3         4         5         6
0  1.189088e-08  0.002612  0.055671  0.103340  0.246336  0.523377  0.068664
1  2.392564e-08  0.560282  0.264407  0.048404  0.088724  0.025450  0.012733
2  5.367820e-08  0.004667  0.039767  0.200779  0.342570  0.323843  0.088374
3  3.582242e-07  0.057763  0.222979  0.355596  0.232047  0.086177  0.045438
4  8.326022e-09  0.092375  0.444669  0.330942  0.100234  0.026543  0.005238


In [11]:
val_probability.drop(labels=[0],axis=1,inplace=True)

In [12]:
val_probability.to_csv(r'../processed/val_probabilit_0.csv',header=None,index=False)

# 测试集

In [13]:
import os

In [14]:
model_file = r'../model/NN_model_0_'

In [15]:
csr_testData = sparse.load_npz(r'../trainTestData/testData4000.npz')
print(csr_testData.shape)

age_test = pd.read_csv(r'../data/age_test.csv',header=None,usecols=[0])

(502500, 4000)


In [16]:
pt.printTime()
proflag = True
model_Num = 0
for i in list(range(10)):
    model = load_model(model_file + str(i) + '.h5')
    if proflag==True:
        probability = model.predict(csr_testData,batch_size=1024,verbose=1)
        proflag = False
    else:
        probability += model.predict(csr_testData,batch_size=1024,verbose=1)
    model_Num += 1
    print(model_Num)
    K.clear_session()
    del model
pt.printTime()

2019-08-22 15:45:10
1
2
3
4
5
6
7
8
9
10
2019-08-22 15:47:28


In [18]:
probability /= model_Num
age = np.argmax(probability,axis=1)

In [19]:
age_test = pd.read_csv(r'../data/age_test.csv',header=None,usecols=[0])
age_test = age_test.values
type(age_test)

numpy.ndarray

In [20]:
print(probability.shape)
pro = np.column_stack((age_test,probability))
pro = pd.DataFrame(pro)
pro.drop(labels=[0,1],axis=1,inplace=True)
print(pro.shape)
pro.to_csv(r'../processed/test_probability_0.csv',index=False,header=False)

(502500, 7)
(502500, 6)


In [21]:
pt.printTime()

2019-08-22 15:47:33
