In [1]:
# coding=utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from keras.utils import np_utils
from keras.models import Sequential,load_model,save_model
from keras.layers import Dense, Dropout, Activation,LeakyReLU
from keras.optimizers import SGD, Adam
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras import backend as K
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score,accuracy_score
from scipy import sparse
import gc
from time import strftime, localtime
import printTime as pt

Using TensorFlow backend.


In [3]:
pt.printTime()

2019-08-21 16:58:04


In [4]:
csr_trainData = sparse.load_npz(r'../trainTestData/trainData23100.npz')
gc.collect()

15

In [5]:
pt.printTime()

2019-08-21 16:58:51


In [6]:
age_train = pd.read_csv(r'../data/age_train.csv',header=None)
label = age_train[1].values
print(label.shape)

(2010000,)


In [7]:
import time

seed = 7
np.random.seed(seed)

In [8]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

In [9]:
model_filePath = r'../model/NN_model23100_NN_'
currK = 0
val_index_list, score = [], []
val_probability = np.zeros((2010000,7))

In [10]:
import os
# 使用指定显卡
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
pt.printTime()
for train_index, val_index in kfold.split(csr_trainData,label):
    K.clear_session()
    trainData, trainLabel, valData, valLabel = csr_trainData[train_index,:], label[train_index], csr_trainData[val_index,:] , label[val_index] 
    trainLabel,valLabel = np_utils.to_categorical(trainLabel,num_classes=7),np_utils.to_categorical(valLabel,num_classes=7)
    print('----------------------------------------------------------------------------------------------------------------------------------')
    print(currK,'split Done!\n')
    
    # 全连接模型
    model = Sequential()
    model.add(Dense(3000, activation='tanh', input_shape=(csr_trainData.shape[1],)))
    model.add(Dense(2000, activation='relu'))
    model.add(Dense(1000, activation='sigmoid'))
    model.add(Dense(7, activation='softmax'))
    #损失函数使用交叉熵
    adam = Adam(lr=0.0003)
    model.compile(loss='categorical_crossentropy',
                  optimizer = adam,
                  metrics=['accuracy'])
    #模型训练
    batch_size = 10240
    epochs = 100
    early_stopping = EarlyStopping(monitor='val_loss', patience=1, verbose=2)
    bestModel = ModelCheckpoint(model_filePath + str(currK) + r'.h5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
    hist = model.fit(trainData, trainLabel,
                      batch_size=batch_size,
                      epochs=epochs,
                      verbose=1,
                      shuffle=True,
                      validation_data=(valData,valLabel),
                      callbacks=[early_stopping,bestModel],
                     ) 
    print('\n',currK,'train Done!')
    pt.printTime()
    
    K.clear_session()
    model = load_model(model_filePath + str(currK) + r'.h5')
    probability = model.predict(valData,batch_size=1024)
    val_probability[val_index,:] = probability
    
    score.append(np.max(hist.history['val_acc']))
    y_label = label[val_index]
    val_label = np.argmax(probability,axis=1) 
    print(currK,'val_acc:',accuracy_score(val_label,y_label),'\n\n')
    
    currK += 1
    K.clear_session()
    del trainData, valData, trainLabel,valLabel,model
    print('----------------------------------------------------------------------------------------------------------------------------------')
print('mean val_acc:', np.mean(score))
pt.printTime()

2019-08-21 16:58:52
----------------------------------------------------------------------------------------------------------------------------------
0 split Done!

Train on 1809000 samples, validate on 201000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: early stopping

 0 train Done!
2019-08-21 17:26:16
0 val_acc: 0.6428258706467662 


----------------------------------------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------------------------------------
1 split Done!

Train on 1809000 samples, validate on 201000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 00006: early stopping

 1 train Done!
2019-08-21 17:59:51
1 val_acc: 0.6421940298507463 


-----------------------------------------------------------------------------------------------------------------------------

In [None]:
accuracy_score(np.argmax(val_probability,axis=1) ,label)

In [None]:
del csr_trainData

In [24]:
import gc 
gc.collect()

8

# 验证集

In [26]:
val_probability = pd.DataFrame(val_probability)
print(val_probability.shape)
print(val_probability.head())

(2010000, 6)
          1         2         3         4         5         6
0  0.001006  0.016241  0.064233  0.182788  0.562305  0.173341
1  0.401959  0.411980  0.104222  0.061859  0.014824  0.005129
2  0.002896  0.061216  0.313714  0.304235  0.271872  0.045998
3  0.067550  0.295213  0.464418  0.107411  0.056571  0.008704
4  0.069203  0.627658  0.209653  0.058043  0.027779  0.007608


In [None]:
val_probability.drop(labels=[0],axis=1,inplace=True)

In [None]:
val_probability.to_csv(r'../processed/val_probability_23100.csv',header=None,index=False)

# 测试集

In [28]:
import os

In [29]:
model_file = r'../model/model23100_NN_'

In [30]:
csr_testData = sparse.load_npz(r'../trainTestData/testData23100.npz')
gc.collect()

82

In [31]:
age_test = pd.read_csv(r'../data/age_test.csv',header=None,usecols=[0])

In [33]:
pt.printTime()
proflag = True
model_Num = 0
for i in list(range(10)):
    model = load_model(model_file + str(i) + '.h5')
    if proflag==True:
        probability = model.predict(csr_testData,batch_size=1024,verbose=1)
        proflag = False
    else:
        probability += model.predict(csr_testData,batch_size=1024,verbose=1)
    model_Num += 1
    print(model_Num)
    K.clear_session()
    del model
pt.printTime()

2019-08-21 21:43:07
1
2
3
4
5
6
7
8
9
10
2019-08-21 21:57:42


In [34]:
model_Num

10

In [35]:
probability /= model_Num
age = np.argmax(probability,axis=1)

In [36]:
age_test = pd.read_csv(r'../data/age_test.csv',header=None,usecols=[0])
age_test = age_test.values
type(age_test)

numpy.ndarray

In [37]:
print(probability.shape)
pro = np.column_stack((age_test,probability))
pro = pd.DataFrame(pro)
pro.drop(labels=[0,1],axis=1,inplace=True)
print(pro.shape)
pro.to_csv(r'../processed/test_probability_23100.csv',index=False,header=False)

(502500, 7)
(502500, 6)
