In [1]:
# coding=utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from keras.utils import np_utils
from keras.models import Sequential,load_model,save_model
from keras.layers import Dense, Dropout, Activation,LeakyReLU
from keras.optimizers import SGD, Adam
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras import backend as K
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score,accuracy_score
from scipy import sparse
import gc
from time import strftime, localtime
import printTime as pt

Using TensorFlow backend.


In [3]:
pt.printTime()

2019-08-21 16:56:58


In [4]:
csr_trainData = sparse.load_npz(r'../trainTestData/trainData13100.npz')
gc.collect()

15

In [5]:
pt.printTime()

2019-08-21 16:57:40


In [6]:
age_train = pd.read_csv(r'../data/age_train.csv',header=None)
label = age_train[1].values
print(label.shape)

(2010000,)


In [7]:
import time

seed = 7
np.random.seed(seed)

In [8]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

In [9]:
model_filePath = r'../model/model13100_NN_'
currK = 0
val_index_list, score = [], []
val_probability = np.zeros((2010000,7))

In [10]:
import os
# 使用指定显卡
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [11]:
pt.printTime()
for train_index, val_index in kfold.split(csr_trainData,label):
    K.clear_session()
    trainData, trainLabel, valData, valLabel = csr_trainData[train_index,:], label[train_index], csr_trainData[val_index,:] , label[val_index] 
    trainLabel,valLabel = np_utils.to_categorical(trainLabel,num_classes=7),np_utils.to_categorical(valLabel,num_classes=7)
    print('----------------------------------------------------------------------------------------------------------------------------------')
    print(currK,'split Done!\n')
    
    # 全连接模型
    model = Sequential()
    model.add(Dense(3000, activation='tanh', input_shape=(csr_trainData.shape[1],)))
    model.add(Dense(2000, activation='relu'))
    model.add(Dense(1000, activation='sigmoid'))
    model.add(Dense(7, activation='softmax'))
    #损失函数使用交叉熵
    adam = Adam(lr=0.0003)
    model.compile(loss='categorical_crossentropy',
                  optimizer = adam,
                  metrics=['accuracy'])
    #模型训练
    batch_size = 10240
    epochs = 100
    early_stopping = EarlyStopping(monitor='val_loss', patience=1, verbose=2)
    bestModel = ModelCheckpoint(model_filePath + str(currK) + r'.h5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
    hist = model.fit(trainData, trainLabel,
                      batch_size=batch_size,
                      epochs=epochs,
                      verbose=1,
                      shuffle=True,
                      validation_data=(valData,valLabel),
                      callbacks=[early_stopping,bestModel],
                     ) 
    print('\n',currK,'train Done!')
    pt.printTime()
    
    K.clear_session()
    model = load_model(model_filePath + str(currK) + r'.h5')
    probability = model.predict(valData,batch_size=1024)
    val_probability[val_index,:] = probability
    
    score.append(np.max(hist.history['val_acc']))
    y_label = label[val_index]
    val_label = np.argmax(probability,axis=1) 
    print(currK,'val_acc:',accuracy_score(val_label,y_label),'\n\n')
    
    currK += 1
    K.clear_session()
    del trainData, valData, trainLabel,valLabel,model
    print('----------------------------------------------------------------------------------------------------------------------------------')
print('mean val_acc:', np.mean(score))
pt.printTime()

2019-08-21 16:57:41
----------------------------------------------------------------------------------------------------------------------------------
0 split Done!

Train on 1809000 samples, validate on 201000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: early stopping

 0 train Done!
2019-08-21 17:15:20
0 val_acc: 0.6394825870646766 


----------------------------------------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------------------------------------
1 split Done!

Train on 1809000 samples, validate on 201000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 00006: early stopping

 1 train Done!
2019-08-21 17:36:56
1 val_acc: 0.6382636815920398 


-----------------------------------------------------------------------------------------------------------------------------

Train on 1809000 samples, validate on 201000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: early stopping

 7 train Done!
2019-08-21 19:24:04
7 val_acc: 0.640049751243781 


----------------------------------------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------------------------------------
8 split Done!

Train on 1809000 samples, validate on 201000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: early stopping

 8 train Done!
2019-08-21 19:42:55
8 val_acc: 0.6401243781094528 


----------------------------------------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------------------------------------
9 split Done!

Train on 1809000 samples, v

In [12]:
accuracy_score(np.argmax(val_probability,axis=1) ,label)

0.6396512437810945

In [13]:
del csr_trainData

In [14]:
import gc 
gc.collect()

81634

# 验证集

In [15]:
val_probability = pd.DataFrame(val_probability)
print(val_probability.shape)
print(val_probability.head())

(2010000, 7)
          0         1         2         3         4         5         6
0  0.000173  0.000918  0.018802  0.098206  0.199636  0.526218  0.156049
1  0.000027  0.506544  0.360741  0.056775  0.053092  0.016939  0.005883
2  0.000097  0.002695  0.049279  0.454928  0.249025  0.170397  0.073578
3  0.000105  0.067980  0.288192  0.455700  0.119740  0.058952  0.009331
4  0.000066  0.060741  0.533396  0.257816  0.098826  0.035453  0.013703


In [16]:
val_probability.drop(labels=[0],axis=1,inplace=True)

In [17]:
val_probability.to_csv(r'../processed/val_probability_13100.csv',header=None,index=False)

# 测试集

In [18]:
import os

In [19]:
model_file = r'../model/model13100_NN_'

In [20]:
csr_testData = sparse.load_npz(r'../trainTestData/testData13100.npz')
gc.collect()

15

In [21]:
age_test = pd.read_csv(r'../data/age_test.csv',header=None,usecols=[0])

In [None]:
pt.printTime()
proflag = True
model_Num = 0
for i in list(range(10)):
    model = load_model(model_file + str(i) + '.h5')
    if proflag==True:
        probability = model.predict(csr_testData,batch_size=1024,verbose=1)
        proflag = False
    else:
        probability += model.predict(csr_testData,batch_size=1024,verbose=1)
    model_Num += 1
    print(model_Num)
    K.clear_session()
    del model
pt.printTime()

2019-08-21 20:25:39
1
2
3
4

In [28]:
model_Num

10

In [None]:
probability /= model_Num
age = np.argmax(probability,axis=1)

In [None]:
age_test = pd.read_csv(r'../data/age_test.csv',header=None,usecols=[0])
age_test = age_test.values
type(age_test)

In [None]:
print(probability.shape)
pro = np.column_stack((age_test,probability))
pro = pd.DataFrame(pro)
pro.drop(labels=[0,1],axis=1,inplace=True)
print(pro.shape)
pro.to_csv(r'../processed/test_probability_13100.csv',index=False,header=False)