In [None]:
# coding=utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from keras.utils import np_utils
from keras.models import Sequential,load_model,save_model
from keras.layers import Dense, Dropout, Activation,LeakyReLU
from keras.optimizers import SGD, Adam
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras import backend as K
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score,accuracy_score
from scipy import sparse
import gc
from time import strftime, localtime

In [None]:
# 打印当前时间
def printTime():
    print(strftime("%Y-%m-%d %H:%M:%S", localtime()))
    return

In [None]:
printTime()

In [None]:
csr_trainData = sparse.load_npz(r'../trainTestData/trainData15112.npz')
csr_trainData = sparse.csr_matrix(csr_trainData,dtype=np.float32)
csr_trainData.shape

In [None]:
age_train = pd.read_csv(r'../data/age_train.csv',header=None)
label = age_train[1].values
print(label.shape)

In [None]:
import time

seed = 7
np.random.seed(seed)

In [None]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

In [None]:
model_filePath = r'../model/model15112_NN_'
currK = 0
val_index_list, score = [], []
val_probability = np.zeros((2010000,7))

In [None]:
printTime()
for train_index, val_index in kfold.split(csr_trainData,label):
    K.clear_session()
    trainData, trainLabel, valData, valLabel = csr_trainData[train_index,:], label[train_index], csr_trainData[val_index,:] , label[val_index] 
    trainLabel,valLabel = np_utils.to_categorical(trainLabel,num_classes=7),np_utils.to_categorical(valLabel,num_classes=7)
    print('----------------------------------------------------------------------------------------------------------------------------------')
    print(currK,'split Done!\n')
    
    # 全连接模型
    model = Sequential()
    model.add(Dense(4000, activation='tanh', input_shape=(csr_trainData.shape[1],)))
    model.add(Dense(2000, activation='relu'))
    model.add(Dense(1000, activation='sigmoid'))
    model.add(Dense(7, activation='softmax'))
    #损失函数使用交叉熵
    adam = Adam(lr=0.0003)
    model.compile(loss='categorical_crossentropy',
                  optimizer = adam,
                  metrics=['accuracy'])
    #模型训练
    batch_size = 1024
    epochs = 100
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=2)
    bestModel = ModelCheckpoint(model_filePath + str(currK) + r'.h5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
    hist = model.fit(trainData, trainLabel,
                      batch_size=batch_size,
                      epochs=epochs,
                      verbose=1,
                      shuffle=True,
                      validation_data=(valData,valLabel),
                      callbacks=[early_stopping,bestModel],
                     ) 
    print('\n',currK,'train Done!')
    printTime()
    
    K.clear_session()
    model = load_model(model_filePath + str(currK) + r'.h5')
    probability = model.predict(valData,batch_size=1024)
    val_probability[val_index,:] = probability
    
    score.append(np.max(hist.history['val_acc']))
    y_label = label[val_index]
    val_label = np.argmax(probability,axis=1) 
    print(currK,'val_acc:',accuracy_score(val_label,y_label),'\n\n')
    
    currK += 1
    K.clear_session()
    del trainData, valData, trainLabel,valLabel,model
    print('----------------------------------------------------------------------------------------------------------------------------------')
print('mean val_acc:', np.mean(score))
printTime()

In [None]:
accuracy_score(np.argmax(val_probability,axis=1) ,label)

In [None]:
del csr_trainData

In [None]:
import gc 
gc.collect()

# 验证集

In [None]:
val_probability = pd.DataFrame(val_probability)
print(val_probability.shape)
print(val_probability.head())

In [None]:
val_probability.drop(labels=[0],axis=1,inplace=True)

In [None]:
val_probability.to_csv(r'../processed/val_probability_15112.csv',header=None,index=False)

# 测试集

In [None]:
import os

In [None]:
model_file = r'../model/model15112_NN_'

In [None]:
csr_testData = sparse.load_npz(r'../trainTestData/testData15112.npz')
csr_testData = sparse.csr_matrix(csr_testData,dtype=np.float32)
csr_testData.shape

In [None]:
age_test = pd.read_csv(r'../data/age_test.csv',header=None,usecols=[0])

In [None]:
printTime()
proflag = True
model_Num = 0
for i in list(range(10)):
    model = load_model(model_file + str(i) + '.h5')
    if proflag==True:
        probability = model.predict(csr_testData,batch_size=1024,verbose=1)
        proflag = False
    else:
        probability += model.predict(csr_testData,batch_size=1024,verbose=1)
    model_Num += 1
    print(model_Num)
    K.clear_session()
    del model
printTime()

In [None]:
model_Num

In [None]:
probability /= model_Num
age = np.argmax(probability,axis=1)

In [None]:
age_test = pd.read_csv(r'../data/age_test.csv',header=None,usecols=[0])
age_test = age_test.values
type(age_test)

In [None]:
print(probability.shape)
pro = np.column_stack((age_test,probability))
pro = pd.DataFrame(pro)
pro.drop(labels=[0,1],axis=1,inplace=True)
print(pro.shape)
pro.to_csv(r'../processed/test_probability_15112.csv',index=False,header=False)