In [1]:
# 下载资料并做normalize，切为Training set和validation set
import numpy as np

np.random.seed(0)

x_train_fpath = './data/X_train'
y_train_fpath = './data/Y_train'
x_test_fpath  = './data/X_test'
output_fpath  = './output_{}.csv'

# 第一行是feature的名称，所以先执行next(f)跳过第一行的内容；第一个dimension是id，feature[1:]从第二个dimension开始读取
with open(x_train_fpath) as f:
    next(f)
    x_train = np.array([line.strip('\n').split(',')[1:]  for line in f], dtype = float)

with open(y_train_fpath) as f:
    next(f)
    y_train = np.array([line.strip('\n').split(',')[1]  for line in f], dtype = float)
    
with open(x_test_fpath) as f:
    next(f)
    x_test = np.array([line.strip('\n').split(',')[1:]   for line in f], dtype = float)
    
print('x_train :\n',x_train,x_train.shape,'\n')
print('y_train :\n',y_train,y_train.shape,'\n')
print('x_test :\n',x_test,x_test.shape)

x_train :
 [[33.  1.  0. ... 52.  0.  1.]
 [63.  1.  0. ... 52.  0.  1.]
 [71.  0.  0. ...  0.  0.  1.]
 ...
 [16.  0.  0. ...  8.  1.  0.]
 [48.  1.  0. ... 52.  0.  1.]
 [48.  0.  0. ...  0.  0.  1.]] (54256, 510) 

y_train :
 [1. 0. 0. ... 0. 0. 0.] (54256,) 

x_test :
 [[37.  1.  0. ... 52.  0.  1.]
 [48.  1.  0. ... 52.  0.  1.]
 [68.  0.  0. ...  0.  1.  0.]
 ...
 [38.  1.  0. ... 52.  0.  1.]
 [17.  0.  0. ... 40.  1.  0.]
 [22.  0.  0. ... 25.  1.  0.]] (27622, 510)


In [2]:
def _normalize(x, train = True, specified_column = None, x_mean = None, x_std = None):
    '''
    This function normalizes specific columns of x
    注意，testing data要跟training data的normalize方式一致，要用training data的mean和std，
    因此还需要input已知的x_mean和x_std
    '''
    # 如果没有指定列，那就穷举所有列，这里np.arange类似于range函数，只不过前者创造的对象是array类型
    if specified_column == None:
        specified_column = np.arange(x.shape[1])
    
    # train=True: for training data; train=False: for testing data，只计算training data的mean和std
    if train:
        # axis=0，对指定列求mean，注意np.mean返回的是一个列向量，因此需要用reshape(1,-1)转化成行向量
        x_mean = np.mean(x[:, specified_column], axis = 0).reshape(1, -1)
        # axis=0，对指定列求std
        x_std  = np.std(x[:, specified_column], axis = 0).reshape(1, -1)
     
    # 对指定列进行normalize，注意相减的两个向量行数不同但列数相同，相当于前者的每一行都减去x_mean这个行向量，除法同理
    # 分母加一个很小很小的数是为了避免标准差为0
    x[:, specified_column] = (x[:, specified_column] - x_mean) / (x_std + 1e-8)
    
    return x, x_mean, x_std

In [3]:
def _train_split(x, y, validation_ratio = 0.25):
    '''
    This function splits data into training set and validation set
    '''
    train_size = int(len(x) * (1 - validation_ratio))
    
    #return x,y of training set and validation set  
    # 如果返回值为x[:train_size, :]的话会报错，但这两种形式本质上是一样的，存疑
    return x[:train_size], y[:train_size], x[train_size:], y[train_size:]

In [4]:
# normalize training data and testing data
x_train, x_mean, x_std = _normalize(x_train, train = True)
x_test, _, _ = _normalize(x_test, train = False, x_mean = x_mean, x_std = x_std)

# split training data into training set and validation set
x_training_set, y_training_set, x_validation_set, y_validation_set = _train_split(x_train, y_train, validation_ratio = 0.1)

print('x_training_set : ', x_training_set.shape, '\n', x_training_set)
print('------------------------------------------------------------------------')
print('y_training_set : ', y_training_set.shape, '\n', y_training_set)
print('------------------------------------------------------------------------')
print('x_validation_set : ', x_validation_set.shape, '\n', x_validation_set)
print('------------------------------------------------------------------------')
print('y_validation_set : ', y_validation_set.shape, '\n', y_validation_set)

x_training_set :  (48830, 510) 
 [[-0.42755297  0.99959458 -0.1822401  ...  0.80645987 -1.01485522
   1.01485522]
 [ 1.19978056  0.99959458 -0.1822401  ...  0.80645987 -1.01485522
   1.01485522]
 [ 1.63373617 -1.00040555 -0.1822401  ... -1.45536172 -1.01485522
   1.01485522]
 ...
 [ 0.65733605 -1.00040555 -0.1822401  ...  0.80645987 -1.01485522
   1.01485522]
 [ 0.27762489  0.99959458 -0.1822401  ...  0.28450104 -1.01485522
   1.01485522]
 [ 0.16913599 -1.00040555 -0.1822401  ...  0.80645987  0.98536219
  -0.98536219]]
------------------------------------------------------------------------
y_training_set :  (48830,) 
 [1. 0. 0. ... 1. 0. 0.]
------------------------------------------------------------------------
x_validation_set :  (5426, 510) 
 [[-0.48179742  0.99959458 -0.1822401  ...  0.80645987  0.98536219
  -0.98536219]
 [-1.24121974 -1.00040555  5.48726602 ...  0.80645987 -1.01485522
   1.01485522]
 [-0.04784181  0.99959458 -0.1822401  ...  0.80645987 -1.01485522
   1.01485522]

In [5]:
from keras.utils import to_categorical

y_training_set = to_categorical(y_training_set)
y_validation_set = to_categorical(y_validation_set)
print(y_training_set, '\n\n', y_validation_set)

Using TensorFlow backend.


[[0. 1.]
 [1. 0.]
 [1. 0.]
 ...
 [0. 1.]
 [1. 0.]
 [1. 0.]] 

 [[0. 1.]
 [1. 0.]
 [1. 0.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]


In [11]:
# keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.optimizers import SGD, Adam
from keras import regularizers

model = Sequential()
model.add(Dense(input_dim = len(x_training_set[0]), units = 50, activation = 'relu', kernel_regularizer=regularizers.l2(0.0001)))
model.add(Dropout(0.5))
# model.add(Dense(units = 250, activation = 'relu')) # activity_regularizer=regularizers.l1(0.0001)
# model.add(Dropout(0.5))
model.add(Dense(units = 50, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(units = 2, activation = 'softmax'))
model.summary()
model.compile(loss = 'categorical_crossentropy', optimizer = 'adagrad', metrics = ['accuracy'])
model.fit(x_training_set, y_training_set, batch_size = 100, epochs = 20)

predict_training_set = model.evaluate(x_training_set, y_training_set)
predict_validation_set = model.evaluate(x_validation_set, y_validation_set)

print('Training set Acc :', predict_training_set[1])
print('Validation set Acc :', predict_validation_set[1])

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 50)                25550     
_________________________________________________________________
dropout_5 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 50)                2550      
_________________________________________________________________
dropout_6 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 2)                 102       
Total params: 28,202
Trainable params: 28,202
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch

In [7]:
y_train = to_categorical(y_train, 2)

In [8]:
model = Sequential()
model.add(Dense(input_dim = len(x_train[0]), units = 50, activation = 'relu', kernel_regularizer = regularizers.l2(0.0001)))
model.add(Dropout(0.5))
model.add(Dense(units = 50, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(units = 2, activation = 'softmax'))
model.summary()
model.compile(loss = 'categorical_crossentropy', optimizer = 'adagrad', metrics = ['accuracy'])
model.fit(x_train, y_train, batch_size = 100, epochs = 20)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 50)                25550     
_________________________________________________________________
dropout_3 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 50)                2550      
_________________________________________________________________
dropout_4 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 102       
Total params: 28,202
Trainable params: 28,202
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch

<keras.callbacks.History at 0x7fe32ea539e8>

In [9]:
y_test_predict = np.round(model.predict(x_test)[:, 1]).astype(int)
y_test_predict

array([0, 1, 0, ..., 1, 0, 0])

In [10]:
import csv
with open('predict_keras.csv', mode = 'w', newline = '') as f:
    csv_writer = csv.writer(f)
    header = ['id', 'label']
    print(header)
    csv_writer.writerow(header)
    for i in range(y_test_predict.shape[0]):
        row = [str(i), y_test_predict[i]]
        print(row)
        csv_writer.writerow(row)


['id', 'label']
['0', 0]
['1', 1]
['2', 0]
['3', 0]
['4', 0]
['5', 1]
['6', 0]
['7', 1]
['8', 0]
['9', 0]
['10', 0]
['11', 0]
['12', 1]
['13', 0]
['14', 0]
['15', 0]
['16', 0]
['17', 0]
['18', 0]
['19', 0]
['20', 1]
['21', 0]
['22', 0]
['23', 0]
['24', 0]
['25', 0]
['26', 0]
['27', 0]
['28', 0]
['29', 0]
['30', 0]
['31', 0]
['32', 0]
['33', 0]
['34', 0]
['35', 0]
['36', 0]
['37', 0]
['38', 1]
['39', 0]
['40', 0]
['41', 0]
['42', 1]
['43', 0]
['44', 0]
['45', 0]
['46', 0]
['47', 0]
['48', 0]
['49', 0]
['50', 0]
['51', 0]
['52', 0]
['53', 0]
['54', 0]
['55', 0]
['56', 0]
['57', 0]
['58', 0]
['59', 0]
['60', 0]
['61', 0]
['62', 0]
['63', 0]
['64', 0]
['65', 0]
['66', 0]
['67', 0]
['68', 1]
['69', 0]
['70', 0]
['71', 0]
['72', 0]
['73', 0]
['74', 0]
['75', 0]
['76', 0]
['77', 1]
['78', 1]
['79', 0]
['80', 0]
['81', 0]
['82', 0]
['83', 0]
['84', 0]
['85', 0]
['86', 0]
['87', 0]
['88', 0]
['89', 0]
['90', 0]
['91', 0]
['92', 0]
['93', 0]
['94', 1]
['95', 0]
['96', 0]
['97', 0]
['98', 1]
['99

['3185', 0]
['3186', 0]
['3187', 0]
['3188', 0]
['3189', 0]
['3190', 0]
['3191', 0]
['3192', 0]
['3193', 0]
['3194', 0]
['3195', 0]
['3196', 0]
['3197', 0]
['3198', 1]
['3199', 0]
['3200', 0]
['3201', 1]
['3202', 0]
['3203', 0]
['3204', 0]
['3205', 0]
['3206', 1]
['3207', 0]
['3208', 1]
['3209', 0]
['3210', 0]
['3211', 0]
['3212', 0]
['3213', 0]
['3214', 1]
['3215', 0]
['3216', 0]
['3217', 0]
['3218', 0]
['3219', 0]
['3220', 1]
['3221', 0]
['3222', 0]
['3223', 0]
['3224', 0]
['3225', 0]
['3226', 1]
['3227', 0]
['3228', 0]
['3229', 0]
['3230', 0]
['3231', 0]
['3232', 0]
['3233', 0]
['3234', 0]
['3235', 0]
['3236', 0]
['3237', 0]
['3238', 0]
['3239', 0]
['3240', 0]
['3241', 0]
['3242', 0]
['3243', 1]
['3244', 1]
['3245', 0]
['3246', 0]
['3247', 0]
['3248', 0]
['3249', 0]
['3250', 0]
['3251', 1]
['3252', 0]
['3253', 0]
['3254', 0]
['3255', 0]
['3256', 0]
['3257', 1]
['3258', 0]
['3259', 0]
['3260', 0]
['3261', 0]
['3262', 0]
['3263', 0]
['3264', 0]
['3265', 0]
['3266', 0]
['3267', 0]
['32

['5876', 1]
['5877', 0]
['5878', 0]
['5879', 0]
['5880', 0]
['5881', 1]
['5882', 0]
['5883', 0]
['5884', 0]
['5885', 0]
['5886', 0]
['5887', 0]
['5888', 1]
['5889', 0]
['5890', 0]
['5891', 1]
['5892', 0]
['5893', 0]
['5894', 0]
['5895', 1]
['5896', 0]
['5897', 0]
['5898', 0]
['5899', 0]
['5900', 0]
['5901', 0]
['5902', 0]
['5903', 0]
['5904', 0]
['5905', 1]
['5906', 0]
['5907', 0]
['5908', 0]
['5909', 1]
['5910', 0]
['5911', 0]
['5912', 0]
['5913', 0]
['5914', 0]
['5915', 1]
['5916', 0]
['5917', 0]
['5918', 0]
['5919', 0]
['5920', 1]
['5921', 0]
['5922', 0]
['5923', 0]
['5924', 0]
['5925', 0]
['5926', 0]
['5927', 0]
['5928', 1]
['5929', 0]
['5930', 0]
['5931', 0]
['5932', 0]
['5933', 0]
['5934', 0]
['5935', 0]
['5936', 0]
['5937', 0]
['5938', 1]
['5939', 0]
['5940', 0]
['5941', 1]
['5942', 0]
['5943', 0]
['5944', 0]
['5945', 0]
['5946', 0]
['5947', 0]
['5948', 0]
['5949', 0]
['5950', 1]
['5951', 0]
['5952', 1]
['5953', 0]
['5954', 0]
['5955', 0]
['5956', 0]
['5957', 0]
['5958', 0]
['59

['9328', 0]
['9329', 0]
['9330', 0]
['9331', 1]
['9332', 0]
['9333', 0]
['9334', 0]
['9335', 1]
['9336', 0]
['9337', 0]
['9338', 0]
['9339', 1]
['9340', 0]
['9341', 0]
['9342', 0]
['9343', 1]
['9344', 0]
['9345', 0]
['9346', 0]
['9347', 0]
['9348', 0]
['9349', 0]
['9350', 1]
['9351', 0]
['9352', 0]
['9353', 0]
['9354', 0]
['9355', 0]
['9356', 0]
['9357', 0]
['9358', 1]
['9359', 1]
['9360', 0]
['9361', 0]
['9362', 1]
['9363', 0]
['9364', 0]
['9365', 0]
['9366', 0]
['9367', 0]
['9368', 0]
['9369', 0]
['9370', 1]
['9371', 0]
['9372', 0]
['9373', 0]
['9374', 0]
['9375', 0]
['9376', 0]
['9377', 0]
['9378', 0]
['9379', 0]
['9380', 0]
['9381', 0]
['9382', 0]
['9383', 0]
['9384', 0]
['9385', 0]
['9386', 0]
['9387', 0]
['9388', 0]
['9389', 0]
['9390', 0]
['9391', 0]
['9392', 0]
['9393', 0]
['9394', 0]
['9395', 1]
['9396', 1]
['9397', 0]
['9398', 0]
['9399', 0]
['9400', 0]
['9401', 0]
['9402', 1]
['9403', 0]
['9404', 1]
['9405', 0]
['9406', 0]
['9407', 0]
['9408', 0]
['9409', 1]
['9410', 0]
['94

['12515', 0]
['12516', 0]
['12517', 0]
['12518', 0]
['12519', 0]
['12520', 1]
['12521', 0]
['12522', 0]
['12523', 1]
['12524', 0]
['12525', 0]
['12526', 0]
['12527', 1]
['12528', 0]
['12529', 0]
['12530', 0]
['12531', 0]
['12532', 1]
['12533', 0]
['12534', 0]
['12535', 0]
['12536', 0]
['12537', 0]
['12538', 0]
['12539', 0]
['12540', 0]
['12541', 0]
['12542', 0]
['12543', 0]
['12544', 0]
['12545', 1]
['12546', 0]
['12547', 0]
['12548', 0]
['12549', 0]
['12550', 0]
['12551', 0]
['12552', 1]
['12553', 0]
['12554', 0]
['12555', 1]
['12556', 0]
['12557', 0]
['12558', 0]
['12559', 0]
['12560', 0]
['12561', 0]
['12562', 0]
['12563', 0]
['12564', 1]
['12565', 0]
['12566', 0]
['12567', 0]
['12568', 0]
['12569', 0]
['12570', 0]
['12571', 0]
['12572', 0]
['12573', 1]
['12574', 0]
['12575', 0]
['12576', 0]
['12577', 1]
['12578', 0]
['12579', 0]
['12580', 0]
['12581', 1]
['12582', 0]
['12583', 0]
['12584', 0]
['12585', 0]
['12586', 0]
['12587', 0]
['12588', 1]
['12589', 0]
['12590', 0]
['12591', 0]

['15600', 0]
['15601', 0]
['15602', 0]
['15603', 0]
['15604', 0]
['15605', 0]
['15606', 1]
['15607', 0]
['15608', 0]
['15609', 0]
['15610', 0]
['15611', 1]
['15612', 0]
['15613', 0]
['15614', 0]
['15615', 1]
['15616', 0]
['15617', 0]
['15618', 0]
['15619', 0]
['15620', 0]
['15621', 0]
['15622', 0]
['15623', 0]
['15624', 1]
['15625', 0]
['15626', 0]
['15627', 0]
['15628', 0]
['15629', 0]
['15630', 0]
['15631', 0]
['15632', 0]
['15633', 0]
['15634', 0]
['15635', 1]
['15636', 0]
['15637', 0]
['15638', 0]
['15639', 0]
['15640', 0]
['15641', 1]
['15642', 0]
['15643', 0]
['15644', 0]
['15645', 0]
['15646', 0]
['15647', 0]
['15648', 1]
['15649', 0]
['15650', 1]
['15651', 0]
['15652', 0]
['15653', 1]
['15654', 0]
['15655', 0]
['15656', 1]
['15657', 0]
['15658', 1]
['15659', 0]
['15660', 0]
['15661', 0]
['15662', 0]
['15663', 0]
['15664', 0]
['15665', 0]
['15666', 0]
['15667', 1]
['15668', 0]
['15669', 0]
['15670', 0]
['15671', 0]
['15672', 0]
['15673', 0]
['15674', 0]
['15675', 0]
['15676', 0]

['18978', 0]
['18979', 0]
['18980', 0]
['18981', 0]
['18982', 0]
['18983', 0]
['18984', 1]
['18985', 0]
['18986', 0]
['18987', 0]
['18988', 0]
['18989', 0]
['18990', 0]
['18991', 0]
['18992', 0]
['18993', 0]
['18994', 0]
['18995', 0]
['18996', 0]
['18997', 0]
['18998', 0]
['18999', 0]
['19000', 0]
['19001', 1]
['19002', 0]
['19003', 0]
['19004', 0]
['19005', 0]
['19006', 0]
['19007', 0]
['19008', 0]
['19009', 0]
['19010', 1]
['19011', 0]
['19012', 0]
['19013', 0]
['19014', 0]
['19015', 0]
['19016', 0]
['19017', 0]
['19018', 0]
['19019', 1]
['19020', 0]
['19021', 0]
['19022', 1]
['19023', 0]
['19024', 0]
['19025', 0]
['19026', 1]
['19027', 0]
['19028', 0]
['19029', 0]
['19030', 0]
['19031', 0]
['19032', 0]
['19033', 0]
['19034', 1]
['19035', 0]
['19036', 1]
['19037', 0]
['19038', 0]
['19039', 0]
['19040', 0]
['19041', 0]
['19042', 0]
['19043', 0]
['19044', 0]
['19045', 0]
['19046', 0]
['19047', 0]
['19048', 0]
['19049', 0]
['19050', 0]
['19051', 0]
['19052', 1]
['19053', 0]
['19054', 0]

['22638', 0]
['22639', 0]
['22640', 0]
['22641', 0]
['22642', 0]
['22643', 0]
['22644', 0]
['22645', 1]
['22646', 0]
['22647', 1]
['22648', 0]
['22649', 0]
['22650', 1]
['22651', 0]
['22652', 0]
['22653', 0]
['22654', 0]
['22655', 0]
['22656', 0]
['22657', 0]
['22658', 0]
['22659', 0]
['22660', 1]
['22661', 0]
['22662', 0]
['22663', 0]
['22664', 0]
['22665', 1]
['22666', 0]
['22667', 0]
['22668', 0]
['22669', 0]
['22670', 0]
['22671', 0]
['22672', 0]
['22673', 0]
['22674', 1]
['22675', 0]
['22676', 1]
['22677', 0]
['22678', 0]
['22679', 1]
['22680', 0]
['22681', 0]
['22682', 0]
['22683', 0]
['22684', 0]
['22685', 0]
['22686', 0]
['22687', 0]
['22688', 0]
['22689', 1]
['22690', 0]
['22691', 0]
['22692', 0]
['22693', 0]
['22694', 0]
['22695', 0]
['22696', 0]
['22697', 0]
['22698', 1]
['22699', 0]
['22700', 0]
['22701', 0]
['22702', 1]
['22703', 0]
['22704', 1]
['22705', 0]
['22706', 1]
['22707', 0]
['22708', 0]
['22709', 0]
['22710', 0]
['22711', 0]
['22712', 1]
['22713', 0]
['22714', 0]

['25966', 0]
['25967', 0]
['25968', 0]
['25969', 0]
['25970', 0]
['25971', 0]
['25972', 0]
['25973', 0]
['25974', 0]
['25975', 0]
['25976', 1]
['25977', 0]
['25978', 1]
['25979', 0]
['25980', 0]
['25981', 0]
['25982', 0]
['25983', 1]
['25984', 0]
['25985', 0]
['25986', 0]
['25987', 0]
['25988', 0]
['25989', 0]
['25990', 0]
['25991', 0]
['25992', 0]
['25993', 0]
['25994', 0]
['25995', 1]
['25996', 0]
['25997', 0]
['25998', 0]
['25999', 1]
['26000', 0]
['26001', 0]
['26002', 0]
['26003', 0]
['26004', 0]
['26005', 0]
['26006', 0]
['26007', 0]
['26008', 1]
['26009', 0]
['26010', 0]
['26011', 1]
['26012', 0]
['26013', 0]
['26014', 0]
['26015', 1]
['26016', 1]
['26017', 1]
['26018', 0]
['26019', 0]
['26020', 0]
['26021', 0]
['26022', 0]
['26023', 0]
['26024', 0]
['26025', 0]
['26026', 0]
['26027', 0]
['26028', 0]
['26029', 1]
['26030', 0]
['26031', 0]
['26032', 0]
['26033', 0]
['26034', 0]
['26035', 0]
['26036', 0]
['26037', 1]
['26038', 0]
['26039', 0]
['26040', 0]
['26041', 0]
['26042', 0]