In [1]:
# 下载资料并做normalize，切为Training set和validation set
import numpy as np

np.random.seed(0)

x_train_fpath = '../data/X_train'
y_train_fpath = '../data/Y_train'
x_test_fpath  = '../data/X_test'

# 第一行是feature的名称，所以先执行next(f)跳过第一行的内容；第一个dimension是id，feature[1:]从第二个dimension开始读取
with open(x_train_fpath) as f:
    next(f)
    x_train = np.array([line.strip('\n').split(',')[1:]  for line in f], dtype = float)

with open(y_train_fpath) as f:
    next(f)
    y_train = np.array([line.strip('\n').split(',')[1]  for line in f], dtype = float)
    
with open(x_test_fpath) as f:
    next(f)
    x_test = np.array([line.strip('\n').split(',')[1:]   for line in f], dtype = float)
    
print('x_train :\n',x_train,x_train.shape,'\n')
print('y_train :\n',y_train,y_train.shape,'\n')
print('x_test :\n',x_test,x_test.shape)

x_train :
 [[33.  1.  0. ... 52.  0.  1.]
 [63.  1.  0. ... 52.  0.  1.]
 [71.  0.  0. ...  0.  0.  1.]
 ...
 [16.  0.  0. ...  8.  1.  0.]
 [48.  1.  0. ... 52.  0.  1.]
 [48.  0.  0. ...  0.  0.  1.]] (54256, 510) 

y_train :
 [1. 0. 0. ... 0. 0. 0.] (54256,) 

x_test :
 [[37.  1.  0. ... 52.  0.  1.]
 [48.  1.  0. ... 52.  0.  1.]
 [68.  0.  0. ...  0.  1.  0.]
 ...
 [38.  1.  0. ... 52.  0.  1.]
 [17.  0.  0. ... 40.  1.  0.]
 [22.  0.  0. ... 25.  1.  0.]] (27622, 510)


In [2]:
def _normalize(x, train = True, specified_column = None, x_mean = None, x_std = None):
    '''
    This function normalizes specific columns of x
    注意，testing data要跟training data的normalize方式一致，要用training data的mean和std，
    因此还需要input已知的x_mean和x_std
    '''
    # 如果没有指定列，那就穷举所有列，这里np.arange类似于range函数，只不过前者创造的对象是array类型
    if specified_column == None:
        specified_column = np.arange(x.shape[1])
    
    # train=True: for training data; train=False: for testing data，只计算training data的mean和std
    if train:
        # axis=0，对指定列求mean，注意np.mean返回的是一个列向量，因此需要用reshape(1,-1)转化成行向量
        x_mean = np.mean(x[:, specified_column], axis = 0).reshape(1, -1)
        # axis=0，对指定列求std
        x_std  = np.std(x[:, specified_column], axis = 0).reshape(1, -1)
     
    # 对指定列进行normalize，注意相减的两个向量行数不同但列数相同，相当于前者的每一行都减去x_mean这个行向量，除法同理
    # 分母加一个很小很小的数是为了避免标准差为0
    x[:, specified_column] = (x[:, specified_column] - x_mean) / (x_std + 1e-8)
    
    return x, x_mean, x_std

In [3]:
def _train_split(x, y, validation_ratio = 0.25):
    '''
    This function splits data into training set and validation set
    '''
    train_size = int(len(x) * (1 - validation_ratio))
    
    #return x,y of training set and validation set  
    # 如果返回值为x[:train_size, :]的话会报错，但这两种形式本质上是一样的，存疑
    return x[:train_size], y[:train_size], x[train_size:], y[train_size:]

In [4]:
# normalize training data and testing data
x_train, x_mean, x_std = _normalize(x_train, train = True)
x_test, _, _ = _normalize(x_test, train = False, x_mean = x_mean, x_std = x_std)

# split training data into training set and validation set
x_training_set, y_training_set, x_validation_set, y_validation_set = _train_split(x_train, y_train, validation_ratio = 0.1)

print('x_training_set : ', x_training_set.shape, '\n', x_training_set)
print('------------------------------------------------------------------------')
print('y_training_set : ', y_training_set.shape, '\n', y_training_set)
print('------------------------------------------------------------------------')
print('x_validation_set : ', x_validation_set.shape, '\n', x_validation_set)
print('------------------------------------------------------------------------')
print('y_validation_set : ', y_validation_set.shape, '\n', y_validation_set)

x_training_set :  (48830, 510) 
 [[-0.42755297  0.99959458 -0.1822401  ...  0.80645987 -1.01485522
   1.01485522]
 [ 1.19978056  0.99959458 -0.1822401  ...  0.80645987 -1.01485522
   1.01485522]
 [ 1.63373617 -1.00040555 -0.1822401  ... -1.45536172 -1.01485522
   1.01485522]
 ...
 [ 0.65733605 -1.00040555 -0.1822401  ...  0.80645987 -1.01485522
   1.01485522]
 [ 0.27762489  0.99959458 -0.1822401  ...  0.28450104 -1.01485522
   1.01485522]
 [ 0.16913599 -1.00040555 -0.1822401  ...  0.80645987  0.98536219
  -0.98536219]]
------------------------------------------------------------------------
y_training_set :  (48830,) 
 [1. 0. 0. ... 1. 0. 0.]
------------------------------------------------------------------------
x_validation_set :  (5426, 510) 
 [[-0.48179742  0.99959458 -0.1822401  ...  0.80645987  0.98536219
  -0.98536219]
 [-1.24121974 -1.00040555  5.48726602 ...  0.80645987 -1.01485522
   1.01485522]
 [-0.04784181  0.99959458 -0.1822401  ...  0.80645987 -1.01485522
   1.01485522]

In [None]:
# 逻辑回归预测
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

lg = LogisticRegression(C = 1.0)
lg.fit(x_training_set, y_training_set)

y_training_predict = lg.predict(x_training_set)
print('training_set  Acc：', lg.score(x_training_set, y_training_set))

y_validation_predict = lg.predict(x_validation_set)
print('validation_set Acc：', lg.score(x_validation_set, y_validation_set))