In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
x_train_path = '../data/X_train'
y_train_path = '../data/Y_train'
x_test_path  = '../data/X_test'

x_train = pd.read_csv(x_train_path)
y_train = pd.read_csv(y_train_path)
x_test  = pd.read_csv(x_test_path)

x_train.head()

Unnamed: 0,id,age,Private,Self-employed-incorporated,State government,Self-employed-not incorporated,Not in universe,Without pay,Federal government,Never worked,...,1.2,Not in universe.12,Yes.3,No.3,2.3,0.3,1.3,weeks worked in year,94,95
0,0,33,1,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,52,0,1
1,1,63,1,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,52,0,1
2,2,71,0,0,0,0,1,0,0,0,...,0,1,0,0,1,0,0,0,0,1
3,3,43,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,52,0,1
4,4,57,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,52,0,1


In [3]:
x_train = np.array(x_train)[:, 1:]
y_train = (np.array(y_train)[:, 1:]).flatten()
x_test  = np.array(x_test)[:, 1:]

# normalize
scaler = StandardScaler().fit(x_train)
# print(scaler.mean, scaler.std)
x_train = scaler.transform(x_train)
x_test  = scaler.transform(x_test)

print('x_train :\n',x_train,x_train.shape,'\n')
print('y_train :\n',y_train,y_train.shape,'\n')
print('x_test :\n',x_test,x_test.shape)

x_train :
 [[-0.42755297  0.9995946  -0.18224011 ...  0.80645987 -1.01485524
   1.01485524]
 [ 1.19978057  0.9995946  -0.18224011 ...  0.80645987 -1.01485524
   1.01485524]
 [ 1.63373618 -1.00040557 -0.18224011 ... -1.45536172 -1.01485524
   1.01485524]
 ...
 [-1.34970865 -1.00040557 -0.18224011 ... -1.10738917  0.98536221
  -0.98536221]
 [ 0.3861138   0.9995946  -0.18224011 ...  0.80645987 -1.01485524
   1.01485524]
 [ 0.3861138  -1.00040557 -0.18224011 ... -1.45536172 -1.01485524
   1.01485524]] (54256, 510) 

y_train :
 [1 0 0 ... 0 0 0] (54256,) 

x_test :
 [[-0.21057517  0.9995946  -0.18224011 ...  0.80645987 -1.01485524
   1.01485524]
 [ 0.3861138   0.9995946  -0.18224011 ...  0.80645987 -1.01485524
   1.01485524]
 [ 1.47100282 -1.00040557 -0.18224011 ... -1.45536172  0.98536221
  -0.98536221]
 ...
 [-0.15633072  0.9995946  -0.18224011 ...  0.80645987 -1.01485524
   1.01485524]
 [-1.29546419 -1.00040557 -0.18224011 ...  0.28450104  0.98536221
  -0.98536221]
 [-1.02424194 -1.00040

In [4]:
# 切分validation set
x_training_set, x_validation_set, y_training_set, y_validation_set = train_test_split(x_train, y_train, test_size = 0.1)

print('x_training_set : ', x_training_set.shape, '\n', x_training_set)
print('------------------------------------------------------------------------')
print('y_training_set : ', y_training_set.shape, '\n', y_training_set)
print('------------------------------------------------------------------------')
print('x_validation_set : ', x_validation_set.shape, '\n', x_validation_set)
print('------------------------------------------------------------------------')
print('y_validation_set : ', y_validation_set.shape, '\n', y_validation_set)

x_training_set :  (48830, 510) 
 [[ 0.6030916  -1.00040557 -0.18224011 ...  0.80645987  0.98536221
  -0.98536221]
 [ 0.6030916   0.9995946  -0.18224011 ...  0.80645987  0.98536221
  -0.98536221]
 [-0.80726413 -1.00040557 -0.18224011 ... -0.75941661 -1.01485524
   1.01485524]
 ...
 [ 0.44035825  0.9995946  -0.18224011 ...  0.80645987 -1.01485524
   1.01485524]
 [ 1.52524727  0.9995946  -0.18224011 ...  0.80645987 -1.01485524
   1.01485524]
 [-0.80726413 -1.00040557 -0.18224011 ... -1.45536172 -1.01485524
   1.01485524]]
------------------------------------------------------------------------
y_training_set :  (48830,) 
 [0 1 0 ... 0 1 0]
------------------------------------------------------------------------
x_validation_set :  (5426, 510) 
 [[-0.80726413  0.9995946  -0.18224011 ...  0.80645987 -1.01485524
   1.01485524]
 [ 0.00640264  0.9995946  -0.18224011 ... -0.32445093  0.98536221
  -0.98536221]
 [-0.64453078  0.9995946  -0.18224011 ...  0.45848731  0.98536221
  -0.98536221]
 ...


In [8]:
best_score = 0.0
for C in [0.01, 0.1, 1, 10, 100]:
    lr_clf = LogisticRegression(C = C)
    lr_clf.fit(x_training_set, y_training_set)
    score_training = lr_clf.score(x_training_set, y_training_set)
    score_validation = lr_clf.score(x_validation_set, y_validation_set)
    print('score_training_set: ', score_training, '\t', 'score_validation_set: ', score_validation)
    if score_validation > best_score:
        best_score = score_validation
        best_parameters = C

score_training_set:  0.8853573622772886 	 score_validation_set:  0.881865093991891


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


score_training_set:  0.8857259881220562 	 score_validation_set:  0.8822336896424622


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


score_training_set:  0.8858283841900471 	 score_validation_set:  0.8824179874677479


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


score_training_set:  0.8858283841900471 	 score_validation_set:  0.8826022852930335
score_training_set:  0.8858283841900471 	 score_validation_set:  0.8826022852930335


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [9]:
lr = LogisticRegression(C = best_parameters)
lr.fit(x_train, y_train)
score = lr.score(x_train, y_train)
print('score_train: ',score)
y_test_predict = lr.predict(x_test)
print('y_test_predict:\n',y_test_predict)

score_train:  0.8857269242111472
y_test_predict:
 [0 0 0 ... 1 0 0]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
