In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('train.csv')
train.rename(columns={'index': 'id'}, inplace=True)
test = pd.read_csv('test.csv')
sample_submit = pd.read_csv('sample_submit.csv', names=['id', 'Y'])

In [3]:
data = pd.concat([train, test], sort=False)

# カテゴリカルな特徴量は全てとりあえずエンコーディングしておく．
categorical_column = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
for column in categorical_column:
    d = {}
    for i, category in enumerate(data[column].unique()):
        d[category] = i
    data[column] = data[column].map(d)
data

Unnamed: 0,id,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,native-country,Y
0,0,22,0,132618,0,12,0,0,0,0,0,0,0.0
1,1,22,0,132655,1,9,1,1,1,0,1,1,1.0
2,2,23,1,132674,2,8,0,2,2,0,0,0,0.0
3,3,36,0,132642,1,13,1,3,1,0,1,0,1.0
4,4,49,0,132646,3,13,1,4,1,0,1,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6795,16995,31,0,132530,0,9,2,8,2,0,0,0,
6796,16996,31,0,132634,1,6,0,12,2,1,1,0,
6797,16997,31,0,132598,2,13,4,1,2,0,1,0,
6798,16998,23,0,132573,0,8,2,10,2,2,0,0,


In [4]:
data.drop('id', axis=1, inplace=True)
train = data[:len(train)]
test = data[len(train):]

y_train = train['Y']
X_train = train.drop('Y', axis=1)
X_test = test.drop('Y', axis=1)
X_train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,native-country
0,22,0,132618,0,12,0,0,0,0,0,0
1,22,0,132655,1,9,1,1,1,0,1,1
2,23,1,132674,2,8,0,2,2,0,0,0
3,36,0,132642,1,13,1,3,1,0,1,0
4,49,0,132646,3,13,1,4,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
10195,31,0,132650,1,9,2,9,5,0,0,1
10196,36,0,132590,2,9,1,4,3,0,1,1
10197,36,2,132498,1,13,1,10,1,2,1,0
10198,17,0,132614,0,9,0,10,2,0,0,0


In [5]:
from sklearn.model_selection import train_test_split


X_train, X_valid, y_train, y_valid = \
    train_test_split(X_train, y_train, test_size=0.3,
                     random_state=0, stratify=y_train)

In [6]:
import lightgbm as lgb
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
params = {
    'objective': 'binary'
}
model = lgb.train(params, lgb_train,
                  valid_sets=[lgb_train, lgb_eval],
                  verbose_eval=10,
                  num_boost_round=100000,
                  early_stopping_rounds=1000)

y_pred = model.predict(X_valid, num_iteration=model.best_iteration)



[LightGBM] [Info] Number of positive: 1729, number of negative: 5411
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 368
[LightGBM] [Info] Number of data points in the train set: 7140, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.242157 -> initscore=-1.140891
[LightGBM] [Info] Start training from score -1.140891
Training until validation scores don't improve for 1000 rounds
[10]	training's binary_logloss: 0.313204	valid_1's binary_logloss: 0.327298
[20]	training's binary_logloss: 0.255141	valid_1's binary_logloss: 0.281897
[30]	training's binary_logloss: 0.230633	valid_1's binary_logloss: 0.269241
[40]	training's binary_logloss: 0.214581	valid_1's binary_logloss: 0.265882
[50]	training's binary_logloss: 0.202819	valid_1's binary_logloss: 0.264681
[60]	training's binary_logloss: 0.192595	valid_1's binary_logloss: 0.264504
[70]	training's binary_logloss:

In [7]:
y_pred[:10]

array([0.0022551 , 0.0013183 , 0.0067186 , 0.91896269, 0.00392708,
       0.01103707, 0.23917377, 0.0028153 , 0.67000383, 0.12515806])

In [8]:
from sklearn.metrics import accuracy_score
y_pred = (y_pred > 0.5).astype(int)
accuracy_score(y_valid, y_pred)

0.8784313725490196

In [9]:
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred = (y_pred > 0.5).astype(int)

In [10]:
sample_submit['Y'] = y_pred
sample_submit
sample_submit.to_csv('submit.csv', header=False, index=False)