In [1]:
%matplotlib inline

import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.metrics import log_loss

In [2]:
# load training data
a_train = pd.read_csv('A_hhold_train.csv', index_col='id')
b_train = pd.read_csv('B_hhold_train.csv', index_col='id')
c_train = pd.read_csv('C_hhold_train.csv', index_col='id')

a_test = pd.read_csv('A_hhold_test.csv', index_col='id')
b_test = pd.read_csv('B_hhold_test.csv', index_col='id')
c_test = pd.read_csv('C_hhold_test.csv', index_col='id')

In [3]:
# Standardize features
def standardize(df, numeric_only=True):
    numeric = df.select_dtypes(include=['int64', 'float64'])
    
    # subtracy mean and divide by std
    df[numeric.columns] = (numeric - numeric.mean()) / numeric.std()
    
    return df
    

def pre_process_data(df, enforce_cols=None):
    print("Input shape:\t{}".format(df.shape))
        

    df = standardize(df)
    print("After standardization {}".format(df.shape))
        
    # create dummy variables for categoricals
    df = pd.get_dummies(df)
    print("After converting categoricals:\t{}".format(df.shape))
    

    # match test set and training set columns
    if enforce_cols is not None:
        to_drop = np.setdiff1d(df.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, df.columns)

        df.drop(to_drop, axis=1, inplace=True)
        df = df.assign(**{c: 0 for c in to_add})
    
    df.fillna(-999, inplace=True)
    
    return df

In [4]:
print("Country A")
aX_train = pre_process_data(a_train.drop('poor', axis=1))
ay_train = np.ravel(a_train.poor)

print("\nCountry B")
bX_train = pre_process_data(b_train.drop('poor', axis=1))
by_train = np.ravel(b_train.poor)

print("\nCountry C")
cX_train = pre_process_data(c_train.drop('poor', axis=1))
cy_train = np.ravel(c_train.poor)

Country A
Input shape:	(8203, 344)
After standardization (8203, 344)
After converting categoricals:	(8203, 859)

Country B
Input shape:	(3255, 441)
After standardization (3255, 441)
After converting categoricals:	(3255, 1432)

Country C
Input shape:	(6469, 163)
After standardization (6469, 163)
After converting categoricals:	(6469, 795)


In [5]:
aX_train = aX_train.drop(['country_A'],axis=1)
bX_train = bX_train.drop(['country_B'],axis=1)
cX_train = cX_train.drop(['country_C'],axis=1)

In [6]:
aX_train.head()

Unnamed: 0_level_0,nEsgxvAq,OMtioXZZ,YFMZwKrU,TiwRslOh,wBXbHZmp_DkQlr,wBXbHZmp_JhtDR,SlDKnCuu_GUusz,SlDKnCuu_alLXR,KAJOWiiw_BIZns,KAJOWiiw_TuovO,...,JCDeZBXq_LPtkN,JCDeZBXq_UyAms,HGPWuGlV_WKNwg,HGPWuGlV_vkbkA,GDUPaBQs_qCEuA,GDUPaBQs_qQxrL,WuwrCsIY_AITFl,WuwrCsIY_GAZGl,AlDbXTlZ_aQeIm,AlDbXTlZ_cecIq
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
46107,-1.44716,0.325746,1.099716,-0.628045,0,1,1,0,0,1,...,1,0,0,1,0,1,1,0,1,0
82739,-0.414625,-0.503468,-0.01605,0.713467,0,1,1,0,0,1,...,0,1,0,1,0,1,1,0,0,1
9646,0.61791,-0.503468,-0.01605,-0.628045,0,1,1,0,1,0,...,0,1,0,1,0,1,1,0,0,1
10975,0.61791,-1.332682,-1.131816,0.713467,0,1,1,0,0,1,...,0,1,0,1,0,1,1,0,0,1
16463,0.61791,0.325746,-1.131816,-0.180874,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,1,0


In [7]:
aX_train.shape

(8203, 858)

In [8]:
import lightgbm as lg
from sklearn.cross_validation import StratifiedKFold   #用于交叉验证
from sklearn.grid_search import GridSearchCV   #用于自动调参  暴力搜索所有参数组合
from sklearn.model_selection import RandomizedSearchCV   #用于自动调参  在指定参数空间内抽样一部分候选参数



In [9]:
from sklearn.model_selection import KFold, train_test_split
AX_train, AX_valid, Ay_train, Ay_valid = train_test_split(aX_train, ay_train, test_size=0.2, random_state =1)
BX_train, BX_valid, By_train, By_valid = train_test_split(bX_train, by_train, test_size=0.2, random_state =1)
CX_train, CX_valid, Cy_train, Cy_valid = train_test_split(cX_train, cy_train, test_size=0.2, random_state =1)


In [10]:
clf = lg.LGBMClassifier

param_grid = {
        'silent': [False],
        'max_depth': [10, 11, 12, 13, 14, 15],
        'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
        'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
        'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0, 13.0],
        'gamma': [0, 0.25, 0.5],
        'reg_lambda': [10.0, 50.0, 100.0, 200, 500.0],
        'n_estimators': [70, 100, 500, 1000]}



In [11]:
fit_params_A = {'eval_metric': 'logloss',
              'early_stopping_rounds': 10,
              'eval_set': [(AX_valid, Ay_valid)]}

model_A = RandomizedSearchCV(clf, param_grid, n_iter=20,
                            n_jobs=1, verbose=2, cv=2,
                            fit_params=fit_params_A,
                            #scoring='neg_log_loss', refit=False, random_state=42)
                            scoring='roc_auc',  random_state=42)

model_A.fit(aX_train, ay_train)

best_score = model_A.best_score_
best_params = model_A.best_params_
print("Best score: {}".format(best_score))
print("Best params: ")
for param_name in sorted(best_params.keys()):
    print('%s: %r' % (param_name, best_params[param_name]))



Fitting 2 folds for each of 20 candidates, totalling 40 fits


TypeError: get_params() missing 1 required positional argument: 'self'