In [0]:
def reduce_ser_mem_usage(ser):
  col_type = ser.dtype
  if col_type != object:
      c_min = ser.min()
      c_max = ser.max()
      if str(col_type)[:3] == 'int':
          if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
              ser = ser.astype(np.int8)
          elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
              ser = ser.astype(np.int16)
          elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
              ser = ser.astype(np.int32)
          elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
              ser = ser.astype(np.int64)  
      else:
          if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
              ser = ser.astype(np.float16)
          elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
              ser = ser.astype(np.float32)
          else:
              ser = ser.astype(np.float64)
  else:
      ser = ser.astype('category')

def reduce_df_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        reduce_ser_mem_usage(df[col])

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

In [0]:
def get_result(model, D_test, Y_test):
  preds = model.predict(D_test)
  best_preds = np.asarray([np.argmax(line) for line in preds])

  print("Precision = {}".format(precision_score(Y_test, best_preds, average='macro')))
  print("Recall = {}".format(recall_score(Y_test, best_preds, average='macro')))
  print("Accuracy = {}".format(accuracy_score(Y_test, best_preds)))

In [0]:
import xgboost as xgb
import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score

import copy

In [0]:
dataset = datasets.load_digits()
X = pd.DataFrame(dataset.data)
y = pd.Series(dataset.target)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5)

In [0]:
def train_iters_search(params, D_train, D_valid, y_valid, eval_func, n_iters=10, maximize=True,max_worse_iters=1):
  best_model = None
  best_params = None

  best_local_model = None
  best_local_params = None

  if maximize:
    best_result = -np.inf
  else:
    best_result = np.inf

  params_mass = []
  params_loc_mass = []
  no_better_result = 0
  for i in range(n_iters):
    no_better_result += 1
    model = copy.deepcopy(best_local_model)
    if maximize:
      best_loc = -np.inf
    else:
      best_loc = np.inf

    for param in params:
      model2 = xgb.train(param, D_train, param['num_iter'], xgb_model=model)
      y_pred = model2.predict(D_valid)
      score = eval_func(y_valid, y_pred)
      if maximize:
        if score > best_loc:
          best_loc = score
          best_local_model = copy.deepcopy(model2)
          best_local_params = param
        if score > best_result:
          best_result = score
          best_model = copy.deepcopy(model2)
          best_params = param
          no_better_result = 0
      else:
        if score < best_loc:
          best_loc = score
          best_local_model = copy.deepcopy(model2)
          best_local_params = param
        if score < best_result:
          best_result = score
          best_model = copy.deepcopy(model2)
          best_params = param
          no_better_result = 0

    print('loc', best_loc)
    print('global', best_result)

    params_loc_mass.append(best_local_params)

    if no_better_result == 0:
      params_mass = copy.deepcopy(params_loc_mass)
      best_model = copy.deepcopy(best_local_model)
    
    if no_better_result >= max_worse_iters:
      return best_model, params_mass

  return best_model, params_mass

In [0]:
def train_iters(params_mass, D_train):
  model = None
  for param in params_mass:
    model = xgb.train(param, D_train, param['num_iter'], xgb_model=model)
  return model

In [0]:
params = [
    {
    'eta': 0.35, 
    'max_depth': 3,  
    'objective': 'multi:softprob',  
    'num_class': 10,
    'num_iter':1,
    },
    {
    'eta': 0.4, 
    'max_depth': 4,  
    'objective': 'multi:softprob',  
    'num_class': 10,
    'num_iter':1,
    },
    {
    'eta': 0.3, 
    'max_depth': 5,  
    'objective': 'multi:softprob',  
    'num_class': 10,
    'num_iter':1,
    },

    {
    'eta': 0.35, 
    'max_depth': 3,  
    'objective': 'multi:softprob',  
    'num_class': 10,
    'num_iter':5,
    },
    {
    'eta': 0.4, 
    'max_depth': 4,  
    'objective': 'multi:softprob',  
    'num_class': 10,
    'num_iter':5,
    },
    {
    'eta': 0.3, 
    'max_depth': 5,  
    'objective': 'multi:softprob',  
    'num_class': 10,
    'num_iter':5,
    },

    {
    'eta': 0.35, 
    'max_depth': 3,  
    'objective': 'multi:softprob',  
    'num_class': 10,
    'num_iter':10,
    },
    {
    'eta': 0.4, 
    'max_depth': 4,  
    'objective': 'multi:softprob',  
    'num_class': 10,
    'num_iter':10,
    },
    {
    'eta': 0.3, 
    'max_depth': 5,  
    'objective': 'multi:softprob',  
    'num_class': 10,
    'num_iter':10,
    },

    {
    'eta': 0.35, 
    'max_depth': 3,  
    'objective': 'multi:softprob',  
    'num_class': 10,
    'num_iter':20,
    },
    {
    'eta': 0.4, 
    'max_depth': 4,  
    'objective': 'multi:softprob',  
    'num_class': 10,
    'num_iter':20,
    },
    {
    'eta': 0.3, 
    'max_depth': 5,  
    'objective': 'multi:softprob',  
    'num_class': 10,
    'num_iter':20,
    },

    {
    'eta': 0.35, 
    'max_depth': 3,  
    'objective': 'multi:softprob',  
    'num_class': 10,
    'num_iter':50,
    },
    {
    'eta': 0.4, 
    'max_depth': 4,  
    'objective': 'multi:softprob',  
    'num_class': 10,
    'num_iter':50,
    },
    {
    'eta': 0.3, 
    'max_depth': 5,  
    'objective': 'multi:softprob',  
    'num_class': 10,
    'num_iter':50,
    },
]

In [88]:
reduce_df_mem_usage(X_train)
reduce_ser_mem_usage(y_train)
D_train = xgb.DMatrix(X_train.copy(), label=y_train.copy())
D_test = xgb.DMatrix(X_test, label=y_test)
D_valid = xgb.DMatrix(X_valid, label=y_valid)

Memory usage of dataframe is 0.62 MB
Memory usage after optimization is: 0.62 MB
Decreased by 0.0%


In [89]:
for param in params:
  model = xgb.train(param, D_train, param['num_iter'])
  get_result(model, D_test, y_test)

Precision = 0.8260146227412781
Recall = 0.8295082333125811
Accuracy = 0.8333333333333334
Precision = 0.8918397834050008
Recall = 0.8933072080898169
Accuracy = 0.8925925925925926
Precision = 0.8936221229191841
Recall = 0.8880620345837735
Accuracy = 0.8888888888888888
Precision = 0.9224397071226884
Recall = 0.9258460999029022
Accuracy = 0.9222222222222223
Precision = 0.9331581262939957
Recall = 0.9369179009396401
Accuracy = 0.9333333333333333
Precision = 0.9278197696023784
Recall = 0.9326118808727506
Accuracy = 0.9296296296296296
Precision = 0.9526022293825893
Recall = 0.9551485109093806
Accuracy = 0.9518518518518518
Precision = 0.9549577203156664
Recall = 0.9593151775760471
Accuracy = 0.9555555555555556
Precision = 0.9355637565832471
Recall = 0.9417419175027872
Accuracy = 0.937037037037037
Precision = 0.9553881674852439
Recall = 0.9589946647555344
Accuracy = 0.9555555555555556
Precision = 0.9584253432724198
Recall = 0.96217232043319
Accuracy = 0.9592592592592593
Precision = 0.9536183943

In [0]:
eval_func = lambda x,y: precision_score(x, np.asarray([np.argmax(line) for line in y]), average='macro')

In [91]:
model, model_params = train_iters_search(params, D_train, D_valid, y_valid, eval_func, n_iters=1000, maximize=True,max_worse_iters=10)

loc 0.9769003570727708
global 0.9769003570727708
loc 0.9782620946305158
global 0.9782620946305158
loc 0.9782620946305158
global 0.9782620946305158
loc 0.9782620946305158
global 0.9782620946305158
loc 0.9782620946305158
global 0.9782620946305158
loc 0.9782620946305158
global 0.9782620946305158
loc 0.9782620946305158
global 0.9782620946305158
loc 0.9782620946305158
global 0.9782620946305158
loc 0.9782620946305158
global 0.9782620946305158
loc 0.9782620946305158
global 0.9782620946305158
loc 0.9782620946305158
global 0.9782620946305158
loc 0.9782620946305158
global 0.9782620946305158


In [92]:
model_params

[{'eta': 0.35,
  'max_depth': 3,
  'num_class': 10,
  'num_iter': 50,
  'objective': 'multi:softprob'},
 {'eta': 0.35,
  'max_depth': 3,
  'num_class': 10,
  'num_iter': 10,
  'objective': 'multi:softprob'}]

In [93]:
model = train_iters(model_params, D_train)
get_result(model, D_test, y_test)

Precision = 0.9591388149858915
Recall = 0.9618518076126772
Accuracy = 0.9592592592592593
