# Libraries.

In [2]:
import pandas as pd 
import numpy as np 
import xgboost as xgb
from ml_metrics import quadratic_weighted_kappa
from scipy.optimize import fmin_powell



# The code below outputs the Quadratic weighted kappa metric.

In [3]:
def eval_wrapper(yhat, y):  
    y = np.array(y)
    y = y.astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)   
    return quadratic_weighted_kappa(yhat, y)

# Defining the parameters to tune for the algorithm.

In [4]:
def get_params():
    
    params = {}
    params["objective"] = "reg:linear"     
    params["eta"] = 0.05
    params["min_child_weight"] = 360
    params["subsample"] = 0.85
    params["colsample_bytree"] = 0.3
    params["silent"] = 1
    params["max_depth"] = 7
    plst = list(params.items())

    return plst

In [5]:
def apply_offsets(data, offsets):
    for j in range(num_classes):
        data[1, data[0].astype(int)==j] = data[0, data[0].astype(int)==j] + offsets[j]
    return data

# Preparing the data for modelling.

In [6]:
# global variables
columns_to_drop = ['Id', 'Response']
xgb_num_rounds = 720
num_classes = 8
missing_indicator = -1000

# Loading the train and test datasets.

In [7]:
print("Load the data using pandas")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.shape

Load the data using pandas


(59381, 128)

In [8]:
# combine train and test
all_data = train.append(test)
all_data.shape

(79146, 128)

In [9]:
# create any new variables    
all_data['Product_Info_2_char'] = all_data.Product_Info_2.str[0]
all_data['Product_Info_2_num'] = all_data.Product_Info_2.str[1]


# Assigning numerical variables to string variables.

In [10]:
# factorize categorical variables
all_data['Product_Info_2'] = pd.factorize(all_data['Product_Info_2'])[0]
all_data['Product_Info_2_char'] = pd.factorize(all_data['Product_Info_2_char'])[0]
all_data['Product_Info_2_num'] = pd.factorize(all_data['Product_Info_2_num'])[0]

all_data['BMI_Age'] = all_data['BMI'] * all_data['Ins_Age']

med_keyword_columns = all_data.columns[all_data.columns.str.startswith('Medical_Keyword_')]
all_data['Med_Keywords_Count'] = all_data[med_keyword_columns].sum(axis=1)

# Filling in the missing data.

In [11]:
print('Eliminate missing values')    
all_data.fillna(missing_indicator, inplace=True)

Eliminate missing values


In [12]:
# fix the dtype on the label column
all_data['Response'] = all_data['Response'].astype(int)

# Split train and test

In [25]:

train = all_data[all_data['Response']>0].copy()
test = all_data[all_data['Response']<1].copy()


# Convert the data to xgb data structure

In [14]:

xgtrain = xgb.DMatrix(train.drop(columns_to_drop, axis=1), train['Response'].values, 
                        missing=missing_indicator)
xgtest = xgb.DMatrix(test.drop(columns_to_drop, axis=1), label=test['Response'].values, 
                        missing=missing_indicator) 

# The code outputs the parameters for xgboost

In [18]:

plst = get_params()
print(plst)   

[('max_depth', 7), ('subsample', 0.85), ('objective', 'reg:linear'), ('min_child_weight', 360), ('silent', 1), ('eta', 0.05), ('colsample_bytree', 0.3)]


# Training the model

In [16]:

model = xgb.train(plst, xgtrain, xgb_num_rounds) 


# Getting the train and test predictions

In [17]:

train_preds = model.predict(xgtrain)        
print('Train score is:', eval_wrapper(train_preds, train['Response'])) 
test_preds = model.predict(xgtest)    

Train score is: 0.6516583139089522


# Applying the offsets

In [20]:
offsets=np.array([-1.5,-2.6,-3.6,-1.2,-0.8,-0.1,0.6,3.6])
offset_preds = np.vstack((train_preds, train_preds, train['Response'].values))
offset_preds = apply_offsets(offset_preds, offsets)
print('Offset Train score is:', eval_wrapper(offset_preds[1], train['Response'])) 


Offset Train score is: 0.7027723116346819


# The code outputs the offsets

In [21]:
def score_offset(data, bin_offset, sv, scorer=eval_wrapper):
     #data has the format of pred=0, offset_pred=1, labels=2 in the first dim
    data[1, data[0].astype(int)==sv] = data[0, data[0].astype(int)==sv] + bin_offset
    score = scorer(data[1], data[2])
    return score

from scipy.optimize import fmin_powell
opt_order = [0,1,2,3,4,5,6,7]
for j in opt_order:
    train_offset = lambda x: -score_offset(offset_preds, x, j) * 100
    offsets[j] = fmin_powell(train_offset, offsets[j], disp=False)

# Apply offsets to the  test dataset.

In [22]:

data = np.vstack((test_preds, test_preds, test['Response'].values))
data = apply_offsets(data, offsets)

# Rounding off the predictions to the given risk interval.

In [23]:
final_test_preds = np.round(np.clip(data[1], 1, 8)).astype(int)

# Kaggle submission.

In [24]:
preds_out = pd.DataFrame({"Id": test['Id'].values, "Response": final_test_preds})
preds_out = preds_out.set_index('Id')
preds_out.to_csv('xgbtl_submission.csv')