In [None]:
import gdown
!gdown https://drive.google.com/uc?id=1Ss7JIuTqTciLWLyjVYArr2yol9wPmVIU
!gdown https://drive.google.com/uc?id=1-Af8dJzdvWtsfAnGSEILpuK15JI6eEpq
!gdown https://drive.google.com/uc?id=1YuKCLvc2bdEcuF__AkXuud-qMQFNfixc

Downloading...
From: https://drive.google.com/uc?id=1Ss7JIuTqTciLWLyjVYArr2yol9wPmVIU
To: /content/test.csv
100% 6.62M/6.62M [00:00<00:00, 104MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-Af8dJzdvWtsfAnGSEILpuK15JI6eEpq
To: /content/final_model.sav
100% 3.55M/3.55M [00:00<00:00, 114MB/s]
Downloading...
From: https://drive.google.com/uc?id=1YuKCLvc2bdEcuF__AkXuud-qMQFNfixc
To: /content/train.csv
100% 20.0M/20.0M [00:00<00:00, 93.3MB/s]


In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import cohen_kappa_score
import xgboost as xgb
from scipy.optimize import fmin_powell

# Utility functions

In [None]:
#metric computing function
def eval_wrapper(yhat, y):  
    y = np.array(y)
    y = y.astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)   
    return cohen_kappa_score(yhat, y,weights='quadratic')

#offsetting function    
def apply_offset(data, bin_offset, sv, scorer=eval_wrapper):
    # data has the format of pred=0, offset_pred=1, labels=2 in the first dim
    data[1, data[0].astype(int)==sv] = data[0, data[0].astype(int)==sv] + bin_offset
    score = scorer(data[1], data[2])
    return score

In [None]:
def final_fun_1(X):
    
    '''This function takes details about a potential customers as input and returns a prediction of the risk level of Customer.
       The details include: product info, family info, employment info, general health measurements , 
       medical history info, and medical keyword(yes/No).'''
    filename = 'final_model.sav'
    # Loading final model, train data 
    loaded_model = pickle.load(open(filename, 'rb'))
    train = pd.read_csv('train.csv')
    # test = pd.read_csv('/content/drive/MyDrive/PrudentialData/test.csv')

    columns_to_drop = ['Id', 'Response']
    xgb_num_rounds = 800
    num_classes = 8
    eta_list = [0.05] * 200 
    eta_list = eta_list + [0.02] * 500
    eta_list = eta_list + [0.01] * 100
    test= X.copy()
    # create any new variables    
    train['Product_Info_2_char'] = train.Product_Info_2.str[0]
    train['Product_Info_2_num'] = train.Product_Info_2.str[1]
    test['Product_Info_2_char'] = test.Product_Info_2.str[0]
    test['Product_Info_2_num'] = test.Product_Info_2.str[1]

    # factorize categorical variables
    train['Product_Info_2'] = pd.factorize(train['Product_Info_2'])[0]
    train['Product_Info_2_char'] = pd.factorize(train['Product_Info_2_char'])[0]
    train['Product_Info_2_num'] = pd.factorize(train['Product_Info_2_num'])[0]

    train['BMI_Age'] = train['BMI'] * train['Ins_Age']

    med_keyword_columns = train.columns[train.columns.str.startswith('Medical_Keyword_')]
    train['Med_Keywords_Count'] = train[med_keyword_columns].sum(axis=1)
    
    
    test['Product_Info_2'] = pd.factorize(test['Product_Info_2'])[0]
    test['Product_Info_2_char'] = pd.factorize(test['Product_Info_2_char'])[0]
    test['Product_Info_2_num'] = pd.factorize(test['Product_Info_2_num'])[0]

    test['BMI_Age'] = test['BMI'] * test['Ins_Age']

    med_keyword_columns = test.columns[test.columns.str.startswith('Medical_Keyword_')]
    test['Med_Keywords_Count'] = test[med_keyword_columns].sum(axis=1)

    print('Eliminate missing values')
    # Use -1 for any others
    train.fillna(-1, inplace=True)
    test.fillna(-1, inplace=True)

    # fix the dtype on the label column
    train['Response'] = train['Response'].astype(int)
    # test['Response'] = test['Response'].astype(int)
    xgtrain = xgb.DMatrix(train.drop(columns_to_drop, axis=1), label=train['Response'].values)
    xgtest = xgb.DMatrix(test.drop(['Id'], axis=1), label=None)
    
    train_preds = loaded_model.predict(xgtrain, ntree_limit=loaded_model.best_iteration)
    test_preds = loaded_model.predict(xgtest, ntree_limit=loaded_model.best_iteration)
    train_preds = np.clip(train_preds, -0.99, 8.99)
    test_preds = np.clip(test_preds, -0.99, 8.99)

    # train offsets 
    offsets = np.array([0.1, -1, -2, -1, -0.8, 0.02, 0.8, 1])
    data = np.vstack((train_preds, train_preds, train['Response'].values))
    for j in range(num_classes):
        data[1, data[0].astype(int)==j] = data[0, data[0].astype(int)==j] + offsets[j] 
    for j in range(num_classes):
        train_offset = lambda x: -apply_offset(data, x, j)
        offsets[j] = fmin_powell(train_offset, offsets[j])  

    # apply offsets to test
    data = np.vstack((test_preds, test_preds, np.zeros(test.shape[0],)))
    for j in range(num_classes):
        data[1, data[0].astype(int)==j] = data[0, data[0].astype(int)==j] + offsets[j] 

    final_test_preds = np.round(np.clip(data[1], 1, 8)).astype(int)
    all_predictions= pd.DataFrame(final_test_preds, index= test['Id'], columns=['Response'])
    return all_predictions

In [None]:
def final_fun_2(X, Y):
    
    '''This fuction evaluates the predictions of model by comparing with actual values'''
    
    predictions = final_fun_1(X)
    y_pred= predictions['Response']
    
    print("Kappa score for data: ", eval_wrapper(y_pred, Y))


In [55]:
#testing function 1
test=pd.read_csv("test.csv")
%time final_fun_1(test)

Eliminate missing values
Optimization terminated successfully.
         Current function value: -0.754532
         Iterations: 1
         Function evaluations: 14
Optimization terminated successfully.
         Current function value: -0.754532
         Iterations: 1
         Function evaluations: 14
Optimization terminated successfully.
         Current function value: -0.754532
         Iterations: 1
         Function evaluations: 14
Optimization terminated successfully.
         Current function value: -0.755011
         Iterations: 3
         Function evaluations: 61
Optimization terminated successfully.
         Current function value: -0.755022
         Iterations: 1
         Function evaluations: 21
Optimization terminated successfully.
         Current function value: -0.755039
         Iterations: 1
         Function evaluations: 23
Optimization terminated successfully.
         Current function value: -0.755496
         Iterations: 2
         Function evaluations: 87
Optimizat

Unnamed: 0_level_0,Response
Id,Unnamed: 1_level_1
1,3
3,7
4,7
9,7
12,8
...,...
79093,8
79099,8
79102,2
79125,1


In [57]:
#testing function 2
train=pd.read_csv("train.csv")
%time final_fun_2(train.iloc[1:10].drop('Response',1), train.iloc[1:10]['Response'])

Eliminate missing values
Optimization terminated successfully.
         Current function value: -0.754532
         Iterations: 1
         Function evaluations: 14
Optimization terminated successfully.
         Current function value: -0.754532
         Iterations: 1
         Function evaluations: 14
Optimization terminated successfully.
         Current function value: -0.754532
         Iterations: 1
         Function evaluations: 14
Optimization terminated successfully.
         Current function value: -0.755011
         Iterations: 3
         Function evaluations: 61
Optimization terminated successfully.
         Current function value: -0.755022
         Iterations: 1
         Function evaluations: 21
Optimization terminated successfully.
         Current function value: -0.755039
         Iterations: 1
         Function evaluations: 23
Optimization terminated successfully.
         Current function value: -0.755496
         Iterations: 2
         Function evaluations: 87
Optimizat