In [1]:
import numpy as np, pandas as pd, seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn import metrics
from sklearn.metrics import accuracy_score

from xgboost import XGBRegressor

In [2]:
X = pd.read_csv('train.csv', index_col='id')
X_test = pd.read_csv('test.csv', index_col='id')

X.dropna(axis=0, subset=['Response'], inplace=True)
y = X.Response              
X.drop(['Response'], axis=1, inplace=True)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numeric columns
numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numeric_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test[my_cols].copy()

X_train.head()

Unnamed: 0_level_0,Gender,Vehicle_Age,Vehicle_Damage,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
51121,Female,< 1 Year,No,21,1,28.0,1,50784.0,152.0,233
263586,Female,< 1 Year,Yes,25,1,21.0,1,29698.0,152.0,241
49073,Male,1-2 Year,No,55,1,45.0,1,32760.0,139.0,222
194215,Male,1-2 Year,Yes,54,1,28.0,0,49328.0,122.0,163
2731,Male,1-2 Year,Yes,50,1,28.0,0,2630.0,154.0,196


In [3]:
from sklearn.preprocessing import LabelEncoder

cols = ['Gender', 'Vehicle_Age','Vehicle_Damage', 'Region_Code']

# Apply label encoder 
label = LabelEncoder()
for col in cols:
    X_train[col] = label.fit_transform(X_train[col])
    X_valid[col] = label.transform(X_valid[col])
    X_test[col] = label.transform(X_test[col])
X_test.head()

Unnamed: 0_level_0,Gender,Vehicle_Age,Vehicle_Damage,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
381110,1,1,0,25,1,11,1,35786.0,152.0,53
381111,1,0,1,40,1,28,0,33762.0,7.0,111
381112,1,0,1,47,1,28,0,40050.0,124.0,199
381113,1,1,1,24,1,27,1,37356.0,152.0,187
381114,1,1,0,27,1,28,1,59097.0,152.0,297


In [4]:
X_train.drop(['Region_Code', 'Vintage'], axis=1, inplace=True)
X_valid.drop(['Region_Code', 'Vintage'], axis=1, inplace=True)
X_test.drop(['Region_Code', 'Vintage'], axis=1, inplace=True)

In [5]:
def scaler(data):
    data = data/max(data)
    return data

cols = ['Age', 'Annual_Premium','Policy_Sales_Channel']

for i in cols:
    X_train[i] = scaler(X_train[i])
    X_valid[i] = scaler(X_valid[i])
    X_test[i] = scaler(X_test[i])

In [6]:
X_test.head()

Unnamed: 0_level_0,Gender,Vehicle_Age,Vehicle_Damage,Age,Driving_License,Previously_Insured,Annual_Premium,Policy_Sales_Channel
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
381110,1,1,0,0.294118,1,1,0.075811,0.932515
381111,1,0,1,0.470588,1,0,0.071523,0.042945
381112,1,0,1,0.552941,1,0,0.084844,0.760736
381113,1,1,1,0.282353,1,1,0.079137,0.932515
381114,1,1,0,0.317647,1,1,0.125194,0.932515


In [7]:
xgb = XGBRegressor()
for i in range(1):
    params = {
        'objective':['binary:logistic'],
        'eval_metric': ['auc'], 
        'learning_rate': [0.03],
        #'max_depth': [i],
        #'silent': [0],
        #'subsample': np.arange(.5, 1, 0.1),
        #'colsample_bytree': np.arange(.5, 1, 0.1),
        'n_estimators': [1000]
    }

    xgb_grid = GridSearchCV(xgb,
                        params,
                        cv = 5,
                        n_jobs = 5,
                        verbose=True)

    xgb_grid.fit(X_train, y_train)

    #print(xgb_grid.best_score_)
    #print(xgb_grid.best_params_)

    y_pred_train = xgb_grid.predict(X_train)
    y_pred_test = xgb_grid.predict(X_valid)

    valid_score = metrics.roc_auc_score(y_train, y_pred_train)
    print(f"Validation AUC score: {valid_score:.8f}")

    valid_score = metrics.roc_auc_score(y_valid, y_pred_test)
    print(f"Validation AUC score: {valid_score:.8f}")

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:  7.0min remaining: 10.5min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:  7.0min finished
  if getattr(data, 'base', None) is not None and \


Validation AUC score: 0.85799541
Validation AUC score: 0.85701944


In [9]:
preds_test = xgb_grid.predict(X_test)
# Save predictions in format used for competition scoring
output = pd.DataFrame({'id': X_test.index,
                       'Response': preds_test})
output.to_csv('sample_submission_iA3afxn.csv', index=False)

In [None]:
Validation AUC score: 0.85753859
Validation AUC score: 0.85635479