In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt

seed = 260681

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train_y = train.QuoteConversion_Flag.values
train = train.drop(['QuoteNumber', 'QuoteConversion_Flag'], axis=1)
test = test.drop('QuoteNumber', axis=1)

In [2]:
# Lets play with some dates
train['Date'] = pd.to_datetime(pd.Series(train['Original_Quote_Date']))
train = train.drop('Original_Quote_Date', axis=1)

test['Date'] = pd.to_datetime(pd.Series(test['Original_Quote_Date']))
test = test.drop('Original_Quote_Date', axis=1)

train['Year'] = train['Date'].apply(lambda x: int(str(x)[:4]))
train['Month'] = train['Date'].apply(lambda x: int(str(x)[5:7]))
train['weekday'] = train['Date'].dt.dayofweek


test['Year'] = test['Date'].apply(lambda x: int(str(x)[:4]))
test['Month'] = test['Date'].apply(lambda x: int(str(x)[5:7]))
test['weekday'] = test['Date'].dt.dayofweek

train = train.drop('Date', axis=1)
test = test.drop('Date', axis=1)

In [3]:
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)
    
train = DataFrameImputer().fit_transform(train)
test = DataFrameImputer().fit_transform(test)

In [4]:
for f in train.columns:
    if train[f].dtype=='object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[f].values) + list(test[f].values))
        train[f] = lbl.transform(list(train[f].values))
        test[f] = lbl.transform(list(test[f].values))

In [None]:
# train, val_train, train_y, test_y = train_test_split(
#      train, train_y, test_size=0.10, random_state=42)

In [None]:
#current Best
clf = xgb.XGBClassifier(n_estimators=35,
                        objective='binary:logistic',
                        nthread=-1,
                        max_depth=15,
                        learning_rate=0.025,
                        silent=True,
                        subsample=0.86,
                        colsample_bytree=0.68)
print train.shape, train_y.shape, val_train.shape, train_y.shape
xgb_model = clf.fit(train, train_y, eval_metric="auc")


preds = clf.predict_proba(test)[:,1]
sample = pd.read_csv('sample_submission.csv')
sample.QuoteConversion_Flag = preds
sample.to_csv('xgb_benchmark.csv', index=False)

In [None]:
from sklearn import metrics
from sklearn import grid_search


param_grid = [
  {'scale_pos_weight': [1, 1.1]}
 ]

clf = xgb.XGBClassifier(n_estimators=35,
                        objective='binary:logistic',
                        nthread=-1,
                        scale_pos_weight=1,
                        max_depth=15,
                        learning_rate=0.025,
                        silent=True,
                        subsample=0.86,
                        colsample_bytree=0.68)


model = grid_search.GridSearchCV(clf, param_grid)

In [None]:
model.fit(train.values, train_y)

In [None]:
model.best_params_

In [None]:
preds = model.predict_proba(test)[:,1]
sample = pd.read_csv('sample_submission.csv')
sample.QuoteConversion_Flag = preds
sample.to_csv('xgb_benchmark.csv', index=False)

In [None]:
from sklearn.decomposition import PCA, KernelPCA
# kpca = KernelPCA(kernel="rbf", gamma=10,n_components=1)
# X_kpca = kpca.fit_transform(train.values)
#print kpca.explained_variance_ratio_
pca = PCA(n_components=1)
X_pca = pca.fit_transform(train.values)
print(pca.explained_variance_ratio_) 
# Plot results

plt.figure()
plt.plot(X_pca, train_y, "ro")
plt.title("Projection by PCA")
plt.xlabel("1st principal component")

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

forest.fit(train.values, train_y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

In [None]:
# Print the feature ranking
print("Feature ranking:")
columns = []
for f in range(min(train.shape[1], 20)):
    print("%s. feature %d (%f)" % (train.columns[indices[f]], indices[f], importances[indices[f]]))
    columns.append(train.columns[indices[f]])
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(min(train.shape[1], 10)), importances[indices[:10]],
       color="r", yerr=std[indices[:10]], align="center")
plt.xticks(range(min(train.shape[1], 10)), indices[:10])
plt.xlim([-1, min(train.shape[1], 10)])
plt.show()

In [None]:
X = train[columns]
X['pca'] = X_pca[:, 0]
clf = xgb.XGBClassifier(n_estimators=35,
                        objective='binary:logistic',
                        nthread=-1,
                        max_depth=15,
                        learning_rate=0.025,
                        silent=True,
                        subsample=0.86,
                        colsample_bytree=0.68)
xgb_model = clf.fit(X, train_y, eval_metric="auc")

test_pca = pca.transform(test)
X_test = test[columns]
X_test['pca'] = test_pca[:, 0]
preds = clf.predict_proba(X_test)[:,1]
sample = pd.read_csv('sample_submission.csv')
sample.QuoteConversion_Flag = preds
sample.to_csv('xgb_benchmark.csv', index=False)

In [5]:
#Create KFold Stackers
clfs = []
stacks = []
preds = []
full_clf = xgb.XGBClassifier(n_estimators=35,
                        objective='binary:logistic',
                        nthread=-1,
                        max_depth=15,
                        learning_rate=0.025,
                        silent=True,
                        subsample=0.86,
                        colsample_bytree=0.68)
full_xgb_model = full_clf.fit(train, train_y, eval_metric="auc")

full_preds = full_clf.predict_proba(test)[:,1]

In [None]:
folds = 2
from sklearn.cross_validation import KFold
from sklearn.svm import SVR
kf = KFold(train.shape[0], n_folds=folds)
for train_index, test_index in kf:
    X_train, X_test = train.iloc[train_index, :], train.iloc[test_index, :]
    y_train, y_test = train_y[train_index], train_y[test_index]
    cl = xgb.XGBClassifier(n_estimators=35,
                        objective='binary:logistic',
                        nthread=-1,
                        max_depth=15,
                        learning_rate=0.025,
                        silent=True,
                        subsample=0.86,
                        colsample_bytree=0.68)
    cl.fit(X_train, y_train, eval_metric="auc")
    clfs.append(cl)
    pred = cl.predict_proba(X_test)[:,1]
    stack = SVR(C=1.0, epsilon=0.2)
    print pred.shape, y_test.shape
    stack.fit(pred.reshape((pred.shape[0], 1,)), y_test.reshape((y_test.shape[0], 1,)))
    print stack.score(pred.reshape((pred.shape[0], 1,)), y_test.reshape((y_test.shape[0], 1,)))
    stacks.append(stack)

In [7]:
#Average Preds
for cl, stack in zip(clfs, stacks):
    probs = cl.predict_proba(test)[:, 1]
    preds.append(stack.predict(probs.reshape(probs.shape[0], 1)))

In [10]:
last_preds = full_preds
for p in preds:
    last_preds = (last_preds + p)/2
print last_preds
sample = pd.read_csv('sample_submission.csv')
sample.QuoteConversion_Flag = last_preds
sample.to_csv('full_xgb_benchmark.csv', index=False)

[ 0.20118165  0.19132914  0.1919019  ...,  0.95768068  0.20138666
  0.21126332]
