In [140]:
import keras
import sklearn 
import numpy as np
import pandas as pd
import matplotlib as plot
from IPython.display import display

%matplotlib inline
training_data = pd.read_csv("train.csv")
testing_data = pd.read_csv("test.csv")

In [141]:
display(training_data.describe())
training_y = training_data["target"]
training_data = training_data.drop(["target", "ID"], axis=1)
testing_ID = testing_data["ID"]
testing_data = testing_data.drop(["ID"], axis=1)

Unnamed: 0,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
count,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,...,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0
mean,5944923.0,14654.93,1390.895,26722.45,4530.164,26409.96,30708.11,16865.22,4669.208,2569407.0,...,467605.7,444623.9,805621.9,781296.6,143.529939,121380.9,35734.51,312374.1,92199.6,227910.0
std,8234312.0,389329.8,64283.02,569965.2,235912.4,1514730.0,577059.0,751275.6,187944.9,9610183.0,...,4068038.0,4428889.0,4513246.0,6839451.0,9584.318507,4720709.0,1614622.0,4318501.0,1635993.0,1811139.0
min,30000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,600000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2260000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,600000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,40000000.0,20000000.0,4000000.0,20000000.0,14800000.0,100000000.0,20708000.0,40000000.0,10400000.0,319612000.0,...,76000000.0,123588000.0,130000000.0,144400000.0,640000.0,301312000.0,106420000.0,140000000.0,61768000.0,43200000.0


Step 1: Identify highly correlated columns and constant columns and drop them.

In [142]:
correlation_matrix = training_data.corr().abs()
print(correlation_matrix.head(5))

           48df886f9  0deb4b6a8  34b15f335  a8cb14b00  2f0771a37  30347e683  \
48df886f9   1.000000   0.000815   0.001461   0.000723   0.000656   0.002004   
0deb4b6a8   0.000815   1.000000   0.001015   0.000416   0.000377   0.001152   
34b15f335   0.001461   0.001015   1.000000   0.000901   0.000818   0.002496   
a8cb14b00   0.000723   0.000416   0.000901   1.000000   0.000335   0.001022   
2f0771a37   0.000656   0.000377   0.000818   0.000335   1.000000   0.000928   

           d08d1fbe3  6ee66e115  20aa07010  dc5a8f1d8    ...      3ecc09859  \
48df886f9   0.000845   0.002621   0.006629   0.003575    ...       0.004151   
0deb4b6a8   0.000486   0.000538   0.003995   0.002116    ...       0.002488   
34b15f335   0.001053   0.001165   0.017082   0.010173    ...       0.000338   
a8cb14b00   0.000431   0.000477   0.005135   0.001878    ...       0.002208   
2f0771a37   0.000391   0.000433   0.004663   0.001705    ...       0.002005   

           9281abeea  8675bec0b  3a13ed79a  f677d4

In [143]:
# Select upper triangle of correlation matrix to eliminate redundancies
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.85
to_drop_85 = [column for column in upper.columns if any(upper[column] > 0.85)]
to_drop_90 = [column for column in upper.columns if any(upper[column] > 0.90)]
to_drop_95 = [column for column in upper.columns if any(upper[column] > 0.95)]
print(len(to_drop_85))
print(len(to_drop_90))
print(len(to_drop_95))

413
262
139


In [144]:
# Drop features from the training and testing data
training_data.drop(to_drop_85, axis=1, inplace=True)
testing_data.drop(to_drop_85, axis=1, inplace=True)

Step 2: Rescale features using MinMaxScaler

In [145]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(training_data)
training_data = scaler.transform(training_data)
testing_data = scaler.transform(testing_data)

Step 3: Split the trainging data into training and validation sets.

In [146]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(training_data, training_y, 
                                                    test_size=0.2, random_state=42)

Create RMSLE evaluation function

In [147]:
def evaluate(groundtruth, predictions):
    score = np.sqrt(np.square(np.log(groundtruth + 1) - np.log(predictions + 1)).mean())
    return score
    

In [175]:
import os

def create_submission(predictions, submission_num):
    submission_path = "submission_" + str(submission_num) + ".csv"
    if os.path.exists(submission_path):
        print("Path already exists, please use another submission number.")
    else:
        final_csv = pd.DataFrame(testing_ID).join(pd.DataFrame(predictions, columns=["target"]))
        final_csv.to_csv(submission_path, index=False)

Set a baseline using XGBoost

In [148]:
import xgboost as xgb

xgdmat = xgb.DMatrix(X_train, y_train)

our_params = {'eta':0.065,'seed':0,'subsample':1,'colsample_bytree':1,
            'objective':'reg:linear','max_depth':25,'min_child_weight':1}
final_gb = xgb.train(our_params,xgdmat)
tesdmat = xgb.DMatrix(X_test)
predictions = final_gb.predict(tesdmat)

print("RMSLE score")
print(evaluate(y_test, predictions))

RMSLE score
1.546027959606595


Retrain with all the training data for submission.

In [149]:
xgdmat = xgb.DMatrix(training_data, training_y)

our_params = {'eta':0.065,'seed':0,'subsample':1,'colsample_bytree':1,
            'objective':'reg:linear','max_depth':25,'min_child_weight':1}
final_gb = xgb.train(our_params,xgdmat)
finaldmat = xgb.DMatrix(testing_data)
final_predictions = final_gb.predict(finaldmat)

Support Vector Regression

In [113]:
from sklearn.svm import SVR
svr = SVR(degree=3, C=0.01)
svr.fit(X_train, y_train)

predictions = svr.predict(X_test)
print("RMSLE score")
print(evaluate(y_test, predictions))

RMSLE score
1.702122998005553


Use PCA for dimensionality reduction

In [123]:
from sklearn.decomposition import PCA

pca = PCA(n_components=1750, random_state=42).fit(X_train)

# Apply pca transformation to the training and testing data
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

# Show how much of the total variance is captured by the PCA transformation.
print(pca.explained_variance_ratio_.cumsum())

[0.07083221 0.112185   0.14347006 ... 0.99000177 0.99003366 0.99006525]


Use xgboost on the PCA-transformed data.

In [138]:
import xgboost as xgb

xgdmat = xgb.DMatrix(X_train_pca, y_train)
our_params = {'eta':0.065,'seed':0,'subsample':1,'colsample_bytree':1,
            'objective':'reg:linear','max_depth':10,'min_child_weight':1}
final_gb = xgb.train(our_params, xgdmat)
tesdmat = xgb.DMatrix(X_test_pca)
predictions = final_gb.predict(tesdmat)

print("RMSLE score")
print(evaluate(y_test, predictions))

RMSLE score
1.58479800455529


DNN Regressor

In [None]:
import tensorflow as tf

regressor = tf.estimator.DNNRegressor(feature_columns=feature_cols, hidden_units=[1000, 500, 200, 50], model_dir="/santander_models")

regressor.train(input_fn=get_input_fn(training_set), steps=5000)

ev = regressor.evaluate(input_fn=get_input_fn(test_set, num_epochs=1, shuffle=False))
loss_score = ev["loss"]
print("Loss: {0:f}".format(loss_score))

y = regressor.predict(input_fn=get_input_fn(prediction_set, num_epochs=1, shuffle=False))
predictions = list(p["predictions"] for p in itertools.islice(y, 6))
print("Predictions: {}".format(str(predictions)))