In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_log_error as mse
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
import pickle

sns.set(style="darkgrid", context="notebook")
rand_seed = 1009
np.random.seed(rand_seed)
xsize, ysize = 12.0, 8.0

import warnings
warnings.filterwarnings('ignore')

In [2]:
X_train = pd.read_csv("../data/X_train.csv", index_col=False)
X_test = pd.read_csv("../data/X_test.csv", index_col=False)
y_train = np.genfromtxt("../data/y_train.txt")
train_id = np.genfromtxt("../data/train_id.txt").astype(int)
test_id = np.genfromtxt("../data/test_id.txt").astype(int)

In [3]:
def log_rmse(y_true, y_pred):
    y_pred[y_pred < 0.0] = 0.0
    return np.sqrt(mse(y_true, y_pred))

log_rmse_scorer = make_scorer(log_rmse, greater_is_better=False)

In [4]:
kneighbor_reg = KNeighborsRegressor(n_neighbors=5, weights="uniform", algorithm="auto", 
                                    leaf_size=30, p=2, metric="minkowski", 
                                    metric_params=None, n_jobs=None)
scores = cross_val_score(kneighbor_reg, X_train, y_train, groups=None, scoring=log_rmse_scorer, cv=4, 
                         n_jobs=None, verbose=0, pre_dispatch="2*n_jobs", error_score="raise-deprecating")
print("RMSE: "+"%.5f"%np.mean(-scores))

RMSE: 0.21111


In [11]:
kneighbor_reg = KNeighborsRegressor(algorithm="auto", leaf_size=30, metric="minkowski", metric_params=None, n_jobs=None, p=2)

param_grid = {
    "n_neighbors": np.arange(2, 10),
    "weights": ["uniform", "distance"]
}

clf = GridSearchCV(kneighbor_reg, param_grid, scoring=log_rmse_scorer, n_jobs=1, iid="warn", refit=True, 
                   cv=4, verbose=1, pre_dispatch="2*n_jobs", error_score=-100.0, return_train_score=True)
clf.fit(X_train, y_train)

Fitting 4 folds for each of 16 candidates, totalling 64 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  64 out of  64 | elapsed:   15.7s finished


GridSearchCV(cv=4, error_score=-100.0,
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params=None, iid='warn', n_jobs=1,
       param_grid={'n_neighbors': array([2, 3, 4, 5, 6, 7, 8, 9]), 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(log_rmse, greater_is_better=False), verbose=1)

In [12]:
clf.best_params_

{'n_neighbors': 5, 'weights': 'distance'}

In [13]:
y_pred = clf.predict(X_train)
print("RMSE: "+str("%.5f"%log_rmse(y_train, y_pred)))

RMSE: 0.00000


In [14]:
file = open("../models/k_neighbors_regression.pkl", "wb")
file.write(pickle.dumps(clf.best_estimator_))
file.close()

In [15]:
submission_df = pd.DataFrame(data={"Id":test_id, "SalePrice":clf.predict(X_test)})
submission_df.to_csv("../submissions/k_neighbors_model2_26_12_2018.csv", index=False)