In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import os
#os.chdir("C:/Users/jiang/Documents/kaggle")
os.chdir("../data/")
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

# 1. Grabbing the features

In [3]:
df = pd.read_json("train.json")

### Engineering naive numerical features
* num photos
* num description
* year, month, day

In [4]:
# raw data
df[["photos", "features", "description", "created"]].head(5)

Unnamed: 0,photos,features,description,created
10,[https://photos.renthop.com/2/7211212_1ed4542e...,[],A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,2016-06-24 07:54:24
10000,[https://photos.renthop.com/2/7150865_be3306c5...,"[Doorman, Elevator, Fitness Center, Cats Allow...",,2016-06-12 12:19:27
100004,[https://photos.renthop.com/2/6887163_de85c427...,"[Laundry In Building, Dishwasher, Hardwood Flo...","Top Top West Village location, beautiful Pre-w...",2016-04-17 03:26:41
100007,[https://photos.renthop.com/2/6888711_6e660cee...,"[Hardwood Floors, No Fee]",Building Amenities - Garage - Garden - fitness...,2016-04-18 02:22:02
100013,[https://photos.renthop.com/2/6934781_1fa4b41a...,[Pre-War],Beautifully renovated 3 bedroom flex 4 bedroom...,2016-04-28 01:32:41


In [5]:
df["num_photos"] = df["photos"].apply(len)
df["num_features"] = df["features"].apply(len)
df["num_description_words"] = df["description"].apply(lambda x: len(x.split(" ")))
df["created"] = pd.to_datetime(df["created"])
df["created_year"] = df["created"].dt.year
df["created_month"] = df["created"].dt.month
df["created_day"] = df["created"].dt.day
df["created_hour"] = df["created"].dt.hour

In [6]:
num_feats = ["bathrooms", "bedrooms", "latitude", "longitude", "price",
             "num_photos", "num_features", "num_description_words",
             "created_year", "created_month", "created_day", "created_hour"]
X = df[num_feats]

In [7]:
target_num_map = {'high':0, 'medium':1, 'low':2}
y = np.array(df['interest_level'].apply(lambda x: target_num_map[x]))

In [10]:
X_train, X_other, y_train, y_other = train_test_split(X, y, test_size=0.5)
X_eval, X_test, y_eval, y_test = train_test_split(X_other, y_other, test_size=0.5)

# 2. Run simple benchmarks

In [11]:
names = ["kNN", "LR", "Random Forest", "AdaBoost", "NB"]
classifiers = [
    KNeighborsClassifier(5),
    LogisticRegression(solver="newton-cg"),
    RandomForestClassifier(n_estimators=100),
    AdaBoostClassifier(n_estimators=100),
    GaussianNB()
]
clf_dict = dict(zip(names, classifiers))

### Model stacking

In [21]:
eval_predictions = []
test_predictions = []
for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    eval_predictions += [clf.predict_proba(X_val)]
    test_predictions += [clf.predict_proba(X_test)]
    print name, log_loss(y_val, clf.predict_proba(X_val))

eval_predictions = np.array(eval_predictions)
# converts a (5, N, 3) to a (N, 5, 3) to a (N, 15)
eval_predictions = eval_predictions.swapaxes(0, 1).reshape(eval_predictions.shape[1],-1)

test_predictions = np.array(test_predictions)
# converts a (5, N, 3) to a (N, 5, 3) to a (N, 15)
test_predictions = test_predictions.swapaxes(0, 1).reshape(test_predictions.shape[1],-1)

kNN 2.14065725786
LR 0.704128637234
Random Forest 0.405973354958
AdaBoost 1.08270961728
NB 1.57984678265


In [23]:
modelStacker = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 3))
modelStacker.fit(predictions, y_val)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 3), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [26]:
[coef.shape for coef in modelStacker.coefs_]

[(15, 5), (5, 3), (3, 3)]

In [25]:
modelStacker.predict_proba(test_predictions)

array([[  8.65274880e-05,   1.28239635e-02,   9.87089509e-01],
       [  4.18385655e-02,   4.67176551e-01,   4.90984884e-01],
       [  1.06949191e-02,   2.34401853e-01,   7.54903228e-01],
       ..., 
       [  3.84277685e-02,   4.49940026e-01,   5.11632206e-01],
       [  7.14044417e-03,   1.86713175e-01,   8.06146381e-01],
       [  1.32062508e-01,   6.74516763e-01,   1.93420729e-01]])

In [27]:
print "stacked model: "
print log_loss(y_test, modelStacker.predict_proba(test_predictions))

stacked model: 
0.686095724353


In [58]:
# basically shows that price and location are the most important feats
print zip(num_feats, clf_dict["Random Forest"].feature_importances_)
print zip(num_feats, clf_dict["AdaBoost"].feature_importances_)

[('bathrooms', 0.012964994348257692), ('bedrooms', 0.046595146765329318), ('latitude', 0.12674910378669293), ('longitude', 0.12597103628167378), ('price', 0.1765267381267098), ('num_photos', 0.080776097917591369), ('num_features', 0.080974005720733711), ('num_description_words', 0.12568365095133305), ('created_year', 0.0), ('created_month', 0.036180913869593601), ('created_day', 0.10163371170065742), ('created_hour', 0.085944600531427329)]
[('bathrooms', 0.01), ('bedrooms', 0.14000000000000001), ('latitude', 0.17000000000000001), ('longitude', 0.11), ('price', 0.32000000000000001), ('num_photos', 0.059999999999999998), ('num_features', 0.059999999999999998), ('num_description_words', 0.050000000000000003), ('created_year', 0.0), ('created_month', 0.0), ('created_day', 0.01), ('created_hour', 0.070000000000000007)]


### Thoughts
* kNN and NB are both not optimized to reduce logloss
    * kNN predicts discrete values...
    * NB is not calibrated (but I am too lazy to do something like Platt's scaling atm)
* LR did well
* RD did better than LR as expected
* Boosting prob both underperformed due to overfitting?
    * note: boosting prob has a lot more potential (but I am too versed in its hyperparam tuning)