In [35]:
%matplotlib inline
import pandas as pd
import numpy as np
import os
#os.chdir("C:/Users/jiang/Documents/kaggle")
# os.chdir("../data/")
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

# 1. Grabbing the features

In [12]:
df = pd.read_json("train.json")

### Engineering naive numerical features
* num photos
* num description
* year, month, day

In [22]:
# raw data
df[["photos", "features", "description", "created"]].head(5)

Unnamed: 0,photos,features,description,created
10,[https://photos.renthop.com/2/7211212_1ed4542e...,[],A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,2016-06-24 07:54:24
10000,[https://photos.renthop.com/2/7150865_be3306c5...,"[Doorman, Elevator, Fitness Center, Cats Allow...",,2016-06-12 12:19:27
100004,[https://photos.renthop.com/2/6887163_de85c427...,"[Laundry In Building, Dishwasher, Hardwood Flo...","Top Top West Village location, beautiful Pre-w...",2016-04-17 03:26:41
100007,[https://photos.renthop.com/2/6888711_6e660cee...,"[Hardwood Floors, No Fee]",Building Amenities - Garage - Garden - fitness...,2016-04-18 02:22:02
100013,[https://photos.renthop.com/2/6934781_1fa4b41a...,[Pre-War],Beautifully renovated 3 bedroom flex 4 bedroom...,2016-04-28 01:32:41


In [28]:
df["num_photos"] = df["photos"].apply(len)
df["num_features"] = df["features"].apply(len)
df["num_description_words"] = df["description"].apply(lambda x: len(x.split(" ")))
df["created"] = pd.to_datetime(df["created"])
df["created_year"] = df["created"].dt.year
df["created_month"] = df["created"].dt.month
df["created_day"] = df["created"].dt.day
df["created_hour"] = df["created"].dt.hour

In [29]:
num_feats = ["bathrooms", "bedrooms", "latitude", "longitude", "price",
             "num_photos", "num_features", "num_description_words",
             "created_year", "created_month", "created_day", "created_hour"]
X = df[num_feats]

In [31]:
target_num_map = {'high':0, 'medium':1, 'low':2}
y = np.array(df['interest_level'].apply(lambda x: target_num_map[x]))

In [32]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)

# 2. Run simple benchmarks

In [50]:
names = ["kNN", "LR", "Random Forest", "AdaBoost", "NB"]
classifiers = [
    KNeighborsClassifier(5),
    LogisticRegression(solver="newton-cg"),
    RandomForestClassifier(n_estimators=100),
    AdaBoostClassifier(n_estimators=100),
    GaussianNB()
]
clf_dict = dict(zip(names, classifiers))

In [51]:
for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    print name, log_loss(y_val, clf.predict_proba(X_val))

kNN 3.74806197286
LR 0.709856631146
Random Forest 0.654073964075
AdaBoost 1.08357345798
NB 2.04172059919


In [58]:
# basically shows that price and location are the most important feats
print zip(num_feats, clf_dict["Random Forest"].feature_importances_)
print zip(num_feats, clf_dict["AdaBoost"].feature_importances_)

[('bathrooms', 0.012964994348257692), ('bedrooms', 0.046595146765329318), ('latitude', 0.12674910378669293), ('longitude', 0.12597103628167378), ('price', 0.1765267381267098), ('num_photos', 0.080776097917591369), ('num_features', 0.080974005720733711), ('num_description_words', 0.12568365095133305), ('created_year', 0.0), ('created_month', 0.036180913869593601), ('created_day', 0.10163371170065742), ('created_hour', 0.085944600531427329)]
[('bathrooms', 0.01), ('bedrooms', 0.14000000000000001), ('latitude', 0.17000000000000001), ('longitude', 0.11), ('price', 0.32000000000000001), ('num_photos', 0.059999999999999998), ('num_features', 0.059999999999999998), ('num_description_words', 0.050000000000000003), ('created_year', 0.0), ('created_month', 0.0), ('created_day', 0.01), ('created_hour', 0.070000000000000007)]


### Thoughts
* kNN and NB are both not optimized to reduce logloss
    * kNN predicts discrete values...
    * NB is not calibrated (but I am too lazy to do something like Platt's scaling atm)
* LR did well
* RD did better than LR as expected
* Boosting prob both underperformed due to overfitting?
    * note: boosting prob has a lot more potential (but I am too versed in its hyperparam tuning)