In [1]:
# Load deps

# general-purpose imports
import feather # may need to put this in your .bashrc: export MACOSX_DEPLOYMENT_TARGET=10.10
import numpy as np
import os
import pandas as pd
import requests
import matplotlib.pyplot as plt

# ML deps
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import Imputer

# Image processing deps
from PIL import Image
from StringIO import StringIO

%matplotlib inline

# Control those annoying warnings
import warnings
warnings.filterwarnings('ignore')

# 1. Read in Data

In [2]:
df = pd.read_json("train.json")

# 2. Split into training and testing

In [3]:
np.random.seed(12345)

# Grab target var
target_num_map = {'high':0, 'medium':1, 'low':2}
y = np.array(df['interest_level'].apply(lambda x: target_num_map[x]))

# Train-validation split
df.drop('interest_level', axis=1)
X_train, X_val, y_train, y_val = train_test_split(df, y, test_size=0.30)

# 3. Preprocessing and data cleaning

In [4]:
# Define cleaning steps
def clean_data(x_df):
    return(x_df)

# Clean input data

# 4. General-Purpose feature engineering

In [5]:
def add_features(some_df, train_df):
    # Censor bedrooms
    some_df['bedrooms_cens'] = some_df['bedrooms'] 
    some_df.loc[some_df.bedrooms_cens > 4, 'bedrooms_cens'] = 5

    # One-hot encode bathrooms
    some_df['bathrooms_cens'] = some_df['bathrooms'] 
    some_df.loc[some_df.bathrooms_cens > 4, 'bathrooms_cens'] = 5
    
    # Count number of photos
    some_df["num_photos"] = some_df["photos"].apply(len)
    
    # Count number of features provided in listing
    some_df["num_features"] = some_df["features"].apply(len)
    
    # count of words in description
    some_df["num_description_words"] = some_df["description"].apply(lambda x: len(x.split(" ")))
    
    # Day of the month
    some_df["created"] = pd.to_datetime(some_df["created"])
    some_df["created_day"] = some_df["created"].dt.day
    
    # Price as percentile
    print("Creating price percentile feature...")
    n_rows = len(train_df["price"])
    some_df["price_perc"] = some_df["price"].apply(lambda x: len(train_df.loc[train_df.price <= x])/float(n_rows))
    
    return(some_df)

In [6]:
def get_image_features(photo_url_list):
    """
    Create one row of features for a collection of
    images.
    """
    
    if len(photo_url_list) > 0:
        
        # Set up collectors
        mean_val = []
        mean_red = []
        mean_green = []
        mean_blue = []
        std_red = []
        std_green = []
        std_blue = []
        img_resolution = []
        
        # TESTING: Just use first image for now
        for url in photo_url_list:
            
            # Get photo (http://stackoverflow.com/questions/7391945/how-do-i-read-image-data-from-a-url-in-python)
            url = url
            response = requests.get(url)
            img = np.array(Image.open(StringIO(response.content)))
            
            # Mean pixel value (RGB scale)
            mean_val.append(img.mean())

            # Mean value by channel
            mean_red.append(img[:,0].mean())
            mean_green.append(img[:,1].mean())
            mean_blue.append(img[:,2].mean())

            # standard deviation by channel
            std_red.append(img[:,0].std())
            std_green.append(img[:,1].std())
            std_blue.append(img[:,2].std())

            # resolution (num pixels)
            img_resolution.append(img.size)
            
        # Summarize 
        out_dict = {
            'mean_pixel_val': np.mean(np.array(mean_val)),
            'mean_red': np.mean(np.array(mean_red)),
            'mean_green': np.mean(np.array(mean_green)),
            'mean_blue': np.mean(np.array(mean_blue)),
            'std_red': np.mean(np.array(std_red)),
            'std_green': np.mean(np.array(std_green)),
            'std_blue': np.mean(np.array(std_blue)),
            'img_resolution': np.mean(np.array(img_resolution))
        }
        
    else:
    
        out_dict = {
            'mean_pixel_val': float('nan'),
            'mean_red': float('nan'),
            'mean_green': float('nan'),
            'mean_blue': float('nan'),
            'std_red': float('nan'),
            'std_green': float('nan'),
            'std_blue': float('nan'),
            'img_resolution': float('nan')
        }
        
    return(out_dict)

In [7]:
# Features cached on disk
if os.path.isfile('x_train.feather') and os.path.isfile('x_val.feather'):
    X_train = feather.read_dataframe('x_train.feather')
    X_val = feather.read_dataframe('x_val.feather')
else:
    # Add these features to training and validation data
    X_train = add_features(X_train, X_train)
    X_val = add_features(X_val, X_train)
    
    # Add image features
    img_features = X_train['photos'].map(lambda photo_album: get_image_features(photo_album))
    X_train_img = pd.DataFrame({
                'mean_pixel_val': np.array([feature_dict['mean_pixel_val'] for feature_dict in img_features]),
                'mean_red': np.array([feature_dict['mean_red'] for feature_dict in img_features]),
                'mean_green': np.array([feature_dict['mean_green'] for feature_dict in img_features]),
                'mean_blue': np.array([feature_dict['mean_blue'] for feature_dict in img_features]),
                'std_red': np.array([feature_dict['std_red'] for feature_dict in img_features]),
                'std_green': np.array([feature_dict['std_green'] for feature_dict in img_features]),
                'std_blue': np.array([feature_dict['std_blue'] for feature_dict in img_features]),
                'img_resolution': np.array([feature_dict['img_resolution'] for feature_dict in img_features])
            })
    X_train = pd.concat([X_train, X_train_img], axis = 1)
    
    
    img_features = X_val['photos'].map(lambda photo_album: get_image_features(photo_album))
    X_val_img = pd.DataFrame({
                'mean_pixel_val': np.array([feature_dict['mean_pixel_val'] for feature_dict in img_features]),
                'mean_red': np.array([feature_dict['mean_red'] for feature_dict in img_features]),
                'mean_green': np.array([feature_dict['mean_green'] for feature_dict in img_features]),
                'mean_blue': np.array([feature_dict['mean_blue'] for feature_dict in img_features]),
                'std_red': np.array([feature_dict['std_red'] for feature_dict in img_features]),
                'std_green': np.array([feature_dict['std_green'] for feature_dict in img_features]),
                'std_blue': np.array([feature_dict['std_blue'] for feature_dict in img_features]),
                'img_resolution': np.array([feature_dict['img_resolution'] for feature_dict in img_features])
            })
    X_val = pd.concat([X_val, X_val_img], axis = 1)
    
    # Just keep the features that are serializable to feather (and usable in modeling!)
    feats_to_keep = ["bathrooms", "bedrooms", "latitude", "longitude", "price",
                     "num_photos", "num_features", "num_description_words",
                     "price_perc", "bathrooms_cens", "bedrooms_cens"]
    X_train = X_train[feats_to_keep]
    X_val = X_val[feats_to_keep]
    
    # Cache to disk
    feather.write_dataframe(X_train, 'x_train.feather')
    feather.write_dataframe(X_val, 'x_val.feather')

Creating price percentile feature...
Creating price percentile feature...


KeyboardInterrupt: 

# 4b. Image Features

In [None]:
smpl_df = df.head(5)

img_features = smpl_df['photos'].map(lambda photo_album: get_image_features(photo_album))

img_df = pd.DataFrame({
        'mean_pixel_val': np.array([feature_dict['mean_pixel_val'] for feature_dict in img_features]),
        'mean_red': np.array([feature_dict['mean_red'] for feature_dict in img_features]),
        'mean_green': np.array([feature_dict['mean_green'] for feature_dict in img_features]),
        'mean_blue': np.array([feature_dict['mean_blue'] for feature_dict in img_features]),
        'std_red': np.array([feature_dict['std_red'] for feature_dict in img_features]),
        'std_green': np.array([feature_dict['std_green'] for feature_dict in img_features]),
        'std_blue': np.array([feature_dict['std_blue'] for feature_dict in img_features]),
        'img_resolution': np.array([feature_dict['img_resolution'] for feature_dict in img_features])
    })
img_df2 = pd.DataFrame({
        'mean_pixel_val2': np.array([feature_dict['mean_pixel_val'] for feature_dict in img_features]),
        'mean_red2': np.array([feature_dict['mean_red'] for feature_dict in img_features]),
        'mean_green2': np.array([feature_dict['mean_green'] for feature_dict in img_features]),
        'mean_blue2': np.array([feature_dict['mean_blue'] for feature_dict in img_features]),
        'std_red2': np.array([feature_dict['std_red'] for feature_dict in img_features]),
        'std_green2': np.array([feature_dict['std_green'] for feature_dict in img_features]),
        'std_blue': np.array([feature_dict['std_blue'] for feature_dict in img_features]),
        'img_resolution': np.array([feature_dict['img_resolution'] for feature_dict in img_features])
    })

In [None]:
num_feats = ["bathrooms", "bedrooms", "latitude", "longitude", "price",
             "num_photos", "num_features", "num_description_words",
             "price_perc", "bathrooms_cens", "bedrooms_cens"]
X_train = X_train[num_feats]
X_val = X_val[num_feats]

# 5. Fit Classifiers

In [None]:
names = ["kNN", "LR", "Random Forest (1)", "Random Forest (2)", "Random Forest (3)","AdaBoost", "NB"]
classifiers = [
    KNeighborsClassifier(5),
    LogisticRegression(solver="newton-cg", penalty='l2'),
    RandomForestClassifier(n_estimators=100, max_depth = 2),
    RandomForestClassifier(n_estimators=100, max_depth = 3),
    RandomForestClassifier(n_estimators=100, max_depth = 5),
    AdaBoostClassifier(n_estimators=100),
    GaussianNB()
]
clf_dict = dict(zip(names, classifiers))

In [None]:
for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    print name, log_loss(y_val, clf.predict_proba(X_val))

In [None]:
# basically shows that price and location are the most important feats
print zip(num_feats, clf_dict["Random Forest"].feature_importances_)
print zip(num_feats, clf_dict["AdaBoost"].feature_importances_)

### Thoughts
* kNN and NB are both not optimized to reduce logloss
    * kNN predicts discrete values...
    * NB is not calibrated (but I am too lazy to do something like Platt's scaling atm)
* LR did well
* RD did better than LR as expected
* Boosting prob both underperformed due to overfitting?
    * note: boosting prob has a lot more potential (but I am too versed in its hyperparam tuning)