In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import pickle

import sklearn.cluster

import Levenshtein

from multiprocessing import Pool

In [2]:
train_df = pd.read_pickle('fin-dprep-train.pkl')
test_df = pd.read_pickle('fin-dprep-test.pkl')

features_to_use = pickle.load(open('fin-dprep-flist.pkl', 'rb'))

medium_price = pd.read_pickle('fin-medium-price.pkl')

train_df = pd.merge(train_df, medium_price, left_on='listing_id', right_index=True)
test_df = pd.merge(test_df, medium_price, left_on='listing_id', right_index=True)

In [3]:
train_df["predicted_price_diff"] = np.log(train_df["price"]) - np.log(train_df["predicted_price"])
test_df["predicted_price_diff"] = np.log(test_df["price"]) - np.log(test_df["predicted_price"])

In [4]:
class MeansProcessor:
    def __init__(self, key, outkey = None, tgt = 'interest_cat'):
        self.key = key
        self.outkey = key if outkey is None else outkey
        
        self.count = {}
        self.means = {}
        self.std = {}
        self.global_means = 0
        
        self.tgt = tgt
        
        self.outkeys = [self.outkey + '_level', self.outkey + '_level_std']
        
    def fit(self, df):
        self.global_means = df[self.tgt].mean()
            
        for k in df.groupby(self.key, sort=False):
            
            self.count[k[0]] = len(k[1])

            if len(k[1]) < 0:
                self.means[k[0]] = np.nan
                self.std[k[0]] = np.nan
            else:
                self.means[k[0]] = np.mean(k[1][self.tgt])
                self.std[k[0]] = np.std(k[1][self.tgt])
            
    def predict(self, df):
        for l in self.outkeys:
            df[l] = np.nan # self.global_means[l]
            
        df[self.outkey + '_count'] = 0
            
        for k in df.groupby(self.key, sort=False):
            if k[0] == 0:
                continue
            
            if k[0] in self.means:
                df.loc[k[1].index, self.outkey + '_count'] = self.count[k[0]]
                df.loc[k[1].index, self.outkey + '_level'] = self.means[k[0]]
                df.loc[k[1].index, self.outkey + '_level_std'] = self.std[k[0]]
        
        return df
    
    def get_features(self):
        return self.outkeys.copy() + [self.outkey + '_count']

# i kept the same index randomization (with fixed seed) so I could validate this code against
# the original...

target_num_map = {'low':0, 'medium':1, 'high':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

def proc_fold(fold):
    train_index = fold[0]
    test_index = fold[1]
    
    cv_train = train_df.iloc[train_index]
    cv_valid = train_df.iloc[test_index][['interest_level', 'manager_id', 'building_id']]
    cv_test = test_df.copy()
    
    m_build = MeansProcessor('building_id', 'building_sort')
    m_build.fit(cv_train)
    cv_valid = m_build.predict(cv_valid)
    cv_test = m_build.predict(cv_test)

    m_mgr = MeansProcessor('manager_id', 'manager_sort')
    m_mgr.fit(cv_train)
    cv_valid = m_mgr.predict(cv_valid)
    cv_test = m_mgr.predict(cv_test)

    m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')
    m_comb.fit(cv_train)
    cv_valid = m_comb.predict(cv_valid)
    cv_test = m_comb.predict(cv_test)

    return cv_train, cv_valid, cv_test

kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
folds = [(k[0], k[1]) for k in kf.split(list(range(train_df.shape[0])), train_y)]

#with Pool(5) as pool:
#    rv = pool.map(proc_fold, folds)

import pickle

try:
    rv = pickle.load(open('0420-model-groupfeatures.pkl', 'rb'))
except:
    with Pool(5) as pool:
        rv = pool.map(proc_fold, folds)

        pickle.dump(rv, open('0420-model-groupfeatures.pkl', 'wb'))

# dummies to get feature id's
m_build = MeansProcessor('building_id', 'building_sort')
m_mgr = MeansProcessor('manager_id', 'manager_sort')
m_comb = MeansProcessor(['building_id', 'manager_id'], 'mb_comb')

group_features = m_build.get_features() + m_mgr.get_features() + m_comb.get_features()

cv_test = []
for r in rv:
    cv_test.append(test_df.merge(r[2][group_features], left_index=True, right_index=True))

cv_allvalid = pd.concat([r[1] for r in rv])

train_df = train_df.merge(cv_allvalid[group_features], left_index=True, right_index=True)

In [5]:
train_ids = []
val_ids = []

for dev_index, val_index in kf.split(range(train_df.shape[0]), train_df.interest_cat):
    train_ids.append(train_df.iloc[dev_index].listing_id.values)
    val_ids.append(train_df.iloc[val_index].listing_id.values)
    

In [6]:
#fl = features_to_use + m_build.get_features() + m_mgr.get_features() + m_comb.get_features() + tfidf_fn

fl = features_to_use.copy() + group_features 

#fl.remove('price')
#fl.remove('price_t')
#fl.remove('price_per_room')
fl.append('predicted_price')
fl.append('predicted_price_diff')

fl.append('manager_lazy_rate')

fl.append('density_exp01')

In [10]:
def run3_to_stackdf(run):
    
    df_testpreds3 = pd.DataFrame(run[2].mean(axis=0))
    df_testpreds3.columns = ['low', 'medium', 'high']
    df_testpreds3['listing_id'] = test_df.listing_id

    df_allpreds3 = pd.concat([run[1][['low', 'medium', 'high', 'listing_id']], df_testpreds3])

    df_allpreds3.sort_values('listing_id', inplace=True)
    df_allpreds3.set_index('listing_id', inplace=True)
    
    df_fold = []
    for f in range(run[2].shape[0]):
        df_fold.append(pd.DataFrame(run[2][f]))
        df_fold[-1]['listing_id'] = test_df.listing_id
        df_fold[-1].sort_values('listing_id', inplace=True)
        df_fold[-1].set_index('listing_id', inplace=True)

    return (df_allpreds3, df_fold)


In [11]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=4000):
    param = {}
    param['objective'] = 'multi:softprob'
    #param['tree_method'] = 'hist'
    param['eta'] = 0.02
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    #param['base_score'] = [np.mean(train_y == i) for i in [0, 1, 2]]
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=10)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest, ntree_limit=model.best_ntree_limit)
    return pred_test_y, model

In [12]:
def run_cv(train_df, cv_test, kf, features_to_use):
    train_X = train_df[features_to_use]
    train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

    cv_preds = []
    cv_scores = []
    models = []
    test_preds = []
    
    fold = 0

    for dev_index, val_index in kf.split(range(train_X.shape[0]), train_y):

        dev_X, val_X = train_X.iloc[dev_index], train_X.iloc[val_index]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        models.append(model)

        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)

        cut_df = train_df.iloc[val_index]
        out_df = pd.DataFrame(preds)
        out_df.columns = ["low", "medium", "high"]
        out_df["listing_id"] = cut_df.listing_id.values
        interest = cut_df.interest_level.apply(lambda x: target_num_map[x])
        out_df['interest_tgt'] = interest.values

        cv_preds.append(out_df)

        xgtest = xgb.DMatrix(cv_test[fold][features_to_use])
        test_preds.append(model.predict(xgtest, ntree_limit=model.best_ntree_limit))

    df_cv = pd.concat(cv_preds)
    print(log_loss(df_cv.interest_tgt, df_cv[['low', 'medium', 'high']]))

    apreds = np.array(test_preds)
    
    return models, df_cv, apreds

In [13]:
rv3 = run_cv(train_df, cv_test, kf, fl) 

[0]	train-mlogloss:1.08457	test-mlogloss:1.08474
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
[10]	train-mlogloss:0.96348	test-mlogloss:0.965683
[20]	train-mlogloss:0.872029	test-mlogloss:0.876469
[30]	train-mlogloss:0.801367	test-mlogloss:0.807939
[40]	train-mlogloss:0.745739	test-mlogloss:0.754607
[50]	train-mlogloss:0.701679	test-mlogloss:0.712797
[60]	train-mlogloss:0.666485	test-mlogloss:0.679807
[70]	train-mlogloss:0.637654	test-mlogloss:0.653089
[80]	train-mlogloss:0.614372	test-mlogloss:0.631793
[90]	train-mlogloss:0.595055	test-mlogloss:0.614436
[100]	train-mlogloss:0.579094	test-mlogloss:0.60046
[110]	train-mlogloss:0.565481	test-mlogloss:0.588773
[120]	train-mlogloss:0.553736	test-mlogloss:0.5789
[130]	train-mlogloss:0.543894	test-mlogloss:0.570894
[140]	train-mlogloss:0.535246	test-mlogloss:0.564114
[150]	train-mlogloss:0.527701	test-mlogloss:0.558442
[160]	train-mloglo

In [14]:
dfs3 = run3_to_stackdf(rv3)
pickle.dump(dfs3, open('modeloutput-xgb-clf.pkl', 'wb'))

In [16]:
def run_to_stackdf(run):
    df_testpreds = pd.DataFrame(run[2].mean(axis=0))
    df_testpreds.columns = ['level']
    df_testpreds['listing_id'] = cv_test[0].listing_id
    df_allpreds = pd.concat([run[1][['level', 'listing_id']], df_testpreds])

    df_allpreds.sort_values('listing_id', inplace=True)
    df_allpreds.set_index('listing_id', inplace=True)

    df_fold = []
    for f in range(run[2].shape[0]):
        df_fold.append(pd.DataFrame(run[2][f]))
        df_fold[-1]['listing_id'] = test_df.listing_id
        df_fold[-1].sort_values('listing_id', inplace=True)
        df_fold[-1].set_index('listing_id', inplace=True)

    return (df_allpreds, df_fold)


In [17]:
def runXGB1(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=4000):
    param = {}
    param['objective'] = 'reg:logistic'
    #param['tree_method'] = 'hist'
    param['eta'] = 0.02
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 1
    param['eval_metric'] = "rmse"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    param['base_score'] = train_y.mean()
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=10)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest, ntree_limit=model.best_ntree_limit)
    return pred_test_y, model

In [18]:
medium_regression_tgt = (.5 + (9/13)) / 2

def run_cv1(train_df, cv_test, kf, features_to_use):
    
    train_X = train_df[features_to_use] #sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
    train_y3 = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
    
    train_y = np.zeros_like(train_y3, dtype=np.float32)
    train_y[train_y3 == 1] = medium_regression_tgt
    train_y[train_y3 == 2] = 1

    cv_preds = []
    cv_scores = []
    models = []
    test_preds = []
    
    fold = 0

    for dev_index, val_index in kf.split(range(train_X.shape[0]), train_y):

        dev_X, val_X = train_X.iloc[dev_index], train_X.iloc[val_index]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB1(dev_X, dev_y, val_X, val_y)
        models.append(model)

        cv_scores.append(model.best_score)
        print(cv_scores)

        cut_df = train_df.iloc[val_index]
        
        out_df = pd.DataFrame(preds)
        out_df.columns = ["level"]
        out_df["listing_id"] = cut_df.listing_id.values
        out_df['interest_tgt'] = val_y # cut_df.interest.values

        cv_preds.append(out_df)

        xgtest = xgb.DMatrix(cv_test[fold][features_to_use])
        test_preds.append(model.predict(xgtest, ntree_limit=model.best_ntree_limit))

    df_cv = pd.concat(cv_preds)
    print(np.sqrt(sklearn.metrics.mean_squared_error(df_cv.interest_tgt, df_cv.level)))
    
    apreds = np.array(test_preds)
    
    return models, df_cv, apreds

In [19]:
rv1 = run_cv1(train_df, cv_test, kf, fl) 

[0]	train-rmse:0.334483	test-rmse:0.334523
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 50 rounds.
[10]	train-rmse:0.313981	test-rmse:0.314755
[20]	train-rmse:0.298413	test-rmse:0.299804
[30]	train-rmse:0.286126	test-rmse:0.288265
[40]	train-rmse:0.27618	test-rmse:0.278983
[50]	train-rmse:0.268765	test-rmse:0.272163
[60]	train-rmse:0.263497	test-rmse:0.267494
[70]	train-rmse:0.258939	test-rmse:0.263398
[80]	train-rmse:0.255294	test-rmse:0.260178
[90]	train-rmse:0.252345	test-rmse:0.257657
[100]	train-rmse:0.249963	test-rmse:0.255729
[110]	train-rmse:0.247915	test-rmse:0.254029
[120]	train-rmse:0.246146	test-rmse:0.252676
[130]	train-rmse:0.244603	test-rmse:0.251512
[140]	train-rmse:0.243268	test-rmse:0.250504
[150]	train-rmse:0.24198	test-rmse:0.249626
[160]	train-rmse:0.240862	test-rmse:0.248845
[170]	train-rmse:0.239945	test-rmse:0.24829
[180]	train-rmse:0.239108	test-rmse:0.247768
[190]	train-rmse

In [20]:
dfs1 = run_to_stackdf(rv1)
pickle.dump(dfs1, open('modeloutput-xgb-reg.pkl', 'wb'))