In [20]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb 
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [21]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param={}
    param['objective']='multi:softprob'
    param['eta']=0.1
    param['max_depth']=6
    param['silent']=1
    param['num_class']=3
    param['eval_metric']='mlogloss'
    param['min_child_weigh']=1
    param['subsample']=0.7
    param['colsample_bytree']=0.7
    param['seed']=seed_val
    num_rounds=num_rounds
    
    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)
    
    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlst = [(xgtrain, 'train'), (xgtest, 'test')]
        model = xgb.train(plst, xgtrain, num_rounds, watchlst, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)
        
    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [22]:
data_path = "/Users/yangqingfeng/data/open_data/kaggle/two-sigma/"
train_file = data_path + "train.json"
test_file = data_path + "test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
print(train_df.shape)
print(test_df.shape)

(49352, 15)
(74659, 14)


In [23]:
features_to_use = ['bathrooms', 'bedrooms','latitude', 'longitude', 'price']

In [24]:
# count of photos #
train_df['num_photos'] = train_df['photos'].apply(len)
test_df['num_photos'] = test_df['photos'].apply(len)

#count of "features"
train_df['num_features'] = train_df['features'].apply(len)
test_df['num_features'] = test_df['features'].apply(len)

#count of words present in descriptions column #
train_df['num_description_words'] = train_df['description'].apply(lambda x: len(x.split(" ")))
test_df['num_description_words'] = test_df['description'].apply(lambda x: len(x.split(" ")))

# convert the created column to datetime object so as to extract more features
train_df['created'] = pd.to_datetime(train_df['created'])
test_df['created'] = pd.to_datetime(test_df['created'])

# Let us extract some features like year, month, day, hour from date columns #
train_df['created_year'] = train_df['created'].dt.year
train_df['created_month'] = train_df['created'].dt.month
train_df['created_day'] = train_df['created'].dt.day
train_df['created_hour'] = train_df['created'].dt.hour

test_df['created_year'] = test_df['created'].dt.year
test_df['created_month'] = test_df['created'].dt.month
test_df['created_day'] = test_df['created'].dt.day
test_df['created_hour'] = test_df['created'].dt.hour

# adding all these new features to use list #
features_to_use.extend(["num_photos", "num_features", "num_description_words", 
                        "created_year", "created_month", "created_day", "created_hour"])



In [25]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
    if train_df[f].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_df[f].values) + list(test_df[f].values))
        train_df[f] = lbl.transform(list(train_df[f].values))
        test_df[f] = lbl.transform(list(test_df[f].values))
        features_to_use.append(f)

In [26]:
train_df['features'] = train_df['features'].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df['features'].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
train_df['features'].head()

10                                                         
10000     Doorman Elevator Fitness_Center Cats_Allowed D...
100004    Laundry_In_Building Dishwasher Hardwood_Floors...
100007                               Hardwood_Floors No_Fee
100013                                              Pre-War
Name: features, dtype: object

In [28]:
tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_spare = tfidf.fit_transform(train_df['features'])
te_spare = tfidf.transform(test_df['features'])

In [35]:
train_X = sparse.hstack([train_df[features_to_use], tr_spare]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_spare]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_X.shape, test_X.shape)

(49352, 216) (74659, 216)


In [40]:
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

[0]	train-mlogloss:1.04083	test-mlogloss:1.04189
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:0.989262	test-mlogloss:0.991126
[2]	train-mlogloss:0.943198	test-mlogloss:0.946199
[3]	train-mlogloss:0.906756	test-mlogloss:0.910371
[4]	train-mlogloss:0.876267	test-mlogloss:0.880702
[5]	train-mlogloss:0.84588	test-mlogloss:0.851287
[6]	train-mlogloss:0.820024	test-mlogloss:0.826144
[7]	train-mlogloss:0.796375	test-mlogloss:0.803423
[8]	train-mlogloss:0.775781	test-mlogloss:0.783616
[9]	train-mlogloss:0.757724	test-mlogloss:0.766164
[10]	train-mlogloss:0.741542	test-mlogloss:0.750651
[11]	train-mlogloss:0.728622	test-mlogloss:0.738387
[12]	train-mlogloss:0.715143	test-mlogloss:0.725583
[13]	train-mlogloss:0.70431	test-mlogloss:0.715483
[14]	train-mlogloss:0.693668	test-mlogloss:0.705751
[15]	train-mlogloss:0.684657	test-mlogloss:0.697585
[16]	train-mlogloss:0.675546	te

[155]	train-mlogloss:0.45534	test-mlogloss:0.561538
[156]	train-mlogloss:0.454548	test-mlogloss:0.561303
[157]	train-mlogloss:0.453966	test-mlogloss:0.561265
[158]	train-mlogloss:0.453185	test-mlogloss:0.561127
[159]	train-mlogloss:0.45273	test-mlogloss:0.561078
[160]	train-mlogloss:0.452184	test-mlogloss:0.560962
[161]	train-mlogloss:0.451701	test-mlogloss:0.560863
[162]	train-mlogloss:0.451049	test-mlogloss:0.56065
[163]	train-mlogloss:0.450509	test-mlogloss:0.560585
[164]	train-mlogloss:0.449969	test-mlogloss:0.560559
[165]	train-mlogloss:0.449312	test-mlogloss:0.560478
[166]	train-mlogloss:0.448838	test-mlogloss:0.560435
[167]	train-mlogloss:0.448325	test-mlogloss:0.560298
[168]	train-mlogloss:0.447602	test-mlogloss:0.560238
[169]	train-mlogloss:0.446949	test-mlogloss:0.560183
[170]	train-mlogloss:0.446123	test-mlogloss:0.560033
[171]	train-mlogloss:0.445499	test-mlogloss:0.559895
[172]	train-mlogloss:0.444682	test-mlogloss:0.55993
[173]	train-mlogloss:0.444139	test-mlogloss:0.5597

[311]	train-mlogloss:0.376064	test-mlogloss:0.552764
[312]	train-mlogloss:0.37577	test-mlogloss:0.552748
[313]	train-mlogloss:0.375404	test-mlogloss:0.552718
[314]	train-mlogloss:0.375078	test-mlogloss:0.552658
[315]	train-mlogloss:0.374569	test-mlogloss:0.552639
[316]	train-mlogloss:0.374074	test-mlogloss:0.55255
[317]	train-mlogloss:0.373552	test-mlogloss:0.55249
[318]	train-mlogloss:0.373138	test-mlogloss:0.552515
[319]	train-mlogloss:0.372849	test-mlogloss:0.552519
[320]	train-mlogloss:0.372492	test-mlogloss:0.552481
[321]	train-mlogloss:0.372004	test-mlogloss:0.552545
[322]	train-mlogloss:0.371691	test-mlogloss:0.552632
[323]	train-mlogloss:0.371354	test-mlogloss:0.552538
[324]	train-mlogloss:0.370886	test-mlogloss:0.552501
[325]	train-mlogloss:0.370575	test-mlogloss:0.552414
[326]	train-mlogloss:0.370066	test-mlogloss:0.5524
[327]	train-mlogloss:0.369841	test-mlogloss:0.552354
[328]	train-mlogloss:0.369375	test-mlogloss:0.55241
[329]	train-mlogloss:0.368993	test-mlogloss:0.552366

In [37]:
type(train_df[features_to_use])

pandas.core.frame.DataFrame