In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import pickle
from dask.diagnostics import ProgressBar
import xgboost as xgb
import dask_xgboost as dxgb
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix

In [2]:
import os
from dask.distributed import Client, progress
client = Client(os.environ.get("DISTRIBUTED_ADDRESS"))

In [3]:
pd.set_option('display.max_columns', 100)

In [4]:
df = pd.read_pickle('/appl/dirac/projects/attrsrch/greenfield/3quarter_cleaned.pkl')

In [5]:
# generate target0
df['target0'] = 0
df.loc[abs(df['prdw_appe'] - df['uad_appe']) <= 0.1, 'target0'] = 0
df.loc[df['prdw_appe'] - df['uad_appe'] > 0.1, 'target0'] = 1
df.loc[df['prdw_appe'] - df['uad_appe'] < -0.1, 'target0'] = 2

In [6]:
# generate target1
df['target1'] = 0
df.loc[abs(abs(df['prdw_ppe'] - df['market_prdw_ppe']) - abs(df['uad_ppe'] - df['market_uad_ppe'])) <= 0.1, 'target1'] = 0
df.loc[abs(df['prdw_ppe'] - df['market_prdw_ppe']) - abs(df['uad_ppe'] - df['market_uad_ppe']) > 0.1, 'target1'] = 1
df.loc[abs(df['prdw_ppe'] - df['market_prdw_ppe']) - abs(df['uad_ppe'] - df['market_uad_ppe']) < -0.1, 'target1'] = 2

In [7]:
# generate target2
df['target2'] = 0
df.loc[abs(abs(df['prdw_appe'] - abs(df['market_prdw_ppe'])) - abs(df['uad_appe'] - abs(df['market_uad_ppe']))) <= 0.1, 'target2'] = 0
df.loc[abs(df['prdw_appe'] - abs(df['market_prdw_ppe'])) - abs(df['uad_appe'] - abs(df['market_uad_ppe'])) > 0.1, 'target2'] = 1
df.loc[abs(df['prdw_appe'] - abs(df['market_prdw_ppe'])) - abs(df['uad_appe'] - abs(df['market_uad_ppe'])) < -0.1, 'target2'] = 2

In [8]:
# generate target3
df['target3'] = abs(df['prdw_appe'] - abs(df['market_prdw_ppe'])) - abs(df['uad_appe'] - abs(df['market_uad_ppe']))

In [9]:
df['year_diff'] = df['tax_year'] - df['uad_year']
df['yr_built_diff'] = df['yr_built_prdw'] - df['yr_built_uad']
df['age_diff'] = df['age_prdw'] - df['age_uad']
df['sf_diff'] = df['sf_prdw'] - df['sf_uad']
df['lot_diff'] = df['lot_prdw'] - df['lot_uad']
df['bath_diff'] = df['bath_prdw'] - df['bath_uad']

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 522548 entries, 0 to 522547
Data columns (total 71 columns):
prop_id                     522548 non-null float32
zip2                        522548 non-null object
address2                    522548 non-null object
state2                      522548 non-null object
origval1                    522548 non-null float32
origdte1                    522548 non-null datetime64[ns]
refiflg1                    392551 non-null object
tract_id                    522548 non-null object
agency1                     522548 non-null object
condo                       522548 non-null float32
best_drve_vld_adr_typ_cd    522548 non-null object
MSA                         522548 non-null float32
MSANAME                     522548 non-null object
CNTY                        522548 non-null float32
uad                         522548 non-null float32
prdw                        522548 non-null float32
mtv                         439972 non-null float32
avm   

In [20]:
ddf = dd.from_pandas(df, npartitions=1).persist()
ddf.head()

Unnamed: 0,prop_id,zip2,address2,state2,origval1,origdte1,refiflg1,tract_id,agency1,condo,best_drve_vld_adr_typ_cd,MSA,MSANAME,CNTY,uad,prdw,mtv,avm,mtv_n,mtv_origval1,mtv_origval2,mtv_origdat1,mtv_origdat2,mtv_refi1,mtv_refi2,code_prdw,sf_prdw,lot_prdw,bath_prdw,yr_built_prdw,rooms_prdw,garage_prdw,part_bath_prdw,bsmt_prdw,tax_year,age_prdw,sf_uad,bath_uad,lot_uad,bdrm_uad,rooms_uad,bsmt_uad,bsmf_uad,yr_built_uad,age_uad,uad_bookdat,garage_uad,cond_uad,qlty_uad,lctn_uad,vew_uad,uad_year,mkt_prdw,cnty,mkt_uad,prdw_ppe,uad_ppe,prdw_appe,uad_appe,market_prdw_ppe,market_uad_ppe,target0,target1,target2,target3,year_diff,yr_built_diff,age_diff,sf_diff,lot_diff,bath_diff
0,1783271.0,1001,108 RIVER RD,MA,230000.0,2018-07-31,N,25013813208,1,0.0,1,44140.0,"Springfield, MA",25013.0,229817.84375,227090.125,218544.109375,223958.671875,2.0,190000.0,224359.0,2015-05-26,2006-06-23,R,N,newchar4,1072.0,18971.0,1.0,1955.0,5.0,,1.0,,2017.0,64.0,1070.0,1.0,19166.400391,3.0,5.0,1070.0,860.0,1955.0,64.0,2018-07-02,1.0,3.0,4.0,1.0,8.0,2018,25013,25013.0,25013,-0.012652,-0.000792,0.012652,0.000792,0.005645,0.01895,0,0,0,-0.011151,-1.0,0.0,0.0,2.0,-195.400391,0.0
1,1913909.0,1001,116 HUNTERS GREENE CIR,MA,371500.0,2018-08-31,N,25013813205,10,0.0,1,44140.0,"Springfield, MA",25013.0,386818.28125,404168.375,326432.78125,370189.75,2.0,310000.0,243042.0,2012-02-28,2004-10-18,,R,newchar4,2594.0,43369.0,2.0,1990.0,8.0,,1.0,,2017.0,29.0,2610.0,2.5,43560.0,4.0,8.0,1490.0,0.0,1990.0,29.0,2018-08-16,2.0,3.0,3.0,1.0,8.0,2018,25013,25013.0,25013,0.087936,0.041234,0.087936,0.041234,0.005645,0.01895,0,0,0,0.060008,-1.0,0.0,0.0,-16.0,-191.0,-0.5
2,1155968.0,1001,16 VERNON ST,MA,210000.0,2018-07-30,N,25013813207,1,0.0,1,44140.0,"Springfield, MA",25013.0,192756.453125,204937.859375,177887.96875,188299.828125,1.0,151000.0,,2011-12-28,NaT,,,newchar4,960.0,17550.0,1.0,1960.0,4.0,,,,2017.0,59.0,960.0,1.0,17424.0,2.0,4.0,960.0,0.0,1960.0,59.0,2018-07-03,1.0,3.0,4.0,1.0,8.0,2018,25013,25013.0,25013,-0.024105,-0.082112,0.024105,0.082112,0.005645,0.01895,0,0,0,-0.044701,-1.0,0.0,0.0,0.0,126.0,0.0
3,1783440.0,1001,24 STEVENSON LN,MA,458000.0,2018-08-10,N,25013813209,2,0.0,1,44140.0,"Springfield, MA",25013.0,443239.03125,492219.0,540173.9375,480115.625,1.0,506950.0,,2008-07-23,NaT,,,newchar4,2955.0,25673.0,3.0,2008.0,7.0,,,,2017.0,11.0,2950.0,3.0,25700.400391,4.0,8.0,2080.0,0.0,2008.0,11.0,2016-05-22,2.0,3.0,3.0,1.0,8.0,2016,25013,25013.0,25013,0.074714,-0.032229,0.074714,0.032229,0.005645,0.01895,0,0,0,0.05579,1.0,0.0,0.0,5.0,-27.400391,0.0
4,1929405.0,1001,25 DEER RUN RD,MA,365000.0,2018-07-27,,25013813205,4,0.0,1,44140.0,"Springfield, MA",25013.0,314359.90625,352584.46875,374149.0625,335040.9375,2.0,260000.0,189000.0,2002-09-06,1997-08-29,,N,newchar4,2088.0,21261.0,2.0,1989.0,6.0,,1.0,,2017.0,30.0,1920.0,2.5,21344.400391,3.0,6.0,1150.0,810.0,1989.0,30.0,2018-07-10,2.0,4.0,4.0,1.0,8.0,2018,25013,25013.0,25013,-0.034015,-0.13874,0.034015,0.13874,0.005645,0.01895,2,2,0,-0.091419,-1.0,0.0,0.0,168.0,-83.400391,-0.5


In [21]:
# pick columns
cols = [
#         'tax_year', 
        'yr_built_prdw', 'sf_prdw', 'lot_prdw', 'bath_prdw', 
        'uad_year', 'yr_built_uad', 'sf_uad', 'lot_uad', 'bdrm_uad', 'bath_uad', 
#         'year_diff', 
        'yr_built_diff', 'sf_diff', 'lot_diff', 'bath_diff'
        ]

X = ddf[cols]
Y0= ddf['target0']
Y1 = ddf['target1']
Y2 = ddf['target2']
Y3 = ddf['target3']

In [22]:
X_train, X_test = ddf.random_split([0.7, 0.3], random_state=1234)
Y_train, Y_test = Y3.random_split([0.7, 0.3], random_state=1234)

In [23]:
param = {
    'max_depth': 12,  # the maximum depth of each tree
    'eta': 0.1,  # the training step for each iteration
    'gamma': 2,
    'verbosity': 0,  # logging mode - quiet
    'objective': 'reg:squarederror', 
    'nround': 500,
#     'eval_metric': 'mlogloss',
#     'num_class': 3
}  # the number of classes that exist in this datset
# n_round = 500

In [24]:
# evallist  = [(dtrain,'train'), (dtest,'test')]
bst = dxgb.train(client, param, X_train, Y_train)

ValueError: I/O operation on closed file