In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
import time
# import seaborn as sns
from xgboost import plot_importance

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [4]:
train = train.loc[train.city == 13]
test = test.loc[test.city == 13]
len_train = len(train)
len_test = len(test)
print(len_train, len_test)

10756 1767


In [5]:
def num_null(df):
    missing = df.isnull().sum()
    print('Show #missing in the columns:')
    for i in range(df.shape[1]):
        if missing[i]:
            print(missing.index[i], ':', missing[i])
            

def metric(truth, pred):
    truth = np.array(truth)
    pred = np.array(pred)
    diff = abs(pred - truth) / truth
    print(list(diff <= 0.1).count(True) / len(diff))
    

num_null(train)

Show #missing in the columns:
parking_area : 10097
parking_price : 8632
txn_floor : 324
village_income_median : 116


In [6]:
train['total_price'] = np.log1p(train["total_price"] / train["building_area"])

In [7]:
X = pd.concat([train.drop(['total_price'], axis=1), test])
X.reset_index(inplace=True, drop=True)

In [8]:
num_null(X)

Show #missing in the columns:
parking_area : 11749
parking_price : 10018
txn_floor : 369
village_income_median : 135


In [9]:
pd.options.mode.chained_assignment = None

In [10]:
X_imp = X.copy()
X_imp['parking_area'].fillna(0, inplace=True)
X_imp['parking_price'].fillna(X_imp['parking_price'].median(), inplace=True)
# X_imp['txn_floor'].fillna(X_imp['txn_floor'].median(), inplace=True)


# X_imp['txn_floor'].fillna(-1, inplace=True)
X_imp['txn_floor'].fillna(X_imp['total_floor'] / 2, inplace=True)

In [11]:
# knn_impute(X_imp, 'village_income_median', ['lat', 'lon'])
vimm = X_imp.groupby('village').agg({'village_income_median':'mean'})

vim_isnan = np.isnan(X_imp['village_income_median'])
for i in range(len(X_imp)):
    if vim_isnan[i]:
        X_imp['village_income_median'][i] = vimm.loc[X_imp['village'][i], 'village_income_median']

In [12]:
num_null(X_imp)

Show #missing in the columns:
village_income_median : 135


In [13]:
tvimm = X_imp.groupby('town').agg({'village_income_median':'median'})

vim_isnan = np.isnan(X_imp['village_income_median'])
for i in range(len(X_imp)):
    if vim_isnan[i]:
        X_imp['village_income_median'][i] = tvimm.loc[X_imp['town'][i], 'village_income_median']

# knn_impute(X_imp, 'village_income_median', ['lat', 'lon'])

In [14]:
num_null(X_imp)

Show #missing in the columns:


In [15]:
X_imp.drop(['parking_area','parking_price'], axis=1, inplace=True)

In [16]:
concat_train = train.copy()

In [17]:
building_material_dummies = pd.get_dummies(X_imp['building_material'])
building_material_dummies.columns = ['building_material_' + str(col) for col in building_material_dummies.columns]
X_imp = pd.concat([X_imp, building_material_dummies], axis=1)
X_imp.drop(['building_material', building_material_dummies.columns[0]], inplace=True, axis=1)

city_dummies = pd.get_dummies(X_imp['city'])
city_dummies.columns = ['city_' + str(col) for col in city_dummies.columns]
X_imp = pd.concat([X_imp, city_dummies], axis=1)
X_imp.drop(['city', city_dummies.columns[0]], inplace=True, axis=1)

In [18]:
building_type_dummies = pd.get_dummies(X_imp['building_type'])
building_type_dummies.columns = ['building_type_' + str(col) for col in building_type_dummies.columns]
X_imp = pd.concat([X_imp, building_type_dummies], axis=1)
X_imp.drop(['building_type', building_type_dummies.columns[0]], inplace=True, axis=1)

building_use_dummies = pd.get_dummies(X_imp['building_use'])
building_use_dummies.columns = ['building_use_' + str(col) for col in building_use_dummies.columns]
X_imp = pd.concat([X_imp, building_use_dummies], axis=1)
X_imp.drop(['building_use', building_use_dummies.columns[0]], inplace=True, axis=1)

parking_way_dummies = pd.get_dummies(X_imp['parking_way'])
parking_way_dummies.columns = ['parking_way_' + str(col) for col in parking_way_dummies.columns]
X_imp = pd.concat([X_imp, parking_way_dummies], axis=1)
X_imp.drop(['parking_way', parking_way_dummies.columns[0]], inplace=True, axis=1)

In [19]:
post_train = X_imp.head(len_train)
X_test = X_imp.tail(len_test)

post_train.drop('building_id', axis=1, inplace=True)
post_train['total_price'] = train['total_price'].reset_index(drop=True)

test_building_id = X_test['building_id']
X_test.drop('building_id', axis=1, inplace=True)

In [20]:
from xgboost import XGBRegressor


ts = time.time()

model = XGBRegressor(
    colsample_bytree=0.4,
    max_depth=8,
    n_estimators=10000,
    learning_rate=0.01,
    subsample=0.6,
    nthread=23)

model.fit(
    post_train.drop('total_price', axis=1), 
    post_train['total_price'],
    verbose=True)

print(time.time() - ts)

562.0260674953461


In [21]:
Y_test_predict = model.predict(X_test)
Y_test_predict = np.floor(np.expm1(Y_test_predict)) * X_test['building_area']

In [22]:
submission = pd.DataFrame({
    "building_id": test_building_id, 
    "total_price": Y_test_predict
})
submission.to_csv("2019-07-08.csv",index=False)