# A top 1% Model for 2019 Esun AI Competition: Taiwan House Price Prediction
## This is an ensemble learning model, composed with a base model and a model, modified based on the city feature
## In this file, the base model will be explained briefly.

### Import Packages

In [20]:
import numpy as np
import pandas as pd
import lightgbm
from scipy.special import boxcox1p
from scipy import stats
from sklearn.model_selection import train_test_split
from scipy.stats import norm, skew #for some statistics

### Functions, which will be used

In [21]:
# Drop outliers
# outliers: house with incredible huge building area
def DropIndexs(dataset):
    all_data = dataset.copy()
    all_data = all_data.drop(all_data[(all_data['building_area']>80)].index)
    return all_data

# Log-normal transformation on the hosue price feature
def PreProcessingForOnlyTr(dataset):
    all_data = dataset.copy()
    all_data["total_price"]=all_data['total_price']/all_data['building_area']
    all_data["total_price"] = np.log1p(all_data["total_price"])
    return all_data  

# data preprocessing, 
# including creating hidden features, filling missing values, 
# dropping bad features and some other feature engineering.
def PreProcessing(dataset):
    all_data = dataset.copy()
    ## Variable Transformation
    ## Predict price per sq
    ## Log Trasnformation
    all_data["building_area"] = np.log1p(all_data["building_area"])
    all_data["land_area"] = np.log1p(all_data["land_area"])
    ### Feature Engineering
    all_data['N_500']=all_data['N_500']-all_data['N_50']
    all_data['N_1000']=all_data['N_1000']-all_data['N_500']-all_data['N_50']
    all_data['N_5000']=all_data['N_5000']-all_data['N_1000']-all_data['N_500']-all_data['N_50']
    all_data['N_10000']=all_data['N_10000']-all_data['N_5000']-all_data['N_1000']-all_data['N_500']-all_data['N_50']
    # # Filling missing values
    all_data['txn_floor']=all_data['txn_floor'].fillna(0)
    all_data['parking_price']=all_data['parking_price'].fillna(0)
    all_data['parking_area']=all_data['parking_area'].fillna(0)
    all_data["parking_area"] = np.log1p(all_data["parking_area"])
    all_data['village_income_median']=all_data['village_income_median'].fillna(0)
    # # Create hidden features
    all_data['roof']=np.where(all_data['total_floor']==all_data['txn_floor'], 0, 1)
    all_data['house_age']=all_data['txn_dt']-all_data['building_complete_dt']
    all_data['material_price']=pd.np.where((all_data['building_material']==10)|(all_data['building_material']==9)|(all_data['building_material']==5), 0, 1)
    all_data['building_material'] = all_data['building_material'].apply(str)
    all_data['building_type'] = all_data['building_use'].apply(str)
    all_data['building_use'] = all_data['building_type'].apply(str)
    all_data['town'] = all_data['town'].apply(str)
    all_data['year']=(all_data['building_complete_dt']/365).apply(str)
    all_data['parking_way'] = all_data['parking_way'].apply(str)
    all_data['txn_floor'] = all_data['txn_floor'].apply(str)
    all_data['village']=all_data['village'].apply(str)
    all_data['lon']=all_data['lon'].apply(str)
    ########################################## Transform skewness
    numeric_feats = all_data.dtypes[(all_data.dtypes != "object" ) & (all_data.columns != "city")].index
    # # Check the skew of all numerical features
    skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
    skewness = pd.DataFrame({'Skew' :skewed_feats})
    skewness = skewness.head(10)
    skewness = skewness[abs(skewness) > 0.55]
    skewed_features = skewness.index
    lam = 0.6
    for feat in skewed_features:
        #all_data[feat] += 1
        all_data[feat] = boxcox1p(all_data[feat], lam)
    all_data=pd.get_dummies(all_data, columns=['txn_floor','lon','roof','year','town','building_material','building_use','building_type','village','material_price','parking_way'])
    # ## drop Variable
    all_data.drop("lat", axis = 1, inplace = True)
    all_data.drop("born_rate", axis = 1, inplace = True)
    all_data.drop("death_rate", axis = 1, inplace = True)
    all_data.drop("marriage_rate", axis = 1, inplace = True)
    all_data.drop("divorce_rate", axis = 1, inplace = True)
    return all_data

# Post Processing
def PostProcessingForThePrice(dataset, priceList):
    priceList = np.expm1(priceList)
    priceList = priceList*dataset['building_area'].values
    # brutal force especially on high price data
    q2 = np.quantile(priceList, .996)
    priceList = np.where(priceList < q2, priceList, priceList * 1.2 )
    return priceList

# Align a dataset with another dataset
# After this process,  oridata will be added columns(features), 
# which was not existed in it, but existed in the target data.  
def GetMissingColumns( oriData, target):
    # Get missing columns in the training test
    missing_cols = set( target.columns ) - set( oriData.columns )
    # Add a missing column in test set with default value equal to 0
    for c in missing_cols:
        if c is not "total_price":
            oriData[c] = 0
    # Ensure the order of column in the test set is in the same order than in train set
    oriData = oriData[target.columns]
    return oriData

# Train the model with the trainset
def Model_Wholedataset(trainset, models):
    trainForBigModel = trainset.copy()
    y = trainForBigModel["total_price"].values
    trainForBigModel.drop(['building_id', 'total_price'], inplace=True, axis=1)
    x = trainForBigModel.values
    model = lightgbm.LGBMRegressor(objective='regression',num_leaves=125,
                                  learning_rate=0.012, n_estimators=61002,
                                  max_bin = 1550, bagging_fraction = 0.8,
                                  bagging_freq = 5, feature_fraction = 0.3319,
                                  feature_fraction_seed=9, bagging_seed=9,
                                  min_data_in_leaf =10, min_sum_hessian_in_leaf = 16)
    model.fit(x,y)
    models.update({'bigModel': model})
    return models

# Get the prediction house price of testset by the trained model
def Get_predict_of_theModel(originTest, afPreprocTest, model):
    dataset = afPreprocTest.copy()
    dataset.drop(['building_id', 'total_price'], inplace=True, axis=1)
    x = dataset.values
    y = model.predict(x)
    predictByModel = PostProcessingForThePrice(originTest, y)    
    return predictByModel

### Prepare the data
#### 1. read the file
#### 2. drop the outliers
#### 3. data preprocessing, including creating hidden features, filling missing values, dropping bad features and some other feature engineering.


In [22]:
train_csv = pd.read_csv('train.csv')
trainBefPreproc, valBefPreproc = train_test_split(train_csv, test_size=0.2, random_state=42)

trainAfDrop = DropIndexs(trainBefPreproc)
train = PreProcessingForOnlyTr(trainAfDrop)
train = PreProcessing(train)
    
valAfDrop = DropIndexs(valBefPreproc)
val = PreProcessingForOnlyTr(valAfDrop)
val = PreProcessing(val)

submissionBefPreproc = pd.read_csv('test.csv')
submission = PreProcessing(submissionBefPreproc)

if(len(train.columns)>=len(submission.columns)):
    submission = GetMissingColumns(submission, train)
    print("submission_data size is : {}".format(submission.shape))    
    train = GetMissingColumns(train, submission)
    print("train_data size is : {}".format(train.shape))
else:
    train = GetMissingColumns(train, submission)
    print("train_data size is : {}".format(train.shape))
    submission = GetMissingColumns(submission, train)
    print("submission_data size is : {}".format(submission.shape))     

submission_data size is : (10000, 4211)
train_data size is : (47820, 4211)


### Train the model

In [23]:
modelDict = {}
modelDict = Model_Wholedataset(train, modelDict)

### Predict the hosuing price accodding to the testset by the trained model

In [24]:
predictBybigModel = Get_predict_of_theModel(submissionBefPreproc, submission, modelDict["bigModel"])
ids = submission['building_id'].values
output = pd.DataFrame({'building_id': ids, 'total_price': predictBybigModel})
output.to_csv("submissionBybigModel.csv", index=False)