In [126]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error


In [127]:
train_data = pd.read_csv('./train.csv', parse_dates=['month'])
test_data = pd.read_csv('./test.csv', parse_dates=['month'])
au_train_data = pd.read_csv('auxiliary-data/comCountTrain.csv')
au_test_data = pd.read_csv('auxiliary-data/comCountTest.csv')

In [128]:
train_data.drop(['block', 'eco_category', 'elevation', 'planning_area'], axis=1, inplace=True)
train_data

Unnamed: 0,month,town,flat_type,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,latitude,longitude,subzone,region,resale_price
0,2001-08-01,pasir ris,4 room,pasir ris drive 4,01 to 03,118.0,model a,1989,1.369008,103.958697,pasir ris drive,east region,209700.0
1,2014-10-01,punggol,5-room,punggol field,10 to 12,110.0,improved,2003,1.399007,103.906991,punggol field,north-east region,402300.0
2,2020-09-01,sengkang,5 room,fernvale lane,01 to 03,112.0,premium apartment,2004,1.388348,103.873815,fernvale,north-east region,351000.0
3,2000-10-01,clementi,3 room,clementi avenue 4,07 to 09,67.0,new generation,1980,1.318493,103.766702,clementi north,west region,151200.0
4,2013-01-01,bukit batok,3-room,bukit batok street 11,07 to 09,73.0,model a,1985,1.348149,103.742658,bukit batok west,west region,318600.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
431727,2005-03-01,woodlands,4 room,Woodlands Drive 16,01 to 03,101.0,model a,2000,1.429658,103.792583,woodlands south,north region,238500.0
431728,2016-04-01,sengkang,4 room,fernvale road,13 to 15,95.0,premium apartment,2012,1.390053,103.875941,fernvale,north-east region,376200.0
431729,2011-01-01,tampines,3-room,tampines street 81,01 to 03,67.0,new generation,1986,1.349224,103.934913,tampines west,east region,255600.0
431730,2013-05-01,sengkang,5-room,compassvale walk,16 to 18,123.0,improved,1999,1.389941,103.900721,sengkang town centre,north-east region,508500.0


In [129]:
value_counts = com_test_data['flat_model'].value_counts()
value_counts

model a                   31874
improved                  28496
new generation            19633
premium apartment          7208
simplified                 6058
apartment                  4509
standard                   4410
maisonette                 2953
model a2                   1769
dbss                        341
adjoined flat               235
model a maisonette          187
terrace                      86
type s1                      58
multi generation             42
type s2                      30
improved maisonette          19
premium maisonette           15
premium apartment loft       10
2 room                        1
Name: flat_model, dtype: int64

In [130]:
# There are multiple duplicates of ('latitude', 'longitude')
train_data.groupby(['latitude', 'longitude']).size()

latitude  longitude 
1.270380  103.823236     58
1.270919  103.822685     62
1.271409  103.810888    144
1.271463  103.825684     55
1.271691  103.809852     18
                       ... 
1.456235  103.814292     21
1.456425  103.815858     93
1.456474  103.817181     59
1.456546  103.816764     60
1.457071  103.815308    113
Length: 9138, dtype: int64

In [131]:
test_data.drop(['block', 'eco_category', 'elevation', 'planning_area'], axis=1, inplace=True)
test_data

Unnamed: 0,month,town,flat_type,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,latitude,longitude,subzone,region
0,2004-01-01,bukit batok,4 room,bukit batok west avenue 6,04 to 06,94.0,new generation,1989,1.346581,103.744085,bukit batok west,west region
1,2001-11-01,tampines,5 room,tampines street 34,04 to 06,122.0,improved,1997,1.357618,103.961379,tampines east,east region
2,2002-07-01,jurong east,3 room,jurong east street 21,01 to 03,67.0,new generation,1982,1.337804,103.741998,toh guan,west region
3,2015-04-01,ang mo kio,3 room,Ang Mo Kio Avenue 5,04 to 06,82.0,new generation,1981,1.380084,103.849574,yio chu kang east,north-east region
4,2004-04-01,clementi,5 room,clementi avenue 2,01 to 03,117.0,standard,1978,1.313960,103.769831,clementi north,west region
...,...,...,...,...,...,...,...,...,...,...,...,...
107929,2008-04-01,hougang,5 room,buangkok crescent,10 to 12,110.0,improved,2003,1.380452,103.879333,trafalgar,north-east region
107930,2006-01-01,kallang/whampoa,4 room,upper boon keng road,13 to 15,102.0,model a,1999,1.314481,103.870458,boon keng,central region
107931,2000-01-01,kallang/whampoa,3 room,beach road,07 to 09,68.0,improved,1979,1.294924,103.854315,city hall,central region
107932,2009-07-01,jurong west,4 room,jurong west street 91,10 to 12,104.0,model a,1988,1.339927,103.687354,yunnan,west region


In [132]:
au_test_data = au_test_data.loc[:, ["commercialCount_5","marketCount_3","shoppingCount_3","stationCount_2"]]
au_test_data

Unnamed: 0,commercialCount_5,marketCount_3,shoppingCount_3,stationCount_2
0,2,5,7,4
1,6,3,9,4
2,2,5,6,3
3,5,12,6,2
4,9,9,8,2
...,...,...,...,...
107929,8,6,11,8
107930,7,21,27,13
107931,4,20,59,21
107932,3,2,5,2


In [133]:
au_train_data = au_train_data.loc[:, ["commercialCount_5","marketCount_3","shoppingCount_3","stationCount_2"]]
au_train_data

Unnamed: 0,commercialCount_5,marketCount_3,shoppingCount_3,stationCount_2
0,7,2,8,2
1,5,0,6,25
2,7,5,8,7
3,8,8,11,2
4,2,4,7,4
...,...,...,...,...
431727,3,3,7,3
431728,7,1,7,7
431729,6,9,9,3
431730,7,1,12,21


In [134]:
com_train_data = pd.concat([train_data, au_train_data], axis=1, ignore_index=False)
com_train_data

Unnamed: 0,month,town,flat_type,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,latitude,longitude,subzone,region,resale_price,commercialCount_5,marketCount_3,shoppingCount_3,stationCount_2
0,2001-08-01,pasir ris,4 room,pasir ris drive 4,01 to 03,118.0,model a,1989,1.369008,103.958697,pasir ris drive,east region,209700.0,7,2,8,2
1,2014-10-01,punggol,5-room,punggol field,10 to 12,110.0,improved,2003,1.399007,103.906991,punggol field,north-east region,402300.0,5,0,6,25
2,2020-09-01,sengkang,5 room,fernvale lane,01 to 03,112.0,premium apartment,2004,1.388348,103.873815,fernvale,north-east region,351000.0,7,5,8,7
3,2000-10-01,clementi,3 room,clementi avenue 4,07 to 09,67.0,new generation,1980,1.318493,103.766702,clementi north,west region,151200.0,8,8,11,2
4,2013-01-01,bukit batok,3-room,bukit batok street 11,07 to 09,73.0,model a,1985,1.348149,103.742658,bukit batok west,west region,318600.0,2,4,7,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431727,2005-03-01,woodlands,4 room,Woodlands Drive 16,01 to 03,101.0,model a,2000,1.429658,103.792583,woodlands south,north region,238500.0,3,3,7,3
431728,2016-04-01,sengkang,4 room,fernvale road,13 to 15,95.0,premium apartment,2012,1.390053,103.875941,fernvale,north-east region,376200.0,7,1,7,7
431729,2011-01-01,tampines,3-room,tampines street 81,01 to 03,67.0,new generation,1986,1.349224,103.934913,tampines west,east region,255600.0,6,9,9,3
431730,2013-05-01,sengkang,5-room,compassvale walk,16 to 18,123.0,improved,1999,1.389941,103.900721,sengkang town centre,north-east region,508500.0,7,1,12,21


In [135]:
com_test_data = pd.concat([test_data, au_test_data], axis=1, ignore_index=False)
com_test_data

Unnamed: 0,month,town,flat_type,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,latitude,longitude,subzone,region,commercialCount_5,marketCount_3,shoppingCount_3,stationCount_2
0,2004-01-01,bukit batok,4 room,bukit batok west avenue 6,04 to 06,94.0,new generation,1989,1.346581,103.744085,bukit batok west,west region,2,5,7,4
1,2001-11-01,tampines,5 room,tampines street 34,04 to 06,122.0,improved,1997,1.357618,103.961379,tampines east,east region,6,3,9,4
2,2002-07-01,jurong east,3 room,jurong east street 21,01 to 03,67.0,new generation,1982,1.337804,103.741998,toh guan,west region,2,5,6,3
3,2015-04-01,ang mo kio,3 room,Ang Mo Kio Avenue 5,04 to 06,82.0,new generation,1981,1.380084,103.849574,yio chu kang east,north-east region,5,12,6,2
4,2004-04-01,clementi,5 room,clementi avenue 2,01 to 03,117.0,standard,1978,1.313960,103.769831,clementi north,west region,9,9,8,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107929,2008-04-01,hougang,5 room,buangkok crescent,10 to 12,110.0,improved,2003,1.380452,103.879333,trafalgar,north-east region,8,6,11,8
107930,2006-01-01,kallang/whampoa,4 room,upper boon keng road,13 to 15,102.0,model a,1999,1.314481,103.870458,boon keng,central region,7,21,27,13
107931,2000-01-01,kallang/whampoa,3 room,beach road,07 to 09,68.0,improved,1979,1.294924,103.854315,city hall,central region,4,20,59,21
107932,2009-07-01,jurong west,4 room,jurong west street 91,10 to 12,104.0,model a,1988,1.339927,103.687354,yunnan,west region,3,2,5,2


# Data Preprocessing

In [136]:
value_counts = com_test_data['flat_type'].value_counts()
value_counts

4 room              33940
3 room              25061
5 room              19197
4-room               8490
executive            8214
3-room               6429
5-room               5338
2 room                899
2-room                250
1 room                 59
multi generation       42
1-room                 15
Name: flat_type, dtype: int64

In [137]:
# convert flat_type to int
def process_value(value):
    if value.startswith('e'):
        return 6
    elif value.startswith('m'):
        return 7
    else:
        return int(value[0])
    

com_train_data['flat_type'] = com_train_data['flat_type'].apply(process_value)
com_test_data['flat_type'] = com_test_data['flat_type'].apply(process_value)
value_counts = com_test_data['flat_type'].value_counts()
value_counts

4    42430
3    31490
5    24535
6     8214
2     1149
1       74
7       42
Name: flat_type, dtype: int64

In [138]:
value_counts = com_test_data['storey_range'].value_counts()
value_counts

04 to 06    26992
07 to 09    23967
01 to 03    21706
10 to 12    20132
13 to 15     7752
16 to 18     3041
19 to 21     1336
22 to 24      832
01 to 05      518
06 to 10      483
25 to 27      393
11 to 15      259
28 to 30      213
31 to 33       63
37 to 39       58
34 to 36       58
16 to 20       55
40 to 42       29
21 to 25       22
26 to 30       10
46 to 48        6
43 to 45        5
49 to 51        3
36 to 40        1
Name: storey_range, dtype: int64

In [139]:
# storey_range process
def transfer(x):
    storeys = x.split(' to ')
    return int(int(storeys[0])+int(storeys[1]))/2


com_train_data['storey_range'] = com_train_data['storey_range'].apply(transfer)
com_test_data['storey_range'] = com_test_data['storey_range'].apply(transfer)
value_counts = com_test_data['storey_range'].value_counts()
value_counts

5.0     26992
8.0     24450
2.0     21706
11.0    20132
14.0     7752
17.0     3041
20.0     1336
23.0      854
3.0       518
26.0      393
13.0      259
29.0      213
32.0       63
38.0       59
35.0       58
18.0       55
41.0       29
28.0       10
47.0        6
44.0        5
50.0        3
Name: storey_range, dtype: int64

In [140]:
# convert time to float
com_train_data['sell_time'] = com_train_data['month'].apply(lambda x: x.year + x.month / 12)
com_train_data.drop('month', axis=1, inplace=True)
com_test_data['sell_time'] = com_test_data['month'].apply(lambda x: x.year + x.month / 12)
com_test_data.drop('month', axis=1, inplace=True)
com_train_data

Unnamed: 0,town,flat_type,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,latitude,longitude,subzone,region,resale_price,commercialCount_5,marketCount_3,shoppingCount_3,stationCount_2,sell_time
0,pasir ris,4,pasir ris drive 4,2.0,118.0,model a,1989,1.369008,103.958697,pasir ris drive,east region,209700.0,7,2,8,2,2001.666667
1,punggol,5,punggol field,11.0,110.0,improved,2003,1.399007,103.906991,punggol field,north-east region,402300.0,5,0,6,25,2014.833333
2,sengkang,5,fernvale lane,2.0,112.0,premium apartment,2004,1.388348,103.873815,fernvale,north-east region,351000.0,7,5,8,7,2020.750000
3,clementi,3,clementi avenue 4,8.0,67.0,new generation,1980,1.318493,103.766702,clementi north,west region,151200.0,8,8,11,2,2000.833333
4,bukit batok,3,bukit batok street 11,8.0,73.0,model a,1985,1.348149,103.742658,bukit batok west,west region,318600.0,2,4,7,4,2013.083333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431727,woodlands,4,Woodlands Drive 16,2.0,101.0,model a,2000,1.429658,103.792583,woodlands south,north region,238500.0,3,3,7,3,2005.250000
431728,sengkang,4,fernvale road,14.0,95.0,premium apartment,2012,1.390053,103.875941,fernvale,north-east region,376200.0,7,1,7,7,2016.333333
431729,tampines,3,tampines street 81,2.0,67.0,new generation,1986,1.349224,103.934913,tampines west,east region,255600.0,6,9,9,3,2011.083333
431730,sengkang,5,compassvale walk,17.0,123.0,improved,1999,1.389941,103.900721,sengkang town centre,north-east region,508500.0,7,1,12,21,2013.416667


# Run Model and Predict

In [141]:
# 对分类特征进行编码
categorical_features = ['town', 'street_name', 'flat_model', 'subzone', 'region']
for col in categorical_features:
    lbl = LabelEncoder()
    com_train_data[col] = lbl.fit_transform(com_train_data[col])

# 将数据分为特征和目标变量
X = com_train_data.drop('resale_price', axis=1)
y = com_train_data['resale_price']

# 将数据集分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建LightGBM回归器
lgbm = LGBMRegressor()

# 定义要搜索的参数网格
param_grid = {
    'n_estimators': [100, 300, 600, 1000, 1500, 2000],
    'learning_rate': [0.01, 0.1, 0.2],
#     'num_leaves': [31, 50],
#     'max_depth': [-1, 3, 5],
#     'min_child_samples': [20, 30]
}

# 使用GridSearchCV搜索最佳参数
grid_search = GridSearchCV(lgbm, param_grid, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 输出最佳参数
print("最佳参数：", grid_search.best_params_)

# 使用最佳参数重新训练模型
best_lgbm = grid_search.best_estimator_
best_lgbm.fit(X_train, y_train)

# 进行预测
y_pred = best_lgbm.predict(X_test)

# 计算并输出均方误差
mse = mean_squared_error(y_test, y_pred)
print("均方误差：", mse)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
最佳参数： {'learning_rate': 0.2, 'min_child_samples': 20, 'n_estimators': 300}
均方误差： 324262219.6467766


In [142]:
# 对com_test_data的分类特征进行编码，与训练数据集保持一致
for col in categorical_features:
    lbl = LabelEncoder()
    com_test_data[col] = lbl.fit_transform(com_test_data[col])

# 使用训练好的模型进行预测
test_pred = best_lgbm.predict(com_test_data)

# 为预测结果创建一个新的DataFrame
result_df = pd.DataFrame({'Id': np.arange(len(test_pred)), 'Predicted': test_pred})

# 将结果保存到CSV文件
result_df.to_csv('predictions.csv', index=False)