In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error


In [43]:
train_data = pd.read_csv('./train.csv', parse_dates=['month'])
test_data = pd.read_csv('./test.csv', parse_dates=['month'])
au_train_data = pd.read_csv('auxiliary-data/comCountTrain.csv')
au_test_data = pd.read_csv('auxiliary-data/comCountTest.csv')

In [44]:
train_data.drop(['block', 'eco_category', 'elevation', 'planning_area', 'region', 'street_name'], axis=1, inplace=True)
train_data

Unnamed: 0,month,town,flat_type,storey_range,floor_area_sqm,flat_model,lease_commence_date,latitude,longitude,subzone,resale_price
0,2001-08-01,pasir ris,4 room,01 to 03,118.0,model a,1989,1.369008,103.958697,pasir ris drive,209700.0
1,2014-10-01,punggol,5-room,10 to 12,110.0,improved,2003,1.399007,103.906991,punggol field,402300.0
2,2020-09-01,sengkang,5 room,01 to 03,112.0,premium apartment,2004,1.388348,103.873815,fernvale,351000.0
3,2000-10-01,clementi,3 room,07 to 09,67.0,new generation,1980,1.318493,103.766702,clementi north,151200.0
4,2013-01-01,bukit batok,3-room,07 to 09,73.0,model a,1985,1.348149,103.742658,bukit batok west,318600.0
...,...,...,...,...,...,...,...,...,...,...,...
431727,2005-03-01,woodlands,4 room,01 to 03,101.0,model a,2000,1.429658,103.792583,woodlands south,238500.0
431728,2016-04-01,sengkang,4 room,13 to 15,95.0,premium apartment,2012,1.390053,103.875941,fernvale,376200.0
431729,2011-01-01,tampines,3-room,01 to 03,67.0,new generation,1986,1.349224,103.934913,tampines west,255600.0
431730,2013-05-01,sengkang,5-room,16 to 18,123.0,improved,1999,1.389941,103.900721,sengkang town centre,508500.0


In [45]:
test_data.drop(['block', 'eco_category', 'elevation', 'planning_area', 'region', 'street_name'], axis=1, inplace=True)
# test_data

In [46]:
au_test_data = au_test_data.loc[:, ["commercialCount_5","marketCount_3","shoppingCount_3","stationCount_2"]]
# au_test_data

In [47]:
au_train_data = au_train_data.loc[:, ["commercialCount_5","marketCount_3","shoppingCount_3","stationCount_2"]]
# au_train_data

In [48]:
all_train_data = pd.concat([train_data, au_train_data], axis=1, ignore_index=False)
# all_train_data

In [49]:
all_test_data = pd.concat([test_data, au_test_data], axis=1, ignore_index=False)
# com_test_data

# Data Preprocessing

In [50]:
# convert flat_type to int
def process_value(value):
    if value.startswith('e'):
        return 6
    elif value.startswith('m'):
        return 7
    else:
        return int(value[0])
    

all_train_data['flat_type'] = all_train_data['flat_type'].apply(process_value)
all_test_data['flat_type'] = all_test_data['flat_type'].apply(process_value)

In [51]:
# storey_range process
def transfer(x):
    storeys = x.split(' to ')
    return int(int(storeys[0])+int(storeys[1]))/2


all_train_data['storey_range'] = all_train_data['storey_range'].apply(transfer)
all_test_data['storey_range'] = all_test_data['storey_range'].apply(transfer)

In [52]:
# convert time to float
all_train_data['sell_time'] = all_train_data['month'].apply(lambda x: x.year + x.month / 12)
all_train_data.drop('month', axis=1, inplace=True)
all_test_data['sell_time'] = all_test_data['month'].apply(lambda x: x.year + x.month / 12)
all_test_data.drop('month', axis=1, inplace=True)


In [53]:
# # add age data
# age_train_data = pd.read_csv('auxiliary-data/WithAge.csv')
# age_train_data = age_train_data.loc[:, ["0-14", "15-29", "30-59", "60+"]]
# all_train_data = pd.concat([all_train_data, age_train_data], axis=1, ignore_index=False)

# age_test_data = pd.read_csv("auxiliary-data/TestWithAge.csv")
# age_test_data = age_test_data.loc[:, ["0-14", "15-29", "30-59", "60+"]]
# all_test_data = pd.concat([all_test_data, age_test_data], axis=1, ignore_index=False)


In [54]:
# add school data, distance = 3
school_train_data = pd.read_csv('auxiliary-data/schoolCountTrain_3.csv')
school_train_data = school_train_data.loc[:, ["primaryCount", "secondaryCount"]]
all_train_data = pd.concat([all_train_data, school_train_data], axis=1, ignore_index=False)

school_test_data = pd.read_csv('auxiliary-data/schoolCountTest_3.csv')
school_test_data = school_test_data.loc[:, ['primaryCount_3', 'secondaryCount_3']]
all_test_data = pd.concat([all_test_data, school_test_data], axis=1, ignore_index=False)
all_train_data

Unnamed: 0,town,flat_type,storey_range,floor_area_sqm,flat_model,lease_commence_date,latitude,longitude,subzone,resale_price,commercialCount_5,marketCount_3,shoppingCount_3,stationCount_2,sell_time,primaryCount,secondaryCount
0,pasir ris,4,2.0,118.0,model a,1989,1.369008,103.958697,pasir ris drive,209700.0,7,2,8,2,2001.666667,16,8
1,punggol,5,11.0,110.0,improved,2003,1.399007,103.906991,punggol field,402300.0,5,0,6,25,2014.833333,20,11
2,sengkang,5,2.0,112.0,premium apartment,2004,1.388348,103.873815,fernvale,351000.0,7,5,8,7,2020.750000,17,10
3,clementi,3,8.0,67.0,new generation,1980,1.318493,103.766702,clementi north,151200.0,8,8,11,2,2000.833333,9,11
4,bukit batok,3,8.0,73.0,model a,1985,1.348149,103.742658,bukit batok west,318600.0,2,4,7,4,2013.083333,13,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431727,woodlands,4,2.0,101.0,model a,2000,1.429658,103.792583,woodlands south,238500.0,3,3,7,3,2005.250000,13,10
431728,sengkang,4,14.0,95.0,premium apartment,2012,1.390053,103.875941,fernvale,376200.0,7,1,7,7,2016.333333,18,12
431729,tampines,3,2.0,67.0,new generation,1986,1.349224,103.934913,tampines west,255600.0,6,9,9,3,2011.083333,20,15
431730,sengkang,5,17.0,123.0,improved,1999,1.389941,103.900721,sengkang town centre,508500.0,7,1,12,21,2013.416667,27,16


In [55]:
all_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431732 entries, 0 to 431731
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   town                 431732 non-null  object 
 1   flat_type            431732 non-null  int64  
 2   storey_range         431732 non-null  float64
 3   floor_area_sqm       431732 non-null  float64
 4   flat_model           431732 non-null  object 
 5   lease_commence_date  431732 non-null  int64  
 6   latitude             431732 non-null  float64
 7   longitude            431732 non-null  float64
 8   subzone              431732 non-null  object 
 9   resale_price         431732 non-null  float64
 10  commercialCount_5    431732 non-null  int64  
 11  marketCount_3        431732 non-null  int64  
 12  shoppingCount_3      431732 non-null  int64  
 13  stationCount_2       431732 non-null  int64  
 14  sell_time            431732 non-null  float64
 15  primaryCount     

In [56]:
# use all_train_data and all_test_data

all_train_data = all_train_data.dropna()

# 对分类特征进行编码
categorical_features = ['town', 'flat_model', 'subzone']
for col in categorical_features:
    lbl = LabelEncoder()
    all_train_data[col] = lbl.fit_transform(all_train_data[col])
    all_test_data[col] = lbl.fit_transform(all_test_data[col])

In [58]:
all_test_data

Unnamed: 0,town,flat_type,storey_range,floor_area_sqm,flat_model,lease_commence_date,latitude,longitude,subzone,resale_price,commercialCount_5,marketCount_3,shoppingCount_3,stationCount_2,sell_time,primaryCount,secondaryCount,primaryCount_3,secondaryCount_3
0,16,4,2.0,118.0,7,1989,1.369008,103.958697,94,209700.0,7,2,8,2,2001.666667,16,8,12.0,14.0
1,17,5,11.0,110.0,4,2003,1.399007,103.906991,100,402300.0,5,0,6,25,2014.833333,20,11,18.0,10.0
2,20,5,2.0,112.0,12,2004,1.388348,103.873815,46,351000.0,7,5,8,7,2020.750000,17,10,12.0,13.0
3,9,3,8.0,67.0,11,1980,1.318493,103.766702,34,151200.0,8,8,11,2,2000.833333,9,11,14.0,11.0
4,3,3,8.0,73.0,7,1985,1.348149,103.742658,22,318600.0,2,4,7,4,2013.083333,13,13,9.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431727,24,4,2.0,101.0,7,2000,1.429658,103.792583,143,238500.0,3,3,7,3,2005.250000,13,10,,
431728,20,4,14.0,95.0,12,2012,1.390053,103.875941,46,376200.0,7,1,7,7,2016.333333,18,12,,
431729,22,3,2.0,67.0,11,1986,1.349224,103.934913,119,255600.0,6,9,9,3,2011.083333,20,15,,
431730,20,5,17.0,123.0,4,1999,1.389941,103.900721,107,508500.0,7,1,12,21,2013.416667,27,16,,


# Run Model and Predict

## Evaluation and Test predict

In [141]:
def evaluate(model, train, test):
    # 将数据分为特征和目标变量
    X = train.drop('resale_price', axis=1)
    y = train['resale_price']

    # 将数据集分为训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # 定义要搜索的参数网格
    tree_param_grid = {
            'n_estimators': [7000],
            'learning_rate': [0.1,],
        #     'num_leaves': [31, 50],
            'max_depth': [None, 15, 17, 19],
        # 'min_child_samples': [20, 30]
    }
    
    linear_param_grid = {
        'fit_intercept': [True, False],
    }

    # 使用GridSearchCV搜索最佳参数
    grid_search = GridSearchCV(model, tree_param_grid, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)
#     grid_search = GridSearchCV(model, linear_param_grid, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    # 输出最佳参数
    print("最佳参数：", grid_search.best_params_)

    # 使用最佳参数重新训练模型
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train)

    # 进行预测
    y_pred = best_model.predict(X_test)

    # 计算并输出均方误差
    mse = mean_squared_error(y_test, y_pred)
    print("均方误差：", mse)
    
    # 使用训练好的模型进行预测
    test_pred = best_model.predict(test)

    # 为预测结果创建一个新的DataFrame
    result_df = pd.DataFrame({'Id': np.arange(len(test_pred)), 'Predicted': test_pred})

    # 将结果保存到CSV文件
    result_df.to_csv('predictions.csv', index=False)

## LightGBM

In [142]:
# 创建LightGBM回归器
lgbm = LGBMRegressor()

evaluate(lgbm, all_train_data, all_test_data)

## Linear Regression

In [143]:
# 找出包含缺失值的行
missing_rows = all_train_data.isna().any(axis=1)

# 显示包含缺失值的行
data_with_missing_values = all_train_data[missing_rows]
data_with_missing_values

Unnamed: 0,town,flat_type,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,latitude,longitude,subzone,...,resale_price,commercialCount_5,marketCount_3,shoppingCount_3,stationCount_2,sell_time,0-14,15-29,30-59,60+


In [145]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()

evaluate(regressor, all_train_data, all_test_data)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
最佳参数： {'fit_intercept': True}
均方误差： 3484700396.823359


## PipelineCatBoost