In [7]:
import pandas as pd
import numpy as np
from collections import OrderedDict

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [8]:
from sklearn.externals import joblib

In [9]:
def r2_score(y,y_hat):
    ratio = np.sum((y-y_hat)**2) / np.sum((y-np.mean(y))**2)
    r_square = 1 - ratio
    return r_square

In [10]:
data = pd.read_csv(r'C:\Users\GN1504301\Desktop\競賽資料-utf8\競賽資料-utf8\租屋資訊trainset.csv',encoding='utf-8')

In [11]:
data['cp_value'] = data['price'] / data['area']

data['min_station_name'][data.address == '中山區天津街'] = '中山'
data['min_station'][data.address == '中山區天津街'] = 700

data['min_station_name'][data.address == '南港區林森路73巷'] = '南港展覽館'
data['min_station'][data.address == '南港區林森路73巷'] = 2300

data['min_station_name'][data.address == '中山區中原街'] = '行天宮'
data['min_station'][data.address == '中山區中原街'] = 550

data['min_station_name'][data.address == '中山區中山二路42巷'] = '雙連'
data['min_station'][data.address == '中山區中山二路42巷'] = 400

data['region_cp_mean'] = data.groupby('sectionname')['cp_value'].transform('mean')
data['station_cp_mean'] = data.groupby('min_station_name')['cp_value'].transform('mean')

region_mean_frame = data[['sectionname','region_cp_mean']].drop_duplicates(subset = 'sectionname')
station_mean_frame = data[['min_station_name','station_cp_mean']].drop_duplicates(subset = 'min_station_name')

In [12]:
test_set = pd.read_csv(r'C:\Users\GN1504301\Desktop\競賽資料-utf8\競賽資料-utf8\租屋資訊testset.csv',encoding='big5')

In [13]:
test_set['min_station_name'][test_set.address == '中山區天津街'] = '中山'
test_set['min_station'][test_set.address == '中山區天津街'] = 700
test_set['min_park'][test_set.address == '中山區天津街'] = 260

test_set['min_station_name'][test_set.address == '南港區林森路73巷'] = '南港展覽館'
test_set['min_station'][test_set.address == '南港區林森路73巷'] = 2300
test_set['min_park'][test_set.address == '南港區林森路73巷'] = 550

test_set['min_station_name'][test_set.address == '中山區中原街'] = '行天宮'
test_set['min_station'][test_set.address == '中山區中原街'] = 550
test_set['min_park'][test_set.address == '中山區中原街'] = 70

test_set['min_station_name'][test_set.address == '中山區中山二路77巷'] = '雙連'
test_set['min_station'][test_set.address == '中山區中山二路77巷'] = 400
test_set['min_park'][test_set.address == '中山區中山二路77巷'] = 290


In [14]:
test_list = test_set['min_station_name'].unique().tolist()
train_list = data['min_station_name'].unique().tolist()

[each for each in test_list if each not in train_list]

[]

In [15]:
merge1 = pd.merge(test_set,station_mean_frame,on = 'min_station_name',how = 'inner')
test_data = pd.merge(merge1,region_mean_frame,on = 'sectionname',how = 'inner')

In [16]:
test_data['floor_ratio'] = test_data['floor'] / test_data['allfloor']
test_data['floor_ratio'].fillna(0,inplace=True)

test_data['walk_apart'] = 1
test_data['walk_apart'][test_data['floor'] > 6] = 0

test_data['first_floor'] = 0
test_data['first_floor'][test_data['floor'] == 1] = 1

test_data['ground_space'] = 0
test_data['ground_space'][test_data['floor'] == 0] = 1

In [17]:
test_data.groupby('kind_name')['id'].count()

kind_name
其他        22
分租套房     479
整層住家    2674
獨立套房    1201
車位       169
雅房       327
Name: id, dtype: int64

In [18]:
def lux_tran(x):
    lux_list = ['豪華','裝潢','精緻','雅緻','全新','精美']
    n = 0
    for word in lux_list:
        if word in x:
            n += 1 
    if n > 0:
        return 1
    else:
        return 0
    
def up_tran(x):
    toproof = ['頂加','頂樓加蓋']
    n = 0
    for word in toproof:
        if word in x:
            n += 1
    if n > 0:
        return 1
    else:
        return 0

In [19]:
def mapping(x):
    if x == '分租套房':
        return '1房0廳1衛'
    if x == '獨立套房':
        return '1房0廳1衛'
    if x == '雅房':
        return '1房0廳0衛'
    if x == '其他':
        return '0房0廳0衛'

In [20]:
test_data['lux_dummy'] = test_data['fulladdress'].apply(lux_tran)
test_data['up_dummy'] = test_data['fulladdress'].apply(up_tran)

test_data['layout2'] = test_data['kind_name'].apply(mapping)
test_data['layout_total'] = test_data['layout'].fillna('') + test_data['layout2'].fillna('')
test_data['rooms'] = test_data['layout_total'].str.split('房').str[0]
test_data['way'] = test_data['layout_total'].str.split('廳').str[1]
test_data['way'] = test_data['way'].str.split('衛').str[0]

test_data['rooms'][test_data.kind_name =='車位'] = 0
test_data['way'][test_data.kind_name =='車位'] = 0


In [21]:
# 創造 kind_name & sectionname虛擬變數
kind_dum = pd.get_dummies(test_data.kind_name)
sect_dum = pd.get_dummies(test_data.sectionname)
dumms = pd.concat([kind_dum,sect_dum],axis = 1) 
test_dumm_data = pd.concat([test_data,dumms],axis = 1)

In [22]:
kind_dum.columns

Index(['其他', '分租套房', '整層住家', '獨立套房', '車位', '雅房'], dtype='object')

In [24]:
# 將living欄資訊轉為虛擬變數
living = pd.Series(test_dumm_data.living)
living.fillna('0',inplace=True)
liv_list = ['depart', 'advstore', 'market', 'night', 'park', 'school', 'hospital']

liv2_list = []
for i in living:
    liv_dict = OrderedDict()
    for j in i.split(','):
        '''
        if j not in liv_list:
            liv_list.append(j)
        '''
        if j in liv_list:
            liv_dict['%s' %j] = 1
    liv2_list.append(liv_dict)

liv_frame = pd.DataFrame(liv2_list)
liv_frame.fillna('0',inplace=True)

In [25]:
# 將condition欄資訊轉為虛擬變數
condition = pd.Series(test_dumm_data.condition)
condition.fillna('0',inplace=True)
cond_list = ['tv','icebox','cold','washer','hotwater','four','broadband','landpost','pet','cook','trabus','balcony_0',
 'lease','bed','wardrobe','sofa','bookTable','chair','balcony_1','naturalgas','cartplace','lift','boy','isleastrent',
 'girl','teaTable','bookshelf','dinette','watermemachine','gasstove','layout']

cond2_list = []
for i in condition:
    cond_dict = OrderedDict()
    for j in i.split(','):
        '''
        if j not in cond_list:
            cond_list.append(j)
        '''
        if j in cond_list:
            cond_dict['%s' %j] = 1
    cond2_list.append(cond_dict)

cond_frame = pd.DataFrame(cond2_list)
cond_frame.fillna('0',inplace=True)

In [26]:
dum_frame = pd.concat([liv_frame,cond_frame],axis = 1)
full = pd.concat([test_dumm_data,dum_frame],axis = 1)

In [27]:
# 刪除無用的欄位
col2drop = ['houseage','id','living','condition','address_img_title','lat','lng','loc_type','min_station_name','min_park_name',
            'unit','regionname','kind_name','fulladdress','layout','address','sectionname','room'
           ,'watermemachine','gasstove','dinette','teaTable','bookshelf','layout2','layout_total'] #
cols = [col for col in full.columns if col not in col2drop]
full_data = full[cols]

In [28]:
full_data = full_data[['area', 'floor', 'allfloor', 'min_station', 'min_park', 'floor_ratio',
       'walk_apart', 'first_floor', 'region_cp_mean', 'station_cp_mean',
       'ground_space', 'rooms', 'way', 'lux_dummy', 'up_dummy', '其他', '分租套房',
       '整層住家', '獨立套房', '車位', '雅房', '中山區', '中正區', '信義區', '內湖區', '北投區', '南港區',
       '士林區', '大同區', '大安區', '文山區', '松山區', '萬華區', 'depart', 'advstore',
       'market', 'night', 'park', 'school', 'hospital', 'tv', 'icebox', 'cold',
       'washer', 'hotwater', 'four', 'broadband', 'landpost', 'pet', 'cook',
       'trabus', 'balcony_0', 'lease', 'bed', 'wardrobe', 'sofa', 'bookTable',
       'chair', 'naturalgas', 'cartplace', 'balcony_1', 'lift', 'boy',
       'isleastrent', 'girl']]

In [106]:
clf2 = joblib.load('clf.pkl')
rig2 = joblib.load('rig.pkl')

In [107]:
pred_y = clf2.predict(full_data)

In [108]:
pred_y

array([ 13566.85611968,  14326.12432426,   9054.71411047, ...,
        79397.2946937 ,  24108.43000129,  26047.5142449 ])

In [109]:
pred_y2 = pred_y.clip(0)

In [110]:
pred_y2

array([ 13566.85611968,  14326.12432426,   9054.71411047, ...,
        79397.2946937 ,  24108.43000129,  26047.5142449 ])

In [111]:
print(len(test_data),len(pred_y2))

4872 4872


In [112]:
pred_frame = pd.DataFrame(pred_y2,columns =['predict'] )

In [113]:
data_final = full.merge(pred_frame, left_index=True, right_index=True)
data_final.head()

Unnamed: 0,id,room,area,houseage,living,condition,floor,layout,allfloor,kind_name,...,girl,lease,isleastrent,boy,watermemachine,bookshelf,teaTable,dinette,gasstove,predict
0,948443,0,8.0,0,"depart,advstore,market,night,park,school,hospital","tv,icebox,cold,washer,hotwater,four,broadband,...",2,,7,分租套房,...,0,0,0,0,0,0,0,0,0,13566.85612
1,3736040,0,13.0,0,"depart,advstore,market,night,park,school,hospital","tv,icebox,cold,washer,hotwater,four,broadband,...",5,,5,分租套房,...,0,0,0,0,0,0,0,0,0,14326.124324
2,4308390,0,10.0,0,0,"tv,icebox,cold,washer,hotwater,broadband,landp...",5,,7,分租套房,...,0,0,0,0,0,0,0,0,0,9054.71411
3,4512785,0,4.0,0,0,"tv,cold,washer,hotwater,four,broadband,natural...",4,,5,雅房,...,0,0,0,0,0,0,0,0,0,4692.918097
4,4514061,0,6.0,0,"advstore,market,night,park,school,hospital","tv,icebox,cold,washer,hotwater,four,broadband,...",2,,4,分租套房,...,0,0,0,0,0,0,0,0,0,8274.203269


In [114]:
data_final['group5'] = 5

In [115]:
upload = data_final[['group5','id','predict']]

In [35]:
upload.shape[0]

4872

In [86]:
upload.head()

Unnamed: 0,group5,id,predict
0,5,948443,11363.615974
1,5,3736040,12455.800758
2,5,4308390,8891.116519
3,5,4512785,4586.036178
4,5,4514061,7777.760147


In [116]:
upload.to_csv(r'C:\Users\GN1504301\Desktop\predict_group5.csv',index=None,encoding = 'utf-8')