In [1]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb
import seaborn as sns
from mlcompetitions import *
import random
from matplotlib.ticker import FuncFormatter
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
import matplotlib.pyplot as plt
from collections import Counter
#rom IPython.core.interactiveshell import InteractiveShell
#nteractiveShell.ast_node_interactivity = "all"




pd.set_option('display.max_columns', 500)
%matplotlib inline

In [2]:
ml=ML_competitions(r'project')

In [3]:
macro_cols = ["balance_trade", "balance_trade_growth", "usdrub", "average_provision_of_build_contract",
"micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
"income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"]
add_macro_cols=['average_provision_of_build_contract_moscow',
 'cpi',
 'deposits_growth',
 'gdp_quart_growth',
 'mortgage_growth',
 'net_capital_export',
 'ppi']

In [4]:
path=r'data'
df_train =pd.read_csv(os.path.join(path,'train.csv'),sep=',',parse_dates=['timestamp'])
df_test = pd.read_csv(os.path.join(path,'test.csv'),sep=',',parse_dates=['timestamp'])
df_macro = pd.read_csv(os.path.join(path,'macro.csv'),sep=',',parse_dates=['timestamp'], usecols=['timestamp'] + macro_cols+add_macro_cols)

In [5]:
ml.load_data(df_train.drop('price_doc',axis=1),df_train.price_doc.values.ravel(),df_test)

In [6]:
#ylog_train_all = np.log1p(df_train['price_doc'].values)
id_test = df_test['id']

In [7]:
ecology_params=['park_km','green_zone_km','industrial_km','water_treatment_km','cemetery_km',
 'incineration_km','railroad_station_avto_km','public_transport_station_km','water_km',
 'water_1line','mkad_km','big_road1_1line','big_road2_km','railroad_km', 'oil_chemistry_km',
 'nuclear_reactor_km','radiation_km','power_transmission_line_km','thermal_power_plant_km','green_part_500']


house_loc_params=['ID_metro','sub_area',
 'area_m',
 'metro_min_avto',
 'metro_km_avto',
 'metro_min_walk',
 'metro_km_walk',
 'kindergarten_km',
 'school_km',
 'park_km',
 'green_zone_km',
 'industrial_km',
 'water_treatment_km',
 'cemetery_km',
 'incineration_km',
 'railroad_station_walk_km',
 'railroad_station_walk_min',
 'ID_railroad_station_walk']



def most_common(arr):
    t=arr.dropna()
    if len(t)>0:
        return Counter(list(arr)).most_common(1)[0][0]
    else: return np.nan

def eco_forest(df,ecology_params):
    eco_df=df[ecology_params+['ecology']].drop_duplicates()
    eco_df=eco_df[eco_df.ecology!='no data']
    eco_df.dropna(inplace=True,axis=1)
    lbl=LabelEncoder()
    lbl.fit(eco_df.ecology)
    y_eco=lbl.transform(eco_df.ecology)
    dummy_df=pd.get_dummies(eco_df[['big_road1_1line','water_1line']])
    eco_df.drop(['ecology','big_road1_1line','water_1line'],axis=1,inplace=True)
    rf=RandomForestClassifier(n_estimators=2000)
  
    scaler=StandardScaler().fit(eco_df)

    rf.fit(np.hstack([dummy_df.values,scaler.transform(eco_df)]),y_eco)
    return rf,lbl,scaler,eco_df.columns

def ecology_replace(x):
    x_uniq=x.unique()
    if len(x_uniq)>1:
        x=[el for el in x if el!='no data']
        res=max(set(x), key=x.count)
    else:
        res=x_uniq[0]
    return res
    
replace_arr=lambda column,data,dict_df:data[[column,'id_house']].apply(lambda row:dict_df.ix[row[1]][column],axis=1)    
replace_arr_with_nan=lambda column,data,dict_df:data[[column,'id_house']].apply(lambda row:dict_df.ix[row[1]][column]
                                                       if np.isnan(row[0]) else row[0] ,axis=1)     
    
def prerpocess_house_params(data):
    df=data.copy(deep=True)
    df['full_sq']=df[['full_sq','life_sq']].apply(lambda row:max(row[0],row[1]),axis=1)
    unique_houses=df[house_loc_params].drop_duplicates().reset_index(drop=True)
    unique_houses['id_house']=range(unique_houses.shape[0])
    df=pd.merge(df,unique_houses ,on =house_loc_params,how='left')
    unique_houses_groped=df.groupby(['id_house']).agg({

    'max_floor' : most_common, #np.median, 
    'material':most_common ,#np.median,
    'build_year':most_common, #np.median,
    'state':most_common, #np.median,
    'ecology':ecology_replace
    })  
    #df['max_floor']=df[['max_floor','id_house']].apply(lambda row:unique_houses_groped.ix[row[1]]['max_floor']
    #                                                  if np.isnan(row[0]) else row[0] ,axis=1)
    df['max_floor']=replace_arr('max_floor',df,unique_houses_groped)
    df['material']=replace_arr('material',df,unique_houses_groped)
    df['build_year']=replace_arr('build_year',df,unique_houses_groped)
    df['state']=replace_arr_with_nan('state',df,unique_houses_groped)
    #df['ecology']=replace_arr('ecology',df,unique_houses_groped)
    
    return df

def squar_fix_outliner(data):
    data['full_sq']=data['full_sq'].apply(lambda x:x/100 if x>4000 else x)
    data['life_sq']=data['life_sq'].apply(lambda x:x/100 if x>4000 else x)
    return data

def closest_in_list(myNumber,myList):
    return min(myList, key=lambda x:abs(x-myNumber))


def fix_square(data):
    manual_sq={23231:74,33974:60,36824:64}
    df=data.copy(deep=True)
    df['price_sq']=df_train.price_doc/df.full_sq
    agg_price_house=df.groupby(['id_house']).agg({
            'price_sq':np.median,'full_sq':lambda x:list(x)
        })
    df['estimated_sq']=df_train.price_doc/   df.id_house.apply(lambda x:agg_price_house.ix[x].price_sq)
   
    df['full_sq']=df[['estimated_sq','id_house','full_sq']].apply(lambda row:closest_in_list(row[0],agg_price_house.ix[row[1]].full_sq) if row[2]<10 else row[2],axis=1)
    df['full_sq']=df[['full_sq','id']].apply(lambda row:manual_sq[row[1]] if row[1] in list(manual_sq.keys())else row[0],axis=1)
    return df.drop(['price_sq','estimated_sq'],axis=1)


def correct_life_square(full_sq,life_sq,dict_val,life_prop,min_th,max_th,th):
    if pd.isnull(life_sq):
        return dict_val*full_sq
    elif life_prop> min_th*(1-th) and life_prop<max_th*(1+th):
        return life_sq
    else:
        return dict_val*full_sq
    


def life_fix_missing(data,th=0.25):

    df=data.copy(deep=True)
    df['life_prop']=df.life_sq/df.full_sq
    life_house_groped=df.groupby(['id_house','sub_area']).agg({'life_prop':np.median,}).reset_index()
    life_area_groped=df.groupby(['sub_area']).agg({'life_prop':np.median}).reset_index()

    life_house_groped=pd.merge(life_house_groped,life_area_groped ,on='sub_area',how='left')
    life_house_groped=life_house_groped.ix[:,-2:].mean(axis=1).to_dict()
    min_th=min(list(life_house_groped.values()))
    max_th=max(list(life_house_groped.values()))
    df['life_sq']=df[['life_sq','full_sq',
                      'id_house','life_prop']].apply(lambda row:
                                                     correct_life_square(row[1],
                                                                         row[0],
                                                                         life_house_groped[row[2]],row[3],
                                                                         min_th,max_th,th),axis=1)
    df.drop('life_prop',axis=1,inplace=True)
    return df
def process_build_year(data):
    
    data['build_year']=data.build_year.apply(lambda x: -10000 if x<1800 or np.isnan(x) else x)
    return data
def transform_knn(data):
    df=data.copy(deep=True)
    df.fillna('nan')
    scaler=StandardScaler()
    df['full_sq']=scaler.fit_transform(df.full_sq)
    cosine_cat_columns=['id_house','build_year','sub_area','product_type']
    df_cat=pd.get_dummies(df[cosine_cat_columns])
    df_cat['full_sq']=df['full_sq']
    return df_cat
def fix_kitch_outliners(data,up_th=1.5,low_th=0.7):
    df=data.copy(deep=True)
    df['build_year']=df[['build_year','id']].apply(lambda row:2014 if row[1]==21418 else row[0],axis=1)
    index=df[(df.kitch_sq>5) & (df.kitch_sq<df.full_sq)].index
    target=df.ix[index].kitch_sq.values
    knn_df=transform_knn(df)
    #knn=KNeighborsRegressor()
    #knn.fit(knn_df.ix[index],target)
    rf=RandomForestRegressor(n_estimators=1000)
    rf.fit(knn_df.ix[index],target)
    df['pred_kitch']=rf.predict(knn_df)
    df['kitch_sq']=df.kitch_sq.fillna(0)
    df['delta_kitch']=df.kitch_sq/df.pred_kitch
    df['kitch_sq']=df[['kitch_sq','pred_kitch','delta_kitch']].apply(lambda row: row[0] if row[2] >low_th and row[2]<up_th else row[1],axis=1)
    return df.drop(['delta_kitch','pred_kitch'],axis=1)

def life_sq_fix(data):
    data['life_sq']=data[['full_sq','life_sq','kitch_sq']].apply(lambda row:
                                                                 row[1] if row[1]<=row[0]-row[2] else row[0]-row[2],axis=1)
    return data

def fix_max_floor(data):
    err_floor=pd.concat([data[data.max_floor<data.floor],data[pd.isnull(data.max_floor)]],axis=0).drop_duplicates()
    err_floor=pd.DataFrame(err_floor.pivot_table(index='id_house',values='floor',aggfunc=np.max)).reset_index()
    err_floor.columns=['id_house','err_max_floor_flg']
    data=pd.merge(data,err_floor,on='id_house',how='left')
    data['max_floor']=data[['max_floor','err_max_floor_flg']].apply(lambda row:max(row[1],row[0]),axis=1)
    data['c']=(data['err_max_floor_flg']/data['err_max_floor_flg']).fillna(0)
    return data



In [8]:
%%time
def f_transform(X):
    
    X = pd.merge(X, df_macro, on='timestamp', how='left')
    # Add month-year
    month_year = (X.timestamp.dt.month + X.timestamp.dt.year * 100)
    month_year_cnt_map = month_year.value_counts().to_dict()
    X['month_year_cnt'] = month_year.map(month_year_cnt_map)
    
    # Add week-year count
    week_year = (X.timestamp.dt.weekofyear + X.timestamp.dt.year * 100)
    week_year_cnt_map = week_year.value_counts().to_dict()
    X['week_year_cnt'] = week_year.map(week_year_cnt_map)
    
    # Add month and day-of-week
    X['month'] = X.timestamp.dt.month
    X['dow'] = X.timestamp.dt.dayofweek
    

    #####################
    
    
    
    X=prerpocess_house_params(X)
    
    X=squar_fix_outliner(X)
    X=fix_square(X)
    ########
    X=life_fix_missing(X)
    X=process_build_year(X)
    X=fix_kitch_outliners(X)
    X=life_sq_fix(X)
    X=fix_max_floor(X)
 
    ################ 
    # Other feature engineering
    X['rel_floor'] = X['floor'] / X['max_floor'].astype(float)
    X['rel_kitch_sq'] = X['kitch_sq'] / X['full_sq'].astype(float)
    X["ratio_life_sq_full_sq"] = X["life_sq"] / X["full_sq"]
    # ratio of kitchen area to full area #
    #building year
    X['year']=X["timestamp"].dt.year
    X["age_of_building"] = X["build_year"] - X["year"]

    #last fist floor
    X['fist_floor']=X.floor.apply(lambda x:1 if x==1 else 0)
    X['last_floor']=X[['floor','max_floor']].apply(lambda x:1 if (x[1]-x[0])<1 else 0,axis=1)


    
    
    

    X.drop(['timestamp'], axis=1, inplace=True)
    X.drop(['id'], axis=1, inplace=True)
    # Deal with categorical values
    df_numeric = X.select_dtypes(exclude=['object'])
    df_obj = X.select_dtypes(include=['object']).copy()

    for c in df_obj:
        df_obj[c] = pd.factorize(X[c])[0]

    X = pd.concat([df_numeric, df_obj], axis=1)
    
    
    return X
a=ml.transform(f_transform)   

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix


(7662, 323)
Wall time: 18min 7s


In [9]:
X=a[0].copy(deep=True)
X['price_meter']=a[1]/X.full_sq
median_price=np.median(X.price_meter.dropna())
X['temp_usd_compare']=X.price_meter*X.usdrub/median_price
X['price']=a[1]
X['price']=X[['price','full_sq',
              'temp_usd_compare','usdrub']].apply(lambda row:row[3]*row[1] if row[2]<3 else row[0],axis=1)

corrected_price=X['price']

In [10]:
# Convert to numpy values

num_train=df_train.shape[0]
# Create a validation set, with last 20% of data
num_val = int(num_train * 0.2)

X_train_all = a[0]
X_train = X_train_all[:num_train-num_val]
X_val = X_train_all[num_train-num_val:num_train]
#ylog_train_all =np.log1p( a[1])
ylog_train_all=a[1]
#ylog_train_all =np.log1p( corrected_price)

ylog_train = ylog_train_all[:-num_val]
ylog_val = ylog_train_all[-num_val:]

X_test = a[2]

df_columns = a[0].columns

print('X_train_all shape is', X_train_all.shape)
print('X_train shape is', X_train.shape)
print('y_train shape is', ylog_train.shape)
print('X_val shape is', X_val.shape)
print('y_val shape is', ylog_val.shape)
print('X_test shape is', X_test.shape)

X_train_all shape is (30471, 323)
X_train shape is (24377, 323)
y_train shape is (24377,)
X_val shape is (6094, 323)
y_val shape is (6094,)
X_test shape is (7662, 323)


In [11]:
#importance=pd.read_csv('importance.csv')
#importance_features=list(importance[importance.importance>=32].Feature)

importance_features=['full_sq',
 'floor',
 'life_sq',
 'micex_cbi_tr',
 'build_year',
 'usdrub',
 'kindergarten_km',
 'max_floor',
 'green_zone_km',
 'micex_rgbi_tr',
 'state',
 'metro_km_avto',
 'additional_education_km',
 'public_healthcare_km',
 'big_market_km',
 'mortgage_rate',
 'mosque_km',
 'railroad_km',
 'metro_min_avto',
 'public_transport_station_km',
 'area_m',
 'bus_terminal_avto_km',
 'water_treatment_km',
 'big_road2_km',
 'preschool_km',
 'thermal_power_plant_km',
 'hospice_morgue_km',
 'workplaces_km',
 'week_year_cnt',
 'park_km',
 'rel_floor',
 'ts_km',
 'prom_part_5000',
 'market_shop_km',
 'balance_trade',
 'nuclear_reactor_km',
 'industrial_km',
 'school_km',
 'power_transmission_line_km',
 'water_km',
 'green_part_1000',
 'church_synagogue_km',
 'deposits_rate',
 'rent_price_4+room_bus',
 'railroad_station_walk_km',
 'office_km',
 'kitch_sq',
 'fitness_km',
 'rel_kitch_sq',
 'ttk_km',
 'indust_part',
 'cemetery_km',
 'big_road1_km',
 'stadium_km',
 'trc_sqm_5000',
 'railroad_station_avto_km',
 'balance_trade_growth']

In [12]:
#importance=pd.read_csv('importance.csv')
df_columns=list( set(importance_features+add_macro_cols+['rel_floor','rel_kitch_sq',"ratio_life_sq_full_sq","age_of_building",'fist_floor','last_floor','year']))

In [13]:
dtrain_all = xgb.DMatrix(X_train_all[df_columns], ylog_train_all, feature_names=df_columns)
dtrain = xgb.DMatrix(X_train[df_columns], ylog_train, feature_names=df_columns)
dval = xgb.DMatrix(X_val[df_columns], ylog_val, feature_names=df_columns)
dtest = xgb.DMatrix(X_test[df_columns], feature_names=df_columns)

In [14]:
seeds=random.sample(range(2000), 20)
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 1.0,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 0,'seed':123
}

In [15]:
%%time
res=[]
# Uncomment to tune XGB `num_boost_rounds`
for seed in seeds:
    xgb_params['seed']=seed
    partial_model = xgb.train(xgb_params, dtrain, num_boost_round=1000, evals=[(dval, 'val')],
                           early_stopping_rounds=20, verbose_eval=False)

    num_boost_round = partial_model.best_iteration
    best_score = partial_model.best_score
    res.append([num_boost_round,best_score,seed])
r=np.array([el[1] for el in res])
print('best score: ' ,r.mean(),' std: ',r.std())

best score:  2725750.65  std:  14004.4027147
Wall time: 6min 15s


In [16]:
r=np.array([el[1] for el in res])
'best score: ' ,r.mean(),' std: ',r.std()
#('best score: ', 0.42012769999999994, ' std: ', 0.001197290027520486)

#best score:  0.4210585  std:  0.000947589388923

('best score: ', 2725750.6499999999, ' std: ', 14004.402714726893)

In [17]:
#importance=partial_model.get_fscore()
#importance=pd.DataFrame(list(importance.items()),columns=['Feature','importance']).sort_values(by='importance',
#                                                                                               ascending=False).reset_index(drop=True)

#importance.to_csv('importance.csv',index=False)
#df_columns= list(importance[importance.importance>=32].Feature)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 60))
bar=xgb.plot_importance(partial_model, max_num_features=50, height=0.5, ax=ax)

In [None]:
res[0]

In [18]:
final_pred=[]
for el in res:
    xgb_params['seed']=el[2]
    model = xgb.train(xgb_params, dtrain_all, num_boost_round=el[0])
    ylog_pred = model.predict(dtest)#*X_test.usdrub
    y_pred =ylog_pred #(np.exp(ylog_pred) - 1)
    final_pred.append(y_pred)

In [19]:


df_sub = pd.DataFrame({'id': id_test, 'price_doc': list(pd.DataFrame(final_pred).mean())})

df_sub.to_csv(os.path.join(ml.project_save,'5.csv'), index=False)

### usd

In [20]:
# Convert to numpy values

num_train=df_train.shape[0]
# Create a validation set, with last 20% of data
num_val = int(num_train * 0.2)

X_train_all = a[0]
X_train = X_train_all[:num_train-num_val]
X_val = X_train_all[num_train-num_val:num_train]
#ylog_train_all =np.log1p( a[1])
ylog_train_all=a[1]/a[0].usdrub
#ylog_train_all =np.log1p( corrected_price)

ylog_train = ylog_train_all[:-num_val]
ylog_val = ylog_train_all[-num_val:]

X_test = a[2]

df_columns = a[0].columns

print('X_train_all shape is', X_train_all.shape)
print('X_train shape is', X_train.shape)
print('y_train shape is', ylog_train.shape)
print('X_val shape is', X_val.shape)
print('y_val shape is', ylog_val.shape)
print('X_test shape is', X_test.shape)

dtrain_all = xgb.DMatrix(X_train_all[df_columns], ylog_train_all, feature_names=df_columns)
dtrain = xgb.DMatrix(X_train[df_columns], ylog_train, feature_names=df_columns)
dval = xgb.DMatrix(X_val[df_columns], ylog_val, feature_names=df_columns)
dtest = xgb.DMatrix(X_test[df_columns], feature_names=df_columns)

X_train_all shape is (30471, 323)
X_train shape is (24377, 323)
y_train shape is (24377,)
X_val shape is (6094, 323)
y_val shape is (6094,)
X_test shape is (7662, 323)


In [21]:
%%time
res=[]
# Uncomment to tune XGB `num_boost_rounds`
for seed in seeds:
    xgb_params['seed']=seed
    partial_model = xgb.train(xgb_params, dtrain, num_boost_round=1000, evals=[(dval, 'val')],
                           early_stopping_rounds=20, verbose_eval=False)

    num_boost_round = partial_model.best_iteration
    best_score = partial_model.best_score
    res.append([num_boost_round,best_score,seed])
r=np.array([el[1] for el in res])
print('best score: ' ,r.mean(),' std: ',r.std())

best score:  56416.7824218  std:  308.851153935
Wall time: 2min 1s


In [22]:
final_pred=[]
for el in res:
    xgb_params['seed']=el[2]
    model = xgb.train(xgb_params, dtrain_all, num_boost_round=el[0])
    ylog_pred = model.predict(dtest)*X_test.usdrub
    y_pred =ylog_pred #(np.exp(ylog_pred) - 1)
    final_pred.append(y_pred)
    
df_sub = pd.DataFrame({'id': id_test, 'price_doc': list(pd.DataFrame(final_pred).mean())})

df_sub.to_csv(os.path.join(ml.project_save,'6.csv'), index=False)

### usdmeter

In [23]:
# Convert to numpy values

num_train=df_train.shape[0]
# Create a validation set, with last 20% of data
num_val = int(num_train * 0.2)

X_train_all = a[0]
X_train = X_train_all[:num_train-num_val]
X_val = X_train_all[num_train-num_val:num_train]
#ylog_train_all =np.log1p( a[1])
ylog_train_all=a[1]/a[0].usdrub/a[0].full_sq
#ylog_train_all =np.log1p( corrected_price)

ylog_train = ylog_train_all[:-num_val]
ylog_val = ylog_train_all[-num_val:]

X_test = a[2]

df_columns = a[0].columns

print('X_train_all shape is', X_train_all.shape)
print('X_train shape is', X_train.shape)
print('y_train shape is', ylog_train.shape)
print('X_val shape is', X_val.shape)
print('y_val shape is', ylog_val.shape)
print('X_test shape is', X_test.shape)

dtrain_all = xgb.DMatrix(X_train_all[df_columns], ylog_train_all, feature_names=df_columns)
dtrain = xgb.DMatrix(X_train[df_columns], ylog_train, feature_names=df_columns)
dval = xgb.DMatrix(X_val[df_columns], ylog_val, feature_names=df_columns)
dtest = xgb.DMatrix(X_test[df_columns], feature_names=df_columns)

X_train_all shape is (30471, 323)
X_train shape is (24377, 323)
y_train shape is (24377,)
X_val shape is (6094, 323)
y_val shape is (6094,)
X_test shape is (7662, 323)


In [24]:
%%time
res=[]
# Uncomment to tune XGB `num_boost_rounds`
for seed in seeds:
    xgb_params['seed']=seed
    partial_model = xgb.train(xgb_params, dtrain, num_boost_round=1000, evals=[(dval, 'val')],
                           early_stopping_rounds=20, verbose_eval=False)

    num_boost_round = partial_model.best_iteration
    best_score = partial_model.best_score
    res.append([num_boost_round,best_score,seed])
r=np.array([el[1] for el in res])
print('best score: ' ,r.mean(),' std: ',r.std())

best score:  816.2432587  std:  1.80074722324
Wall time: 2min 1s


In [25]:
final_pred=[]
for el in res:
    xgb_params['seed']=el[2]
    model = xgb.train(xgb_params, dtrain_all, num_boost_round=el[0])
    ylog_pred = model.predict(dtest)*X_test.usdrub*X_test.full_sq
    y_pred =ylog_pred #(np.exp(ylog_pred) - 1)
    final_pred.append(y_pred)
    
df_sub = pd.DataFrame({'id': id_test, 'price_doc': list(pd.DataFrame(final_pred).mean())})

df_sub.to_csv(os.path.join(ml.project_save,'7.csv'), index=False)

# meter


In [26]:
# Convert to numpy values

num_train=df_train.shape[0]
# Create a validation set, with last 20% of data
num_val = int(num_train * 0.2)

X_train_all = a[0]
X_train = X_train_all[:num_train-num_val]
X_val = X_train_all[num_train-num_val:num_train]
#ylog_train_all =np.log1p( a[1])
ylog_train_all=a[1]/a[0].full_sq
#ylog_train_all =np.log1p( corrected_price)

ylog_train = ylog_train_all[:-num_val]
ylog_val = ylog_train_all[-num_val:]

X_test = a[2]

df_columns = a[0].columns

print('X_train_all shape is', X_train_all.shape)
print('X_train shape is', X_train.shape)
print('y_train shape is', ylog_train.shape)
print('X_val shape is', X_val.shape)
print('y_val shape is', ylog_val.shape)
print('X_test shape is', X_test.shape)

dtrain_all = xgb.DMatrix(X_train_all[df_columns], ylog_train_all, feature_names=df_columns)
dtrain = xgb.DMatrix(X_train[df_columns], ylog_train, feature_names=df_columns)
dval = xgb.DMatrix(X_val[df_columns], ylog_val, feature_names=df_columns)
dtest = xgb.DMatrix(X_test[df_columns], feature_names=df_columns)

X_train_all shape is (30471, 323)
X_train shape is (24377, 323)
y_train shape is (24377,)
X_val shape is (6094, 323)
y_val shape is (6094,)
X_test shape is (7662, 323)


In [27]:
%%time
res=[]
# Uncomment to tune XGB `num_boost_rounds`
for seed in seeds:
    xgb_params['seed']=seed
    partial_model = xgb.train(xgb_params, dtrain, num_boost_round=1000, evals=[(dval, 'val')],
                           early_stopping_rounds=20, verbose_eval=False)

    num_boost_round = partial_model.best_iteration
    best_score = partial_model.best_score
    res.append([num_boost_round,best_score,seed])
r=np.array([el[1] for el in res])
print('best score: ' ,r.mean(),' std: ',r.std())

best score:  40949.5316406  std:  100.616211754
Wall time: 13min 9s


In [28]:
final_pred=[]
for el in res:
    xgb_params['seed']=el[2]
    model = xgb.train(xgb_params, dtrain_all, num_boost_round=el[0])
    ylog_pred = model.predict(dtest)*X_test.full_sq
    y_pred =ylog_pred #(np.exp(ylog_pred) - 1)
    final_pred.append(y_pred)
    
df_sub = pd.DataFrame({'id': id_test, 'price_doc': list(pd.DataFrame(final_pred).mean())})

df_sub.to_csv(os.path.join(ml.project_save,'8.csv'), index=False)