#### Import relevant libraries

In [None]:
import numpy as np
import pandas as pd
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection, preprocessing
import xgboost as xgb
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.base import TransformerMixin
import math
from sklearn import datasets, linear_model
from keras.layers import Dense, Dropout, Activation
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
from keras import metrics
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from scipy import stats

%matplotlib inline

#### Load data 

In [None]:
macro_cols = ["balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
"micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
"income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"]

## https://www.kaggle.com/bguberfain/naive-xgb-lb-0-317/comments/notebook

In [None]:
path = 'F:/Kaggle/Sberbank Housing Price Prediction'
train = pd.read_csv(os.path.join(path,"train.csv"),parse_dates=['timestamp'])
test = pd.read_csv(os.path.join(path,"test.csv"),parse_dates=['timestamp'])
macro = pd.read_csv(os.path.join(path,"macro.csv"),parse_dates=['timestamp'], usecols=['timestamp'] + macro_cols)
submission = pd.read_csv(os.path.join(path,"sample_submission.csv"))

train = pd.merge(train, macro, how='left', on='timestamp')
test = pd.merge(test, macro, how='left', on='timestamp')
print(train.shape, test.shape)

### Setup right CV strategy using https://www.kaggle.com/c/sberbank-russian-housing-market/discussion/32717

In [None]:
'''trainsub = train[train.timestamp < '2015-01-01']
trainsub = trainsub[trainsub.product_type=="Investment"]

ind_1m = trainsub[trainsub.price_doc <= 1000000].index
ind_2m = trainsub[trainsub.price_doc == 2000000].index
ind_3m = trainsub[trainsub.price_doc == 3000000].index

train_index = set(train.index.copy())

for ind, gap in zip([ind_1m, ind_2m, ind_3m], [10, 3, 2]):
    ind_set = set(ind)
    ind_set_cut = ind.difference(set(ind[::gap]))

    train_index = train_index.difference(ind_set_cut)

train = train.loc[train_index]
'''

In [None]:
### drop cols with all NAs they dont contribute in any case and take intersection b/w test and train data
test.dropna(how="all", axis=1)
train.dropna(how="all", axis=1)

col_test = test.columns
col_train = train.columns
col_train_unique = set(col_train)
intersection = [val for val in col_test if val in col_train_unique]
#print (intersection)

price_doc = train['price_doc']
train = train[intersection]
test = test[intersection]

In [None]:
combined_set = pd.concat([train,test],axis=0)

In [None]:
def hmean(x):
    try:
        return stats.hmean(x)
    except:
        return 10000

In [None]:
### Feature engg 
### need to incorporate more features -- floor/full_sq, life_sq/full_sq,build_month (new), build_week (new),kitch/full, kitch>full_sq,
### harmonic mean and mean of (school, water, kindergarten, fitness, public_transport_station_km,church_synagogue,metro, metro_min_walk)

combined_set['floor_by_full'] = combined_set['floor']/combined_set['full_sq']
combined_set['kitch_by_full'] = combined_set['kitch_sq']/combined_set['full_sq']
combined_set['life_by_full'] = combined_set['life_sq']/combined_set['full_sq']
combined_set['avg_area_per_room'] = combined_set['full_sq']/combined_set['num_room']

absolutely_good_stuff = ['bus_terminal_avto_km','fitness_km','green_zone_km','office_km','school_km','workplaces_km','public_healthcare_km']
educational_stuff  = ['additional_education_km','kindergarten_km','preschool_km','university_km']
religious_stuff = ['big_church_km','church_synagogue_km','mosque_km']
work_stuff = ['office_km','workplaces_km']
fitness_health_stuff = ['basketball_km','fitness_km','green_zone_km','park_km','public_healthcare_km']
bad_stuff = ['detention_facility_km','hospice_morgue_km','industrial_km','nuclear_reactor_km',]
other_good_stuff  = ['bus_terminal_avto_km','public_transport_station_km','railroad_station_avto_km']

combined_set['absolutely_good_stuff'] = combined_set[absolutely_good_stuff].apply(hmean, axis=1)
combined_set['max_good_stuff'] = combined_set[absolutely_good_stuff].apply(max, axis=1)
combined_set['min_good_stuff'] = combined_set[absolutely_good_stuff].apply(min, axis=1)
combined_set['religious_stuff'] = combined_set[religious_stuff].apply(hmean, axis=1)
combined_set['max_religious_stuff'] = combined_set[religious_stuff].apply(max, axis=1)
combined_set['min_religious_stuff'] = combined_set[religious_stuff].apply(min, axis=1)
combined_set['work_stuff'] = combined_set[work_stuff].apply(hmean, axis=1)
combined_set['max_work_stuff'] = combined_set[work_stuff].apply(max, axis=1)
combined_set['min_work_stuff'] = combined_set[work_stuff].apply(min, axis=1)
combined_set['fitness_health_stuff'] = combined_set[fitness_health_stuff].apply(hmean, axis=1)
combined_set['max_fitness_health_stuff'] = combined_set[fitness_health_stuff].apply(max, axis=1)
combined_set['min_fitness_health_stuff'] = combined_set[fitness_health_stuff].apply(min, axis=1)

combined_set['min_bad_stuff'] = combined_set[bad_stuff].apply(min, axis=1)
combined_set['bad_stuff_hmean'] = combined_set[bad_stuff].apply(hmean, axis=1)
combined_set['max_bad_stuff'] = combined_set[bad_stuff].apply(min, axis=1)

combined_set['other_good_stuff'] = combined_set[other_good_stuff].apply(hmean, axis=1)
combined_set['min_other_good_stuff'] = combined_set[other_good_stuff].apply(min, axis=1)
combined_set['max_other_good_stuff'] = combined_set[other_good_stuff].apply(max, axis=1)


# Add month-year
month_year = (combined_set.timestamp.dt.month + combined_set.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
combined_set['month_year_cnt'] = month_year.map(month_year_cnt_map)

# Add week-year count
week_year = (combined_set.timestamp.dt.weekofyear + combined_set.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
combined_set['week_year_cnt'] = week_year.map(week_year_cnt_map)

# Add month and day-of-week
combined_set['month'] = combined_set.timestamp.dt.month
combined_set['dow'] = combined_set.timestamp.dt.dayofweek

# Remove timestamp column (may overfit the model in train)
combined_set.drop(['timestamp'], axis=1, inplace=True)

#### We are working towards a K-means based clustering followed by 2 stage price prediction

In [None]:
#from sklearn.base import TransformerMixin
## missing value impute 

def imputer (dataframe,impute_continuous = True, impute_categorical=True):
    categorical = []
    continuous = []
    for f in dataframe.columns:
        if dataframe[f].dtype=='object':
            categorical.append(f)
        else:
            continuous.append(f)
            
    print (categorical)
    print (continuous)
    if (impute_continuous):
        for c in continuous:
            if pd.isnull(dataframe[c].mean()):
                fill = -99
                dataframe[c].fillna(fill,inplace=True)
                print ("filling -99 for ",c)
            else:
                dataframe[c].fillna(dataframe[c].mean(), inplace=True)
                print (c+"_mean is_" + str(dataframe[c].mean()))
                print ("-------------------------")
    
    print ("===============================================")
    
    if (impute_categorical):
        for c in categorical:
            if pd.isnull(dataframe[c].mode()[0]):
                fill = -99
                dataframe[c].fillna(fill,inplace=True)
                print ("filling -99 for ",c)
            else:
                dataframe[c].fillna(dataframe[c].mode()[0],inplace=True)
                print (c+"_mode is_" + str(dataframe[c].mode()[0]))
                print ("-------------------------")
    return dataframe

In [None]:
combined_set = combined_set.replace(np.inf, 0)
combined_set_upd = imputer (dataframe=combined_set,impute_continuous = True, impute_categorical=True)

In [None]:
### separarte test and train data sets now -- these are final test/train sets -- still need to setup right CV test data
final_train = combined_set_upd[:train.shape[0]] # Up to the last initial training set row
final_test = combined_set_upd[train.shape[0]:] # Past the last initial training set row

print ('shape of train is:', final_train.shape)
print ('shape of test is:',final_test.shape)

In [None]:
categorical_var =[]
for f in final_train.ix[:, final_train.columns != 'timestamp'].columns:
    if final_train[f].dtype=='object':
        print(f)
        categorical_var.append(f)
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(final_train[f].values.astype('str')) + list(final_test[f].values.astype('str')))
        final_train[f] = lbl.transform(list(final_train[f].values.astype('str')))
        final_test[f] = lbl.transform(list(final_test[f].values.astype('str')))

In [None]:
final_test.head()

In [None]:
Y_train = price_doc
final_train.drop('id',axis=1,inplace=True)
final_test.drop('id',axis=1,inplace=True)
X_test = final_test

In [None]:
### define the rmsle func
def rmsle_eval(y, y0):
    y0=y0.get_label()    
    assert len(y) == len(y0)
    return 'error',np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(y0), 2)))

In [None]:
train_columns = list(set(final_train.select_dtypes(include=['float64', 'int64']).columns) - set(['id', 'timestamp']))

#### KMEANS FROM HERE 

In [None]:
## we need z std for each column
from scipy.stats import zscore

final_train.apply(zscore)
final_test.apply(zscore)

In [None]:
y_train = price_doc.values
x_train = final_train[train_columns].values
x_test = final_test[train_columns].values 

In [None]:
### look at the variance decomposition using the elbow plot for different values of k (k-means)
import numpy as np
from scipy import cluster
from matplotlib import pyplot
initial = [cluster.vq.kmeans(x_train,i) for i in range(1,10)]
pyplot.plot([var for (cent,var) in initial])
pyplot.show()

## takeaway: 7 clusters look good for starters -- although 7 is a bit too much, but we will persist and check later on

In [None]:
from sklearn.cluster import KMeans
clusters = 5
kmeans = KMeans(n_clusters=clusters, random_state=0).fit(x_train)
kmeans.labels_
test_labels = kmeans.predict(x_test)

In [None]:
#### Now that we have n clusters --- we will do separate prediction for each using xgboost

In [None]:
#29035 / 7662

In [None]:
### separarte test and train data sets now -- these are final test/train sets -- still need to setup right CV test data
final_train = combined_set[:train.shape[0]] # Up to the last initial training set row
final_test = combined_set[train.shape[0]:] # Past the last initial training set row

print ('shape of train is:', final_train.shape)
print ('shape of test is:',final_test.shape)

In [None]:
### encoding 
categorical_var =[]
for f in final_train.ix[:, final_train.columns != 'timestamp'].columns:
    if final_train[f].dtype=='object':
        print(f)
        categorical_var.append(f)
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(final_train[f].values.astype('str')) + list(final_test[f].values.astype('str')))
        final_train[f] = lbl.transform(list(final_train[f].values.astype('str')))
        final_test[f] = lbl.transform(list(final_test[f].values.astype('str')))

In [None]:
### reindex the train/test cluster labels to merge later on
assign = final_train.index.values
train_labels = pd.DataFrame({"cluster":kmeans.labels_})
#train_labels = train_labels.set_index[assign]
train_labels.index = assign
#train_labels

assign1 = final_test.index.values
test_labels = pd.DataFrame({"cluster":test_labels})
#train_labels = train_labels.set_index[assign]
test_labels.index = assign1
#test_labels

In [None]:
## merge test/train sets with respective labels
final_train = pd.concat([final_train,train_labels],axis=1)
final_test = pd.concat([final_test,test_labels],axis=1)

### So, at this point we have both test and train datasets appended with respective clusters

#### next: create separate test/train for each cluster and train a xgb separately 
#### automate this so that we can iterate over number of clusters

#### for those clusters where prediction is not good, we will use the global xgb estimates

In [None]:
##### real stuff from here 

## remember there is still a y_train --- which is price_doc from original train df

In [None]:
### subset test/train based on n-clusters --- this will give n dataframes --- still wokring on how to name these dfs properly
clusters = clusters
train_names =[]
for cluster in range(clusters):
    train_names.append("train"+str(cluster))
train_names    
#name = {}
for cluster in range(clusters):
    name = (train_names[cluster])
    print (name)
    train_names[cluster] = final_train[final_train['cluster'] == cluster]
    
test_names =[]
for cluster in range(clusters):
    test_names.append("test"+str(cluster))
test_names    
#name = {}
for cluster in range(clusters):
    name = (test_names[cluster])
    print (name)
    test_names[cluster] = final_test[final_test['cluster'] == cluster]    
    ### the n dataframes are named train_names[0], train_names[1] and so on till clusters-1 ....--- need to fix this 

In [None]:
### we will now do a PCA/SNE based dimensionality redxn to try and plot all vars and price_doc to check visualization of clusters

from sklearn.decomposition import PCA
components = 5
pca = PCA(n_components=components)
pca.fit(x_train)
PCA(copy=True, iterated_power='auto', n_components=components, random_state=None,svd_solver='auto', tol=0.0, whiten=False)
print(pca.explained_variance_ratio_) 
plt.plot(pca.explained_variance_, linewidth=2)
#plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_')

## this is a weird looking graph --- not at all smooth
## so, we only need only 1 components from PCA

In [None]:
## pca with only 1 component
pca = PCA(n_components=1)
pca.fit(x_train)
PCA(copy=True, iterated_power='auto', n_components=1, random_state=None,svd_solver='auto', tol=0.0, whiten=False)

In [None]:
#### visualilze the PCA0 and price_doc based on cluster values for train set

PCA0 = pca.transform(x_train) 
#y_train
#final_train['cluster']

assign_upd = final_train.index.values
y_train = pd.DataFrame(y_train)
PCA0 = pd.DataFrame(PCA0)
y_train.index = assign_upd
PCA0.index = assign_upd

for_viz = pd.concat([pd.DataFrame(final_train['cluster']),pd.DataFrame(y_train),pd.DataFrame(PCA0)],axis=1)
for_viz.columns = ['cluster','price_doc','PCA0']
for_viz.head()

In [None]:
#f, ax = plt.subplots(figsize=(7, 7))
#ax.set(xscale="log")
plot = sns.lmplot(x="PCA0", y="price_doc", hue="cluster",fit_reg=False, data=for_viz)
plot.set(yscale="log")

### takeaway: within each cluster price varies quite a bit but pca0 is more or less static 
### this could be because we did not factor in price during clustering process

In [None]:
### prepare separate dfs for each cluster --- the train will entail further train+33% valid sets

train_c0 = final_train.loc[train_names[0].index]
train_c1 = final_train.loc[train_names[1].index]
train_c2 = final_train.loc[train_names[2].index]
train_c3 = final_train.loc[train_names[3].index]
train_c4 = final_train.loc[train_names[4].index]

train_c0.drop(['id','cluster'],axis=1, inplace=True)
train_c1.drop(['id','cluster'],axis=1, inplace=True)
train_c2.drop(['id','cluster'],axis=1, inplace=True)
train_c3.drop(['id','cluster'],axis=1, inplace=True)
train_c4.drop(['id','cluster'],axis=1, inplace=True)

#train_c1.drop('id',axis=1, inplace=True)
#train_c2.drop('id',axis=1, inplace=True)
#train_c3.drop('id',axis=1, inplace=True)
#train_c4.drop('id',axis=1, inplace=True)

test_c0 = final_test.loc[test_names[0].index]
test_c1 = final_test.loc[test_names[1].index]
test_c2 = final_test.loc[test_names[2].index]
test_c3 = final_test.loc[test_names[3].index]
test_c4 = final_test.loc[test_names[4].index]


#test_c0.drop('id',axis=1, inplace=True)
#test_c1.drop('id',axis=1, inplace=True)
#test_c2.drop('id',axis=1, inplace=True)
#test_c3.drop('id',axis=1, inplace=True)
#test_c4.drop('id',axis=1, inplace=True)

y_train_c0 = y_train.loc[train_names[0].index]
y_train_c1 = y_train.loc[train_names[1].index]
y_train_c2 = y_train.loc[train_names[2].index]
y_train_c3 = y_train.loc[train_names[3].index]
y_train_c4 = y_train.loc[train_names[4].index]

In [None]:
### xgb for c0

test_size = 0.33

X_tr_c0, X_test_c0, Y_tr_c0, Y_test_c0 = train_test_split(train_c0, y_train_c0, test_size=test_size, random_state=1234)
#X_tr_c1, X_test_c1, Y_tr_c1, Y_test_c1 = train_test_split(x_train, y_train, test_size=test_size, random_state=1234)
#X_tr_c2, X_test_c2, Y_tr_c2, Y_test_c2 = train_test_split(x_train, y_train, test_size=test_size, random_state=1234)
#X_tr_c3, X_test_c3, Y_tr_c3, Y_test_c3 = train_test_split(x_train, y_train, test_size=test_size, random_state=1234)
#X_tr_c4, X_test_c4, Y_tr_c4, Y_test_c4 = train_test_split(x_train, y_train, test_size=test_size, random_state=1234)

xg_train = xgb.DMatrix(X_tr_c0, Y_tr_c0.values)
xg_valid = xgb.DMatrix(X_test_c0, Y_test_c0.values)
test_c0_upd = test_c0.drop(['id','cluster'],axis=1,inplace=False)
xg_test = xgb.DMatrix(test_c0_upd)

param ={}
param['objective'] = 'reg:linear'
param["subsample"] = 0.9
param["colsample_bytree"] = 0.7
param['eta'] = 0.1
param['max_depth'] = 5
param['silent'] = 1
param['lambda'] = 1
param['min_child_weight'] = 3
#param['eval_metric'] = ['rmse']
#param['eval_metric'] = [rmsle_eval]

watchlist = [ (xg_train,'train'), (xg_valid, 'test') ]
num_round = 10000

bst = xgb.train(param, xg_train, num_round, watchlist,feval=rmsle_eval,early_stopping_rounds=30)
pred_for_rmsle = bst.predict(xg_valid)

num_boost_round = bst.best_iteration
bst = xgb.train(param, xg_train, num_boost_round,watchlist)
prediction = bst.predict(xg_test)
df_c0 = pd.DataFrame({'id':test_c0.id,'price_doc':prediction})
#df.to_csv('fresh_try1.csv', index=False)

In [None]:
### xgb for c1

test_size = 0.33

#X_tr_c0, X_test_c0, Y_tr_c0, Y_test_c0 = train_test_split(train_c0, y_train_c0, test_size=test_size, random_state=1234)
X_tr_c1, X_test_c1, Y_tr_c1, Y_test_c1 = train_test_split(train_c1, y_train_c1, test_size=test_size, random_state=1234)
#X_tr_c2, X_test_c2, Y_tr_c2, Y_test_c2 = train_test_split(x_train, y_train, test_size=test_size, random_state=1234)
#X_tr_c3, X_test_c3, Y_tr_c3, Y_test_c3 = train_test_split(x_train, y_train, test_size=test_size, random_state=1234)
#X_tr_c4, X_test_c4, Y_tr_c4, Y_test_c4 = train_test_split(x_train, y_train, test_size=test_size, random_state=1234)

xg_train = xgb.DMatrix(X_tr_c1, Y_tr_c1.values)
xg_valid = xgb.DMatrix(X_test_c1, Y_test_c1.values)
test_c1_upd = test_c1.drop(['id','cluster'],axis=1,inplace=False)
xg_test = xgb.DMatrix(test_c1_upd)

param ={}
param['objective'] = 'reg:linear'
param["subsample"] = 0.9
param["colsample_bytree"] = 0.7
param['eta'] = 0.1
param['max_depth'] = 5
param['silent'] = 1
param['lambda'] = 1
param['min_child_weight'] = 3
#param['eval_metric'] = ['rmse']
#param['eval_metric'] = [rmsle_eval]

watchlist = [ (xg_train,'train'), (xg_valid, 'test') ]
num_round = 10000

bst = xgb.train(param, xg_train, num_round, watchlist,feval=rmsle_eval,early_stopping_rounds=30)
pred_for_rmsle = bst.predict(xg_valid)

num_boost_round = bst.best_iteration
bst = xgb.train(param, xg_train, num_boost_round,watchlist)
prediction = bst.predict(xg_test)
df_c1 = pd.DataFrame({'id':test_c1.id,'price_doc':prediction})
#df.to_csv('fresh_try1.csv', index=False)

In [None]:
### xgb for c2

test_size = 0.33

#X_tr_c0, X_test_c0, Y_tr_c0, Y_test_c0 = train_test_split(train_c0, y_train_c0, test_size=test_size, random_state=1234)
#X_tr_c1, X_test_c1, Y_tr_c1, Y_test_c1 = train_test_split(x_train, y_train, test_size=test_size, random_state=1234)
X_tr_c2, X_test_c2, Y_tr_c2, Y_test_c2 = train_test_split(train_c2, y_train_c2, test_size=test_size, random_state=1234)
#X_tr_c3, X_test_c3, Y_tr_c3, Y_test_c3 = train_test_split(x_train, y_train, test_size=test_size, random_state=1234)
#X_tr_c4, X_test_c4, Y_tr_c4, Y_test_c4 = train_test_split(x_train, y_train, test_size=test_size, random_state=1234)

xg_train = xgb.DMatrix(X_tr_c2, Y_tr_c2.values)
xg_valid = xgb.DMatrix(X_test_c2, Y_test_c2.values)
test_c2_upd = test_c2.drop(['id','cluster'],axis=1,inplace=False)
xg_test = xgb.DMatrix(test_c2_upd)

param ={}
param['objective'] = 'reg:linear'
param["subsample"] = 0.9
param["colsample_bytree"] = 0.7
param['eta'] = 0.1
param['max_depth'] = 5
param['silent'] = 1
param['lambda'] = 1
param['min_child_weight'] = 3
#param['eval_metric'] = ['rmse']
#param['eval_metric'] = [rmsle_eval]

watchlist = [ (xg_train,'train'), (xg_valid, 'test') ]
num_round = 10000

bst = xgb.train(param, xg_train, num_round, watchlist,feval=rmsle_eval,early_stopping_rounds=30)
pred_for_rmsle = bst.predict(xg_valid)

num_boost_round = bst.best_iteration
bst = xgb.train(param, xg_train, num_boost_round,watchlist)
prediction = bst.predict(xg_test)
df_c2 = pd.DataFrame({'id':test_c2.id,'price_doc':prediction})
#df.to_csv('fresh_try1.csv', index=False)

In [None]:
### xgb for c3

test_size = 0.33

#X_tr_c0, X_test_c0, Y_tr_c0, Y_test_c0 = train_test_split(train_c0, y_train_c0, test_size=test_size, random_state=1234)
#X_tr_c1, X_test_c1, Y_tr_c1, Y_test_c1 = train_test_split(x_train, y_train, test_size=test_size, random_state=1234)
#X_tr_c2, X_test_c2, Y_tr_c2, Y_test_c2 = train_test_split(x_train, y_train, test_size=test_size, random_state=1234)
X_tr_c3, X_test_c3, Y_tr_c3, Y_test_c3 = train_test_split(train_c3, y_train_c3, test_size=test_size, random_state=1234)
#X_tr_c4, X_test_c4, Y_tr_c4, Y_test_c4 = train_test_split(x_train, y_train, test_size=test_size, random_state=1234)

xg_train = xgb.DMatrix(X_tr_c3, Y_tr_c3.values)
xg_valid = xgb.DMatrix(X_test_c3, Y_test_c3.values)
test_c3_upd = test_c3.drop(['id','cluster'],axis=1,inplace=False)
xg_test = xgb.DMatrix(test_c3_upd)

param ={}
param['objective'] = 'reg:linear'
param["subsample"] = 0.9
param["colsample_bytree"] = 0.7
param['eta'] = 0.1
param['max_depth'] = 5
param['silent'] = 1
param['lambda'] = 1
param['min_child_weight'] = 3
#param['eval_metric'] = ['rmse']
#param['eval_metric'] = [rmsle_eval]

watchlist = [ (xg_train,'train'), (xg_valid, 'test') ]
num_round = 10000

bst = xgb.train(param, xg_train, num_round, watchlist,feval=rmsle_eval,early_stopping_rounds=30)
pred_for_rmsle = bst.predict(xg_valid)

num_boost_round = bst.best_iteration
bst = xgb.train(param, xg_train, num_boost_round,watchlist)
prediction = bst.predict(xg_test)
df_c3 = pd.DataFrame({'id':test_c3.id,'price_doc':prediction})
#df.to_csv('fresh_try1.csv', index=False)

In [None]:
### xgb for c4

test_size = 0.33

#X_tr_c0, X_test_c0, Y_tr_c0, Y_test_c0 = train_test_split(train_c0, y_train_c0, test_size=test_size, random_state=1234)
#X_tr_c1, X_test_c1, Y_tr_c1, Y_test_c1 = train_test_split(x_train, y_train, test_size=test_size, random_state=1234)
#X_tr_c2, X_test_c2, Y_tr_c2, Y_test_c2 = train_test_split(x_train, y_train, test_size=test_size, random_state=1234)
#X_tr_c3, X_test_c3, Y_tr_c3, Y_test_c3 = train_test_split(x_train, y_train, test_size=test_size, random_state=1234)
X_tr_c4, X_test_c4, Y_tr_c4, Y_test_c4 = train_test_split(train_c4, y_train_c4, test_size=test_size, random_state=1234)

xg_train = xgb.DMatrix(X_tr_c4, Y_tr_c4.values)
xg_valid = xgb.DMatrix(X_test_c4, Y_test_c4.values)
test_c4_upd = test_c4.drop(['id','cluster'],axis=1,inplace=False)
xg_test = xgb.DMatrix(test_c4_upd)

param ={}
param['objective'] = 'reg:linear'
param["subsample"] = 0.9
param["colsample_bytree"] = 0.7
param['eta'] = 0.1
param['max_depth'] = 5
param['silent'] = 1
param['lambda'] = 1
param['min_child_weight'] = 3
#param['eval_metric'] = ['rmse']
#param['eval_metric'] = [rmsle_eval]

watchlist = [ (xg_train,'train'), (xg_valid, 'test') ]
num_round = 10000

bst = xgb.train(param, xg_train, num_round, watchlist,feval=rmsle_eval,early_stopping_rounds=30)
pred_for_rmsle = bst.predict(xg_valid)

num_boost_round = bst.best_iteration
bst = xgb.train(param, xg_train, num_boost_round,watchlist)
prediction = bst.predict(xg_test)
df_c4 = pd.DataFrame({'id':test_c4.id,'price_doc':prediction})
#df.to_csv('fresh_try1.csv', index=False)

In [None]:
submit = pd.concat([df_c0,df_c1,df_c2,df_c3,df_c4],axis=0)
submit.sort_index(axis=0,inplace=True)
submit.tail()
submit.to_csv("kmeans_XGB_try1.csv", index=False)