In [50]:
import pandas as pd
import numpy as np
import gc
from sklearn import preprocessing

def get_prices():
    prices = reduce_mem_usage(pd.read_csv('./input/price.csv',
                                         encoding='cp1251',
                                         parse_dates=['datefrom','dateto']))
    prices.drop(['date_salestart'], axis = 1, inplace = True)
    # prices["datefrom_week"] = prices['datefrom'].dt.week
    prices["datefrom_month"] = prices['datefrom'].dt.month
    # prices["dateto_week"] = prices['dateto'].dt.week
    prices["dateto_month"] = prices['dateto'].dt.month
    prices["have_price"] = prices['pricem2'].apply(lambda x: 1 if x > 50000 else 0)
    prices["not_saled"] = prices['dateto'].apply(lambda x: 1 if x == '2100-01-01 00:00:00' else 0)
    prices['sales_duration'] = ((prices['dateto'] - prices['datefrom'])/30).dt.days
    prices["sales_duration"] = prices['sales_duration'].apply(lambda x: x if x > 900 else 0)

    # unique_df = prices.nunique()
    # dummy_features = list(unique_df[unique_df <= 12].index)
    # prices = pd.get_dummies(prices, columns=dummy_features, dummy_na=True)
    # del unique_df
    aggregations = {}
    aggregations = {
                'pricem2': ['min','max','mean','count'],
#                 'datefrom_month': ['mean'],
#                 'dateto_month': ['mean'],
                'have_price': ['mean', 'sum'],
                'not_saled': ['mean', 'sum'],
                'sales_duration': ['min', 'mean', 'max'],
            }

    # for cat in dummy_features:
    #         aggregations[cat] = ['mean', 'min', 'max']

    prices_agg = prices.groupby('id_flatwork').agg(aggregations)
    prices_agg.columns = pd.Index(['PRICES_' + e[0] + "_" + e[1].upper() for e in prices_agg.columns.tolist()])
    return prices_agg

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if (col_type != object) & (col_type != 'datetime64[ns]'):
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def one_hot_encoder(df, nan_as_category = True, keep_columns = None, max_num_of_unique_items = 31):
    original_columns = list(df.columns)
    if keep_columns is None:
        categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    else:
        categorical_columns = [col for col in df.columns if df[col].dtype == 'object' and col not in keep_columns]
    for col in categorical_columns:
        if len(df[col].unique()) > max_num_of_unique_items :
            categorical_columns = list(set(categorical_columns) - set([col]))
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

def get_flats(df):
    flats = reduce_mem_usage(pd.read_csv('./input/flat.csv',
                                         encoding='cp1251',
                                         parse_dates=['date_salestart','date_settle','sale']))

    # flats["date_salestart_weekday"] = flats['date_salestart'].dt.weekday
    flats["date_salestart_week"] = flats['date_salestart'].dt.week
    # flats["date_salestart_day"] = flats['date_salestart'].dt.day
    flats["date_salestart_month"] = flats['date_salestart'].dt.month

    # flats["date_settle_weekday"] = flats['date_settle'].dt.weekday
    flats["date_settle_week"] = flats['date_settle'].dt.week
    # flats["date_settle_day"] = flats['date_settle'].dt.day
    flats["date_settle_month"] = flats['date_settle'].dt.month
    flats['time_before_settle'] = ((flats['date_settle'] - flats['date_salestart'])/30).dt.days
    
    
    flats['new_index_spalen'] = flats['id_bulk'] + '-' + flats['spalen'].astype(int).astype(str)
   
    aggregations = {
            'stage_number': ['max', 'mean'],
            'spalen': ['count','sum'],
            'square': ['sum', 'mean'],
#             'date_salestart_week': ['min','max','mean'],
#             'date_salestart_month': ['min','max','mean'],
#             'date_settle_week': ['min','max','mean'],
#             'date_settle_month': ['min','max','mean'],
            'time_before_settle': ['min','max','mean'],
        }
   
    flats_agg = flats.groupby(['new_index_spalen']).agg(aggregations)
    flats_agg.columns = pd.Index(['FLATS_sp_' + e[0] + "_" + e[1].upper() for e in flats_agg.columns.tolist()])
    
    discard_columns = ["id_bulk",
    "section",
    "date_settle",
    "date_salestart",
    "id_gk",
    "id_flatwork",
    "Класс объекта",
    "Количество помещений",
    "Огорожена территория",
    "Площадь земельного участка",
    "Входные группы",
    "Детский сад",
    "Школа",
    "Поликлиника",
    "ФОК",
    "Спортивная площадка",
    "Автомойка",
    "Кладовые",
    "Колясочные",
    "Кондиционирование",
    "Вентлияция",
    "Лифт",
    "Система мусоротведения",
    "Видеонаблюдение",
    "Подземная парковка",
    "Двор без машин",
    "Машиномест",
    "Площадь пром. зоны в радиусе 500 м",
    "Площадь зеленой зоны в радиусе 500 м",
    "До Кремля",
    "До ТТК(км)",
    "До Садового(км)",
    "До большой дороги на машине(км)",
    "До удобной авторазвязки на машине(км)",
    "До метро пешком(км)",
    "До промки(км)",
    "До парка(км)",
    "До парка пешком(км)",
    "Станций метро от кольца",
    "Площадь двора",
    "vid",
    "sale",
    "plan_size",
    ]
    feats = [f for f in flats_agg.columns.tolist() if f not in discard_columns]

    gc.collect()
    return flats_agg[feats]

def add_lags(df, feat, index='new_index_spalen', by_col='month_cnt', aggfunc=np.mean):
    temp = pd.pivot_table(df, index=index, values=[feat], columns=by_col, aggfunc=aggfunc)
#     temp = df.pivot(index=index, values=[feat], columns=by_col)
    cols = [feat+'_'+by_col+'_{}'.format(j[1]) if j[1] != "" else j[0] for i, j in enumerate(temp.columns)]
    temp.columns = cols
    temp[feat+'_lag_1'] = 0
    for row_idx in range(temp.shape[0]):
        for idx in range(len(cols)):
            value = temp.iloc[row_idx, -idx-2]
            if not np.isnan(value) :
                temp.iloc[row_idx, -1] = value
                break
    new_cols = [feat+'_lag_1']
    return temp, new_cols



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Memory usage of dataframe is 43.42 MB
Memory usage after optimization is: 23.73 MB
Decreased by 45.3%
Flats df shape: (1040, 9)


In [None]:



df_train = pd.read_csv("./input/train.csv", encoding='cp1251')
df_test = pd.read_csv("./input/test.csv", encoding='cp1251')
df_add = df_train[['bulk_id', 'start_square', 'spalen',
                   'Класс объекта', 'month','value','mean_sq']]
df_add['mean_sq_spalen'] = df_add['mean_sq'] / (df_add['spalen']+1)

df_train.drop(['start_square', 'plan_s', 'plan_m', 'plan_l', 'vid_0', 'vid_1', 'vid_2'], axis=1, inplace=True)
df_test['value']=-1

target = df_train['value'].copy()
df = pd.concat([df_train, df_test[df_train.columns]])

df['mean_sq_spalen'] = df['mean_sq'] / (df['spalen']+1)
df['mean_sq*price'] = df['mean_sq']*df['price'] ## 220.94
df['spalen*Площадь зеленой зоны в радиусе 500 м'] = df['spalen']*df['Площадь зеленой зоны в радиусе 500 м'] ## 220,04
df['spalen*month_cnt'] = df['spalen']*df['month_cnt'] ## 219.77
df['mean_sq*До удобной авторазвязки на машине(км)'] = df['mean_sq']*df['До удобной авторазвязки на машине(км)'] ## 219.78
df['Станций метро от кольца*price'] = df['Станций метро от кольца']*df['price'] ## 219.19
df['price-*-mean_sq*price'] = df['price']*df['mean_sq*price']
df['price-*-Станций метро от кольца*price'] = df['price']*df['Станций метро от кольца*price']
df['mean_sq*price-*-Станций метро от кольца*price'] = df['mean_sq*price']*df['Станций метро от кольца*price']

df['new_index_spalen'] = df['bulk_id'] + '-' + df['spalen'].astype(str)

col1 = 'Класс объекта'
col2 = 'spalen'
col3 = 'month'
index_name = 'index_'+col1+'_'+col2+'_'+col3
df[index_name] = df[col1] + '-' + df[col2].astype(str) + '-' + df[col3].astype(str)
groupby_col = index_name
aggregations = {
             'price': ['mean','max','min'],
             'mean_sq_spalen': ['mean','max','min'],
             'mean_sq*price': ['mean']
        }
df_agg = df[df.value > 0].groupby([groupby_col]).agg(aggregations)
df_agg.columns = pd.Index(['AGG_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df = df.join(df_agg, how='left', on=groupby_col, rsuffix='_'+groupby_col)
df.drop(index_name, inplace=True, axis=1)

df['AGG_price_MAX'] = df['AGG_price_MAX'].fillna(df['price'])
df['AGG_price_MIN'] = df['AGG_price_MIN'].fillna(df['price'])
df['AGG_price_MEAN'] = df['AGG_price_MEAN'].fillna(df['price'])
df['AGG_mean_sq_spalen_MIN'] = df['AGG_mean_sq_spalen_MIN'].fillna(df['mean_sq_spalen'])
df['AGG_mean_sq_spalen_MAX'] = df['AGG_mean_sq_spalen_MAX'].fillna(df['mean_sq_spalen'])
df['AGG_mean_sq_spalen_MEAN'] = df['AGG_mean_sq_spalen_MEAN'].fillna(df['mean_sq_spalen'])
df['AGG_mean_sq*price_MEAN'] = df['AGG_mean_sq*price_MEAN'].fillna(df['mean_sq*price'])

df['price_diff_1'] = df['price']/df['AGG_price_MAX']
df['price_diff_2'] = df['price']/df['AGG_price_MIN']
df['price_diff_3'] = df['price']/df['AGG_price_MEAN']

    ## 223

col2 = 'spalen'
col3 = 'month'
index_name = 'index_'+'_'+col2+'_'+col3
df[index_name] = df[col2].astype(str) + '-' + df[col3].astype(str)
groupby_col = index_name
aggregations = {
                 'price': ['mean'],
                 'mean_sq_spalen':  ['mean','max','min', 'var'],
             }
df_agg = df[df.value > 0].groupby([groupby_col]).agg(aggregations)
df_agg.columns = pd.Index(['AGG_'+index_name + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df = df.join(df_agg, how='left', on=groupby_col, rsuffix='_'+groupby_col)
df.drop(index_name, inplace=True, axis=1)
    
groupby_col = 'bulk_id'
aggregations = {
                 'month_cnt': ['min'],
             }
df_agg = df[df.value > 0].groupby([groupby_col]).agg(aggregations)
df_agg.columns = pd.Index(['AGG_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df = df.join(df_agg, how='left', on=groupby_col, rsuffix='_'+groupby_col)
df['AGG_month_cnt_MIN'] = df['AGG_month_cnt_MIN'].fillna(df['month_cnt'])
df['flag_sales_started'] = np.where(df['month_cnt']==df['AGG_month_cnt_MIN'], 1, 0)
df['sales_month_cnt'] =  df['month_cnt'] - df['AGG_month_cnt_MIN']

groupby_col = 'month'
aggregations = {}
cols = ['mean_sq*price',
        'mean_sq',
#         'Машиномест'
       ]
for col in cols:
    aggregations = {
            col: ['min', 'max', 'mean'],
        }
df_agg = df.groupby([groupby_col]).agg(aggregations)
df_agg.columns = pd.Index(['AGG_'+groupby_col+ e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df = df.join(df_agg, how='left', on=groupby_col, rsuffix='_'+groupby_col)

# temp, new_cols = add_lags(df, feat='value', index='new_index_spalen', by_col='month_cnt')
# df = df.merge(temp[new_cols], on='new_index_spalen',  how='left')
    
temp, new_cols = add_lags(df, feat='price', index='new_index_spalen', by_col='month_cnt')
df = df.merge(temp[new_cols], on='new_index_spalen',  how='left')
    
temp, new_cols = add_lags(df, feat='mean_sq', index='new_index_spalen', by_col='month_cnt')
df = df.merge(temp[new_cols], on='new_index_spalen',  how='left')

groupby_col = 'spalen'
aggregations = {
            'value': ['mean'],
        }
df_agg = df[df.value > 0].groupby([groupby_col]).agg(aggregations)
df_agg.columns = pd.Index(['AGG_'+groupby_col+ e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df = df.join(df_agg, how='left', on=groupby_col, rsuffix='_'+groupby_col)


##### 199
df['Курс_ratio'] = df['Курс']/np.min(df['Курс'])

flats = get_flats(df)
flats_cols = list(flats.columns)
print("Flats df shape:", flats.shape)
df = df.join(flats, how='left', on='new_index_spalen')
del flats
gc.collect()

# df_bulk_cols = ['bulk_id_1','bulk_id_2','bulk_id_3','bulk_id_4','bulk_id_5']
# df_bulk = pd.DataFrame(df.bulk_id.str.split('-').values.tolist(), columns=df_bulk_cols
#                        , index=df.index)
# df[df_bulk_cols] = df_bulk[df_bulk_cols]
# df.drop(['bulk_id_1','bulk_id_2','bulk_id_3','bulk_id_4'], inplace=True, axis=1)

# col2 = 'spalen'
# col3 = 'bulk_id_5'
# index_name = 'index_'+col2+'_'+col3
# df[index_name] = df[col2].astype(str) + '-' + df[col3].astype(str)
# groupby_col = index_name
# aggregations = {
#                  'price': ['mean','max','min', 'var'],
# #                  'value':  ['mean','max','min', 'var'],
#                  'mean_sq_spalen': ['mean','max','min']
#              }
# df_agg = df[df.value > 0].groupby([groupby_col]).agg(aggregations)
# df_agg.columns = pd.Index(['AGG_' + index_name + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
# df = df.join(df_agg, how='left', on=groupby_col, rsuffix='_'+groupby_col)
# df.drop(index_name, inplace=True, axis=1)

# df['mean_sq_spalen_diff_1'] = df['price']/df['AGG_'+index_name+'mean_sq_spalen_MAX']
# df['mean_sq_spalen_diff_2'] = df['price']/df['AGG_'+index_name+'mean_sq_spalen_MIN']
# df['mean_sq_spalen_diff_3'] = df['price']/df['AGG_'+index_name+'mean_sq_spalen_MEAN']

# col1 = 'Класс объекта'
# col2 = 'spalen'
# index_name = 'index_'+col1+'_'+col2
# df[index_name] = df[col1] + '-' + df[col2].astype(str)
# df_add[index_name] = df_add[col1] + '-' + df_add[col2].astype(str)
# groupby_col = index_name
# aggregations = {
#              'start_square': ['mean','max','min','var'],
#              'value': ['mean','max','min','var'],
# #              'mean_sq_spalen': ['mean','max','min','var'],
# #              'mean_sq': ['mean','max','min','var'],
#         }

# df_agg = df_add.groupby([groupby_col]).agg(aggregations)
# df_agg.columns = pd.Index(['AGG_' + index_name + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
# df = df.join(df_agg, how='left', on=groupby_col, rsuffix='_'+groupby_col)
# df.drop(index_name, inplace=True, axis=1)

# df['ratio_mean_sq_1'] = df['AGG_'+index_name+'mean_sq_MEAN']/df['mean_sq']
# df['ratio_mean_sq_2'] = df['AGG_'+index_name+'mean_sq_MAX']/df['mean_sq']
# df['ratio_mean_sq_spalen_1'] = df['AGG_'+index_name+'mean_sq_MEAN']/df['mean_sq_spalen']
# df['ratio_mean_sq_spalen_2'] = df['AGG_'+index_name+'mean_sq_MAX']/df['mean_sq_spalen']


df['ratio_mean_sq_spalen'] = df['mean_sq_spalen']/df['AGG_mean_sq_spalen_MEAN']
df['log_mean_sq*price'] = np.log1p(df['mean_sq*price'])
df['log_price'] = np.log1p(df['price'])
df['log_mean_sq'] = np.log1p(df['mean_sq'])
# 198


# df['ratio_mean_sq*price'] = df['mean_sq*price']/df['AGG_mean_sq*price_MEAN']
# df['log_price-*-mean_sq*price'] = np.log1p(df['price-*-mean_sq*price'])
# df['log_price-*-Станций метро от кольца*price'] = np.log1p(df['price-*-Станций метро от кольца*price']) 
# df['log_mean_sq*price-*-Станций метро от кольца*price']  = np.log1p(df['mean_sq*price-*-Станций метро от кольца*price']) 


for col in df.columns:
    if len(df[df[col].notnull()]) < len(df):
#         print(col)
        df[col] = df[col].fillna(df[df[col].notnull()][col].mean())

cat_feats = ['spalen', 'month', 'Класс объекта',
     'Огорожена территория', 'Входные группы', 'Детский сад', 'Школа',
     'Поликлиника', 'ФОК', 'Спортивная площадка', 'Автомойка',
     'Кладовые', 'Колясочные', 'Кондиционирование', 'Вентлияция',
     'Лифт', 'Система мусоротведения', 'Видеонаблюдение', 'Подземная парковка',
     'Двор без машин', 'flag_sales_started']

lbl = preprocessing.LabelEncoder()
for col in cat_feats:
    df[col].fillna('Unknown')
    df[col] = lbl.fit_transform(df[col].astype(str)) 

df.drop(['new_index_spalen'], axis=1, inplace=True)
# df.drop(['date1'], axis=1, inplace=True)

df[df.value == -1].to_csv("./input/test_.csv", index=False)
df[df.value > -1].to_csv("./input/train_full.csv", index=False)

In [61]:
## One left period validation

df = pd.read_csv("./input/train_full.csv")

month_folds = ([30,31],[31,32]
               ,[32,33],[33,34],[34,35],[35,36],[36,37])

for i, (train_index, valid_index) in enumerate(month_folds):
    train = df[df.month_cnt<train_index]
    valid = df[df.month_cnt.between(train_index, valid_index)]

    train.to_csv(f"./input/train_month_folds_{i}.csv",index=False)
    valid.to_csv(f"./input/valid_month_folds_{i}.csv",index=False)

In [51]:
## KFold validation
from sklearn.model_selection import KFold
import pandas as pd

K = 10
df = pd.read_csv("./input/train_full.csv")

kf = KFold(n_splits=K, random_state=10, shuffle=True)
kf.get_n_splits(range(len(df)))

print(kf)  

for i, (train_index, valid_index) in enumerate(kf.split(range(len(df)))):
    train = df.loc[train_index]
    valid = df.loc[valid_index]

    train.to_csv(f"./input/train_{i}.csv",index=False)
    valid.to_csv(f"./input/valid_{i}.csv",index=False)

KFold(n_splits=10, random_state=10, shuffle=True)


In [47]:
idx = 0
for col in df.columns.tolist():
    print(idx, len(df[col].unique()), col)
    idx += 1

0 8726 id
1 233 bulk_id
2 5 spalen
3 31 date1
4 4676 value
5 7025 price
6 3792 mean_sq
7 32 mean_fl
8 12 month
9 31 month_cnt
10 3 Класс объекта
11 29 Количество помещений
12 2 Огорожена территория
13 29 Площадь земельного участка
14 2 Входные группы
15 21 Детский сад
16 16 Школа
17 4 Поликлиника
18 2 ФОК
19 1 Спортивная площадка
20 2 Автомойка
21 2 Кладовые
22 2 Колясочные
23 3 Кондиционирование
24 3 Вентлияция
25 1 Лифт
26 1 Система мусоротведения
27 3 Видеонаблюдение
28 2 Подземная парковка
29 2 Двор без машин
30 29 Машиномест
31 128 Площадь пром. зоны в радиусе 500 м
32 173 Площадь зеленой зоны в радиусе 500 м
33 17 До Кремля
34 22 До ТТК(км)
35 17 До Садового(км)
36 105 До большой дороги на машине(км)
37 79 До удобной авторазвязки на машине(км)
38 81 До метро пешком(км)
39 79 До промки(км)
40 95 До парка(км)
41 92 До парка пешком(км)
42 10 Станций метро от кольца
43 123 Площадь двора
44 31 Курс
45 30 Cтавка по ипотеке
46 30 Вклады до 1 года
47 31 Вклады от 1 года до 3 лет
48 30 Вк

In [48]:
from catboost.utils import create_cd

create_cd(
    label=4,
    #cat_features=(2, 8, 10, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 66),
    cat_features=(2, 8, 10, 12, 14, 15, 16, 17, 18, 19,
                  20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 42, 81),
#     weight=1,
#     baseline=2,
    doc_id=0,
    #group_id=1,
#     subgroup_id=8,
    auxiliary_columns=(1,3),
    #timestamp=3,
#     feature_names=feature_names,
    output_path='./input/train2.cd'
)

In [49]:
from catboost import CatBoostRegressor, Pool
import pandas as pd
import numpy as np

K = 10

path = './input/'

TEST_FILE = path + 'test_.csv' #add
CD_FILE = path + 'train2.cd'

test_pool = Pool(TEST_FILE, column_description=CD_FILE, has_header=True, delimiter=",")
df = pd.read_csv('./input/train_full.csv', encoding='cp1251')
oof = np.zeros((len(df), 1))

dt = pd.read_csv(TEST_FILE, encoding='cp1251')
test = np.zeros((len(dt), 1))
avg_score = 0

for i in range(K):
    print('Iteration {} started'.format(i))
    TRAIN_FILE = path+f'train_{i}.csv'
    VAL_FILE = path+f'valid_{i}.csv'

    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE, has_header=True, delimiter=",")
    val_pool = Pool(VAL_FILE, column_description=CD_FILE, has_header=True, delimiter=",")

    model = CatBoostRegressor(iterations=2000, 
                              learning_rate=0.2, 
                              depth=11, 
                              random_seed = 10,
                              verbose = False
                              )
    
    model.fit(train_pool, eval_set = val_pool)
    
    pred = model.predict(test_pool)
    test += pred.reshape((len(dt),1))
    dt['value'] = pred
    dt.to_csv(f"./output/test_{i}.csv", index=False, columns=['id','value'])
    
    pred = model.predict(val_pool)
    df = pd.read_csv(VAL_FILE)
    oof[df.id] = pred.reshape((len(df),1))
    df['value'] = pred
    df.to_csv(f"./output/oof_{i}.csv", index=False, columns=['id','value'])

    full_train_pool = Pool(TRAIN_FILE, column_description=CD_FILE, has_header=True, delimiter=",")
    oof_value = pd.read_csv(VAL_FILE, encoding='cp1251')['value']
    oof_score = model.score(val_pool, oof_value)
    print('Iteration {} with oof score {}'.format(i, oof_score))
    avg_score += oof_score
    
df = pd.read_csv(path+"sample submission.csv", encoding='cp1251')

df['value'] = (test/K).clip(0, 5000)
df.to_csv('submission_catboost_'+str(avg_score/(i+1))+'.csv', index=False)

df = pd.read_csv(path+"train_full.csv", encoding='cp1251')
df['value_pred'] = oof
df.to_csv('./output/oof_'+str(avg_score/(i+1))+'.csv', index=False)

Iteration 0 started
Iteration 0 with oof score 204.28098595455506
Iteration 1 started
Iteration 1 with oof score 247.42313728455727
Iteration 2 started
Iteration 2 with oof score 200.4002340728003
Iteration 3 started
Iteration 3 with oof score 192.50499913276875
Iteration 4 started
Iteration 4 with oof score 183.58216204713682
Iteration 5 started
Iteration 5 with oof score 201.41336909537418
Iteration 6 started
Iteration 6 with oof score 217.5833455423839
Iteration 7 started
Iteration 7 with oof score 214.14163122158908
Iteration 8 started
Iteration 8 with oof score 245.6235567309891
Iteration 9 started
Iteration 9 with oof score 222.06118148682836


In [None]:
## Lightgbm

import pandas as pd
import lightgbm as lgb
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
import numpy as np
iter_num = 0
n_folds = 7
avg_score = 0
feature_importance_df = pd.DataFrame()
df_train = pd.read_csv('./input/train_full.csv', encoding='cp1251')
# df_train.to_csv('./input/train_full_lgbm.csv', encoding='cp1251')
# df_oof = np.zeros((len(df), n_folds))
# df_meta = pd.read_csv('submission_catboost_211.43605189630915_199_05136.csv')
df_test = pd.read_csv('./input/test_.csv')

# df_test['metafeature_catboost'] = df_meta['value']
test = np.zeros((len(df_test), 1))
oof = np.zeros((len(df_train), 1))
num_seeds = 1
for s in range(num_seeds):
    for fold in range(n_folds):
        iter_num += 1
        df_train = pd.read_csv('./input/train_month_folds_{}.csv'.format(fold))
        df_valid = pd.read_csv('./input/valid_month_folds_{}.csv'.format(fold))
        target_train = df_train.pop('value')
        target_valid = df_valid.pop('value')

        discard_feats = ['id','bulk_id','date1','bulk_id_5']

        cat_feats = ['spalen', 'month', 'Класс объекта',
         'Огорожена территория', 'Входные группы', 'Детский сад', 'Школа',
         'Поликлиника', 'ФОК', 'Спортивная площадка', 'Автомойка',
         'Кладовые', 'Колясочные', 'Кондиционирование', 'Вентлияция',
         'Лифт', 'Система мусоротведения', 'Видеонаблюдение', 'Подземная парковка',
         'Двор без машин', 'flag_sales_started']

        feats = [f for f in df_train.columns.tolist() if f not in discard_feats]
    #     feats = [f for f in feats if f not in cat_feats]

        lgtrain = lgb.Dataset(df_train[feats], target_train,
                            feature_name=feats,
                            categorical_feature = cat_feats
                             )
        lgvalid = lgb.Dataset(df_valid[feats], target_valid,
                            feature_name=feats,
                            categorical_feature = cat_feats
                             )

        lgbm_params =  {
            'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': 'rmse',
            # 'max_depth': 15,
            'num_leaves': 34,
            'nthread':4,
            'learning_rate': 0.005,
            'colsample_bytree': 0.8,
            'subsample': 0.87,
            'max_depth': -1,
            'reg_alpha': 0.04,
            'min_split_gain': 0.017,
            'min_child_weight': 20,
            'verbose': -1,
            'silent':-1,
            'seed':s,
            'random_state':s
        }

        lgb_clf = lgb.train(
                lgbm_params,
                lgtrain,
                num_boost_round=10000,
                valid_sets=[lgtrain, lgvalid],
                valid_names=['train','valid'],
                early_stopping_rounds=300,
                verbose_eval=10000
            )
        # print('RMSE:', np.sqrt(mean_squared_error(target_valid, lgb_clf.predict(df_valid[feats]))))
        pred = lgb_clf.predict(df_test[feats])
        test += pred.reshape((len(df_test),1))
        df_test['value'] = pred
        df_test.to_csv("./output/test_lgbm_m_{}.csv.".format(fold), index=False, columns=['id','value'])
        pred = lgb_clf.predict(df_valid[feats])
        df = pd.read_csv("./input/valid_month_folds_{}.csv.".format(fold))
        oof[df.id] = pred.reshape((len(df),1))
        df['value'] = pred
        df.to_csv("./output/oof_lgbm_m_{}.csv".format(fold), index=False, columns=['id','value'])
    #     full_train = pd.read_csv('./input/train_full_lgbm.csv', encoding='cp1251')
    #     full_train['oof_'.format(fold)] = pred
    #     full_train.to_csv('./input/train_full_lgbm.csv', encoding='cp1251')
    #     oof_score = model.score(val_pool, oof_value)
    #     print('Iteration {} with oof score {}'.format(i, oof_score))
        oof_score = mean_squared_error(target_valid, pred)**0.5
        avg_score += oof_score
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = lgb_clf.feature_importance
        fold_importance_df["fold"] = fold + 1
        fold_importance_df["seed"] = s
        fold_importance_df['oof_score'] = oof_score
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

df = pd.read_csv('./input/sample submission.csv', encoding='cp1251')

df['value'] = (test/iter_num).clip(0, 3000)
df.to_csv('submission_lgbm_m_'+str(avg_score/(iter_num))+'.csv', index=False)
print('Score {}'.format(avg_score/iter_num))
# df = pd.read_csv(path+"train_full.csv", encoding='cp1251')
# df['value_pred'] = oof
# df.to_csv('./output/oof_'+str(avg_score/(i+1))+'.csv', index=False)

In [None]:
## Adding metafeatures and starting lgbm

from sklearn.model_selection import KFold
import pandas as pd
import lightgbm as lgb
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
import numpy as np

df_stack_meta1 = pd.DataFrame()
df_stack_meta2 = pd.DataFrame()

for i in range(10):
    df = pd.read_csv('./output/oof_{}.csv'.format(i))
    val = df['value']
    df_stack_meta1 = pd.concat([df_stack_meta1, df])

    df = pd.read_csv('./output/oof_lgbm_{}.csv'.format(i))
    val = df['value']
    df_stack_meta2 = pd.concat([df_stack_meta2, df])
df_stack_meta1 = df_stack_meta1.sort_values(['id']).set_index(['id'])
df_stack_meta2 = df_stack_meta2.sort_values(['id']).set_index(['id'])

K = 10
df = pd.read_csv("./input/train_full.csv")
df['metafeature_catboost'] = df_stack_meta1['value']
df['metafeature_lgbm'] = df_stack_meta2['value']

df_test = pd.read_csv('./input/test_.csv')
df_stack_meta1 = pd.read_csv('submission_catboost_212.64175801420384_198.csv')
df_stack_meta2 = pd.read_csv('submission_lgbm_207.2784140815164_205_224.csv')
df_test['metafeature_catboost'] = df_stack_meta1['value']
df_test['metafeature_lgbm'] = df_stack_meta2['value']

kf = KFold(n_splits=K, random_state=100, shuffle=True)
kf.get_n_splits(range(len(df)))

print(kf)  

for i, (train_index, valid_index) in enumerate(kf.split(range(len(df)))):
    train = df.loc[train_index]
    valid = df.loc[valid_index]

    train.to_csv(f"./input/train_meta_cb_{i}.csv",index=False)
    valid.to_csv(f"./input/valid_meta_cb_{i}.csv",index=False)

df_test.to_csv('./input/test_meta.csv',index=False)
    
iter_num = 0
n_folds = 10
avg_score = 0
feature_importance_df = pd.DataFrame()
df_train = pd.read_csv('./input/train_full.csv', encoding='cp1251')
# df_train.to_csv('./input/train_full_lgbm.csv', encoding='cp1251')
# df_oof = np.zeros((len(df), n_folds))
# df_meta = pd.read_csv('submission_catboost_211.43605189630915_199_05136.csv')
# df_test = pd.read_csv('./input/test_.csv')

# df_test['metafeature_catboost'] = df_meta['value']
test = np.zeros((len(df_test), 1))
oof = np.zeros((len(df_train), 1))
num_seeds = 1
for s in range(num_seeds):
    for fold in range(n_folds):
        iter_num += 1
        df_train = pd.read_csv('./input/train_meta_cb_{}.csv'.format(fold))
        df_valid = pd.read_csv('./input/valid_meta_cb_{}.csv'.format(fold))
        target_train = df_train.pop('value')
        target_valid = df_valid.pop('value')

        discard_feats = ['id','bulk_id','date1','bulk_id_5']

        cat_feats = ['spalen', 'month', 'Класс объекта',
         'Огорожена территория', 'Входные группы', 'Детский сад', 'Школа',
         'Поликлиника', 'ФОК', 'Спортивная площадка', 'Автомойка',
         'Кладовые', 'Колясочные', 'Кондиционирование', 'Вентлияция',
         'Лифт', 'Система мусоротведения', 'Видеонаблюдение', 'Подземная парковка',
         'Двор без машин', 'flag_sales_started']

        feats = [f for f in df_train.columns.tolist() if f not in discard_feats]
#         feats = ['metafeature_catboost','metafeature_lgbm']

        lgtrain = lgb.Dataset(df_train[feats], target_train,
                            feature_name=feats,
                            categorical_feature = cat_feats
                             )
        lgvalid = lgb.Dataset(df_valid[feats], target_valid,
                            feature_name=feats,
                            categorical_feature = cat_feats
                             )

        lgbm_params =  {
            'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': 'rmse',
            # 'max_depth': 15,
            'num_leaves': 34,
            'nthread':4,
            'learning_rate': 0.01,
            'colsample_bytree': 0.8,
            'subsample': 0.87,
            'max_depth': -1,
            'reg_alpha': 0.04,
            'min_split_gain': 0.017,
            'min_child_weight': 20,
            'verbose': -1,
            'silent':-1,
            'seed':s,
            'random_state':s
        }

        lgb_clf = lgb.train(
                lgbm_params,
                lgtrain,
                num_boost_round=10000,
                valid_sets=[lgtrain, lgvalid],
                valid_names=['train','valid'],
                early_stopping_rounds=300,
                verbose_eval=10000
            )
        # print('RMSE:', np.sqrt(mean_squared_error(target_valid, lgb_clf.predict(df_valid[feats]))))
        pred = lgb_clf.predict(df_test[feats])
        test += pred.reshape((len(df_test),1))
        df_test['value'] = pred
#         df_test.to_csv("./output/test_lgbm_{}.csv.".format(fold), index=False, columns=['id','value'])
        pred = lgb_clf.predict(df_valid[feats])
        df = pd.read_csv("./output/oof_{}.csv.".format(fold))
        oof[df.id] = pred.reshape((len(df),1))
        df['value'] = pred
#         df.to_csv("./output/oof_lgbm_{}.csv".format(fold), index=False, columns=['id','value'])
    #     full_train = pd.read_csv('./input/train_full_lgbm.csv', encoding='cp1251')
    #     full_train['oof_'.format(fold)] = pred
    #     full_train.to_csv('./input/train_full_lgbm.csv', encoding='cp1251')
    #     oof_score = model.score(val_pool, oof_value)
    #     print('Iteration {} with oof score {}'.format(i, oof_score))
        oof_score = mean_squared_error(target_valid, pred)**0.5
        avg_score += oof_score
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = lgb_clf.feature_importance
        fold_importance_df["fold"] = fold + 1
        fold_importance_df["seed"] = s
        fold_importance_df['oof_score'] = oof_score
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

df = pd.read_csv('./input/sample submission.csv', encoding='cp1251')

df['value'] = (test/iter_num).clip(0, 3000)
df.to_csv('submission_lgbm_meta_'+str(avg_score/(iter_num))+'.csv', index=False)
print('Score {}'.format(avg_score/iter_num))
# df = pd.read_csv(path+"train_full.csv", encoding='cp1251')
# df['value_pred'] = oof
# df.to_csv('./output/oof_'+str(avg_score/(i+1))+'.csv', index=False)