In [10]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import catboost
import lightgbm as lgb
import gc
from sklearn.neighbors import NearestNeighbors

If you are wondering which 3 libraries you can use, here is an example:

For training models, you can, for example, use gradient boosting from the libraries: 
1) Lightgbm
2) Catboost
3) Sklearn
4) Pyboost
5) Xgboost

Also you can use for a fitting neural network:
1) Torch
2) Tensorflow

Once you have made predictions using the 3 models for the test, you can average them or aggregate them in some other way.

In [11]:
train = pd.read_csv('/kaggle/input/ioai-2025-preparation-class-lesson-4-homework/train.csv')
sub = pd.read_csv('/kaggle/input/ioai-2025-preparation-class-lesson-4-homework/sample_submission.csv')

We split the id column into id_house and date columns like its in the train

In [12]:
sub['date'] = sub['id'].apply(lambda x:x.split('_')[0])
sub['id_house'] = sub['id'].apply(lambda x:int(x.split('_')[1]))
sub['date'] = pd.to_datetime(sub['date'])
train['date'] = pd.to_datetime(train['date'])

We check how many months we have to make predictions for.

In [5]:
sub['date'].unique()
# 3 month to predict

<DatetimeArray>
['2022-06-01 00:00:00', '2022-07-01 00:00:00', '2022-08-01 00:00:00']
Length: 3, dtype: datetime64[ns]

We are making a dataset that we will use for training model and predictions for the test. Since we don't have data from the train for every **month and id_house**, we want to fill the missing data. To do this, we are making a dataset where we have records for each month and id_house from the trend. And we fill the missing data, using data from the  **past months** first, then from the **future months** for id_house. This is not the best way to fill in the form, using data from the future. For example, we can simply **throw them out** of training if we can't fill them out correctly. You can check out different ideas that might work better for yourself.

In [13]:
date_range = pd.date_range(train['date'].min(), sub['date'].max(), freq = 'MS').tolist()
city_list = []
time_list = []
for city in sub['id_house'].unique():
    time_list += date_range
    city_list += [city] * len(date_range)
data = pd.DataFrame()
data['id_house'] = city_list
data['date'] = time_list
data = data.merge(train, on = ['id_house', 'date'], how = 'left')
data_other_columns = [x for x in data.columns if x not in ['id_house', 'date']]
#fill issing data
for col in tqdm(data_other_columns):
    data[col] = data.groupby('id_house', group_keys = False)[col].apply(lambda x:x.fillna(method = 'bfill').fillna(method = 'ffill'))

data['preds'] = -1
data

  0%|          | 0/20 [00:00<?, ?it/s]

  data[col] = data.groupby('id_house', group_keys = False)[col].apply(lambda x:x.fillna(method = 'bfill').fillna(method = 'ffill'))


Unnamed: 0,id_house,date,apart_to_room,num_builds_live,mean_price,num_builds_series_live,room_three,med_price,room_four,room_one,...,room_two,vc_city_quadkey,healthcare_cnt,flats_cnt,beauty_cnt,shopping_cnt,build_year_median,lng,lat,preds
0,6123,2020-01-01,0.0,1.0,43500.000000,1.0,1.000000,42857.142188,0.0,0.000000,...,0.000000,9.0,0.0,1.0,0.0,0.0,1981.5,59.107842,79.032814,-1
1,6123,2020-02-01,0.0,1.0,43500.000000,1.0,1.000000,42857.142188,0.0,0.000000,...,0.000000,9.0,0.0,1.0,0.0,0.0,1981.5,59.107842,79.032814,-1
2,6123,2020-03-01,0.0,1.0,43500.000000,1.0,1.000000,42857.142188,0.0,0.000000,...,0.000000,9.0,0.0,1.0,0.0,0.0,1981.5,59.107842,79.032814,-1
3,6123,2020-04-01,0.0,1.0,43500.000000,1.0,1.000000,42857.142188,0.0,0.000000,...,0.000000,9.0,0.0,1.0,0.0,0.0,1981.5,59.107842,79.032814,-1
4,6123,2020-05-01,0.0,1.0,43500.000000,1.0,1.000000,42857.142188,0.0,0.000000,...,0.000000,9.0,0.0,1.0,0.0,0.0,1981.5,59.107842,79.032814,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171195,5668,2022-04-01,0.0,9.0,22493.675699,1.0,0.052632,22173.914062,0.0,0.052632,...,0.894737,23.0,3.0,0.0,0.0,0.0,1972.0,62.612480,77.772386,-1
171196,5668,2022-05-01,0.0,9.0,22866.712054,1.0,0.047619,22500.000000,0.0,0.047619,...,0.904762,23.0,3.0,0.0,0.0,0.0,1972.0,62.612480,77.772386,-1
171197,5668,2022-06-01,0.0,9.0,22866.712054,1.0,0.047619,22500.000000,0.0,0.047619,...,0.904762,23.0,3.0,0.0,0.0,0.0,1972.0,62.612480,77.772386,-1
171198,5668,2022-07-01,0.0,9.0,22866.712054,1.0,0.047619,22500.000000,0.0,0.047619,...,0.904762,23.0,3.0,0.0,0.0,0.0,1972.0,62.612480,77.772386,-1


An example of feature generation using coordinates. We can search for the nearest buildings and average the target values. So we may be making the feature more resistant to outliers.

In [14]:
coords_data = data.drop_duplicates('id_house')[['lat', 'lng']]
nn = NearestNeighbors(n_neighbors = 10) #15
nn.fit(coords_data.values)

col = 'mean_price'
dict_vals = data.groupby(['id_house', 'date'])[col].mean().to_dict()
dict_ind_house = {i:k for i, k in enumerate(data['id_house'].unique())}
list_month = data['date'].unique()

dict_mean = {}
dict_std = {}
for neighbors in nn.kneighbors(coords_data.values)[1]:
    for month in list_month:
        vals = []
        for neighbor in neighbors:
            if (dict_ind_house[neighbor], month) in dict_vals:
                vals += [dict_vals[(dict_ind_house[neighbor], month)]]
        if len(vals) > 0:
            dict_mean[(dict_ind_house[neighbors[0]], month)] = np.mean(vals)

data['mean_pricem'] = [dict_mean.get((c, m), None) for c,m in data[['id_house', 'date']].values]

Functions for training catboost and lightgbm

In [15]:
from sklearn.metrics import mean_absolute_percentage_error
import xgboost as xgb
def catboost_train(train, target, split_list, param):
    
    bst_list = []
    for i , (train_index, val_index, test_index) in enumerate(split_list):

        tr = catboost.Pool(train[train_index], label = target[train_index])
        te = catboost.Pool(train[val_index], label = target[val_index])
        
        bst = catboost.train(tr, param, eval_set = te, iterations = 2000, early_stopping_rounds = 100, verbose =300)
        bst_list += [bst]

        gc.collect()
        del tr, te
    
    return bst_list

params_cat = {
    'loss_function' :'MAE', 
     'max_depth' : 4, 
    'eval_metric' :'MAPE', 
    'learning_rate' : .01, 
    'l2_leaf_reg' : 15, 
    'random_state' : 42 ,
    'random_strength' : 1,
    'grow_policy' : 'Depthwise',
    'bagging_temperature' : 2,
    #'subsample' : 0.85,
    'bootstrap_type' :  'Bayesian',
}

def lgb_train(train, target, split_list, param):
    
    bst_list = []
    for i , (train_index, val_index, test_index) in enumerate(split_list):

        tr = lgb.Dataset(train[train_index], target[train_index])
        te = lgb.Dataset(train[val_index], target[val_index], reference=tr)
    
        bst = lgb.train(param, tr, num_boost_round = 2000, valid_sets = te,
                        callbacks = [lgb.early_stopping(100), lgb.log_evaluation(5000)])
        bst_list += [bst]

        gc.collect()
        del tr, te
    
    return bst_list

params_lgb = {
    'objective':        'mae',
    'verbosity':        -1,
    'boosting_type':    'gbdt',
    'metric' : 'mape',
    'lambda_l1':        7,
    'lambda_l2':        5,
    'learning_rate':    0.01,
    'num_leaves':        16,
    'extra_trees' : True,
}
def xgb_train(train, target, split_list, param):
    
    bst_list = []
    
    for i, (train_index, val_index, test_index) in enumerate(split_list):
        
        tr = xgb.DMatrix(train[train_index], label=target[train_index])
        te = xgb.DMatrix(train[val_index], label=target[val_index])
        
        evallist = [(te, 'eval')]
        
        bst = xgb.train(param, tr, num_boost_round=2000,
                        evals=evallist,
                        early_stopping_rounds=100,
                        verbose_eval=5000)
        
        bst_list += [bst]
        
        gc.collect()
        del tr, te
    
    return bst_list

params_xgb = {
    'objective': 'reg:absoluteerror',  
    'learning_rate': 0.01,
    'max_depth': 4,                 
    'min_child_weight': 3,           
    'reg_alpha': 5,              
    'reg_lambda': 5,         
    'gamma': 1,
    'random_state': 42
}



Since we need to make predictions for 3 months ahead. We will build models separately for each month. And then we'll combine the predictions. As features, we will use the values of the various columns for which we apply **rolling mean and shift** with number = the month for which we are making a prediction. Plus i use year as a feature

For validation i using 2 last month in train. Then train catboost and lightgbm and average predictions from that models. Final prediction i save in columns **preds** and than I merge that column with the sample_submission.csv file. Also i check correlations.

In [17]:
train_month = sorted(train['date'].unique())
test_month = sorted(sub['date'].unique())
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

for m_predict in [1, 2, 3]:

    data[f'mean_price_shift_{m_predict}'] = data.groupby('id_house', group_keys = False)['mean_price'].apply(lambda x:x.shift(m_predict))
    data['new_target'] = data['mean_price'] / data[f'mean_price_shift_{m_predict}']

    data['year'] = data['date'].dt.year
    
    train_cols = ['year']
    
    for col in tqdm(['build_year_median', 'vc_city_quadkey', 'number_total', 'new_target', 'mean_price',
                    'num_builds_live', 'room_four', 'mean_pricem','room_one']):
        data[f'{col}_shift_rm_{m_predict}'] = data.groupby('id_house', group_keys = False)[col].apply(lambda x:
                                            x.rolling(3, min_periods = 1).mean().shift(m_predict))
        train_cols += [f'{col}_shift_rm_{m_predict}']
    
    '''train_cols2 = ['build_year_median', 'vc_city_quadkey', f'new_target_shift_rm_{m_predict}']
    for col in tqdm(['number_total','mean_price','mean_pricem', 'room_four','num_builds_live']):
            data[f'{col}_diff_{m_predict}'] = data.groupby('id_house', group_keys = False)[col].apply(lambda x:
                                                 x.shift(m_predict) - x)
            train_cols2 += [f'{col}_diff_{m_predict}']'''

    split_list = []    
    list_val_months = [-1, -2]
    for val_month in list_val_months:
        train_index = data[(data['date'] > train_month[5]) & (data['date'] <= train_month[val_month - 1])].index
        val_index = data[data['date'] == train_month[val_month]].index
        test_index = data[data['date'] == test_month[m_predict - 1]].index
        split_list += [(train_index, val_index, test_index)]

    # CHECKING CORRELATIONS
    vals = data[train_cols].corr().abs().values
    print('CHECK CORR COLS', bool(vals[~np.eye(vals.shape[0],dtype=bool)].max() < 0.95) == True, 
          'MAX CORR: ', vals[~np.eye(vals.shape[0],dtype=bool)].max() )

    
    bst_list_catboost = catboost_train(data[train_cols].values, data['new_target'].values, split_list, params_cat)

    catboost_preds = []
    for num_, bst in enumerate(bst_list_catboost):
        val_index = split_list[num_][-2]
        val_pred = bst.predict(data[train_cols].values[val_index]) * data[f'mean_price_shift_{m_predict}'][val_index]
        score = (data['mean_price'][val_index] - val_pred).abs().mean()
        print(f'VAL SCORE CATBOOST MONTH {num_}: ', score)
        test_index = split_list[num_][-1]
        catboost_preds += [ bst.predict(data[train_cols].values[test_index]) * data[f'mean_price_shift_{m_predict}'][test_index] ]
    catboost_preds = np.mean(catboost_preds, 0)

    bst_list_lgb = lgb_train(data[train_cols].values, data['new_target'].values, split_list, params_lgb)
    lgb_preds = []
    for num_, bst in enumerate(bst_list_lgb):
        val_index = split_list[num_][-2]
        val_pred = bst.predict(data[train_cols].values[val_index]) * data[f'mean_price_shift_{m_predict}'][val_index]
        score = (data['mean_price'][val_index] - val_pred).abs().mean()
        print(f'VAL SCORE LIGHTGBM MONTH {num_}: ', score)
        test_index = split_list[num_][-1]
        lgb_preds += [ bst.predict(data[train_cols].values[test_index]) * data[f'mean_price_shift_{m_predict}'][test_index] ]
    lgb_preds = np.mean(lgb_preds, 0)

    bst_list_xgb = xgb_train(data[train_cols].values, data['new_target'].values, split_list, params_xgb)
    xgb_preds = []
    for num_, bst in enumerate(bst_list_xgb):
        val_index = split_list[num_][-2]
        val_data = xgb.DMatrix(data[train_cols].values[val_index])
        val_pred = bst.predict(val_data) * data[f'mean_price_shift_{m_predict}'][val_index]
        score = (data['mean_price'][val_index] - val_pred).abs().mean()
        print(f'VAL SCORE XGB MONTH {num_}: ', score)
        test_index = split_list[num_][-1]
        xgb_preds += [ bst.predict(xgb.DMatrix(data[train_cols].values[test_index])) * data[f'mean_price_shift_{m_predict}'][test_index] ]
    xgb_preds = np.mean(xgb_preds, 0)

    
    data.loc[test_index, 'preds'] = (catboost_preds + lgb_preds + xgb_preds) / 3
    # hui


  0%|          | 0/9 [00:00<?, ?it/s]

CHECK CORR COLS True MAX CORR:  0.9292495275428049
0:	learn: 0.0170505	test: 0.0161136	best: 0.0161136 (0)	total: 20.5ms	remaining: 40.9s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.01598849208
bestIteration = 195

Shrink model to first 196 iterations.
0:	learn: 0.0170298	test: 0.0174846	best: 0.0174846 (0)	total: 18.5ms	remaining: 37s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.01735354303
bestIteration = 185

Shrink model to first 186 iterations.
VAL SCORE CATBOOST MONTH 0:  2229.4735030085403
VAL SCORE CATBOOST MONTH 1:  2397.1583558143334
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[329]	valid_0's mape: 0.0159959
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[569]	valid_0's mape: 0.0173576
VAL SCORE LIGHTGBM MONTH 0:  2231.0373404716183
VAL SCORE LIGHTGBM MONTH 1:  2399.117893551519
[0]	eval-mae:0.01706
[283]	eval-mae:0.01693
[0]	

  0%|          | 0/9 [00:00<?, ?it/s]

CHECK CORR COLS True MAX CORR:  0.9289505307628535
0:	learn: 0.0272533	test: 0.0268541	best: 0.0268541 (0)	total: 18.8ms	remaining: 37.5s
300:	learn: 0.0267081	test: 0.0263725	best: 0.0263649 (228)	total: 5.3s	remaining: 29.9s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.02636487287
bestIteration = 228

Shrink model to first 229 iterations.
0:	learn: 0.0271761	test: 0.0288663	best: 0.0288663 (0)	total: 22.4ms	remaining: 44.7s
300:	learn: 0.0266354	test: 0.0281918	best: 0.0281918 (300)	total: 5.08s	remaining: 28.7s
600:	learn: 0.0266014	test: 0.0281692	best: 0.0281692 (600)	total: 10s	remaining: 23.4s
900:	learn: 0.0265808	test: 0.0281551	best: 0.0281550 (898)	total: 15.1s	remaining: 18.4s
1200:	learn: 0.0265645	test: 0.0281484	best: 0.0281483 (1182)	total: 19.9s	remaining: 13.2s
1500:	learn: 0.0265466	test: 0.0281425	best: 0.0281418 (1488)	total: 24.7s	remaining: 8.21s
1800:	learn: 0.0265307	test: 0.0281365	best: 0.0281364 (1792)	total: 29.5s	remaining: 3.26s
19

  0%|          | 0/9 [00:00<?, ?it/s]

CHECK CORR COLS True MAX CORR:  0.928641019696598
0:	learn: 0.0340166	test: 0.0353450	best: 0.0353450 (0)	total: 19.4ms	remaining: 38.9s
300:	learn: 0.0330020	test: 0.0341189	best: 0.0341178 (284)	total: 5.12s	remaining: 28.9s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.03411141666
bestIteration = 431

Shrink model to first 432 iterations.
0:	learn: 0.0338774	test: 0.0369550	best: 0.0369550 (0)	total: 17.9ms	remaining: 35.8s
300:	learn: 0.0329054	test: 0.0352646	best: 0.0352554 (293)	total: 4.97s	remaining: 28s
600:	learn: 0.0328524	test: 0.0352074	best: 0.0352074 (600)	total: 9.78s	remaining: 22.8s
900:	learn: 0.0328037	test: 0.0351537	best: 0.0351537 (899)	total: 14.8s	remaining: 18s
1200:	learn: 0.0327673	test: 0.0351211	best: 0.0351210 (1199)	total: 19.5s	remaining: 12.9s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.03511191939
bestIteration = 1350

Shrink model to first 1351 iterations.
VAL SCORE CATBOOST MONTH 0:  4646.426393849468

making submission

In [None]:
sub

In [18]:
sub = sub.merge(data[['date', 'id_house', 'preds']], on = ['date', 'id_house'], how = 'left')
sub['target'] = sub['preds']
# delete temp files of catboost from working folder
!rm -rf /kaggle/working/
sub[['id', 'target']].to_csv('solution.csv', index = None)
sub['target'].min(), sub['target'].isnull().sum()

rm: cannot remove '/kaggle/working/': Device or resource busy


(12399.999636020199, 0)

In [19]:
import os
os.makedirs("/root/.kaggle", exist_ok=True)
text_file = open("/root/.kaggle/kaggle.json", "w")
n = text_file.write('{"username":"heriqis777","key":"51f5d196d31e37053d573e0679743000"}')
text_file.close()
!chmod 600 /root/.kaggle/kaggle.json
# Make sure it worked
!cat /root/.kaggle/kaggle.json

{"username":"heriqis777","key":"51f5d196d31e37053d573e0679743000"}

In [20]:
!kaggle competitions submit -c ioai-2025-preparation-class-lesson-4-homework -f 'solution.csv' -m "{final}" 

100%|█████████████████████████████████████████| 521k/521k [00:01<00:00, 356kB/s]
Successfully submitted to IOAI 2025 preparation class, Lesson 4, Homework

набирает 3786

Function for checking correlation between features. Its should output: True

In [None]:
train_features = ['apart_to_room', 'num_builds_live', 'med_price', 'room_four']
vals = train[train_features].corr().abs().values
print(vals)
bool(vals[~np.eye(vals.shape[0],dtype=bool)].max() < 0.95) == True