In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from training.trainer import *
import optuna
from tqdm.notebook import tqdm

In [2]:
def read_data(cluster_num):
    df_train = pd.read_parquet(f'./clean_data/train/cluster_{cluster_num}.gz')
    df_test = pd.read_parquet(f'./clean_data/test/cluster_{cluster_num}.gz')
    df_attr = pd.read_csv('./clean_data/attr.csv')
    # soc-dem features u
    df_train = df_train.merge(df_attr, 
                              how = 'left', 
                              left_on = ['ego_id', 'u'],
                              right_on = ['ego_id', 'u']).rename(
                                                                columns = {
                                                                    'school':     'school_u',
                                                                    'university': 'university_u',
                                                                    'sex':        'sex_u',
                                                                    'city_id':    'city_id_u',
                                                                    'age':        'age_u'
                                                                    }
                                                                )
    df_test = df_test.merge(df_attr, 
                              how = 'left', 
                              left_on = ['ego_id', 'u'],
                              right_on = ['ego_id', 'u']).rename(
                                                                columns = {
                                                                    'school':     'school_u',
                                                                    'university': 'university_u',
                                                                    'sex':        'sex_u',
                                                                    'city_id':    'city_id_u',
                                                                    'age':        'age_u'
                                                                    }
                                                                )
    # soc-dem features v
    df_train = df_train.merge(df_attr, 
                              how = 'left', 
                              left_on = ['ego_id', 'v'],
                              right_on = ['ego_id', 'u']).drop(['u_y'], axis = 1).rename(
                                                                columns = {
                                                                    'school':     'school_v',
                                                                    'university': 'university_v',
                                                                    'sex':        'sex_v',
                                                                    'city_id':    'city_id_v',
                                                                    'age':        'age_v',
                                                                    'u_x': 'u'
                                                                    }
                                                                )
    df_test = df_test.merge(df_attr, 
                              how = 'left', 
                              left_on = ['ego_id', 'v'],
                              right_on = ['ego_id', 'u']).drop(['u_y'], axis = 1).rename(
                                                                columns = {
                                                                    'school':     'school_v',
                                                                    'university': 'university_v',
                                                                    'sex':        'sex_v',
                                                                    'city_id':    'city_id_v',
                                                                    'age':        'age_v',
                                                                    'u_x': 'u'
                                                                    }
                                                                )
    return df_train, df_test, df_attr

In [3]:
def preproc_and_get_features(df, df_attr):
    for i in ['city_id_u', 'city_id_v', 'sex_u', 'sex_v', 'school_u', 'university_u', 'university_v', 'school_v', 't']:
        df[i].fillna(-1, inplace = True)
    iqr = np.quantile(df_attr['age'], 0.75)-np.quantile(df_attr['age'], .25)
    m_age = df_attr[~(df_attr['age'].isna())]['age'].median()
    df['age_u'].fillna(m_age, inplace = True)
    df['age_v'].fillna(m_age, inplace = True)
    df.loc[df['age_u']>m_age+1.5*iqr, 'age_u'] = m_age
    df.loc[df['age_v']>m_age+1.5*iqr, 'age_v'] = m_age

    df.loc[(df['university_u'] == df['university_v']) & (df['university_u'] != -1), 'university_is_equal'] = 1
    df.loc[(df['university_u'] == -1) | (df['university_v'] == -1), 'university_is_equal'] = -1
    df['university_is_equal'].fillna(0, inplace = True)
    
    df.loc[(df['school_u'] == df['school_v']) & (df['school_u'] != -1), 'school_is_equal'] = 1
    df.loc[(df['school_u'] == -1) | (df['school_v'] == -1), 'school_is_equal'] = -1
    df['school_is_equal'].fillna(0, inplace = True)
    
    df.loc[(df['city_id_u'] == df['city_id_v']) & (df['city_id_u'] != -1), 'city_is_equal'] = 1
    df.loc[(df['city_id_v'] == -1) | (df['city_id_u'] == -1), 'city_is_equal'] = -1
    df['city_is_equal'].fillna(0, inplace = True)
    
    df.loc[(df['sex_u'] == df['sex_v']) & (df['sex_v'] != -1), 'sex_is_equal'] = 1
    df.loc[(df['sex_u'] == -1) | (df['sex_v'] == -1), 'sex_is_equal'] = -1
    df['sex_is_equal'].fillna(0, inplace = True)
    
    df['age_diff'] = df['age_u']-df['age_v']
    df.loc[abs(df['age_diff']) < 1, 'age_diff_1'] = 1
    df.loc[~(abs(df['age_diff']) < 1), 'age_diff_1'] = 0
    
    df.loc[abs(df['age_diff']) < 2, 'age_diff_2'] = 1
    df.loc[~(abs(df['age_diff']) < 2), 'age_diff_2'] = 0
    
    df.loc[abs(df['age_diff']) < 3, 'age_diff_3'] = 1
    df.loc[~(abs(df['age_diff']) < 3), 'age_diff_3'] = 0
    
    df.loc[abs(df['age_diff']) < 4, 'age_diff_4'] = 1
    df.loc[~(abs(df['age_diff']) < 4), 'age_diff_4'] = 0
    
    df.loc[abs(df['age_diff']) < 5, 'age_diff_5'] = 1
    df.loc[~(abs(df['age_diff']) < 5), 'age_diff_5'] = 0
    
    df.loc[abs(df['age_diff']) < 10, 'age_diff_10'] = 1
    df.loc[~(abs(df['age_diff']) < 10), 'age_diff_10'] = 0
    
    df.loc[abs(df['age_diff']) < 15, 'age_diff_15'] = 1
    df.loc[~(abs(df['age_diff']) < 15), 'age_diff_15'] = 0
    
    df.loc[abs(df['age_diff']) < 20, 'age_diff_20'] = 1
    df.loc[~(abs(df['age_diff']) < 20), 'age_diff_20'] = 0
    
    df.loc[abs(df['age_diff']) < 25, 'age_diff_25'] = 1
    df.loc[~(abs(df['age_diff']) < 25), 'age_diff_25'] = 0
    
    df.loc[abs(df['age_diff']) < 30, 'age_diff_30'] = 1
    df.loc[~(abs(df['age_diff']) < 30), 'age_diff_30'] = 0
    
    df.loc[(df['school_is_equal'] == 1) & (df['age_diff_2'] == 1), 'flg_classmates'] = 1
    df.loc[~((df['school_is_equal'] == 1) & (df['age_diff_2'] == 1)), 'flg_classmates'] = 0
    
    df.loc[(df['university_is_equal'] == 1) & (df['age_diff_2'] == 1), 'flg_univermates'] = 1
    df.loc[~((df['university_is_equal'] == 1) & (df['age_diff_2'] == 1)), 'flg_univermates'] = 0
    
    df.loc[df['u']==0, 'from_ego'] = 1
    df.loc[~(df['u']==0), 'from_ego'] = 1
    df.loc[df['v']==0, 'to_ego'] = 1
    df.loc[~(df['v']==0), 'to_ego'] = 1

    df.drop(['university_u', 'university_v', 'city_id_u', 'city_id_v', 'school_u', 'school_v'], axis = 1, inplace = True)
    return df

In [4]:
tech_cols = ['ego_id', 'u', 'v', 'x1']

In [5]:
def train_model(train):
    x = train.drop(tech_cols, axis = 1)
    y = train['x1']

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.8, 
                                                        shuffle = True, random_state = 42)
    best_params = calc_hps('LGBM', x_train, y_train, x_test, y_test, 'reg', trials_num = 5)
    model = lgb.LGBMRegressor()
    model.set_params(**best_params)
    model.fit(x_train, y_train)

    rmse_train = mean_squared_error(y_train, model.predict(x_train), squared = False)
    rmse_test = mean_squared_error(y_test, model.predict(x_test), squared = False)
    
    return model, rmse_train, rmse_test

In [6]:
def make_submition():
    df_subm = pd.read_csv('./clean_data/submission.csv')
    test_pred = pd.DataFrame(columns = ['ego_id', 'u', 'v', 'pred'])
    for cluster in tqdm(range(9)):
        _, test, attr = read_data(cluster)
        test = preproc_and_get_features(test, attr)
        model = lgb.Booster(model_file=f'./models/cluster_{cluster}_model.txt')
        test['pred'] = model.predict(test[model.feature_name()])
        test_pred = pd.concat([test_pred, test[['ego_id', 'u', 'v', 'pred']]], axis = 0)
    df_subm = df_subm.merge(test_pred, how = 'left', left_on = ['ego_id', 'u', 'v'],
                           right_on = ['ego_id', 'u', 'v']).drop('x1', axis = 1)\
                .rename(columns = {'pred': 'x1'}).drop_duplicates(['ego_id','u', 'v'], keep='first')[['ego_id', 'u','v', 'x1']]
    df_subm.to_csv('./submitions/submition.csv', index = False)
    return df_subm

In [7]:
models = {}
for cluster in tqdm(range(9)):
    train, test, attr = read_data(cluster)
    train = preproc_and_get_features(train, attr)
    test = preproc_and_get_features(test, attr)
    del attr
    model, rmse_train, rmse_val = train_model(train)
    models[cluster] = model
    model.booster_.save_model(f'./models/cluster_{cluster}_model.txt')
    
    print(f'----- Cluster {cluster} -----')
    print(f'RMSE TRAIN: {rmse_train:.3f}')
    print(f'RMSE VAL: {rmse_val:.3f}')

  0%|          | 0/9 [00:00<?, ?it/s]

[32m[I 2023-05-21 05:39:37,646][0m A new study created in memory with name: no-name-5b80dbb8-87bc-4c28-b69f-9f0e924065c8[0m
[32m[I 2023-05-21 05:39:45,881][0m Trial 0 finished with value: -0.6946038967059192 and parameters: {'max_depth': 5, 'num_leaves': 77, 'n_estimators': 51, 'reg_alpha': 2.0279124388934814e-06, 'reg_lambda': 8.31978304684231e-05, 'min_child_samples': 51, 'min_child_weight': 5.56115387155981}. Best is trial 0 with value: -0.6946038967059192.[0m
[32m[I 2023-05-21 05:40:01,616][0m Trial 1 finished with value: -0.6646759989570522 and parameters: {'max_depth': 10, 'num_leaves': 34, 'n_estimators': 164, 'reg_alpha': 6.50654872235369e-05, 'reg_lambda': 6.800057268745558e-05, 'min_child_samples': 10, 'min_child_weight': 3.8276430589065074}. Best is trial 1 with value: -0.6646759989570522.[0m
[32m[I 2023-05-21 05:40:07,997][0m Trial 2 finished with value: -0.8402371718088049 and parameters: {'max_depth': 1, 'num_leaves': 99, 'n_estimators': 141, 'reg_alpha': 0.003

----- Cluster 0 -----
RMSE TRAIN: 0.608
RMSE VAL: 0.661


[32m[I 2023-05-21 05:41:25,292][0m A new study created in memory with name: no-name-7ecec78f-4d40-43e1-bdec-2ba845aeede6[0m
[32m[I 2023-05-21 05:41:28,367][0m Trial 0 finished with value: -0.7692628526110515 and parameters: {'max_depth': 2, 'num_leaves': 50, 'n_estimators': 158, 'reg_alpha': 0.00042321451389922075, 'reg_lambda': 1.9453873937777516e-07, 'min_child_samples': 29, 'min_child_weight': 7.324401979644921}. Best is trial 0 with value: -0.7692628526110515.[0m
[32m[I 2023-05-21 05:41:36,500][0m Trial 1 finished with value: -0.6848939650917628 and parameters: {'max_depth': 10, 'num_leaves': 89, 'n_estimators': 145, 'reg_alpha': 0.006528053981966564, 'reg_lambda': 1.6221845543658377, 'min_child_samples': 60, 'min_child_weight': 4.20332407239607}. Best is trial 1 with value: -0.6848939650917628.[0m
[32m[I 2023-05-21 05:41:44,179][0m Trial 2 finished with value: -0.6853094378064937 and parameters: {'max_depth': 8, 'num_leaves': 63, 'n_estimators': 172, 'reg_alpha': 0.1790

----- Cluster 1 -----
RMSE TRAIN: 0.611
RMSE VAL: 0.685


[32m[I 2023-05-21 05:43:51,557][0m A new study created in memory with name: no-name-7f501d4f-15d6-4f1f-82b3-9d4a15dd50bf[0m
[32m[I 2023-05-21 05:44:45,659][0m Trial 0 finished with value: -0.6270992909900679 and parameters: {'max_depth': 10, 'num_leaves': 48, 'n_estimators': 136, 'reg_alpha': 0.002758590586016117, 'reg_lambda': 6.330186502947234, 'min_child_samples': 18, 'min_child_weight': 4.917571884382152}. Best is trial 0 with value: -0.6270992909900679.[0m
[32m[I 2023-05-21 05:45:15,186][0m Trial 1 finished with value: -0.7600565210807675 and parameters: {'max_depth': 1, 'num_leaves': 31, 'n_estimators': 174, 'reg_alpha': 0.0006420367303344119, 'reg_lambda': 0.0025840313986855777, 'min_child_samples': 70, 'min_child_weight': 6.973166037054832}. Best is trial 0 with value: -0.6270992909900679.[0m
[32m[I 2023-05-21 05:45:46,842][0m Trial 2 finished with value: -0.6358220082542136 and parameters: {'max_depth': 7, 'num_leaves': 78, 'n_estimators': 56, 'reg_alpha': 0.0002575

----- Cluster 2 -----
RMSE TRAIN: 0.617
RMSE VAL: 0.627


[32m[I 2023-05-21 05:49:05,184][0m A new study created in memory with name: no-name-98bbc715-aa93-4067-a4a5-41771f3df387[0m
[32m[I 2023-05-21 05:49:40,170][0m Trial 0 finished with value: -0.6280703630633573 and parameters: {'max_depth': 9, 'num_leaves': 98, 'n_estimators': 77, 'reg_alpha': 2.622918703137743e-07, 'reg_lambda': 1.1248718039155068e-07, 'min_child_samples': 63, 'min_child_weight': 1.7946358367006463}. Best is trial 0 with value: -0.6280703630633573.[0m
[32m[I 2023-05-21 05:49:58,739][0m Trial 1 finished with value: -0.655983223755255 and parameters: {'max_depth': 7, 'num_leaves': 18, 'n_estimators': 64, 'reg_alpha': 0.019033012384035982, 'reg_lambda': 0.1143062348061024, 'min_child_samples': 21, 'min_child_weight': 1.6056678173047667}. Best is trial 0 with value: -0.6280703630633573.[0m
[32m[I 2023-05-21 05:50:14,101][0m Trial 2 finished with value: -0.6815886027506376 and parameters: {'max_depth': 3, 'num_leaves': 55, 'n_estimators': 77, 'reg_alpha': 0.0031113

----- Cluster 3 -----
RMSE TRAIN: 0.598
RMSE VAL: 0.624


[32m[I 2023-05-21 05:52:52,179][0m A new study created in memory with name: no-name-158ea547-4d1b-4ad9-8592-f08e53c382d6[0m
[32m[I 2023-05-21 05:53:18,221][0m Trial 0 finished with value: -0.651523845750466 and parameters: {'max_depth': 9, 'num_leaves': 75, 'n_estimators': 161, 'reg_alpha': 1.6531294713133527e-05, 'reg_lambda': 0.627829810183127, 'min_child_samples': 38, 'min_child_weight': 0.11472038288336685}. Best is trial 0 with value: -0.651523845750466.[0m
[32m[I 2023-05-21 05:53:35,075][0m Trial 1 finished with value: -0.6579018635504349 and parameters: {'max_depth': 6, 'num_leaves': 51, 'n_estimators': 176, 'reg_alpha': 0.0061946988212798525, 'reg_lambda': 0.002206721657149849, 'min_child_samples': 64, 'min_child_weight': 7.037110570434314}. Best is trial 0 with value: -0.651523845750466.[0m
[32m[I 2023-05-21 05:53:45,187][0m Trial 2 finished with value: -0.69689981439149 and parameters: {'max_depth': 3, 'num_leaves': 61, 'n_estimators': 150, 'reg_alpha': 0.000448177

----- Cluster 4 -----
RMSE TRAIN: 0.597
RMSE VAL: 0.652


[32m[I 2023-05-21 05:56:50,786][0m A new study created in memory with name: no-name-2f0d0527-ae49-4b90-a86c-3f92ee55228f[0m
[32m[I 2023-05-21 05:58:05,666][0m Trial 0 finished with value: -0.6309788272391964 and parameters: {'max_depth': 10, 'num_leaves': 86, 'n_estimators': 109, 'reg_alpha': 0.05343639003627918, 'reg_lambda': 4.178470548413366, 'min_child_samples': 20, 'min_child_weight': 4.060815797788485}. Best is trial 0 with value: -0.6309788272391964.[0m
[32m[I 2023-05-21 05:58:51,415][0m Trial 1 finished with value: -0.6549598380515054 and parameters: {'max_depth': 5, 'num_leaves': 23, 'n_estimators': 90, 'reg_alpha': 0.001956918850194811, 'reg_lambda': 0.00026169461796316396, 'min_child_samples': 54, 'min_child_weight': 8.620905752282066}. Best is trial 0 with value: -0.6309788272391964.[0m
[32m[I 2023-05-21 05:59:39,583][0m Trial 2 finished with value: -0.6512874116877844 and parameters: {'max_depth': 4, 'num_leaves': 23, 'n_estimators': 184, 'reg_alpha': 6.45864309

----- Cluster 5 -----
RMSE TRAIN: 0.610
RMSE VAL: 0.631


[32m[I 2023-05-21 06:02:32,184][0m A new study created in memory with name: no-name-c17b4ef6-f2c5-4cab-9481-49d29b0a0d7e[0m
[32m[I 2023-05-21 06:02:46,868][0m Trial 0 finished with value: -0.6502562786949133 and parameters: {'max_depth': 7, 'num_leaves': 34, 'n_estimators': 107, 'reg_alpha': 5.1502187112246354e-05, 'reg_lambda': 8.042705287944094e-07, 'min_child_samples': 51, 'min_child_weight': 4.136549631706926}. Best is trial 0 with value: -0.6502562786949133.[0m
[32m[I 2023-05-21 06:02:55,628][0m Trial 1 finished with value: -0.722590591188346 and parameters: {'max_depth': 2, 'num_leaves': 82, 'n_estimators': 132, 'reg_alpha': 0.012197883875131809, 'reg_lambda': 3.582107327827566e-06, 'min_child_samples': 67, 'min_child_weight': 0.29454269072538536}. Best is trial 0 with value: -0.6502562786949133.[0m
[32m[I 2023-05-21 06:03:08,845][0m Trial 2 finished with value: -0.6543685885483624 and parameters: {'max_depth': 5, 'num_leaves': 77, 'n_estimators': 130, 'reg_alpha': 3.3

----- Cluster 6 -----
RMSE TRAIN: 0.637
RMSE VAL: 0.650


[32m[I 2023-05-21 06:05:20,898][0m A new study created in memory with name: no-name-a97c35f4-b61b-46c5-bc00-55ee4a8167b6[0m
[32m[I 2023-05-21 06:05:54,237][0m Trial 0 finished with value: -0.7852189901624597 and parameters: {'max_depth': 1, 'num_leaves': 36, 'n_estimators': 117, 'reg_alpha': 0.00016281837657437873, 'reg_lambda': 1.2828194536079013e-07, 'min_child_samples': 10, 'min_child_weight': 3.8680849691169845}. Best is trial 0 with value: -0.7852189901624597.[0m
[32m[I 2023-05-21 06:06:36,488][0m Trial 1 finished with value: -0.6400988693405734 and parameters: {'max_depth': 6, 'num_leaves': 89, 'n_estimators': 82, 'reg_alpha': 2.8190045457371753e-05, 'reg_lambda': 7.61616333967245e-07, 'min_child_samples': 23, 'min_child_weight': 2.0408221840806284}. Best is trial 1 with value: -0.6400988693405734.[0m
[32m[I 2023-05-21 06:07:20,565][0m Trial 2 finished with value: -0.6391493379787145 and parameters: {'max_depth': 9, 'num_leaves': 29, 'n_estimators': 109, 'reg_alpha': 2

----- Cluster 7 -----
RMSE TRAIN: 0.631
RMSE VAL: 0.639


[32m[I 2023-05-21 06:09:30,597][0m A new study created in memory with name: no-name-e92f82cd-c169-454f-9911-124f75661d6e[0m
[32m[I 2023-05-21 06:09:36,053][0m Trial 0 finished with value: -0.6592336258876853 and parameters: {'max_depth': 7, 'num_leaves': 53, 'n_estimators': 177, 'reg_alpha': 1.8996414067427207e-06, 'reg_lambda': 0.1006861986736221, 'min_child_samples': 51, 'min_child_weight': 1.8336229123970575}. Best is trial 0 with value: -0.6592336258876853.[0m
[32m[I 2023-05-21 06:09:39,929][0m Trial 1 finished with value: -0.6563610688653171 and parameters: {'max_depth': 9, 'num_leaves': 81, 'n_estimators': 98, 'reg_alpha': 0.6573537329229889, 'reg_lambda': 8.211224931841161e-06, 'min_child_samples': 29, 'min_child_weight': 0.9771519241303217}. Best is trial 1 with value: -0.6563610688653171.[0m
[32m[I 2023-05-21 06:09:42,747][0m Trial 2 finished with value: -0.6728359688939499 and parameters: {'max_depth': 5, 'num_leaves': 90, 'n_estimators': 100, 'reg_alpha': 0.275440

----- Cluster 8 -----
RMSE TRAIN: 0.570
RMSE VAL: 0.656


In [8]:
submition = make_submition()

  0%|          | 0/9 [00:00<?, ?it/s]