In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from training.trainer import *
import optuna
from tqdm.notebook import tqdm

In [2]:
def read_data(cluster_num):
    df_train = pd.read_parquet(f'./clean_data/train/cluster_{cluster_num}.gz')
    df_test = pd.read_parquet(f'./clean_data/test/cluster_{cluster_num}.gz')
    df_attr = pd.read_csv('./clean_data/attr.csv')
    # soc-dem features u
    df_train = df_train.merge(df_attr, 
                              how = 'left', 
                              left_on = ['ego_id', 'u'],
                              right_on = ['ego_id', 'u']).rename(
                                                                columns = {
                                                                    'school':     'school_u',
                                                                    'university': 'university_u',
                                                                    'sex':        'sex_u',
                                                                    'city_id':    'city_id_u',
                                                                    'age':        'age_u'
                                                                    }
                                                                )
    df_test = df_test.merge(df_attr, 
                              how = 'left', 
                              left_on = ['ego_id', 'u'],
                              right_on = ['ego_id', 'u']).rename(
                                                                columns = {
                                                                    'school':     'school_u',
                                                                    'university': 'university_u',
                                                                    'sex':        'sex_u',
                                                                    'city_id':    'city_id_u',
                                                                    'age':        'age_u'
                                                                    }
                                                                )
    # soc-dem features v
    df_train = df_train.merge(df_attr, 
                              how = 'left', 
                              left_on = ['ego_id', 'v'],
                              right_on = ['ego_id', 'u']).drop(['u_y'], axis = 1).rename(
                                                                columns = {
                                                                    'school':     'school_v',
                                                                    'university': 'university_v',
                                                                    'sex':        'sex_v',
                                                                    'city_id':    'city_id_v',
                                                                    'age':        'age_v',
                                                                    'u_x': 'u'
                                                                    }
                                                                )
    df_test = df_test.merge(df_attr, 
                              how = 'left', 
                              left_on = ['ego_id', 'v'],
                              right_on = ['ego_id', 'u']).drop(['u_y'], axis = 1).rename(
                                                                columns = {
                                                                    'school':     'school_v',
                                                                    'university': 'university_v',
                                                                    'sex':        'sex_v',
                                                                    'city_id':    'city_id_v',
                                                                    'age':        'age_v',
                                                                    'u_x': 'u'
                                                                    }
                                                                )
    return df_train, df_test, df_attr

In [3]:
def preproc_and_get_features(df, df_attr):
    for i in ['city_id_u', 'city_id_v', 'sex_u', 'sex_v', 'school_u', 'university_u', 'university_v', 'school_v', 't']:
        df[i].fillna(-1, inplace = True)
    iqr = np.quantile(df_attr['age'], 0.75)-np.quantile(df_attr['age'], .25)
    m_age = df_attr[~(df_attr['age'].isna())]['age'].median()
    df['age_u'].fillna(m_age, inplace = True)
    df['age_v'].fillna(m_age, inplace = True)
    df.loc[df['age_u']>m_age+1.5*iqr, 'age_u'] = m_age
    df.loc[df['age_v']>m_age+1.5*iqr, 'age_v'] = m_age

    df.loc[(df['university_u'] == df['university_v']) & (df['university_u'] != -1), 'university_is_equal'] = 1
    df.loc[(df['university_u'] == -1) | (df['university_v'] == -1), 'university_is_equal'] = -1
    df['university_is_equal'].fillna(0, inplace = True)
    
    df.loc[(df['school_u'] == df['school_v']) & (df['school_u'] != -1), 'school_is_equal'] = 1
    df.loc[(df['school_u'] == -1) | (df['school_v'] == -1), 'school_is_equal'] = -1
    df['school_is_equal'].fillna(0, inplace = True)
    
    df.loc[(df['city_id_u'] == df['city_id_v']) & (df['city_id_u'] != -1), 'city_is_equal'] = 1
    df.loc[(df['city_id_v'] == -1) | (df['city_id_u'] == -1), 'city_is_equal'] = -1
    df['city_is_equal'].fillna(0, inplace = True)
    
    df.loc[(df['sex_u'] == df['sex_v']) & (df['sex_v'] != -1), 'sex_is_equal'] = 1
    df.loc[(df['sex_u'] == -1) | (df['sex_v'] == -1), 'sex_is_equal'] = -1
    df['sex_is_equal'].fillna(0, inplace = True)
    
    df['age_diff'] = df['age_u']-df['age_v']
    df.loc[abs(df['age_diff']) < 1, 'age_diff_1'] = 1
    df.loc[~(abs(df['age_diff']) < 1), 'age_diff_1'] = 0
    
    df.loc[abs(df['age_diff']) < 2, 'age_diff_2'] = 1
    df.loc[~(abs(df['age_diff']) < 2), 'age_diff_2'] = 0
    
    df.loc[abs(df['age_diff']) < 3, 'age_diff_3'] = 1
    df.loc[~(abs(df['age_diff']) < 3), 'age_diff_3'] = 0
    
    df.loc[abs(df['age_diff']) < 4, 'age_diff_4'] = 1
    df.loc[~(abs(df['age_diff']) < 4), 'age_diff_4'] = 0
    
    df.loc[abs(df['age_diff']) < 5, 'age_diff_5'] = 1
    df.loc[~(abs(df['age_diff']) < 5), 'age_diff_5'] = 0
    
    df.loc[abs(df['age_diff']) < 10, 'age_diff_10'] = 1
    df.loc[~(abs(df['age_diff']) < 10), 'age_diff_10'] = 0
    
    df.loc[abs(df['age_diff']) < 15, 'age_diff_15'] = 1
    df.loc[~(abs(df['age_diff']) < 15), 'age_diff_15'] = 0
    
    df.loc[abs(df['age_diff']) < 20, 'age_diff_20'] = 1
    df.loc[~(abs(df['age_diff']) < 20), 'age_diff_20'] = 0
    
    df.loc[abs(df['age_diff']) < 25, 'age_diff_25'] = 1
    df.loc[~(abs(df['age_diff']) < 25), 'age_diff_25'] = 0
    
    df.loc[abs(df['age_diff']) < 30, 'age_diff_30'] = 1
    df.loc[~(abs(df['age_diff']) < 30), 'age_diff_30'] = 0
    
    df.loc[(df['school_is_equal'] == 1) & (df['age_diff_2'] == 1), 'flg_classmates'] = 1
    df.loc[~((df['school_is_equal'] == 1) & (df['age_diff_2'] == 1)), 'flg_classmates'] = 0
    
    df.loc[(df['university_is_equal'] == 1) & (df['age_diff_2'] == 1), 'flg_univermates'] = 1
    df.loc[~((df['university_is_equal'] == 1) & (df['age_diff_2'] == 1)), 'flg_univermates'] = 0
    
    df.loc[df['u']==0, 'from_ego'] = 1
    df.loc[~(df['u']==0), 'from_ego'] = 1
    df.loc[df['v']==0, 'to_ego'] = 1
    df.loc[~(df['v']==0), 'to_ego'] = 1

    df.drop(['university_u', 'university_v', 'city_id_u', 'city_id_v', 'school_u', 'school_v'], axis = 1, inplace = True)
    return df

In [4]:
tech_cols = ['ego_id', 'u', 'v', 'x1']

In [5]:
def train_model(train):
    x = train.drop(tech_cols, axis = 1)
    y = train['x1']

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.8, 
                                                        shuffle = True, random_state = 42)
    best_params = calc_hps('LGBM', x_train, y_train, 'reg', trials_num = 5)
    model = lgb.LGBMRegressor()
    model.set_params(**best_params)
    model.fit(x_train, y_train)

    rmse_train = mean_squared_error(y_train, model.predict(x_train), squared = False)
    rmse_test = mean_squared_error(y_test, model.predict(x_test), squared = False)
    
    return model, rmse_train, rmse_test

In [6]:
def make_submition():
    df_subm = pd.read_csv('./clean_data/submission.csv')
    test_pred = pd.DataFrame(columns = ['ego_id', 'u', 'v', 'pred'])
    for cluster in tqdm(range(9)):
        _, test, attr = read_data(cluster)
        test = preproc_and_get_features(test, attr)
        model = lgb.Booster(model_file=f'./models/cluster_{cluster}_model.txt')
        test['pred'] = model.predict(test[model.feature_name()])
        test_pred = pd.concat([test_pred, test[['ego_id', 'u', 'v', 'pred']]], axis = 0)
    df_subm = df_subm.merge(test_pred, how = 'left', left_on = ['ego_id', 'u', 'v'],
                           right_on = ['ego_id', 'u', 'v']).drop('x1', axis = 1)\
                .rename(columns = {'pred': 'x1'}).drop_duplicates(['ego_id','u', 'v'], keep='first')[['ego_id', 'u','v', 'x1']]
    df_subm.to_csv('./submitions/submition.csv', index = False)
    return df_subm

In [7]:
models = {}
for cluster in tqdm(range(9)):
    train, test, attr = read_data(cluster)
    train = preproc_and_get_features(train, attr)
    test = preproc_and_get_features(test, attr)
    del attr
    model, rmse_train, rmse_val = train_model(train)
    models[cluster] = model
    model.booster_.save_model(f'./models/cluster_{cluster}_model.txt')
    
    print(f'----- Cluster {cluster} -----')
    print(f'RMSE TRAIN: {rmse_train:.3f}')
    print(f'RMSE VAL: {rmse_val:.3f}')

  0%|          | 0/9 [00:00<?, ?it/s]

[32m[I 2023-05-21 04:56:38,456][0m A new study created in memory with name: no-name-3d0da58c-ff66-4d57-9de2-01edc7809bec[0m
[32m[I 2023-05-21 04:56:46,433][0m Trial 0 finished with value: -0.6038514067729256 and parameters: {'max_depth': 10, 'num_leaves': 98, 'n_estimators': 147, 'reg_alpha': 0.4181207578712726, 'reg_lambda': 2.6913857515375475e-07, 'min_child_samples': 37, 'min_child_weight': 1.9597272946419337}. Best is trial 0 with value: -0.6038514067729256.[0m
[32m[I 2023-05-21 04:56:49,580][0m Trial 1 finished with value: -0.696370783023756 and parameters: {'max_depth': 4, 'num_leaves': 28, 'n_estimators': 69, 'reg_alpha': 0.029837007071720243, 'reg_lambda': 1.8631412772492373, 'min_child_samples': 26, 'min_child_weight': 9.712061172465253}. Best is trial 0 with value: -0.6038514067729256.[0m
[32m[I 2023-05-21 04:56:53,521][0m Trial 2 finished with value: -0.700024309639072 and parameters: {'max_depth': 3, 'num_leaves': 64, 'n_estimators': 129, 'reg_alpha': 0.003286505

----- Cluster 0 -----
RMSE TRAIN: 0.604
RMSE VAL: 0.657


[32m[I 2023-05-21 04:57:50,558][0m A new study created in memory with name: no-name-261c504d-52fa-4413-b8f7-ded3110b616d[0m
[32m[I 2023-05-21 04:57:53,608][0m Trial 0 finished with value: -0.6248107670668874 and parameters: {'max_depth': 9, 'num_leaves': 82, 'n_estimators': 111, 'reg_alpha': 0.004983254087499036, 'reg_lambda': 0.004028405287950032, 'min_child_samples': 46, 'min_child_weight': 2.6613185072808148}. Best is trial 0 with value: -0.6248107670668874.[0m
[32m[I 2023-05-21 04:57:57,076][0m Trial 1 finished with value: -0.623971076996371 and parameters: {'max_depth': 8, 'num_leaves': 92, 'n_estimators': 140, 'reg_alpha': 0.09196685384261512, 'reg_lambda': 1.1880037706600035, 'min_child_samples': 57, 'min_child_weight': 0.8884351732088747}. Best is trial 1 with value: -0.623971076996371.[0m
[32m[I 2023-05-21 04:57:58,841][0m Trial 2 finished with value: -0.7038454737355568 and parameters: {'max_depth': 4, 'num_leaves': 20, 'n_estimators': 107, 'reg_alpha': 1.679942285

----- Cluster 1 -----
RMSE TRAIN: 0.600
RMSE VAL: 0.685


[32m[I 2023-05-21 04:59:58,520][0m A new study created in memory with name: no-name-6e8f0839-fbd8-445b-8728-bc1e224de7f3[0m
[32m[I 2023-05-21 05:00:08,452][0m Trial 0 finished with value: -0.7086821055703671 and parameters: {'max_depth': 2, 'num_leaves': 39, 'n_estimators': 100, 'reg_alpha': 3.8159380543001022, 'reg_lambda': 7.455000359642387e-05, 'min_child_samples': 52, 'min_child_weight': 4.830878466254876}. Best is trial 0 with value: -0.7086821055703671.[0m
[32m[I 2023-05-21 05:00:18,019][0m Trial 1 finished with value: -0.6989238098351263 and parameters: {'max_depth': 2, 'num_leaves': 60, 'n_estimators': 149, 'reg_alpha': 0.00025972828926772845, 'reg_lambda': 5.413502477457515e-06, 'min_child_samples': 56, 'min_child_weight': 0.19572671690008792}. Best is trial 1 with value: -0.6989238098351263.[0m
[32m[I 2023-05-21 05:00:31,066][0m Trial 2 finished with value: -0.6248871347892238 and parameters: {'max_depth': 6, 'num_leaves': 74, 'n_estimators': 99, 'reg_alpha': 4.638

----- Cluster 2 -----
RMSE TRAIN: 0.625
RMSE VAL: 0.633


[32m[I 2023-05-21 05:02:41,815][0m A new study created in memory with name: no-name-b7a9ebb9-f75e-4587-9487-d7c0d052e896[0m
[32m[I 2023-05-21 05:03:01,442][0m Trial 0 finished with value: -0.5983089432214226 and parameters: {'max_depth': 9, 'num_leaves': 69, 'n_estimators': 195, 'reg_alpha': 2.3111543836469234e-08, 'reg_lambda': 0.007087967274114281, 'min_child_samples': 67, 'min_child_weight': 4.82224448186533}. Best is trial 0 with value: -0.5983089432214226.[0m
[32m[I 2023-05-21 05:03:14,806][0m Trial 1 finished with value: -0.6171314524080461 and parameters: {'max_depth': 9, 'num_leaves': 31, 'n_estimators': 173, 'reg_alpha': 0.12250037011291531, 'reg_lambda': 0.0002882476836523549, 'min_child_samples': 12, 'min_child_weight': 5.093923499394223}. Best is trial 0 with value: -0.5983089432214226.[0m
[32m[I 2023-05-21 05:03:21,564][0m Trial 2 finished with value: -0.7935095235150867 and parameters: {'max_depth': 1, 'num_leaves': 85, 'n_estimators': 110, 'reg_alpha': 4.22026

----- Cluster 3 -----
RMSE TRAIN: 0.598
RMSE VAL: 0.623


[32m[I 2023-05-21 05:05:18,788][0m A new study created in memory with name: no-name-024001f9-454f-44fd-b62a-740d68af134c[0m
[32m[I 2023-05-21 05:05:22,034][0m Trial 0 finished with value: -0.6581402600790698 and parameters: {'max_depth': 5, 'num_leaves': 84, 'n_estimators': 84, 'reg_alpha': 0.009965678754213968, 'reg_lambda': 0.0008126149950126926, 'min_child_samples': 33, 'min_child_weight': 2.8912422996364966}. Best is trial 0 with value: -0.6581402600790698.[0m
[32m[I 2023-05-21 05:05:24,691][0m Trial 1 finished with value: -0.8104727019982547 and parameters: {'max_depth': 1, 'num_leaves': 70, 'n_estimators': 175, 'reg_alpha': 0.04871023028363573, 'reg_lambda': 8.865464139929352e-08, 'min_child_samples': 59, 'min_child_weight': 7.937777603439409}. Best is trial 0 with value: -0.6581402600790698.[0m
[32m[I 2023-05-21 05:05:27,343][0m Trial 2 finished with value: -0.7036617779087556 and parameters: {'max_depth': 3, 'num_leaves': 82, 'n_estimators': 86, 'reg_alpha': 0.043189

----- Cluster 4 -----
RMSE TRAIN: 0.586
RMSE VAL: 0.652


[32m[I 2023-05-21 05:07:35,029][0m A new study created in memory with name: no-name-24f24f53-18a4-4424-a7df-a3e8a0dac4ac[0m
[32m[I 2023-05-21 05:07:43,117][0m Trial 0 finished with value: -0.7201833471711339 and parameters: {'max_depth': 2, 'num_leaves': 24, 'n_estimators': 93, 'reg_alpha': 0.5011408643441198, 'reg_lambda': 0.0002470792677914736, 'min_child_samples': 20, 'min_child_weight': 7.895454970673182}. Best is trial 0 with value: -0.7201833471711339.[0m
[32m[I 2023-05-21 05:08:00,118][0m Trial 1 finished with value: -0.6111335802939656 and parameters: {'max_depth': 8, 'num_leaves': 71, 'n_estimators': 147, 'reg_alpha': 3.4620601139129164e-07, 'reg_lambda': 2.532856740850941e-06, 'min_child_samples': 50, 'min_child_weight': 3.9199016765992902}. Best is trial 1 with value: -0.6111335802939656.[0m
[32m[I 2023-05-21 05:08:10,116][0m Trial 2 finished with value: -0.6658819797467617 and parameters: {'max_depth': 3, 'num_leaves': 29, 'n_estimators': 178, 'reg_alpha': 5.7292

----- Cluster 5 -----
RMSE TRAIN: 0.611
RMSE VAL: 0.630


[32m[I 2023-05-21 05:10:06,456][0m A new study created in memory with name: no-name-781c76cd-59cd-44fe-9d26-6f3fecb267c4[0m
[32m[I 2023-05-21 05:10:14,690][0m Trial 0 finished with value: -0.6418325788156799 and parameters: {'max_depth': 7, 'num_leaves': 26, 'n_estimators': 115, 'reg_alpha': 1.3367404125029252e-05, 'reg_lambda': 0.9157312752220751, 'min_child_samples': 44, 'min_child_weight': 4.682278337574818}. Best is trial 0 with value: -0.6418325788156799.[0m
[32m[I 2023-05-21 05:10:25,170][0m Trial 1 finished with value: -0.6268075207021075 and parameters: {'max_depth': 6, 'num_leaves': 45, 'n_estimators': 160, 'reg_alpha': 1.2420998051440284e-06, 'reg_lambda': 2.374077145907645, 'min_child_samples': 67, 'min_child_weight': 2.112672308782514}. Best is trial 1 with value: -0.6268075207021075.[0m
[32m[I 2023-05-21 05:10:31,591][0m Trial 2 finished with value: -0.6360274195087404 and parameters: {'max_depth': 6, 'num_leaves': 38, 'n_estimators': 106, 'reg_alpha': 0.0089138

----- Cluster 6 -----
RMSE TRAIN: 0.627
RMSE VAL: 0.646


[32m[I 2023-05-21 05:12:43,717][0m A new study created in memory with name: no-name-f1abc3bf-e575-469f-9660-68ffa207ca35[0m
[32m[I 2023-05-21 05:13:02,487][0m Trial 0 finished with value: -0.6036868517234363 and parameters: {'max_depth': 10, 'num_leaves': 99, 'n_estimators': 109, 'reg_alpha': 0.013261049818641919, 'reg_lambda': 0.5796967997707857, 'min_child_samples': 15, 'min_child_weight': 9.492703860625026}. Best is trial 0 with value: -0.6036868517234363.[0m
[32m[I 2023-05-21 05:13:14,085][0m Trial 1 finished with value: -0.650901798317116 and parameters: {'max_depth': 4, 'num_leaves': 24, 'n_estimators': 136, 'reg_alpha': 3.130349097950584e-05, 'reg_lambda': 5.677670011742192e-06, 'min_child_samples': 11, 'min_child_weight': 6.549609550952463}. Best is trial 0 with value: -0.6036868517234363.[0m
[32m[I 2023-05-21 05:13:22,919][0m Trial 2 finished with value: -0.7080096578198266 and parameters: {'max_depth': 2, 'num_leaves': 26, 'n_estimators': 106, 'reg_alpha': 0.066402

----- Cluster 7 -----
RMSE TRAIN: 0.604
RMSE VAL: 0.627


[32m[I 2023-05-21 05:15:17,623][0m A new study created in memory with name: no-name-b8d8f7e8-b831-4103-8e07-7a815ab3f9ae[0m
[32m[I 2023-05-21 05:15:20,110][0m Trial 0 finished with value: -0.5564562508621911 and parameters: {'max_depth': 9, 'num_leaves': 78, 'n_estimators': 130, 'reg_alpha': 0.13553984668324973, 'reg_lambda': 0.00040492869611053763, 'min_child_samples': 29, 'min_child_weight': 5.715421368431086}. Best is trial 0 with value: -0.5564562508621911.[0m
[32m[I 2023-05-21 05:15:21,500][0m Trial 1 finished with value: -0.6610517884044214 and parameters: {'max_depth': 4, 'num_leaves': 43, 'n_estimators': 124, 'reg_alpha': 0.016237629257358752, 'reg_lambda': 1.2346426267112125, 'min_child_samples': 40, 'min_child_weight': 6.24102470353816}. Best is trial 0 with value: -0.5564562508621911.[0m
[32m[I 2023-05-21 05:15:22,690][0m Trial 2 finished with value: -0.6899215095149245 and parameters: {'max_depth': 3, 'num_leaves': 69, 'n_estimators': 117, 'reg_alpha': 2.89718424

----- Cluster 8 -----
RMSE TRAIN: 0.556
RMSE VAL: 0.656


In [8]:
submition = make_submition()

  0%|          | 0/9 [00:00<?, ?it/s]