In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [2]:
class DiGraph:
    def __init__(self) -> None:
        self.max_v = -1
        self.edges = []

    def add_edge(self, u: int, v: int, x: float) -> None:
        self.max_v = max(self.max_v, max(u, v))
        self.edges.append((u, v, x))
        
    def get_n(self) -> int:
        return self.max_v + 1

    def get_m(self) -> int:
        return len(self.edges)

    def to_array(self) -> np.array:
        g = np.zeros((self.max_v + 1, self.max_v + 1), np.float32)
        for edge in self.edges:
            g[edge[0]][edge[1]] = 1 + edge[2]
        return g

In [3]:
def read_ego_net(ego_net_path):
    cur_ego_id = -1
    cur_ego_net_x2 = None
    cur_ego_net_x3 = None
    cur_ego_net_time = None
    with open(ego_net_path, 'r') as ego_net_f:
        ego_net_f.readline()
        for line in ego_net_f:
            line = line.split(',')
            ego_id, u, v = int(line[0]), int(line[1]), int(line[2])
            x2, x3 = float(line[5]), float(line[6])

            if ego_id != cur_ego_id:
                if cur_ego_id != -1:
                    yield cur_ego_id, cur_ego_net_x2, cur_ego_net_x3, cur_ego_net_time
                assert cur_ego_id < ego_id
                cur_ego_id = ego_id
                cur_ego_net_x2 = DiGraph()
                cur_ego_net_x3 = DiGraph()
                cur_ego_net_time = DiGraph()
            
            cur_ego_net_x2.add_edge(u, v, x2)
            cur_ego_net_x3.add_edge(u, v, x3)
            if line[3] == "":
                cur_ego_net_time.add_edge(u, v, -1)
            else:
                cur_ego_net_time.add_edge(u, v, float(line[3]))

        if cur_ego_id != -1:
            yield cur_ego_id, cur_ego_net_x2, cur_ego_net_x3, cur_ego_net_time

In [4]:
train = pd.read_csv(
    './data/train.csv', index_col='ego_id', usecols=['ego_id', 'u', 'v', 'x1'], 
    dtype={'ego_id': 'int64', 'u': 'int32', 'v': 'int32', 'x1': 'float32'}
)
train

Unnamed: 0_level_0,u,v,x1
ego_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,131,84,5.669200e-07
0,135,164,6.246274e-02
0,47,15,0.000000e+00
0,5,4,4.962974e-02
0,176,219,1.237935e+00
...,...,...,...
1709396984692,3,5,2.307750e+00
1709396984692,1,5,3.729143e+00
1709396984692,1,7,4.286984e+00
1709396984692,5,11,3.500757e+00


In [5]:
test = pd.read_csv(
    './data/test.csv', index_col='ego_id', usecols=['ego_id', 'u', 'v', 'x1'], 
    dtype={'ego_id': 'int64', 'u': 'int32', 'v': 'int32', 'x1': 'float32'}
)
test

Unnamed: 0_level_0,u,v,x1
ego_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,20,19,3.839089e-04
8,131,125,4.034464e-01
8,73,56,8.554643e-05
8,0,4,2.886418e-01
8,63,73,4.281692e-07
...,...,...,...
1709396984676,89,0,1.167843e+00
1709396984676,84,87,1.179100e-06
1709396984676,8,18,1.175182e+00
1709396984676,33,20,5.511019e-01


In [6]:
def build_X(X, ego_net_x2, ego_net_x3, ego_net_time):
    vertex_cnt = ego_net_x2.get_n()
        
    g = ego_net_time.to_array()
    
    X['t'] = g[X.u, X.v]
    X['not_frens'] = (X['t'] == 0)
    
    X['u_time_0'] = g[X.u, 0]
    X['v_time_0'] = g[X.v, 0]
    
    inv = g.copy()
    inv[inv > 0] = 1 / inv[inv > 0]
    inv_sum = inv.sum(axis=1)
    X['u_t_sum'] = inv_sum[X.u]
    X['v_t_sum'] = inv_sum[X.v]
    
    t_min = g.copy()
    t_min[t_min == 0] = t_min.max()
    X['min_time'] = t_min.min()
    t_min = t_min.min(axis=1)
    X['u_t_min'] = t_min[X.u]
    X['v_t_min'] = t_min[X.v]
    
    sc = inv.dot(inv.T).flatten()
    X['SC_time'] = sc[X.u * vertex_cnt + X.v]
    
    g = ego_net_x2.to_array()
    
    X['x2'] = g[X.u, X.v]
    
    x2_sum = g.sum(axis=1)
    X['u_x2_sum'] = x2_sum[X.u]
    X['v_x2_sum'] = x2_sum[X.v]
    
    x2_max = g.max(axis=1)
    X['u_x2_max'] = x2_max[X.u]
    X['v_x2_max'] = x2_max[X.v]
    
    sc = g.dot(g.T).flatten()
    X['SC_x2'] = sc[X.u * vertex_cnt + X.v]
    
    g = ego_net_x3.to_array()
    
    X['x3'] = g[X.u, X.v]
    
    x3_sum = g.sum(axis=1)
    X['u_x3_sum'] = x3_sum[X.u]
    X['v_x3_sum'] = x3_sum[X.v]
    
    x3_max = g.max(axis=1)
    X['u_x3_max'] = x3_max[X.u]
    X['v_x3_max'] = x3_max[X.v]
    
    sc = g.dot(g.T).flatten()
    X['SC_x3'] = sc[X.u * vertex_cnt + X.v]

In [7]:
train_df = []
for ego_id, ego_net_x2, ego_net_x3, ego_net_time in tqdm(read_ego_net('./data/train.csv'), total=61786):
    X = train.loc[ego_id].reset_index()
    build_X(X, ego_net_x2, ego_net_x3, ego_net_time)
    train_df.append(X)

  0%|          | 0/61786 [00:00<?, ?it/s]

In [8]:
train_df = pd.concat(train_df)
train_df

Unnamed: 0,ego_id,u,v,x1,t,not_frens,u_time_0,v_time_0,u_t_sum,v_t_sum,...,v_x2_sum,u_x2_max,v_x2_max,SC_x2,x3,u_x3_sum,v_x3_sum,u_x3_max,v_x3_max,SC_x3
0,0,131,84,5.669200e-07,149.000000,False,31.700001,31.600000,0.072108,0.125051,...,10.000000,1.000000,1.000000,2.000000,1.0,7.0,10.0,1.0,1.0,2.0
1,0,135,164,6.246274e-02,397.700012,False,337.799988,0.000000,0.010631,0.012619,...,5.000000,1.000000,1.000000,2.000000,1.0,4.0,5.0,1.0,1.0,2.0
2,0,47,15,0.000000e+00,0.000000,True,0.000000,381.600006,0.021978,0.006425,...,3.000000,1.000000,1.000000,0.000000,2.0,6.0,3.0,2.0,1.0,0.0
3,0,5,4,4.962974e-02,595.500000,False,595.500000,595.500000,0.166992,0.029118,...,10.000000,1.693147,1.000000,7.693147,1.0,24.0,10.0,1.0,1.0,7.0
4,0,176,219,1.237935e+00,46.500000,False,0.000000,125.000000,0.226234,0.238561,...,25.247082,3.995732,6.860786,11.860786,1.0,19.0,24.0,2.0,2.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1637,0,253,158,1.497307e-01,210.800003,False,211.000000,0.000000,0.015020,0.018386,...,4.000000,1.000000,1.000000,1.000000,2.0,7.0,4.0,2.0,1.0,1.0
1638,0,259,244,1.827714e+00,122.699997,False,0.000000,107.300003,0.097852,0.102518,...,15.442652,6.897154,3.833213,5.944439,2.0,15.0,15.0,2.0,2.0,7.0
1639,0,209,127,6.636844e-02,123.400002,False,0.000000,125.000000,0.182195,0.200175,...,27.579250,5.454347,3.890372,8.609438,1.0,19.0,34.0,1.0,2.0,13.0
1640,0,187,241,3.756446e+00,117.500000,False,0.000000,0.000000,0.091554,0.287136,...,21.988985,6.961005,4.295837,11.289363,1.0,10.0,18.0,1.0,1.0,4.0


In [9]:
test_df = []
for ego_id, ego_net_x2, ego_net_x3, ego_net_time in tqdm(read_ego_net('./data/test.csv'), total=20596):
    X = test.loc[ego_id].reset_index()
    build_X(X, ego_net_x2, ego_net_x3, ego_net_time)
    test_df.append(X)

  0%|          | 0/20596 [00:00<?, ?it/s]

In [10]:
test_df = pd.concat(test_df)
test_df

Unnamed: 0,ego_id,u,v,x1,t,not_frens,u_time_0,v_time_0,u_t_sum,v_t_sum,...,v_x2_sum,u_x2_max,v_x2_max,SC_x2,x3,u_x3_sum,v_x3_sum,u_x3_max,v_x3_max,SC_x3
0,8,20,19,3.839089e-04,186.699997,False,0.000000,0.000000,0.041465,0.021312,...,7.000000,1.000000,1.000000,4.000000,1.0,12.0,7.0,1.0,1.0,4.0
1,8,131,125,4.034464e-01,162.399994,False,199.699997,0.000000,0.023270,0.053286,...,4.098612,1.693147,2.098612,4.553259,1.0,6.0,3.0,1.0,1.0,2.0
2,8,73,56,8.554643e-05,128.000000,False,0.000000,0.000000,0.080499,0.183724,...,12.000000,1.000000,1.000000,4.000000,1.0,11.0,12.0,2.0,1.0,5.0
3,8,0,4,2.886418e-01,595.500000,False,0.000000,0.000000,2.899330,0.000000,...,6.000000,3.564949,1.000000,0.000000,1.0,122.0,6.0,1.0,1.0,0.0
4,8,63,73,4.281692e-07,128.000000,False,0.000000,0.000000,0.184636,0.080499,...,10.000000,1.000000,1.000000,3.000000,1.0,10.0,11.0,2.0,2.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,8,132,17,1.826740e+00,25.100000,False,0.000000,169.800003,0.119841,0.381358,...,30.313343,2.791759,6.493062,0.000000,1.0,2.0,20.0,1.0,1.0,0.0
1014,8,29,14,,347.899994,False,467.000000,458.399994,0.021420,0.022891,...,10.000000,1.000000,1.000000,3.000000,1.0,6.0,10.0,1.0,1.0,3.0
1015,8,56,59,,81.000000,False,0.000000,0.000000,0.183724,0.172751,...,12.772589,1.000000,2.386294,6.386294,1.0,12.0,10.0,1.0,1.0,5.0
1016,8,14,11,,301.100006,False,458.399994,466.500000,0.022891,0.021768,...,10.000000,1.000000,1.000000,5.000000,1.0,10.0,10.0,1.0,1.0,5.0


In [11]:
val_df = test_df[test_df.x1.notna()].copy()
val_df

Unnamed: 0,ego_id,u,v,x1,t,not_frens,u_time_0,v_time_0,u_t_sum,v_t_sum,...,v_x2_sum,u_x2_max,v_x2_max,SC_x2,x3,u_x3_sum,v_x3_sum,u_x3_max,v_x3_max,SC_x3
0,8,20,19,3.839089e-04,186.699997,False,0.000000,0.000000,0.041465,0.021312,...,7.000000,1.000000,1.000000,4.000000,1.0,12.0,7.0,1.0,1.0,4.0
1,8,131,125,4.034464e-01,162.399994,False,199.699997,0.000000,0.023270,0.053286,...,4.098612,1.693147,2.098612,4.553259,1.0,6.0,3.0,1.0,1.0,2.0
2,8,73,56,8.554643e-05,128.000000,False,0.000000,0.000000,0.080499,0.183724,...,12.000000,1.000000,1.000000,4.000000,1.0,11.0,12.0,2.0,1.0,5.0
3,8,0,4,2.886418e-01,595.500000,False,0.000000,0.000000,2.899330,0.000000,...,6.000000,3.564949,1.000000,0.000000,1.0,122.0,6.0,1.0,1.0,0.0
4,8,63,73,4.281692e-07,128.000000,False,0.000000,0.000000,0.184636,0.080499,...,10.000000,1.000000,1.000000,3.000000,1.0,10.0,11.0,2.0,2.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1009,8,140,113,6.380812e-06,423.399994,False,423.299988,0.000000,0.007242,0.008715,...,3.000000,1.000000,1.000000,0.000000,1.0,3.0,3.0,1.0,1.0,0.0
1010,8,104,91,5.176676e-01,464.299988,False,172.899994,366.500000,1.007937,0.018146,...,6.000000,1.000000,1.000000,1.000000,1.0,3.0,6.0,1.0,1.0,1.0
1011,8,121,41,2.224251e-02,318.299988,False,0.000000,0.000000,0.022978,0.017184,...,10.693148,1.000000,1.693147,1.000000,1.0,7.0,12.0,1.0,2.0,2.0
1012,8,91,13,6.132724e-15,462.899994,False,366.500000,0.000000,0.018146,0.028162,...,12.000000,1.000000,1.000000,0.000000,1.0,6.0,12.0,1.0,1.0,0.0


In [12]:
sub = pd.read_csv('./data/submission.csv')
sub_values = set()
for ego_id, u, v in zip(sub.ego_id, sub.u, sub.v):
    sub_values.add((ego_id, u, v))

In [13]:
mask = []
for ego_id, u, v in zip(test_df.ego_id, test_df.u, test_df.v):
    mask.append((ego_id, u, v) in sub_values)

In [14]:
test_df = test_df[mask].drop_duplicates(subset=['ego_id', 'u', 'v']).copy()
test_df

Unnamed: 0,ego_id,u,v,x1,t,not_frens,u_time_0,v_time_0,u_t_sum,v_t_sum,...,v_x2_sum,u_x2_max,v_x2_max,SC_x2,x3,u_x3_sum,v_x3_sum,u_x3_max,v_x3_max,SC_x3
120,8,7,16,,423.700012,False,0.0,0.0,0.026285,0.0,...,8.0,1.0,1.0,3.0,1.0,12.0,8.0,1.0,1.0,3.0
141,8,8,29,,0.0,True,0.0,467.0,0.009491,0.02142,...,6.0,1.0,1.0,5.0,1.0,13.0,6.0,1.0,1.0,5.0
142,8,75,0,,478.600006,False,478.600006,0.0,0.014327,2.89933,...,130.428589,1.693147,3.564949,3.693147,1.0,6.0,122.0,1.0,1.0,3.0
183,8,0,151,,1.2,False,0.0,1.2,2.89933,1.051628,...,13.328451,3.564949,5.234107,3.609438,1.0,122.0,5.0,1.0,1.0,2.0
297,8,8,20,,292.200012,False,0.0,0.0,0.009491,0.041465,...,12.0,1.0,1.0,4.0,1.0,13.0,12.0,1.0,1.0,4.0
302,8,152,65,,191.5,False,141.5,180.600006,0.206173,0.230897,...,22.590302,1.0,3.564949,10.025352,1.0,16.0,16.0,2.0,1.0,7.0
317,8,5,4,,462.5,False,0.0,0.0,0.022237,0.0,...,6.0,1.0,1.0,3.0,1.0,9.0,6.0,1.0,1.0,3.0
347,8,55,0,,121.699997,False,121.699997,0.0,0.082238,2.89933,...,130.428589,1.0,3.564949,7.0,1.0,15.0,122.0,2.0,1.0,9.0
456,8,46,110,,51.400002,False,0.0,315.5,0.045096,0.008445,...,4.0,1.0,1.0,1.0,1.0,6.0,7.0,1.0,2.0,2.0
461,8,74,159,,8.6,False,0.0,8.2,0.465849,0.318875,...,4.098612,1.0,2.098612,0.0,1.0,13.0,4.0,1.0,2.0,0.0


In [15]:
del train, test

In [16]:
import gc
gc.collect()

0

In [17]:
train_df['x1'].mean(), val_df['x1'].mean()

(0.666723, 0.43647254)

In [18]:
from catboost import CatBoostRegressor, Pool

train_pool = Pool(
    data=train_df.drop(['ego_id', 'u', 'v', 'x1'], axis=1),
    label=train_df['x1']
)

val_pool = Pool(
    data=val_df.drop(['ego_id', 'u', 'v', 'x1'], axis=1),
    label=val_df['x1']
)

  from pandas import MultiIndex, Int64Index



In [19]:
params = {
    'task_type': 'CPU',
    'loss_function': 'RMSE',
    'iterations': 300,
    'max_depth': 5,
}

In [20]:
model_cb = CatBoostRegressor(**params, random_seed=56)
model_cb.fit(train_pool, eval_set=val_pool, plot=True, verbose=False, use_best_model=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x14941edcee0>

In [21]:
1 - np.min(model_cb.evals_result_['validation']['RMSE'])

0.33847185870160146

In [22]:
x1_pred = model_cb.predict(test_df)
x1_pred[x1_pred<0] = 0
x1_pred[x1_pred>50] = 50
test_df['x1'] = x1_pred

In [23]:
test_df = pd.merge(test_df[['ego_id', 'u', 'v', 'x1']], sub[['ego_id', 'u', 'v']], on = ['ego_id', 'u', 'v'], how='right')
test_df.to_csv('./x2,x3,t_submit.csv', index=False)