In [26]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [27]:
class DiGraph:
    def __init__(self) -> None:
        self.max_v = -1
        self.edges = []

    def add_edge(self, u: int, v: int, x: float) -> None:
        self.max_v = max(self.max_v, max(u, v))
        self.edges.append((u, v, x))
        
    def get_n(self) -> np.int32:
        return np.int32(self.max_v + 1)

    def get_m(self) -> np.int32:
        return np.int32(len(self.edges))

    def to_array(self) -> np.array:
        g = np.zeros((self.max_v + 1, self.max_v + 1), np.float32)
        for edge in self.edges:
            g[edge[0]][edge[1]] = 1 + edge[2]
        return g

In [28]:
def read_ego_net(ego_net_path):
    cur_ego_id = -1
    cur_ego_net = None
    cur_ego_net_x1 = None
    cur_ego_net_x2 = None
    cur_ego_net_x3 = None
    cur_ego_net_time = None
    cur_ego_net_f = None
    with open(ego_net_path, 'r') as ego_net_f:
        ego_net_f.readline()
        for line in ego_net_f:
            line = line.split(',')
            ego_id, u, v = int(line[0]), int(line[1]), int(line[2])
            x2, x3 = float(line[5]), float(line[6])

            if ego_id != cur_ego_id:
                if cur_ego_id != -1:
                    yield cur_ego_id, cur_ego_net, cur_ego_net_x1, cur_ego_net_x2, cur_ego_net_x3, cur_ego_net_time, cur_ego_net_f
                assert cur_ego_id < ego_id
                cur_ego_id = ego_id
                cur_ego_net = DiGraph()
                cur_ego_net_x1 = DiGraph()
                cur_ego_net_x2 = DiGraph()
                cur_ego_net_x3 = DiGraph()
                cur_ego_net_time = DiGraph()
                cur_ego_net_f = DiGraph()
            
            cur_ego_net.add_edge(u, v, 0)
            if line[4] == "":
                cur_ego_net_x1.add_edge(u, v, -1)
            else:
                cur_ego_net_x1.add_edge(u, v, float(line[4]))
            cur_ego_net_x2.add_edge(u, v, x2)
            cur_ego_net_x3.add_edge(u, v, x3)
            if line[3] == "":
                cur_ego_net_time.add_edge(u, v, t := -1)
            else:
                cur_ego_net_time.add_edge(u, v, t := float(line[3]))
                
            p = max(1, np.log10(max(0, t) + 1) - 2)                  
            cur_ego_net_f.add_edge(
                u, v, 
                np.log((1 + 2 * x2 / p) * (1 + 3 * x3 / p) * (1 + max(x2, x3) / p))
            )

        if cur_ego_id != -1:
            yield cur_ego_id, cur_ego_net, cur_ego_net_x1, cur_ego_net_x2, cur_ego_net_x3, cur_ego_net_time, cur_ego_net_f

In [29]:
train = pd.read_csv(
    './data/train.csv', index_col='ego_id', usecols=['ego_id', 'u', 'v', 'x1'], 
    dtype={'ego_id': 'int64', 'u': 'int32', 'v': 'int32', 'x1': 'float32'}
)
train

Unnamed: 0_level_0,u,v,x1
ego_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,131,84,5.669200e-07
0,135,164,6.246274e-02
0,47,15,0.000000e+00
0,5,4,4.962974e-02
0,176,219,1.237935e+00
...,...,...,...
1709396984692,3,5,2.307750e+00
1709396984692,1,5,3.729143e+00
1709396984692,1,7,4.286984e+00
1709396984692,5,11,3.500757e+00


In [30]:
test = pd.read_csv(
    './data/test.csv', index_col='ego_id', usecols=['ego_id', 'u', 'v', 'x1'], 
    dtype={'ego_id': 'int64', 'u': 'int32', 'v': 'int32', 'x1': 'float32'}
)
test

Unnamed: 0_level_0,u,v,x1
ego_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,20,19,3.839089e-04
8,131,125,4.034464e-01
8,73,56,8.554643e-05
8,0,4,2.886418e-01
8,63,73,4.281692e-07
...,...,...,...
1709396984676,89,0,1.167843e+00
1709396984676,84,87,1.179100e-06
1709396984676,8,18,1.175182e+00
1709396984676,33,20,5.511019e-01


In [31]:
attr_df = pd.read_csv('./data/attr.csv').set_index('ego_id')
attr_df

Unnamed: 0_level_0,u,age,city_id,sex,school,university
ego_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,227,68,-1,1,778293348,-1
0,45,38,237065842,1,82803468,238500268
0,142,60,237065842,1,196560139,-1
0,280,66,-1,2,963209731,720783270
0,41,18,-1,2,308862409,-1
...,...,...,...,...,...,...
1709396984692,2,16,492149712,2,769209871,-1
1709396984692,12,15,-1,1,-1,-1
1709396984692,18,23,-1,1,-1,-1
1709396984692,4,16,650683235,1,-1,-1


In [32]:
def build_X(X, ego_net, ego_net_x1, ego_net_x2, ego_net_x3, ego_net_time, cur_ego_net_f):    
    vertex_cnt = ego_net.get_n()
    edge_cnt = ego_net.get_m()
    
    X['vertex_cnt'] = vertex_cnt
    X['density'] = 2 * edge_cnt / vertex_cnt / (vertex_cnt - 1)
    
    g = ego_net.to_array()
    degree = g.sum(axis=1)
    
    X['u_neighbour_cnt'] = degree[X.u]
    X['v_neighbour_cnt'] = degree[X.v]
    
    sc = g.dot(g.T).flatten()
    X['SC'] = sc[X.u * vertex_cnt + X.v]
    
    degree = 1 + degree.reshape((-1, 1))
        
    aa = (g / degree).dot(g.T).flatten()
    X['AA'] = aa[X.u * vertex_cnt + X.v]
        
    g = ego_net_x1.to_array()
    
    X['r_x1'] = g[X.v, X.u]
    
    g = cur_ego_net_f.to_array()
    
    degree = 1 + (g >= 1).sum(axis=1).reshape((-1, 1))
    aa = (g / (1 + np.log(degree))).dot(g.T).flatten()
    X['AA_f_log'] = aa[X.u * vertex_cnt + X.v]
    
    g += g.T
    degree = 1 + (g >= 1).sum(axis=1).reshape((-1, 1))
    aa = (g / (1 + np.log(degree))).dot(g.T).flatten()
    X['AA_f_t_log'] = aa[X.u * vertex_cnt + X.v]
    
    g = ego_net_time.to_array()
    
    X['t'] = g[X.u, X.v]
    X['r_t'] = g[X.v, X.u]
    
    X['u_time_0'] = g[X.u, 0]
    X['v_time_0'] = g[X.v, 0]
    
    inv = g.copy()
    inv[inv > 0] = 1 / inv[inv > 0]
    inv_sum = inv.sum(axis=1)
    X['u_t_sum'] = inv_sum[X.u]
    X['v_t_sum'] = inv_sum[X.v]
    
    t_min = g.copy()
    t_min[t_min == 0] = t_min.max()
    X['min_time'] = t_min.min()
    t_min = t_min.min(axis=1)
    X['u_t_min'] = t_min[X.u]
    X['v_t_min'] = t_min[X.v]
    
    sc = inv.dot(inv.T).flatten()
    X['SC_time'] = sc[X.u * vertex_cnt + X.v]
    
    g = ego_net_x2.to_array()
    
    X['x2'] = g[X.u, X.v]
    X['r_x2'] = g[X.v, X.u]
    
    x2_sum = g.sum(axis=1)
    X['u_x2_sum'] = x2_sum[X.u]
    X['v_x2_sum'] = x2_sum[X.v]
    
    x2_max = g.max(axis=1)
    X['u_x2_max'] = x2_max[X.u]
    X['v_x2_max'] = x2_max[X.v]
    
    sc = g.dot(g.T).flatten()
    X['SC_x2'] = sc[X.u * vertex_cnt + X.v]
    X['SC_x2_mean'] = X['SC_x2'] / X['SC']
    X.loc[X['SC'] == 0, 'SC_x2_mean'] = 0
    
    g = ego_net_x3.to_array()
    
    X['x3'] = g[X.u, X.v]
    X['r_x3'] = g[X.v, X.u]
    
    x3_sum = g.sum(axis=1)
    X['u_x3_sum'] = x3_sum[X.u]
    X['v_x3_sum'] = x3_sum[X.v]
    
    x3_max = g.max(axis=1)
    X['u_x3_max'] = x3_max[X.u]
    X['v_x3_max'] = x3_max[X.v]
    
    sc = g.dot(g.T).flatten()
    X['SC_x3'] = sc[X.u * vertex_cnt + X.v]

In [33]:
def add_attr(X, ego_id, ego_net):
    cur_attr = attr_df.loc[ego_id].reset_index()
    vertex_cnt = ego_net.get_n()
    g = ego_net.to_array()
        
    age = np.full(300, -1, dtype=np.int32)
    age[cur_attr.u] = cur_attr.age
    age = age[:vertex_cnt]
    
    X['0_age'] = age[0]
    X['u_age'] = age[X.u]
    X['v_age'] = age[X.v]
    X['age_diff'] = np.abs(X['u_age'] - X['v_age'])
    X.loc[(X.u_age == -1) | (X.v_age == -1), 'age_diff'] = -1
    
    age = g * age
    mask = (age > 0).sum(axis=1)
    mask[mask==0] = 1
    X['u_mean_age'] = age.sum(axis=1)[X.u] / mask[X.u]
    X['v_mean_age'] = age.sum(axis=1)[X.v] / mask[X.v]
    
    sex = np.full(300, -1, dtype=np.int8)
    sex[cur_attr.u] = cur_attr.sex
    
    X['0_sex'] = sex[0]
    X['u_sex'] = sex[X.u]
    X['v_sex'] = sex[X.v]
    
    city = np.full(300, -1, dtype=np.int32)
    city[cur_attr.u] = cur_attr.city_id
    X['is_city_eq'] = (city[X.u] == city[X.v]) & (city[X.u] != -1)
    
    school = np.full(300, -1, dtype=np.int32)
    school[cur_attr.u] = cur_attr.school
    X['is_school_eq'] = (school[X.u] == school[X.v]) & (school[X.u] != -1)
    
    university = np.full(300, -1, dtype=np.int32)
    university[cur_attr.u] = cur_attr.university
    X['is_university_eq'] = (university[X.u] == university[X.v]) & (university[X.u] != -1)

In [34]:
train_df = []
for ego_id, ego_net, ego_net_x1, ego_net_x2, ego_net_x3, ego_net_time, cur_ego_net_f in \
tqdm(read_ego_net('./data/train.csv'), total=61786):
    X = train.loc[ego_id].reset_index()
    build_X(X, ego_net, ego_net_x1, ego_net_x2, ego_net_x3, ego_net_time, cur_ego_net_f)
    add_attr(X, ego_id, ego_net)
    train_df.append(X)

  0%|          | 0/61786 [00:00<?, ?it/s]

In [35]:
train_df = pd.concat(train_df)
train_df

Unnamed: 0,ego_id,u,v,x1,vertex_cnt,density,u_neighbour_cnt,v_neighbour_cnt,SC,AA,...,v_age,age_diff,u_mean_age,v_mean_age,0_sex,u_sex,v_sex,is_city_eq,is_school_eq,is_university_eq
0,0,131,84,5.669200e-07,300,0.036611,7.0,10.0,2.0,0.250000,...,50,12,39.285714,56.700000,1,1,1,True,False,False
1,0,135,164,6.246274e-02,300,0.036611,4.0,5.0,2.0,0.400000,...,92,47,48.250000,37.800000,1,2,1,False,False,True
2,0,47,15,0.000000e+00,300,0.036611,3.0,3.0,0.0,0.000000,...,42,1,41.000000,41.000000,1,2,2,False,False,False
3,0,5,4,4.962974e-02,300,0.036611,24.0,10.0,7.0,0.280000,...,40,9,41.318182,41.800000,1,1,2,False,False,False
4,0,176,219,1.237935e+00,300,0.036611,15.0,18.0,6.0,0.375000,...,21,0,26.933333,22.777778,1,1,1,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2474,25769804017,182,157,0.000000e+00,299,0.055644,18.0,27.0,4.0,0.210526,...,-1,-1,35.666667,23.400000,2,-1,-1,False,False,False
2475,25769804017,0,298,3.746179e-07,299,0.055644,177.0,0.0,0.0,0.000000,...,21,1,28.288136,0.000000,2,2,2,False,False,False
2476,25769804017,197,201,3.855797e-03,299,0.055644,4.0,4.0,1.0,0.200000,...,20,0,20.333333,20.000000,2,1,1,False,False,False
2477,25769804017,26,88,2.511534e-09,299,0.055644,25.0,11.0,0.0,0.000000,...,21,-1,28.350000,20.500000,2,-1,2,False,False,False


In [36]:
test_df = []
for ego_id, ego_net, ego_net_x1, ego_net_x2, ego_net_x3, ego_net_time, cur_ego_net_f in \
tqdm(read_ego_net('./data/test.csv'), total=20596):
    X = test.loc[ego_id].reset_index()
    build_X(X, ego_net, ego_net_x1, ego_net_x2, ego_net_x3, ego_net_time, cur_ego_net_f)
    add_attr(X, ego_id, ego_net)
    test_df.append(X)

  0%|          | 0/20596 [00:00<?, ?it/s]

In [37]:
test_df = pd.concat(test_df)
test_df

Unnamed: 0,ego_id,u,v,x1,vertex_cnt,density,u_neighbour_cnt,v_neighbour_cnt,SC,AA,...,v_age,age_diff,u_mean_age,v_mean_age,0_sex,u_sex,v_sex,is_city_eq,is_school_eq,is_university_eq
0,8,20,19,3.839089e-04,194,0.054377,12.0,7.0,4.0,0.307692,...,37,0,36.500000,50.333333,2,2,2,True,False,False
1,8,131,125,4.034464e-01,194,0.054377,6.0,3.0,2.0,0.285714,...,38,3,33.500000,34.666667,2,2,2,False,False,False
2,8,73,56,8.554643e-05,194,0.054377,10.0,12.0,4.0,0.363636,...,36,22,36.222222,36.600000,2,2,2,False,False,False
3,8,0,4,2.886418e-01,194,0.054377,122.0,6.0,0.0,0.000000,...,37,1,37.557895,47.666667,2,2,1,False,True,False
4,8,63,73,4.281692e-07,194,0.054377,8.0,10.0,3.0,0.333333,...,58,19,48.857143,36.222222,2,2,2,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2238,85899345968,284,42,0.000000e+00,300,0.050011,9.0,10.0,0.0,0.000000,...,42,12,34.666667,42.600000,2,2,1,False,False,False
2239,85899345968,81,76,2.246893e-01,300,0.050011,23.0,15.0,2.0,0.083333,...,43,11,33.904762,46.866667,2,2,1,False,False,False
2240,85899345968,77,180,1.775982e-03,300,0.050011,6.0,6.0,2.0,0.285714,...,21,-1,30.500000,31.666667,2,-1,2,False,False,False
2241,85899345968,132,58,4.258665e-06,300,0.050011,9.0,6.0,2.0,0.200000,...,33,0,36.285714,31.800000,2,2,2,False,False,False


In [38]:
val_df = test_df[test_df.x1.notna()].copy()
val_df

Unnamed: 0,ego_id,u,v,x1,vertex_cnt,density,u_neighbour_cnt,v_neighbour_cnt,SC,AA,...,v_age,age_diff,u_mean_age,v_mean_age,0_sex,u_sex,v_sex,is_city_eq,is_school_eq,is_university_eq
0,8,20,19,3.839089e-04,194,0.054377,12.0,7.0,4.0,0.307692,...,37,0,36.500000,50.333333,2,2,2,True,False,False
1,8,131,125,4.034464e-01,194,0.054377,6.0,3.0,2.0,0.285714,...,38,3,33.500000,34.666667,2,2,2,False,False,False
2,8,73,56,8.554643e-05,194,0.054377,10.0,12.0,4.0,0.363636,...,36,22,36.222222,36.600000,2,2,2,False,False,False
3,8,0,4,2.886418e-01,194,0.054377,122.0,6.0,0.0,0.000000,...,37,1,37.557895,47.666667,2,2,1,False,True,False
4,8,63,73,4.281692e-07,194,0.054377,8.0,10.0,3.0,0.333333,...,58,19,48.857143,36.222222,2,2,2,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2237,85899345968,153,201,6.353147e-03,300,0.050011,5.0,3.0,1.0,0.166667,...,31,16,30.800000,31.666667,2,2,2,False,False,False
2238,85899345968,284,42,0.000000e+00,300,0.050011,9.0,10.0,0.0,0.000000,...,42,12,34.666667,42.600000,2,2,1,False,False,False
2239,85899345968,81,76,2.246893e-01,300,0.050011,23.0,15.0,2.0,0.083333,...,43,11,33.904762,46.866667,2,2,1,False,False,False
2240,85899345968,77,180,1.775982e-03,300,0.050011,6.0,6.0,2.0,0.285714,...,21,-1,30.500000,31.666667,2,-1,2,False,False,False


In [39]:
sub = pd.read_csv('./data/submission.csv')
sub_values = set()
for ego_id, u, v in zip(sub.ego_id, sub.u, sub.v):
    sub_values.add((ego_id, u, v))

In [40]:
mask = []
for ego_id, u, v in zip(test_df.ego_id, test_df.u, test_df.v):
    mask.append((ego_id, u, v) in sub_values)

In [41]:
test_df = test_df[mask].drop_duplicates(subset=['ego_id', 'u', 'v']).copy()
test_df

Unnamed: 0,ego_id,u,v,x1,vertex_cnt,density,u_neighbour_cnt,v_neighbour_cnt,SC,AA,...,v_age,age_diff,u_mean_age,v_mean_age,0_sex,u_sex,v_sex,is_city_eq,is_school_eq,is_university_eq
120,8,7,16,,194,0.054377,12.0,8.0,3.0,0.230769,...,36,1,54.666667,36.375000,2,1,1,False,False,False
141,8,8,29,,194,0.054377,13.0,6.0,5.0,0.357143,...,-1,-1,43.727273,52.800000,2,1,-1,False,False,False
142,8,75,0,,194,0.054377,6.0,122.0,3.0,0.428571,...,36,1,35.750000,37.557895,2,1,2,True,False,True
183,8,0,151,,194,0.054377,122.0,5.0,2.0,0.016260,...,18,18,37.557895,38.600000,2,2,2,True,False,False
297,8,8,20,,194,0.054377,13.0,12.0,4.0,0.285714,...,37,84,43.727273,36.500000,2,1,2,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1931,85899345968,185,57,,300,0.050011,12.0,13.0,1.0,0.076923,...,29,1,39.500000,31.846154,2,2,2,False,False,False
1937,85899345968,104,49,,300,0.050011,5.0,6.0,0.0,0.000000,...,35,3,32.600000,33.166667,2,1,2,False,False,False
2023,85899345968,240,153,,300,0.050011,3.0,5.0,0.0,0.000000,...,15,13,26.666667,30.800000,2,2,2,False,False,False
2059,85899345968,0,89,,300,0.050011,175.0,3.0,2.0,0.011364,...,77,44,36.361963,39.333333,2,2,2,False,False,False


In [42]:
del train, test

In [43]:
import gc
gc.collect()

0

In [44]:
train_df['x1'].mean(), val_df['x1'].mean()

(0.65888464, 0.7000451)

In [45]:
from catboost import CatBoostRegressor, Pool

train_pool = Pool(
    data=train_df.drop(['ego_id', 'u', 'v', 'x1'], axis=1),
    label=train_df['x1']
)

val_pool = Pool(
    data=val_df.drop(['ego_id', 'u', 'v', 'x1'], axis=1),
    label=val_df['x1']
)

In [46]:
del train_df, val_df
gc.collect()

0

In [47]:
params = {
    'task_type': 'CPU',
    'loss_function': 'RMSE',
    'iterations': 300,
    'max_depth': 5,
}

In [48]:
model_cb = CatBoostRegressor(**params, random_seed=56)
model_cb.fit(train_pool, eval_set=val_pool, plot=True, verbose=False, use_best_model=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x21718e3e8e0>

In [49]:
model_cb.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,x2,34.461784
1,t,27.879915
2,r_x1,17.59159
3,u_x2_max,3.430191
4,v_t_min,1.596933
5,r_t,1.228021
6,age_diff,1.182116
7,r_x2,1.181088
8,v_x2_max,1.133256
9,u_age,0.929564


In [50]:
1 - np.min(model_cb.evals_result_['validation']['RMSE'])

0.2780525008299364

In [51]:
x1_pred = model_cb.predict(test_df)
x1_pred[x1_pred<0] = 0
x1_pred[x1_pred>50] = 50
test_df['x1'] = x1_pred

In [52]:
test_df = pd.merge(test_df[['ego_id', 'u', 'v', 'x1']], sub[['ego_id', 'u', 'v']], on = ['ego_id', 'u', 'v'], how='right')
test_df.to_csv('./x1,x2,x3,t,attr_submit.csv', index=False)