In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [2]:
class DiGraph:
    def __init__(self) -> None:
        self.max_v = -1
        self.edges = []

    def add_edge(self, u: int, v: int, x: float) -> None:
        self.max_v = max(self.max_v, max(u, v))
        self.edges.append((u, v, x))
        
    def get_n(self) -> int:
        return self.max_v + 1

    def get_m(self) -> int:
        return len(self.edges)

    def to_array(self) -> np.array:
        g = np.zeros((self.max_v + 1, self.max_v + 1), np.float32)
        for edge in self.edges:
            g[edge[0]][edge[1]] = 1 + edge[2]
        return g

In [3]:
def read_ego_net(ego_net_path):
    cur_ego_id = -1
    cur_ego_net = None
    cur_ego_net_x1 = None
    cur_ego_net_x2 = None
    cur_ego_net_x3 = None
    cur_ego_net_time = None
    with open(ego_net_path, 'r') as ego_net_f:
        ego_net_f.readline()
        for line in ego_net_f:
            line = line.split(',')
            ego_id, u, v = int(line[0]), int(line[1]), int(line[2])
            x2, x3 = float(line[5]), float(line[6])

            if ego_id != cur_ego_id:
                if cur_ego_id != -1:
                    yield cur_ego_id, cur_ego_net, cur_ego_net_x1, cur_ego_net_x2, cur_ego_net_x3, cur_ego_net_time
                assert cur_ego_id < ego_id
                cur_ego_id = ego_id
                cur_ego_net = DiGraph()
                cur_ego_net_x1 = DiGraph()
                cur_ego_net_x2 = DiGraph()
                cur_ego_net_x3 = DiGraph()
                cur_ego_net_time = DiGraph()
            
            cur_ego_net.add_edge(u, v, 0)
            if line[4] == "":
                cur_ego_net_x1.add_edge(u, v, -1)
            else:
                cur_ego_net_x1.add_edge(u, v, float(line[4]))
            cur_ego_net_x2.add_edge(u, v, x2)
            cur_ego_net_x3.add_edge(u, v, x3)
            if line[3] == "":
                cur_ego_net_time.add_edge(u, v, -1)
            else:
                cur_ego_net_time.add_edge(u, v, float(line[3]))

        if cur_ego_id != -1:
            yield cur_ego_id, cur_ego_net, cur_ego_net_x1, cur_ego_net_x2, cur_ego_net_x3, cur_ego_net_time

In [4]:
train = pd.read_csv(
    './data/train.csv', index_col='ego_id', usecols=['ego_id', 'u', 'v', 'x1'], 
    dtype={'ego_id': 'int64', 'u': 'int32', 'v': 'int32', 'x1': 'float32'}
)
train

Unnamed: 0_level_0,u,v,x1
ego_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,131,84,5.669200e-07
0,135,164,6.246274e-02
0,47,15,0.000000e+00
0,5,4,4.962974e-02
0,176,219,1.237935e+00
...,...,...,...
1709396984692,3,5,2.307750e+00
1709396984692,1,5,3.729143e+00
1709396984692,1,7,4.286984e+00
1709396984692,5,11,3.500757e+00


In [5]:
test = pd.read_csv(
    './data/test.csv', index_col='ego_id', usecols=['ego_id', 'u', 'v', 'x1'], 
    dtype={'ego_id': 'int64', 'u': 'int32', 'v': 'int32', 'x1': 'float32'}
)
test

Unnamed: 0_level_0,u,v,x1
ego_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,20,19,3.839089e-04
8,131,125,4.034464e-01
8,73,56,8.554643e-05
8,0,4,2.886418e-01
8,63,73,4.281692e-07
...,...,...,...
1709396984676,89,0,1.167843e+00
1709396984676,84,87,1.179100e-06
1709396984676,8,18,1.175182e+00
1709396984676,33,20,5.511019e-01


In [6]:
attr_df = pd.read_csv('./data/attr.csv').set_index('ego_id')
attr_df

Unnamed: 0_level_0,u,age,city_id,sex,school,university
ego_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,227,68,-1,1,778293348,-1
0,45,38,237065842,1,82803468,238500268
0,142,60,237065842,1,196560139,-1
0,280,66,-1,2,963209731,720783270
0,41,18,-1,2,308862409,-1
...,...,...,...,...,...,...
1709396984692,2,16,492149712,2,769209871,-1
1709396984692,12,15,-1,1,-1,-1
1709396984692,18,23,-1,1,-1,-1
1709396984692,4,16,650683235,1,-1,-1


In [7]:
def build_X(X, ego_net, ego_net_x1, ego_net_x2, ego_net_x3, ego_net_time):    
    vertex_cnt = ego_net.get_n()
    edge_cnt = ego_net.get_m()
    
    X['vertex_cnt'] = vertex_cnt
    X['edge_cnt'] = edge_cnt
    X['edge-vertex_cnt'] = edge_cnt - vertex_cnt
    X['density'] = 2 * edge_cnt / vertex_cnt / (vertex_cnt - 1)
    
    g = ego_net.to_array()
    degree = g.sum(axis=1)
    
    X['u_neighbour_cnt'] = degree[X.u]
    X['v_neighbour_cnt'] = degree[X.v]
    
    sc = g.dot(g.T).flatten()
    X['SC'] = sc[X.u * vertex_cnt + X.v]
    
    degree = 1 + degree.reshape((-1, 1))
        
    aa = (g / degree).dot(g.T).flatten()
    X['AA'] = aa[X.u * vertex_cnt + X.v]
    aa = (g / (1 + np.sqrt(degree))).dot(g.T).flatten()
    X['AA_sqrt'] = aa[X.u * vertex_cnt + X.v]
    aa = (g / (1 + np.log(degree))).dot(g.T).flatten()
    X['AA_log'] = aa[X.u * vertex_cnt + X.v]
        
    g = ego_net_x1.to_array()
    
    X['r_x1'] = g[X.v, X.u]
    
    g = ego_net_time.to_array()
    
    X['t'] = g[X.u, X.v]
    X['r_t'] = g[X.v, X.u]
    
    X['u_time_0'] = g[X.u, 0]
    X['v_time_0'] = g[X.v, 0]
    
    inv = g.copy()
    inv[inv > 0] = 1 / inv[inv > 0]
    inv_sum = inv.sum(axis=1)
    X['u_t_sum'] = inv_sum[X.u]
    X['v_t_sum'] = inv_sum[X.v]
    
    t_min = g.copy()
    t_min[t_min == 0] = t_min.max()
    X['min_time'] = t_min.min()
    t_min = t_min.min(axis=1)
    X['u_t_min'] = t_min[X.u]
    X['v_t_min'] = t_min[X.v]
    
    sc = inv.dot(inv.T).flatten()
    X['SC_time'] = sc[X.u * vertex_cnt + X.v]
    
    g = ego_net_x2.to_array()
    
    X['x2'] = g[X.u, X.v]
    X['r_x2'] = g[X.v, X.u]
    
    x2_sum = g.sum(axis=1)
    X['u_x2_sum'] = x2_sum[X.u]
    X['v_x2_sum'] = x2_sum[X.v]
    
    x2_max = g.max(axis=1)
    X['u_x2_max'] = x2_max[X.u]
    X['v_x2_max'] = x2_max[X.v]
    
    sc = g.dot(g.T).flatten()
    X['SC_x2'] = sc[X.u * vertex_cnt + X.v]
    
    g = ego_net_x3.to_array()
    
    X['x3'] = g[X.u, X.v]
    X['r_x3'] = g[X.v, X.u]
    
    x3_sum = g.sum(axis=1)
    X['u_x3_sum'] = x3_sum[X.u]
    X['v_x3_sum'] = x3_sum[X.v]
    
    x3_max = g.max(axis=1)
    X['u_x3_max'] = x3_max[X.u]
    X['v_x3_max'] = x3_max[X.v]
    
    sc = g.dot(g.T).flatten()
    X['SC_x3'] = sc[X.u * vertex_cnt + X.v]

In [8]:
def add_attr(X, ego_id, ego_net):
    cur_attr = attr_df.loc[ego_id].reset_index()
    vertex_cnt = ego_net.get_n()
    g = ego_net.to_array()
        
    age = np.full(300, -1, dtype=np.int32)
    age[cur_attr.u] = cur_attr.age
    age = age[:vertex_cnt]
    
    X['0_age'] = age[0]
    X['u_age'] = age[X.u]
    X['v_age'] = age[X.v]
    X['age_diff'] = np.abs(X['u_age'] - X['v_age'])
    X.loc[(X.u_age == -1) | (X.v_age == -1), 'age_diff'] = -1
    
    age = g * age
    mask = (age > 0).sum(axis=1)
    mask[mask==0] = 1
    X['u_mean_age'] = age.sum(axis=1)[X.u] / mask[X.u]
    X['v_mean_age'] = age.sum(axis=1)[X.v] / mask[X.v]
    
    sex = np.full(300, -1, dtype=np.int8)
    sex[cur_attr.u] = cur_attr.sex
    
    X['0_sex'] = sex[0]
    X['u_sex'] = sex[X.u]
    X['v_sex'] = sex[X.v]
    
    city = np.full(300, -1, dtype=np.int32)
    city[cur_attr.u] = cur_attr.city_id
    X['is_city_eq'] = (city[X.u] == city[X.v]) & (city[X.u] != -1)
    
    school = np.full(300, -1, dtype=np.int32)
    school[cur_attr.u] = cur_attr.school
    X['is_school_eq'] = (school[X.u] == school[X.v]) & (school[X.u] != -1)
    
    university = np.full(300, -1, dtype=np.int32)
    university[cur_attr.u] = cur_attr.university
    X['is_university_eq'] = (university[X.u] == university[X.v]) & (university[X.u] != -1)

In [9]:
train_df = []
for ego_id, ego_net, ego_net_x1, ego_net_x2, ego_net_x3, ego_net_time in \
tqdm(read_ego_net('./data/train.csv'), total=61786):
    X = train.loc[ego_id].reset_index()
    build_X(X, ego_net, ego_net_x1, ego_net_x2, ego_net_x3, ego_net_time)
    add_attr(X, ego_id, ego_net)
    train_df.append(X)

  0%|          | 0/61786 [00:00<?, ?it/s]

In [10]:
train_df = pd.concat(train_df)
train_df

Unnamed: 0,ego_id,u,v,x1,vertex_cnt,edge_cnt,edge-vertex_cnt,density,u_neighbour_cnt,v_neighbour_cnt,...,v_age,age_diff,u_mean_age,v_mean_age,0_sex,u_sex,v_sex,is_city_eq,is_school_eq,is_university_eq
0,0,131,84,5.669200e-07,300,1642,1342,0.036611,7.0,10.0,...,50,12,39.285714,56.700000,1,1,1,True,False,False
1,0,135,164,6.246274e-02,300,1642,1342,0.036611,4.0,5.0,...,92,47,48.250000,37.800000,1,2,1,False,False,True
2,0,47,15,0.000000e+00,300,1642,1342,0.036611,3.0,3.0,...,42,1,41.000000,41.000000,1,2,2,False,False,False
3,0,5,4,4.962974e-02,300,1642,1342,0.036611,24.0,10.0,...,40,9,41.318182,41.800000,1,1,2,False,False,False
4,0,176,219,1.237935e+00,300,1642,1342,0.036611,15.0,18.0,...,21,0,26.933333,22.777778,1,1,1,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1637,0,253,158,1.497307e-01,300,1642,1342,0.036611,4.0,4.0,...,40,26,37.000000,34.250000,1,1,1,True,False,False
1638,0,259,244,1.827714e+00,300,1642,1342,0.036611,10.0,11.0,...,25,4,35.000000,32.181818,1,1,2,False,False,False
1639,0,209,127,6.636844e-02,300,1642,1342,0.036611,19.0,21.0,...,21,1,25.526316,24.250000,1,2,1,True,False,True
1640,0,187,241,3.756446e+00,300,1642,1342,0.036611,10.0,18.0,...,21,0,21.600000,23.176471,1,1,1,False,False,False


In [11]:
test_df = []
for ego_id, ego_net, ego_net_x1, ego_net_x2, ego_net_x3, ego_net_time in \
tqdm(read_ego_net('./data/test.csv'), total=20596):
    X = test.loc[ego_id].reset_index()
    build_X(X, ego_net, ego_net_x1, ego_net_x2, ego_net_x3, ego_net_time)
    add_attr(X, ego_id, ego_net)
    test_df.append(X)

  0%|          | 0/20596 [00:00<?, ?it/s]

In [12]:
test_df = pd.concat(test_df)
test_df

Unnamed: 0,ego_id,u,v,x1,vertex_cnt,edge_cnt,edge-vertex_cnt,density,u_neighbour_cnt,v_neighbour_cnt,...,v_age,age_diff,u_mean_age,v_mean_age,0_sex,u_sex,v_sex,is_city_eq,is_school_eq,is_university_eq
0,8,20,19,3.839089e-04,194,1018,824,0.054377,12.0,7.0,...,37,0,36.500000,50.333333,2,2,2,True,False,False
1,8,131,125,4.034464e-01,194,1018,824,0.054377,6.0,3.0,...,38,3,33.500000,34.666667,2,2,2,False,False,False
2,8,73,56,8.554643e-05,194,1018,824,0.054377,10.0,12.0,...,36,22,36.222222,36.600000,2,2,2,False,False,False
3,8,0,4,2.886418e-01,194,1018,824,0.054377,122.0,6.0,...,37,1,37.557895,47.666667,2,2,1,False,True,False
4,8,63,73,4.281692e-07,194,1018,824,0.054377,8.0,10.0,...,58,19,48.857143,36.222222,2,2,2,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,8,132,17,1.826740e+00,194,1018,824,0.054377,2.0,20.0,...,36,2,37.500000,35.789474,2,2,2,True,False,False
1014,8,29,14,,194,1018,824,0.054377,6.0,10.0,...,36,-1,52.800000,51.200000,2,-1,1,False,False,False
1015,8,56,59,,194,1018,824,0.054377,12.0,10.0,...,36,0,36.600000,46.400000,2,2,2,False,False,False
1016,8,14,11,,194,1018,824,0.054377,10.0,10.0,...,18,18,51.200000,44.800000,2,1,1,True,False,False


In [13]:
val_df = test_df[test_df.x1.notna()].copy()
val_df

Unnamed: 0,ego_id,u,v,x1,vertex_cnt,edge_cnt,edge-vertex_cnt,density,u_neighbour_cnt,v_neighbour_cnt,...,v_age,age_diff,u_mean_age,v_mean_age,0_sex,u_sex,v_sex,is_city_eq,is_school_eq,is_university_eq
0,8,20,19,3.839089e-04,194,1018,824,0.054377,12.0,7.0,...,37,0,36.500000,50.333333,2,2,2,True,False,False
1,8,131,125,4.034464e-01,194,1018,824,0.054377,6.0,3.0,...,38,3,33.500000,34.666667,2,2,2,False,False,False
2,8,73,56,8.554643e-05,194,1018,824,0.054377,10.0,12.0,...,36,22,36.222222,36.600000,2,2,2,False,False,False
3,8,0,4,2.886418e-01,194,1018,824,0.054377,122.0,6.0,...,37,1,37.557895,47.666667,2,2,1,False,True,False
4,8,63,73,4.281692e-07,194,1018,824,0.054377,8.0,10.0,...,58,19,48.857143,36.222222,2,2,2,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1009,8,140,113,6.380812e-06,194,1018,824,0.054377,3.0,3.0,...,36,-1,37.666667,34.500000,2,-1,2,False,False,False
1010,8,104,91,5.176676e-01,194,1018,824,0.054377,3.0,6.0,...,36,1,36.000000,33.750000,2,2,2,False,False,True
1011,8,121,41,2.224251e-02,194,1018,824,0.054377,7.0,10.0,...,41,7,38.428571,38.333333,2,1,1,False,False,False
1012,8,91,13,6.132724e-15,194,1018,824,0.054377,6.0,12.0,...,36,0,33.750000,50.583333,2,2,2,False,False,False


In [14]:
sub = pd.read_csv('./data/submission.csv')
sub_values = set()
for ego_id, u, v in zip(sub.ego_id, sub.u, sub.v):
    sub_values.add((ego_id, u, v))

In [15]:
mask = []
for ego_id, u, v in zip(test_df.ego_id, test_df.u, test_df.v):
    mask.append((ego_id, u, v) in sub_values)

In [16]:
test_df = test_df[mask].drop_duplicates(subset=['ego_id', 'u', 'v']).copy()
test_df

Unnamed: 0,ego_id,u,v,x1,vertex_cnt,edge_cnt,edge-vertex_cnt,density,u_neighbour_cnt,v_neighbour_cnt,...,v_age,age_diff,u_mean_age,v_mean_age,0_sex,u_sex,v_sex,is_city_eq,is_school_eq,is_university_eq
120,8,7,16,,194,1018,824,0.054377,12.0,8.0,...,36,1,54.666667,36.375,2,1,1,False,False,False
141,8,8,29,,194,1018,824,0.054377,13.0,6.0,...,-1,-1,43.727273,52.8,2,1,-1,False,False,False
142,8,75,0,,194,1018,824,0.054377,6.0,122.0,...,36,1,35.75,37.557895,2,1,2,True,False,True
183,8,0,151,,194,1018,824,0.054377,122.0,5.0,...,18,18,37.557895,38.6,2,2,2,True,False,False
297,8,8,20,,194,1018,824,0.054377,13.0,12.0,...,37,84,43.727273,36.5,2,1,2,False,False,False
302,8,152,65,,194,1018,824,0.054377,15.0,16.0,...,46,22,36.0,38.866667,2,2,2,True,False,False
317,8,5,4,,194,1018,824,0.054377,9.0,6.0,...,37,0,33.875,47.666667,2,1,1,False,False,False
347,8,55,0,,194,1018,824,0.054377,11.0,122.0,...,36,82,38.444444,37.557895,2,2,2,False,False,False
456,8,46,110,,194,1018,824,0.054377,6.0,4.0,...,32,24,38.666667,33.666667,2,2,1,True,False,True
461,8,74,159,,194,1018,824,0.054377,13.0,3.0,...,18,23,37.545455,31.0,2,2,2,False,False,False


In [17]:
del train, test

In [18]:
import gc
gc.collect()

0

In [19]:
train_df['x1'].mean(), val_df['x1'].mean()

(0.666723, 0.43647254)

In [20]:
from catboost import CatBoostRegressor, Pool

train_pool = Pool(
    data=train_df.drop(['ego_id', 'u', 'v', 'x1'], axis=1),
    label=train_df['x1']
)

val_pool = Pool(
    data=val_df.drop(['ego_id', 'u', 'v', 'x1'], axis=1),
    label=val_df['x1']
)

  from pandas import MultiIndex, Int64Index



In [21]:
params = {
    'task_type': 'CPU',
    'loss_function': 'RMSE',
    'iterations': 300,
    'max_depth': 5,
}

In [22]:
model_cb = CatBoostRegressor(**params, random_seed=56)
model_cb.fit(train_pool, eval_set=val_pool, plot=True, verbose=False, use_best_model=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x19cc67faf10>

In [23]:
model_cb.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,x2,58.101831
1,r_x1,9.028127
2,t,5.680937
3,v_x2_max,2.850402
4,r_t,2.097052
5,v_mean_age,1.864781
6,SC_time,1.71236
7,v_age,1.657277
8,v_time_0,1.539702
9,r_x2,1.402195


In [24]:
1 - np.min(model_cb.evals_result_['validation']['RMSE'])

0.3556318460448743

In [25]:
x1_pred = model_cb.predict(test_df)
x1_pred[x1_pred<0] = 0
x1_pred[x1_pred>50] = 50
test_df['x1'] = x1_pred

In [26]:
test_df = pd.merge(test_df[['ego_id', 'u', 'v', 'x1']], sub[['ego_id', 'u', 'v']], on = ['ego_id', 'u', 'v'], how='right')
test_df.to_csv('./x1,x2,x3,t,attr_submit.csv', index=False)