In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [2]:
class DiGraph:
    def __init__(self) -> None:
        self.max_v = -1
        self.edges = []

    def add_edge(self, u: int, v: int, x: float) -> None:
        self.max_v = max(self.max_v, max(u, v))
        self.edges.append((u, v, x))
        
    def get_n(self) -> int:
        return self.max_v + 1

    def get_m(self) -> int:
        return len(self.edges)

    def to_array(self) -> np.array:
        g = np.zeros((self.max_v + 1, self.max_v + 1), np.float32)
        for edge in self.edges:
            g[edge[0]][edge[1]] = 1 + edge[2]
        return g

In [7]:
def read_ego_net(ego_net_path):
    cur_ego_id = -1
    cur_ego_net_x2 = None
    cur_ego_net_x3 = None
    with open(ego_net_path, 'r') as ego_net_f:
        ego_net_f.readline()
        for line in ego_net_f:
            line = line.split(',')
            ego_id, u, v = int(line[0]), int(line[1]), int(line[2])
            x2, x3 = float(line[5]), float(line[6])

            if ego_id != cur_ego_id:
                if cur_ego_id != -1:
                    yield cur_ego_id, cur_ego_net_x2, cur_ego_net_x3
                assert cur_ego_id < ego_id
                cur_ego_id = ego_id
                cur_ego_net_x2 = DiGraph()
                cur_ego_net_x3 = DiGraph()
            
            cur_ego_net_x2.add_edge(u, v, x2)
            cur_ego_net_x3.add_edge(u, v, x3)

        if cur_ego_id != -1:
            yield cur_ego_id, cur_ego_net_x2, cur_ego_net_x3

In [4]:
train = pd.read_csv('./data/train.csv', index_col='ego_id', usecols=['ego_id', 'u', 'v', 'x1'])
train

Unnamed: 0_level_0,u,v,x1
ego_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,131,84,5.669200e-07
0,135,164,6.246274e-02
0,47,15,0.000000e+00
0,5,4,4.962974e-02
0,176,219,1.237935e+00
...,...,...,...
1709396984692,3,5,2.307750e+00
1709396984692,1,5,3.729143e+00
1709396984692,1,7,4.286984e+00
1709396984692,5,11,3.500757e+00


In [5]:
test = pd.read_csv('./data/test.csv', index_col='ego_id', usecols=['ego_id', 'u', 'v', 'x1'])
test

Unnamed: 0_level_0,u,v,x1
ego_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,20,19,3.839089e-04
8,131,125,4.034464e-01
8,73,56,8.554643e-05
8,0,4,2.886418e-01
8,63,73,4.281692e-07
...,...,...,...
1709396984676,89,0,1.167843e+00
1709396984676,84,87,1.179100e-06
1709396984676,8,18,1.175182e+00
1709396984676,33,20,5.511019e-01


In [22]:
train_df = []
for ego_id, ego_net_x2, ego_net_x3 in tqdm(read_ego_net('./data/train.csv'), total=61786):
    X = train.loc[ego_id].reset_index()
    
    vertex_cnt = ego_net_x2.get_n()
    
    if len(X) == 0:
        continue
    
    g = ego_net_x2.to_array()
    
    X['x2'] = g[X.u, X.v]
    
    x2_sum = g.sum(axis=1)
    X['u_x2_sum'] = x2_sum[X.u]
    X['v_x2_sum'] = x2_sum[X.v]
    
    x2_max = g.max(axis=1)
    X['u_x2_max'] = x2_max[X.u]
    X['v_x2_max'] = x2_max[X.v]
    
    sc = g.dot(g.T).flatten()
    X['SC_x2'] = sc[X.u * vertex_cnt + X.v]
    
    g = ego_net_x3.to_array()
    
    X['x3'] = g[X.u, X.v]
    
    x3_sum = g.sum(axis=1)
    X['u_x3_sum'] = x3_sum[X.u]
    X['v_x3_sum'] = x3_sum[X.v]
    
    x3_max = g.max(axis=1)
    X['u_x3_max'] = x3_max[X.u]
    X['v_x3_max'] = x3_max[X.v]
    
    sc = g.dot(g.T).flatten()
    X['SC_x3'] = sc[X.u * vertex_cnt + X.v]
    
    train_df.append(X)

  0%|          | 0/61786 [00:00<?, ?it/s]

In [23]:
train_df = pd.concat(train_df)
train_df

Unnamed: 0,ego_id,u,v,x1,x2,u_x2_sum,v_x2_sum,u_x2_max,v_x2_max,SC_x2,x3,u_x3_sum,v_x3_sum,u_x3_max,v_x3_max,SC_x3
0,0,131,84,5.669200e-07,1.000000,7.000000,10.000000,1.000000,1.000000,2.000000,1.0,7.0,10.0,1.0,1.0,2.0
1,0,135,164,6.246274e-02,1.000000,4.000000,5.000000,1.000000,1.000000,2.000000,1.0,4.0,5.0,1.0,1.0,2.0
2,0,47,15,0.000000e+00,1.000000,3.000000,3.000000,1.000000,1.000000,0.000000,2.0,6.0,3.0,2.0,1.0,0.0
3,0,5,4,4.962974e-02,1.000000,24.693148,10.000000,1.693147,1.000000,7.693147,1.0,24.0,10.0,1.0,1.0,7.0
4,0,176,219,1.237935e+00,1.000000,24.057190,25.247082,3.995732,6.860786,11.860786,1.0,19.0,24.0,2.0,2.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1637,0,253,158,1.497307e-01,1.000000,4.000000,4.000000,1.000000,1.000000,1.000000,2.0,7.0,4.0,2.0,1.0,1.0
1638,0,259,244,1.827714e+00,2.609438,20.451031,15.442652,6.897154,3.833213,5.944439,2.0,15.0,15.0,2.0,2.0,7.0
1639,0,209,127,6.636844e-02,1.000000,28.983776,27.579250,5.454347,3.890372,8.609438,1.0,19.0,34.0,1.0,2.0,13.0
1640,0,187,241,3.756446e+00,3.833213,31.558306,21.988985,6.961005,4.295837,11.289363,1.0,10.0,18.0,1.0,1.0,4.0


In [11]:
test_df = []
for ego_id, ego_net_x2, ego_net_x3 in tqdm(read_ego_net('./data/test.csv'), total=20596):
    X = test.loc[ego_id].reset_index()
    
    vertex_cnt = ego_net_x2.get_n()
    
    if len(X) == 0:
        continue
    
    g = ego_net_x2.to_array()
    
    X['x2'] = g[X.u, X.v]
    
    x2_sum = g.sum(axis=1)
    X['u_x2_sum'] = x2_sum[X.u]
    X['v_x2_sum'] = x2_sum[X.v]
    
    x2_max = g.max(axis=1)
    X['u_x2_max'] = x2_max[X.u]
    X['v_x2_max'] = x2_max[X.v]
    
    sc = g.dot(g.T).flatten()
    X['SC_x2'] = sc[X.u * vertex_cnt + X.v]
    
    g = ego_net_x3.to_array()
    
    X['x3'] = g[X.u, X.v]
    
    x3_sum = g.sum(axis=1)
    X['u_x3_sum'] = x3_sum[X.u]
    X['v_x3_sum'] = x3_sum[X.v]
    
    x3_max = g.max(axis=1)
    X['u_x3_max'] = x3_max[X.u]
    X['v_x3_max'] = x3_max[X.v]
    
    sc = g.dot(g.T).flatten()
    X['SC_x3'] = sc[X.u * vertex_cnt + X.v]
    
    test_df.append(X)

  0%|          | 0/20596 [00:00<?, ?it/s]

In [12]:
test_df = pd.concat(test_df)
test_df

Unnamed: 0,ego_id,u,v,x1,u_x2_sum,v_x2_sum,u_x2_max,v_x2_max,SC_x2,u_x3_sum,v_x3_sum,u_x3_max,v_x3_max,SC_x3
0,8,20,19,3.839089e-04,12.000000,7.000000,1.000000,1.000000,4.000000,12.0,7.0,1.0,1.0,4.0
1,8,131,125,4.034464e-01,6.693147,4.098612,1.693147,2.098612,4.553259,6.0,3.0,1.0,1.0,2.0
2,8,73,56,8.554643e-05,10.000000,12.000000,1.000000,1.000000,4.000000,11.0,12.0,2.0,1.0,5.0
3,8,0,4,2.886418e-01,130.428589,6.000000,3.564949,1.000000,0.000000,122.0,6.0,1.0,1.0,0.0
4,8,63,73,4.281692e-07,8.000000,10.000000,1.000000,1.000000,3.000000,10.0,11.0,2.0,2.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1082,1709396984676,89,0,1.167843e+00,7.484907,88.665810,3.484907,5.025352,3.000000,5.0,67.0,1.0,1.0,3.0
1083,1709396984676,84,87,1.179100e-06,4.000000,4.000000,1.000000,1.000000,1.000000,4.0,4.0,1.0,1.0,1.0
1084,1709396984676,8,18,1.175182e+00,54.314777,38.044987,8.426549,3.079442,34.393356,40.0,39.0,2.0,2.0,26.0
1085,1709396984676,33,20,5.511019e-01,45.578659,43.570210,5.143135,7.453625,23.533068,46.0,26.0,2.0,2.0,18.0


In [13]:
val_df = test_df[test_df.x1.notna()]
val_df

Unnamed: 0,ego_id,u,v,x1,u_x2_sum,v_x2_sum,u_x2_max,v_x2_max,SC_x2,u_x3_sum,v_x3_sum,u_x3_max,v_x3_max,SC_x3
0,8,20,19,3.839089e-04,12.000000,7.000000,1.000000,1.000000,4.000000,12.0,7.0,1.0,1.0,4.0
1,8,131,125,4.034464e-01,6.693147,4.098612,1.693147,2.098612,4.553259,6.0,3.0,1.0,1.0,2.0
2,8,73,56,8.554643e-05,10.000000,12.000000,1.000000,1.000000,4.000000,11.0,12.0,2.0,1.0,5.0
3,8,0,4,2.886418e-01,130.428589,6.000000,3.564949,1.000000,0.000000,122.0,6.0,1.0,1.0,0.0
4,8,63,73,4.281692e-07,8.000000,10.000000,1.000000,1.000000,3.000000,10.0,11.0,2.0,2.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1082,1709396984676,89,0,1.167843e+00,7.484907,88.665810,3.484907,5.025352,3.000000,5.0,67.0,1.0,1.0,3.0
1083,1709396984676,84,87,1.179100e-06,4.000000,4.000000,1.000000,1.000000,1.000000,4.0,4.0,1.0,1.0,1.0
1084,1709396984676,8,18,1.175182e+00,54.314777,38.044987,8.426549,3.079442,34.393356,40.0,39.0,2.0,2.0,26.0
1085,1709396984676,33,20,5.511019e-01,45.578659,43.570210,5.143135,7.453625,23.533068,46.0,26.0,2.0,2.0,18.0


In [14]:
test_df = test_df[test_df.x1.isna()]
test_df

Unnamed: 0,ego_id,u,v,x1,u_x2_sum,v_x2_sum,u_x2_max,v_x2_max,SC_x2,u_x3_sum,v_x3_sum,u_x3_max,v_x3_max,SC_x3
9,8,109,154,,6.000000,3.000000,1.000000,1.000000,1.000000,11.0,3.0,2.0,1.0,2.0
10,8,51,30,,4.000000,4.791759,1.000000,2.791759,2.791759,4.0,6.0,1.0,2.0,2.0
20,8,54,56,,14.000000,12.000000,1.000000,1.000000,6.000000,14.0,12.0,1.0,1.0,6.0
26,8,106,9,,7.386294,18.000000,1.693147,1.000000,1.693147,6.0,18.0,1.0,1.0,1.0
28,8,135,119,,2.000000,3.791759,1.000000,2.098612,0.000000,2.0,2.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1063,1709396984676,58,33,,17.721666,45.578659,4.367296,5.143135,16.420691,8.0,46.0,1.0,2.0,5.0
1064,1709396984676,41,0,,2.000000,88.665810,1.000000,5.025352,1.000000,4.0,67.0,2.0,1.0,2.0
1069,1709396984676,79,33,,21.385914,45.578659,4.806663,5.143135,5.386294,11.0,46.0,1.0,2.0,8.0
1074,1709396984676,80,30,,27.414795,17.890371,8.340187,2.791759,1.693147,10.0,19.0,2.0,2.0,1.0


In [15]:
import gc
gc.collect()

0

In [16]:
train_df['x1'].mean(), val_df['x1'].mean()

(0.6909816146178823, 0.6974638859342772)

In [17]:
from catboost import CatBoostRegressor, Pool

train_pool = Pool(
    data=train_df.drop(['ego_id', 'u', 'v', 'x1'], axis=1),
    label=train_df['x1']
)

val_pool = Pool(
    data=val_df.drop(['ego_id', 'u', 'v', 'x1'], axis=1),
    label=val_df['x1']
)

  from pandas import MultiIndex, Int64Index



MemoryError: Unable to allocate 4.56 GiB for an array with shape (10, 122280277) and data type float32

In [None]:
params = {
    'task_type': 'CPU',
    'loss_function': 'RMSE',
}

In [None]:
model_cb = CatBoostRegressor(**params, random_seed=56)
model_cb.fit(train_pool, eval_set=val_pool, plot=True, verbose=False, use_best_model=True)

In [None]:
np.max(model_cb.evals_result_['validation']['RMSE'])