In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
%matplotlib inline
sns.set()

from tqdm import tqdm_notebook
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout
import keras as K
from keras.callbacks import EarlyStopping
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import os
from gensim.models import KeyedVectors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

n2v = KeyedVectors.load_word2vec_format("n2v.bin")

In [0]:
%%time
edges = pd.read_csv('finnet_data/edges.csv')
vertices = pd.read_csv('finnet_data/vertices.csv')
ids = pd.read_csv('finnet_data/ids.csv')

# Here we convert columns to str and do some region_code preprocessing
vertices['main_okved'] = vertices['main_okved'].astype(str)
vertices['region_code'] = vertices['region_code'].astype(str)
vertices['region_code'][vertices['region_code'] == '82'] = '41'
vertices['region_code'][vertices['region_code'] == '81'] = '59'
vertices['region_code'][vertices['region_code'] == '80'] = '75'
vertices['region_code'][vertices['region_code'].isin(['84', '88'])] = '24'
vertices['region_code'][vertices['region_code'] == '85'] = '38'
vertices['region_code'][vertices['region_code'] == '0'] = '16'
vertices['region_code'][vertices['region_code'] == '99'] = '77'

# 1. Data preprocessing
### 1.1. Aggregations

In [0]:
# Simple aggregations by id from edges
groupped = pd.concat((edges, edges.rename(columns={'id_1':'id_2', 'id_2':'id_1'}))).drop('id_2', axis=1).groupby('id_1')
ids_agg = groupped.agg(['mean', 'sum', 'min', 'max', 'std', 'count'])
ids_agg.columns = ['_'.join(el) for el in ids_agg.columns]
ids_agg = ids_agg.reset_index().rename(columns={'id_1':'id'}).fillna(0).drop('value_count', axis=1)

edges_conc = pd.concat((edges, edges.rename(columns={'id_1':'id_2', 'id_2':'id_1'}))).rename(columns={'id_1':'id'}).merge(vertices[['id', 'main_okved', 'region_code', 'company_type']], on='id').drop('id_2', axis=1)

# Simple aggregations by main_okved from edges
main_okved_aggs_1 = edges_conc.groupby(['main_okved'])[['value', 'n_transactions']].agg(['mean', 'sum', 'max', 'min', 'count', 'std'])
main_okved_aggs_1.columns = ['_'.join(el)+'_okved' for el in main_okved_aggs_1.columns]
main_okved_aggs_1 = main_okved_aggs_1.drop('value_count_okved', axis=1).fillna(0)

# Simple aggregations by region_code from edges
region_aggs_1 = edges_conc.groupby(['region_code'])[['value', 'n_transactions']].agg(['mean', 'sum', 'max', 'min', 'count', 'std'])
region_aggs_1.columns = ['_'.join(el)+'_region' for el in region_aggs_1.columns]
region_aggs_1 = region_aggs_1.drop('value_count_region', axis=1).fillna(0)

# Simple aggregations by company_type from edges
company_aggs_1 = edges_conc.groupby(['company_type'])[['value', 'n_transactions']].agg(['mean', 'sum', 'max', 'min', 'count', 'std'])
company_aggs_1.columns = ['_'.join(el)+'_company' for el in company_aggs_1.columns]
company_aggs_1 = company_aggs_1.drop('value_count_company', axis=1).fillna(0)

### 1.2. Node2Vec

In [0]:
# Convert n2v vectors to pandas.DataFrame
keys = np.array(list(n2v.vocab.keys()))
vectors = n2v[n2v.vocab.keys()]
n2v_df = pd.DataFrame(vectors, index=keys).add_prefix('vec_').reset_index().rename(columns={'index':'id'})
n2v_df['id'] = n2v_df['id'].astype(int)

### 1.3. Count edges for each main_okved and region_code

In [0]:
# OKVED Counter
okved2okved = edges[['id_1', 'id_2']].merge(vertices[['id', 'main_okved']].add_suffix('_1'),  on='id_1', how='left').merge(vertices[['id', 'main_okved']].add_suffix('_2'), on='id_2', how='left').drop(['id_1', 'id_2'], axis=1).astype(str)
okved2okved = pd.concat((okved2okved, okved2okved.rename(columns={'main_okved_1':'main_okved_2', 'main_okved_2':'main_okved_1'})[['main_okved_1', 'main_okved_2']]))
okved2okved.head()

okved_list = list(okved2okved['main_okved_1'].unique())
okved_df_proba = pd.DataFrame(np.zeros((len(okved_list),len(okved_list))), index=okved_list, columns=okved_list).astype(int)
groupped = okved2okved.groupby('main_okved_1')['main_okved_2'].value_counts()
for idx in tqdm_notebook(groupped.index):
    okved_df_proba.loc[idx[0], idx[1]] += groupped.loc[idx]


# Region Counter
region2region = edges[['id_1', 'id_2']].merge(vertices[['id', 'region_code']].add_suffix('_1'),  on='id_1', how='left').merge(vertices[['id', 'region_code']].add_suffix('_2'), on='id_2', how='left').drop(['id_1', 'id_2'], axis=1)
region2region = pd.concat((region2region, region2region.rename(columns={'region_code_1':'region_code_2', 'region_code_2':'region_code_1'})[['region_code_1', 'region_code_2']]))

region_list = list(vertices['region_code'].unique())
region_df_proba = pd.DataFrame(np.zeros((len(region_list),len(region_list))), index=list(map(str, region_list)), columns=list(map(str, region_list))).astype(int)
groupped = region2region.groupby('region_code_1')['region_code_2'].value_counts()
for idx in tqdm_notebook(groupped.index):
    region_df_proba.loc[idx[0], idx[1]] += groupped.loc[idx]

In [0]:
# Convert counts to probas
region_df_proba = (region_df_proba/region_df_proba.sum()).T
okved_df_proba = (okved_df_proba/okved_df_proba.sum()).T

### 1.3.1. Dimension reduction for okved by seq2seq
Here is simple non-linear autoencoder where middle layer is a representation of our **okved_df_proba** matrix. It reduce size from 1073 to 128 for each **main_okved**

In [0]:
okved = okved_df_proba

model = Sequential()
model.add(Dense(okved.shape[0], input_shape=(okved.shape[1],), activation='sigmoid'))
model.add(Dense(512, activation='sigmoid'))
model.add(Dense(256, activation='sigmoid'))
model.add(Dense(128, activation='sigmoid')) # Our representation vector
model.add(Dense(256, activation='sigmoid'))
model.add(Dense(512, activation='sigmoid'))
model.add(Dense(okved.shape[1], activation='sigmoid'))

model.compile(loss='mse', optimizer='adam', metrics=['mse'])
model.fit(okved.values, okved.values, epochs=20, batch_size=10)

new_model = Sequential()
for i in range(4):
    new_model.add(model.layers[i])
new_model.summary()
okved = pd.DataFrame(new_model.predict(okved.values), index=okved.index).reset_index()

### 1.4. Region and OKVED stats from  **§1.3**

In [0]:
okved_stats = pd.DataFrame({
    'sum_okved': okved_df_proba.sum(axis=0),
    'mean_okved': okved_df_proba.mean(axis=0),
    'std_okved': okved_df_proba.std(axis=0),
    'max_okved': okved_df_proba.max(axis=0),
}).reset_index().rename(columns={'index':'main_okved'})
region_stats = pd.DataFrame({
    'sum_region': region_df_proba.sum(axis=0),
    'mean_region': region_df_proba.mean(axis=0),
    'std_region': region_df_proba.std(axis=0),
    'max_region': region_df_proba.max(axis=0),
}).reset_index().rename(columns={'index':'region_code'})

### 1.5. Merging **§1.1** and **§1.4**

In [0]:
main_okved_aggs_1 = main_okved_aggs_1.merge(okved_stats, on='main_okved', how='left')
region_aggs_1 = region_aggs_1.merge(region_stats, on='region_code', how='left')

### 1.6. TF-IDF for okved and main_okved and region_code

In [0]:
vertices['main_okved_splitted'] = vertices['main_okved'].apply(lambda x: x.split('.')[0])
tmp = edges.merge(vertices[['id', 'main_okved_splitted']].add_suffix('_1'), on='id_1').merge(vertices[['id', 'main_okved_splitted']].add_suffix('_2'), on='id_2')[['main_okved_splitted_1', 'main_okved_splitted_2']]
corpus = (tmp['main_okved_splitted_1']+' '+tmp['main_okved_splitted_2']).values
okved_vec = TfidfVectorizer()
okved_vec.fit(corpus)
vertices = vertices.drop('main_okved_splitted', axis=1)

tmp = edges.merge(vertices[['id', 'region_code']].add_suffix('_1'), on='id_1').merge(vertices[['id', 'region_code']].add_suffix('_2'), on='id_2')[['region_code_1', 'region_code_2']]
corpus = (tmp['region_code_1']+' '+tmp['region_code_2']).values
region_vec = TfidfVectorizer()
region_vec.fit(corpus)
del corpus; gc.collect()

# 2. Models training
If you do not wanna see big bunch of models on different types of data just go to the **Ensemble** section

In [0]:
# Prepare directory for models results
if not 'Results' in os.listdir():
    os.mkdir('Results')

In [0]:
# 1) TFIDF
result = pd.DataFrame(columns=['id_1', 'id_2', 'proba'])
for i in tqdm_notebook(ids.id):
    df1 = edges[edges['id_1'] == i].reset_index()
    df2 = edges[edges['id_2'] == i].reset_index()

    df = df1[['id_2', 'id_1']].rename(columns={'id_1':'id_2', 'id_2':'id_1'}).append(df2[['id_1', 'id_2']])
    df['target'] = 1
    df = vertices.set_index('id').join(df.rename(columns={'id_1':'id'}).set_index('id')['target']).fillna(0)
    
    corpus_okved = (vertices['main_okved_splitted'][vertices['id'] == i].iloc[0] + ' ' + df['main_okved_splitted'])
    corpus_region = (vertices['region_code'][vertices['id'] == i].iloc[0] + ' ' + df['region_code'])
    df = pd.concat((df, pd.DataFrame(okved_vec.transform(corpus_okved).toarray(), index=df.index).add_suffix('_okved')), axis=1)
    df = pd.concat((df, pd.DataFrame(region_vec.transform(corpus_region).toarray(), index=df.index).add_suffix('_region')), axis=1)

    X = df.drop(['target'], axis=1)
    y = df['target']
    
    model = CatBoostClassifier(iterations=100, task_type='GPU', random_state=42, loss_function='Logloss', verbose=0)
    model.fit(X, y, [0, 1, 2])
    
    preds = model.predict_proba(df.drop(['target'], axis=1))[:, 1]
    df['preds'] = preds
    df['id_2'] = i
    
    res = df[df['target'] != 1].sort_values(by='preds', ascending=False).reset_index()[['id', 'id_2', 'preds']]
    res.columns = ['id_1', 'id_2', 'proba']
    
    result = result.append(res[:10000], ignore_index=True, sort=False)

# Drop duplicates from results
id_1 = []
id_2 = []
for row in result[['id_1', 'id_2']].values:
    id_1.append(min(row[0], row[1]))
    id_2.append(max(row[0], row[1]))
result['id_1'] = id_1
result['id_2'] = id_2
result = result.drop_duplicates(['id_1', 'id_2'])
# Save
result.to_csv('Results/tfidf.csv', index=False)

# Reset vertices
verices = vertices.iloc[:, :4]

In [0]:
# 2) Node2Vec vectors + cosine similarity between ids + main_okved_proba + region_proba

vertices = vertices.merge(n2v_df, on='id', how='left').fillna(0)

result = pd.DataFrame(columns=['id_1', 'id_2', 'proba'])
for i in tqdm_notebook(ids.id):
    df1 = edges[edges['id_1'] == i].reset_index()
    df2 = edges[edges['id_2'] == i].reset_index()

    df = df1[['id_2', 'id_1']].rename(columns={'id_1':'id_2', 'id_2':'id_1'}).append(df2[['id_1', 'id_2']])
    df['target'] = 1

    vecs = n2v.wv.most_similar(str(i), topn=len(keys))
    vecs = pd.DataFrame(vecs, columns=['id', 'similarity'])
    vecs['id'] = vecs['id'].astype(int)
    df = vertices.merge(vecs, on='id').fillna(-1).set_index('id').join(df.rename(columns={'id_1':'id'}).set_index('id')['target']).fillna(0)
    
    df['okved_proba'] = okved_df_proba.loc[vertices[vertices['id'] == i]['main_okved'], df['main_okved']].values.T.reshape(-1)
    df['region_proba'] = region_df_proba.loc[vertices[vertices['id'] == i]['region_code'], df['region_code']].values.T.reshape(-1)

    df = vertices.set_index('id').join(df.rename(columns={'id_1':'id'}).set_index('id')['target']).fillna(0)
    
    X = df.drop(['target'], axis=1)
    y = df['target']
    
    model = CatBoostClassifier(iterations=100, task_type='GPU', random_state=42, loss_function='Logloss', verbose=0)
    model.fit(X, y, [0, 1, 2])
    
    preds = model.predict_proba(df.drop(['target'], axis=1))[:, 1]
    df['preds'] = preds
    df['id_2'] = i
    
    res = df[df['target'] != 1].sort_values(by='preds', ascending=False).reset_index()[['id', 'id_2', 'preds']]
    res.columns = ['id_1', 'id_2', 'proba']
    
    result = result.append(res[:10000], ignore_index=True, sort=False)

# Drop duplicates from results
id_1 = []
id_2 = []
for row in result[['id_1', 'id_2']].values:
    id_1.append(min(row[0], row[1]))
    id_2.append(max(row[0], row[1]))
result['id_1'] = id_1
result['id_2'] = id_2
result = result.drop_duplicates(['id_1', 'id_2'])
# Save
result.to_csv('Results/n2v_df_sim_okved_region_proba_not_seq.csv', index=False)

# Reset vertices
verices = vertices.iloc[:, :4]

In [0]:
# 3) Node2Vec vectors + cosine similarity between ids + main_okved_aggs + region_aggs

vertices = vertices.merge(n2v_df, on='id', how='left').fillna(0)
vertices = vertices.merge(main_okved_aggs_1, on='main_okved', how='left').fillna(0)
vertices = vertices.merge(region_aggs_1, on='region_code', how='left').fillna(0)

result = pd.DataFrame(columns=['id_1', 'id_2', 'proba'])
for i in tqdm_notebook(ids.id):
    df1 = edges[edges['id_1'] == i].reset_index()
    df2 = edges[edges['id_2'] == i].reset_index()

    df = df1[['id_2', 'id_1']].rename(columns={'id_1':'id_2', 'id_2':'id_1'}).append(df2[['id_1', 'id_2']])
    df['target'] = 1

    vecs = n2v.wv.most_similar(str(i), topn=len(keys))
    vecs = pd.DataFrame(vecs, columns=['id', 'similarity'])
    vecs['id'] = vecs['id'].astype(int)
    df = vertices.merge(vecs, on='id').fillna(-1).set_index('id').join(df.rename(columns={'id_1':'id'}).set_index('id')['target']).fillna(0)
    df = vertices.set_index('id').join(df.rename(columns={'id_1':'id'}).set_index('id')['target']).fillna(0)
    
    X = df.drop(['target'], axis=1)
    y = df['target']
    
    model = CatBoostClassifier(iterations=100, task_type='GPU', random_state=42, loss_function='Logloss', verbose=0)
    model.fit(X, y, [0, 1, 2])
    
    preds = model.predict_proba(df.drop(['target'], axis=1))[:, 1]
    df['preds'] = preds
    df['id_2'] = i
    
    res = df[df['target'] != 1].sort_values(by='preds', ascending=False).reset_index()[['id', 'id_2', 'preds']]
    res.columns = ['id_1', 'id_2', 'proba']
    
    result = result.append(res[:10000], ignore_index=True, sort=False)

# Drop duplicates from results
id_1 = []
id_2 = []
for row in result[['id_1', 'id_2']].values:
    id_1.append(min(row[0], row[1]))
    id_2.append(max(row[0], row[1]))
result['id_1'] = id_1
result['id_2'] = id_2
result = result.drop_duplicates(['id_1', 'id_2'])
# Save
result.to_csv('Results/n2v_df_sim_okved_region_aggs.csv', index=False)

# Reset vertices
verices = vertices.iloc[:, :4]

In [0]:
# 4) Node2Vec vectors + main_okved_aggs + region_aggs

vertices = vertices.merge(n2v_df, on='id', how='left').fillna(0)
vertices = vertices.merge(main_okved_aggs_1, on='main_okved', how='left').fillna(0)
vertices = vertices.merge(region_aggs_1, on='region_code', how='left').fillna(0)

result = pd.DataFrame(columns=['id_1', 'id_2', 'proba'])
for i in tqdm_notebook(ids.id):
    df1 = edges[edges['id_1'] == i].reset_index()
    df2 = edges[edges['id_2'] == i].reset_index()

    df = df1[['id_2', 'id_1']].rename(columns={'id_1':'id_2', 'id_2':'id_1'}).append(df2[['id_1', 'id_2']])
    df['target'] = 1

    df = vertices.set_index('id').join(df.rename(columns={'id_1':'id'}).set_index('id')['target']).fillna(0)
    df = vertices.set_index('id').join(df.rename(columns={'id_1':'id'}).set_index('id')['target']).fillna(0)
    
    X = df.drop(['target'], axis=1)
    y = df['target']
    
    model = CatBoostClassifier(iterations=100, task_type='GPU', random_state=42, loss_function='Logloss', verbose=0)
    model.fit(X, y, [0, 1, 2])
    
    preds = model.predict_proba(df.drop(['target'], axis=1))[:, 1]
    df['preds'] = preds
    df['id_2'] = i
    
    res = df[df['target'] != 1].sort_values(by='preds', ascending=False).reset_index()[['id', 'id_2', 'preds']]
    res.columns = ['id_1', 'id_2', 'proba']
    
    result = result.append(res[:10000], ignore_index=True, sort=False)

# Drop duplicates from results
id_1 = []
id_2 = []
for row in result[['id_1', 'id_2']].values:
    id_1.append(min(row[0], row[1]))
    id_2.append(max(row[0], row[1]))
result['id_1'] = id_1
result['id_2'] = id_2
result = result.drop_duplicates(['id_1', 'id_2'])
# Save
result.to_csv('Results/n2v_df_okved_region_aggs.csv', index=False)

# Reset vertices
verices = vertices.iloc[:, :4]

In [0]:
# 5) main_okved_aggs + region_aggs + company_aggs + okved_proba

vertices = vertices.merge(company_aggs_1.reset_index(), on='company_type', how='left').fillna(0)
vertices = vertices.merge(main_okved_aggs_1, on='main_okved', how='left').fillna(0)
vertices = vertices.merge(region_aggs_1, on='region_code', how='left').fillna(0)
vertices = vertices.merge(okved.add_suffix('_4'), left_on='main_okved', right_on='index_4').drop('index_4', axis=1)

result = pd.DataFrame(columns=['id_1', 'id_2', 'proba'])
for i in tqdm_notebook(ids.id):
    df1 = edges[edges['id_1'] == i].reset_index()
    df2 = edges[edges['id_2'] == i].reset_index()

    df = df1[['id_2', 'id_1']].rename(columns={'id_1':'id_2', 'id_2':'id_1'}).append(df2[['id_1', 'id_2']])
    df['target'] = 1

    df = vertices.set_index('id').join(df.rename(columns={'id_1':'id'}).set_index('id')['target']).fillna(0)
    df = vertices.set_index('id').join(df.rename(columns={'id_1':'id'}).set_index('id')['target']).fillna(0)
    
    X = df.drop(['target'], axis=1)
    y = df['target']
    
    model = CatBoostClassifier(iterations=100, task_type='GPU', random_state=42, loss_function='Logloss', verbose=0)
    model.fit(X, y, [0, 1, 2])
    
    preds = model.predict_proba(df.drop(['target'], axis=1))[:, 1]
    df['preds'] = preds
    df['id_2'] = i
    
    res = df[df['target'] != 1].sort_values(by='preds', ascending=False).reset_index()[['id', 'id_2', 'preds']]
    res.columns = ['id_1', 'id_2', 'proba']
    
    result = result.append(res[:10000], ignore_index=True, sort=False)

# Drop duplicates from results
id_1 = []
id_2 = []
for row in result[['id_1', 'id_2']].values:
    id_1.append(min(row[0], row[1]))
    id_2.append(max(row[0], row[1]))
result['id_1'] = id_1
result['id_2'] = id_2
result = result.drop_duplicates(['id_1', 'id_2'])
# Save
result.to_csv('Results/okved_region_company_aggs_okved_proba.csv', index=False)

# Reset vertices
verices = vertices.iloc[:, :4]

In [0]:
# 6) okved_proba + region_proba + cpu + logloss + 100 iters

vertices = vertices.merge(okved.add_suffix('_4'), left_on='main_okved', right_on='index_4').drop('index_4', axis=1)
vertices = vertices.merge(region_df_proba.reset_index().add_suffix('_5'), left_on='region_code', right_on='index_5').drop('index_5', axis=1)

result = pd.DataFrame(columns=['id_1', 'id_2', 'proba'])
for i in tqdm_notebook(ids.id):
    df1 = edges[edges['id_1'] == i].reset_index()
    df2 = edges[edges['id_2'] == i].reset_index()

    df = df1[['id_2', 'id_1']].rename(columns={'id_1':'id_2', 'id_2':'id_1'}).append(df2[['id_1', 'id_2']])
    df['target'] = 1

    df = vertices.set_index('id').join(df.rename(columns={'id_1':'id'}).set_index('id')['target']).fillna(0)
    df = vertices.set_index('id').join(df.rename(columns={'id_1':'id'}).set_index('id')['target']).fillna(0)
    
    X = df.drop(['target'], axis=1)
    y = df['target']
    
    model = CatBoostClassifier(iterations=100, task_type='CPU', random_state=42, loss_function='Logloss', verbose=0)
    model.fit(X, y, [0, 1, 2])
    
    preds = model.predict_proba(df.drop(['target'], axis=1))[:, 1]
    df['preds'] = preds
    df['id_2'] = i
    
    res = df[df['target'] != 1].sort_values(by='preds', ascending=False).reset_index()[['id', 'id_2', 'preds']]
    res.columns = ['id_1', 'id_2', 'proba']
    
    result = result.append(res[:10000], ignore_index=True, sort=False)

# Drop duplicates from results
id_1 = []
id_2 = []
for row in result[['id_1', 'id_2']].values:
    id_1.append(min(row[0], row[1]))
    id_2.append(max(row[0], row[1]))
result['id_1'] = id_1
result['id_2'] = id_2
result = result.drop_duplicates(['id_1', 'id_2'])
# Save
name = 'okved_region_proba_100i_logloss_cpu.csv'
result.to_csv('Results/'+name, index=False)

# Reset vertices
verices = vertices.iloc[:, :4]

In [0]:
# 7) okved_proba + region_proba + crossentropy + 300 iters

vertices = vertices.merge(okved.add_suffix('_4'), left_on='main_okved', right_on='index_4').drop('index_4', axis=1)
vertices = vertices.merge(region_df_proba.reset_index().add_suffix('_5'), left_on='region_code', right_on='index_5').drop('index_5', axis=1)

result = pd.DataFrame(columns=['id_1', 'id_2', 'proba'])
for i in tqdm_notebook(ids.id):
    df1 = edges[edges['id_1'] == i].reset_index()
    df2 = edges[edges['id_2'] == i].reset_index()

    df = df1[['id_2', 'id_1']].rename(columns={'id_1':'id_2', 'id_2':'id_1'}).append(df2[['id_1', 'id_2']])
    df['target'] = 1

    df = vertices.set_index('id').join(df.rename(columns={'id_1':'id'}).set_index('id')['target']).fillna(0)
    df = vertices.set_index('id').join(df.rename(columns={'id_1':'id'}).set_index('id')['target']).fillna(0)
    
    X = df.drop(['target'], axis=1)
    y = df['target']
    
    model = CatBoostClassifier(iterations=300, task_type='CPU', random_state=42, loss_function='CrossEntropy', verbose=0)
    model.fit(X, y, [0, 1, 2])
    
    preds = model.predict_proba(df.drop(['target'], axis=1))[:, 1]
    df['preds'] = preds
    df['id_2'] = i
    
    res = df[df['target'] != 1].sort_values(by='preds', ascending=False).reset_index()[['id', 'id_2', 'preds']]
    res.columns = ['id_1', 'id_2', 'proba']
    
    result = result.append(res[:10000], ignore_index=True, sort=False)

# Drop duplicates from results
id_1 = []
id_2 = []
for row in result[['id_1', 'id_2']].values:
    id_1.append(min(row[0], row[1]))
    id_2.append(max(row[0], row[1]))
result['id_1'] = id_1
result['id_2'] = id_2
result = result.drop_duplicates(['id_1', 'id_2'])
# Save
result.to_csv('Results/okved_region_proba_300i_crossentropy.csv', index=False)

# Reset vertices
verices = vertices.iloc[:, :4]

In [0]:
# 8) main_okved_aggs + region_aggs + company_aggs

vertices = vertices.merge(company_aggs_1.reset_index(), on='company_type', how='left').fillna(0)
vertices = vertices.merge(main_okved_aggs_1, on='main_okved', how='left').fillna(0)
vertices = vertices.merge(region_aggs_1, on='region_code', how='left').fillna(0)

result = pd.DataFrame(columns=['id_1', 'id_2', 'proba'])
for i in tqdm_notebook(ids.id):
    df1 = edges[edges['id_1'] == i].reset_index()
    df2 = edges[edges['id_2'] == i].reset_index()

    df = df1[['id_2', 'id_1']].rename(columns={'id_1':'id_2', 'id_2':'id_1'}).append(df2[['id_1', 'id_2']])
    df['target'] = 1

    df = vertices.set_index('id').join(df.rename(columns={'id_1':'id'}).set_index('id')['target']).fillna(0)
    df = vertices.set_index('id').join(df.rename(columns={'id_1':'id'}).set_index('id')['target']).fillna(0)
    
    X = df.drop(['target'], axis=1)
    y = df['target']
    
    model = CatBoostClassifier(iterations=100, task_type='GPU', random_state=42, loss_function='Logloss', verbose=0)
    model.fit(X, y, [0, 1, 2])
    
    preds = model.predict_proba(df.drop(['target'], axis=1))[:, 1]
    df['preds'] = preds
    df['id_2'] = i
    
    res = df[df['target'] != 1].sort_values(by='preds', ascending=False).reset_index()[['id', 'id_2', 'preds']]
    res.columns = ['id_1', 'id_2', 'proba']
    
    result = result.append(res[:10000], ignore_index=True, sort=False)

# Drop duplicates from results
id_1 = []
id_2 = []
for row in result[['id_1', 'id_2']].values:
    id_1.append(min(row[0], row[1]))
    id_2.append(max(row[0], row[1]))
result['id_1'] = id_1
result['id_2'] = id_2
result = result.drop_duplicates(['id_1', 'id_2'])
# Save
result.to_csv('Results/okved_region_company_aggs.csv', index=False)

# Reset vertices
verices = vertices.iloc[:, :4]

In [0]:
# 9) okved_proba + region_proba + 200 iters
vertices = vertices.merge(okved.add_suffix('_4'), left_on='main_okved', right_on='index_4').drop('index_4', axis=1)
vertices = vertices.merge(region_df_proba.reset_index().add_suffix('_5'), left_on='region_code', right_on='index_5').drop('index_5', axis=1)

result = pd.DataFrame(columns=['id_1', 'id_2', 'proba'])
for i in tqdm_notebook(ids.id):
    df1 = edges[edges['id_1'] == i].reset_index()
    df2 = edges[edges['id_2'] == i].reset_index()

    df = df1[['id_2', 'id_1']].rename(columns={'id_1':'id_2', 'id_2':'id_1'}).append(df2[['id_1', 'id_2']])
    df['target'] = 1

    df = vertices.set_index('id').join(df.rename(columns={'id_1':'id'}).set_index('id')['target']).fillna(0)
    df = vertices.set_index('id').join(df.rename(columns={'id_1':'id'}).set_index('id')['target']).fillna(0)
    
    X = df.drop(['target'], axis=1)
    y = df['target']
    
    model = CatBoostClassifier(iterations=200, task_type='GPU', random_state=42, loss_function='Logloss', verbose=0)
    model.fit(X, y, [0, 1, 2])
    
    preds = model.predict_proba(df.drop(['target'], axis=1))[:, 1]
    df['preds'] = preds
    df['id_2'] = i
    
    res = df[df['target'] != 1].sort_values(by='preds', ascending=False).reset_index()[['id', 'id_2', 'preds']]
    res.columns = ['id_1', 'id_2', 'proba']
    
    result = result.append(res[:10000], ignore_index=True, sort=False)

# Drop duplicates from results
id_1 = []
id_2 = []
for row in result[['id_1', 'id_2']].values:
    id_1.append(min(row[0], row[1]))
    id_2.append(max(row[0], row[1]))
result['id_1'] = id_1
result['id_2'] = id_2
result = result.drop_duplicates(['id_1', 'id_2'])
# Save
result.to_csv('Results/okved_region_proba.csv', index=False)

# Reset vertices
verices = vertices.iloc[:, :4]

# 3. Ensemble
Here we merge all models results and take weighted average as final score 

In [0]:
merged_df = None
for el in other_models:
    if merged_df is None:
        merged_df = pd.read_csv('Results/'+el)
        merged_df['id_1_2'] = merged_df['id_1'].astype(str) + ' ' + merged_df['id_2'].astype(str)
        merged_df = merged_df.drop(['id_1', 'id_2'], axis=1)
    else:
        tmp = pd.read_csv('Results/'+el)
        tmp['id_1_2'] = tmp['id_1'].astype(str) + ' ' + tmp['id_2'].astype(str)
        tmp = tmp[['id_1_2', 'proba']]
        merged_df = merged_df.merge(tmp, on='id_1_2', how='outer')
tmp = pd.read_csv('Results/'+name)
tmp['id_1_2'] = tmp['id_1'].astype(str) + ' ' + tmp['id_2'].astype(str)
tmp = tmp[['id_1_2', 'proba']]
merged_df = merged_df.merge(tmp, on='id_1_2', how='outer')
merged_df = merged_df.fillna(0)
merged_df['mean_proba'] = merged_df.drop('id_1_2', axis=1).mean(axis=1)

So, we have pandas DataFrame with score for different ids pairs. Last step is just to choose top_n value (How many edges we want to predict)

# 4. Final submission
In terms of the competition only 100k edges must be chosen. This approach gives ~6400 points on the liderboard

In [0]:
out = merged_df.sort_values('mean_proba', ascending=False).iloc[:10**5]
id_1 = []
id_2 = []
for el in out['ids'].apply(lambda x: x.split(' ')).values:
    id_1.append(el[0])
    id_2.append(el[1])
out['id_1'] = id_1
out['id_2'] = id_2
out[['id_1', 'id_2']].to_csv('submission.csv', index=False)