In [1]:
import pandas as pd
import numpy as np
import networkx as nx

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import minmax_scale

### Loading and consolidating input data

In [52]:
df_edges_orig = pd.read_csv('df_edges_orig.csv')
df_nodes_train = pd.read_csv('df_nodes_train.csv')
df_nodes_test_publ = pd.read_csv('df_nodes_test_publ.csv')

In [53]:
train_merged = pd.merge(df_edges_orig, df_nodes_train.rename(columns={'Feat3': 'Feat3Left', 
                'Feat4': 'Feat4Left', 'Y': 'YLeft'}), how = "left", left_on = 'NodeLeft', right_on = 'Node')
del train_merged['Node']

train_merged = pd.merge(train_merged, df_nodes_train.rename(columns={'Feat3': 'Feat3Right', 
                'Feat4': 'Feat4Right', 'Y': 'YRight'}), how = "left", left_on = 'NodeRight', right_on = 'Node')
del train_merged['Node']

train_merged['SameIndustry'] = (train_merged['YLeft'] == train_merged['YRight']).astype(int)

### Inference of classes on outbound payment transactions with a balanced decision tree classifier
#### Hyper-parameters are found through a grid search of best accuracy score with 5-fold stratified cross-validation

In [182]:
X = train_merged.dropna()[['Feat1', 'Feat2', 'Feat3Right', 'Feat4Right', 'YRight']]
y = train_merged.dropna()['SameIndustry']

estm = DecisionTreeClassifier(class_weight = "balanced")
tuned_parameters = [{'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 2, 10]}]

clf = GridSearchCV(estimator=estm, param_grid=tuned_parameters,
                   cv=StratifiedKFold(n_splits=5, shuffle = True), scoring=make_scorer(accuracy_score))
clf.fit(X, y)

print("Grid accuracy scores on development set:")
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, 2 * std, params))
print()
print("Best accuracy score is %0.3f." % clf.best_score_, "Best parameters set found on development set:")
print(clf.best_params_)

estm.set_params(**clf.best_params_)
estm.fit(X, y);

Grid accuracy scores on development set:
0.879 (+/-0.001) for {'min_samples_split': 2, 'min_samples_leaf': 1}
0.875 (+/-0.002) for {'min_samples_split': 3, 'min_samples_leaf': 1}
0.828 (+/-0.002) for {'min_samples_split': 10, 'min_samples_leaf': 1}
0.853 (+/-0.002) for {'min_samples_split': 2, 'min_samples_leaf': 2}
0.853 (+/-0.001) for {'min_samples_split': 3, 'min_samples_leaf': 2}
0.821 (+/-0.002) for {'min_samples_split': 10, 'min_samples_leaf': 2}
0.751 (+/-0.004) for {'min_samples_split': 2, 'min_samples_leaf': 10}
0.751 (+/-0.003) for {'min_samples_split': 3, 'min_samples_leaf': 10}
0.751 (+/-0.003) for {'min_samples_split': 10, 'min_samples_leaf': 10}

Best accuracy score is 0.879. Best parameters set found on development set:
{'min_samples_split': 2, 'min_samples_leaf': 1}


In [55]:
train_merged_LeftPredict = train_merged[train_merged['YLeft'].isnull() & train_merged['YRight'].notnull()].copy()
train_merged_LeftPredict['SameIndustry'] = estm.predict(
                        train_merged_LeftPredict[['Feat1', 'Feat2', 'Feat3Right', 'Feat4Right', 'YRight']])
train_merged_LeftPredict['YLeft'] = np.where(
                    train_merged_LeftPredict['SameIndustry'], train_merged_LeftPredict['YRight'], np.nan)
train_merged.update(train_merged_LeftPredict)
del train_merged_LeftPredict

### Inference of classes on inbound payment transactions with a balanced decision tree classifier
#### Hyper-parameters are found through a grid search of best accuracy score with 5-fold stratified cross-validation

In [183]:
X = train_merged.dropna()[['Feat1', 'Feat2', 'Feat3Left', 'Feat4Left', 'YLeft']]
y = train_merged.dropna()['SameIndustry']

estm = DecisionTreeClassifier(class_weight = "balanced")
tuned_parameters = [{'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 2, 10]}]

clf = GridSearchCV(estimator=estm, param_grid=tuned_parameters,
                   cv=StratifiedKFold(n_splits=5, shuffle = True), scoring=make_scorer(accuracy_score))
clf.fit(X, y)

print("Grid accuracy scores on development set:")
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, 2 * std, params))
print()
print("Best accuracy score is %0.3f." % clf.best_score_, "Best parameters set found on development set:")
print(clf.best_params_)

estm.set_params(**clf.best_params_)
estm.fit(X, y);

Grid accuracy scores on development set:
0.844 (+/-0.002) for {'min_samples_split': 2, 'min_samples_leaf': 1}
0.840 (+/-0.002) for {'min_samples_split': 3, 'min_samples_leaf': 1}
0.805 (+/-0.001) for {'min_samples_split': 10, 'min_samples_leaf': 1}
0.822 (+/-0.001) for {'min_samples_split': 2, 'min_samples_leaf': 2}
0.822 (+/-0.001) for {'min_samples_split': 3, 'min_samples_leaf': 2}
0.797 (+/-0.002) for {'min_samples_split': 10, 'min_samples_leaf': 2}
0.744 (+/-0.003) for {'min_samples_split': 2, 'min_samples_leaf': 10}
0.744 (+/-0.003) for {'min_samples_split': 3, 'min_samples_leaf': 10}
0.744 (+/-0.003) for {'min_samples_split': 10, 'min_samples_leaf': 10}

Best accuracy score is 0.844. Best parameters set found on development set:
{'min_samples_split': 2, 'min_samples_leaf': 1}


In [57]:
train_merged_RightPredict = train_merged[train_merged['YRight'].isnull() & train_merged['YLeft'].notnull()].copy()
train_merged_RightPredict['SameIndustry'] = estm.predict(
                        train_merged_RightPredict[['Feat1', 'Feat2', 'Feat3Left', 'Feat4Left', 'YLeft']])
train_merged_RightPredict['YRight'] = np.where(
                    train_merged_RightPredict['SameIndustry'], train_merged_RightPredict['YLeft'], np.nan)
train_merged.update(train_merged_RightPredict)
del train_merged_RightPredict

### Dimensionality reduction of payment amounts and values to a single variable

In [181]:
pca = PCA(n_components=1, svd_solver='full')
pca.fit(train_merged[['Feat1', 'Feat2']])
print('Explained portion of variance:', np.round(np.cumsum(pca.explained_variance_ratio_), 2)[0])
train_merged['Length'] = minmax_scale(pca.transform(train_merged[['Feat1', 'Feat2']]))

Explained portion of variance: 0.97


### Constructing payments graph with semi-classified nodes and a single measure of edge length

In [59]:
train_graph = nx.from_pandas_dataframe(df=train_merged, source='NodeLeft', target='NodeRight', 
                                       edge_attr='Length', create_using=nx.Graph())

nx.set_node_attributes(G = train_graph, name = 'Y', values = [])
for row in train_merged.iterrows():
    if ((not np.isnan(row[1]['YLeft'])) and (row[1]['YLeft'] not in train_graph.node[row[1]['NodeLeft']]['Y'])):
        train_graph.node[row[1]['NodeLeft']]['Y'] = train_graph.node[row[1]['NodeLeft']]['Y'] + [int(row[1]['YLeft'])]
    if ((not np.isnan(row[1]['YRight'])) and (row[1]['YRight'] not in train_graph.node[row[1]['NodeRight']]['Y'])):
        train_graph.node[row[1]['NodeRight']]['Y'] = train_graph.node[row[1]['NodeRight']]['Y'] + [int(row[1]['YRight'])]

### Inference of classes through label propagation on payments graph

In [180]:
X = df_nodes_train['Feat4']
y = df_nodes_train['Y']

IndCLF = DecisionTreeClassifier(class_weight = "balanced", min_samples_split = 2, min_samples_leaf = 1)
IndCLF.fit(X.values.reshape(-1, 1), y);

In [169]:
out_arr = []

for trow in df_nodes_test_publ.iterrows():
    current_node = trow[1]['Node']
    out_val = -1
    
    if (train_graph.has_node(current_node)):    
        
        if (len(train_graph.node[current_node]['Y']) == 1):
            out_val = train_graph.node[current_node]['Y'][0]
        
        else:
            Ys = {}
            
            # iteration through 1st level neighbors
            for i in iter(train_graph[current_node]):
                if (train_graph.node[i]['Y']):
                    for y in train_graph.node[i]['Y']:
                        if (y in Ys.keys()):
                            Ys[y] += train_graph[current_node][i]['Length']
                        else:
                            Ys[y] = train_graph[current_node][i]['Length']    
                            
                # iteration through 2nd level neighbors
                for j in iter(train_graph[i]):
                    if (j != current_node):
                        if (train_graph.node[j]['Y']):
                            for y in train_graph.node[j]['Y']:
                                if (y in Ys.keys()):
                                    Ys[y] += train_graph[current_node][i]['Length'] * train_graph[i][j]['Length']
                                else:
                                    Ys[y] = train_graph[current_node][i]['Length'] * train_graph[i][j]['Length']
        
            if (Ys.keys()):
                # selecting most frequent class among nearest neighbors
                out_val = max(Ys, key=Ys.get)
                
            else:
                # using dummy classifier if graph data does not help to make inference
                out_val = IndCLF.predict(trow[1]['Feat4'].reshape(1, -1))[0]
        
    else:
        # using dummy classifier if graph data does not help to make inference
        out_val = IndCLF.predict(trow[1]['Feat4'].reshape(1, -1))[0]
        
    out_arr.append([current_node, out_val])

### Output of results

In [179]:
Out_pd = pd.DataFrame(out_arr)
Out_pd.columns = ['Node', 'Y']
Out_pd.to_csv('OlegMitsik-13112017-SberbankIndustry.csv', header = True, index = False, mode = 'w')