In [33]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
cd ../src

/home/theo/mva/altegrad/challenge/src


In [35]:
from util import *
from params import *
from imports import *

from post_processing import *

In [36]:
seed_everything(SEED)

# Data

## Labels & nodes

In [37]:
sub = pd.read_csv(DATA_PATH + 'graph_baseline.csv')

In [38]:
df_train = pd.read_csv(DATA_PATH + 'train.csv', names=['node', 'class'])
df_test = pd.read_csv(DATA_PATH + 'test.csv', names=['node'])

In [39]:
df_train.head()

Unnamed: 0,node,class
0,9032,health/medical
1,5346,entertainment
2,18778,entertainment
3,11939,education/research
4,17502,tech/science


In [40]:
nodes_train = list(df_train['node'])
nodes_test = list(df_test['node'])

In [41]:
nodes = nodes_test + nodes_train

In [42]:
y_train = np.array([CLASSES.index(c) for c in df_train['class'].values])

## Graph

In [43]:
G = nx.read_weighted_edgelist(DATA_PATH + 'edgelist.txt', create_using=nx.DiGraph())
nodes_indexing = list(np.array(list(G.nodes)).astype(int))

In [44]:
full_graph = nx.adjacency_matrix(G).toarray()

## Texts

In [None]:
df_texts = pd.read_csv('../output/df_texts.csv')

In [None]:
df_texts.head()

In [None]:
errors = ((df_texts['language'].values != 'fr') + df_texts['error'].values >= 1).astype(int)
errors = (df_texts['error'].values >= 1).astype(int)

In [None]:
Counter(errors)

# $k$-fold

In [None]:
def k_fold_lgb(graph, nodes_indexing, nodes, nodes_test, features, errors, y, 
               seed=2019, k=5, n_neigh=5, remove_error=False):  
    X = np.array([get_knns_features(node, graph, features, nodes_indexing, errors, 
                                    k=n_neigh, remove_error=remove_error) for node in nodes])
        
    X_test = np.array([get_knns_features(node, graph, features, nodes_indexing, errors, 
                                         k=n_neigh, remove_error=remove_error) for node in nodes_test])
    
    print("Training data shape :", X.shape)
    
    splits = list(StratifiedKFold(n_splits=k, random_state=seed).split(X=nodes, y=y))
    
    pred_oof = np.zeros((len(nodes), NUM_CLASSES))
    pred_test = np.zeros((len(nodes_test), NUM_CLASSES))
    
    for i, (train_idx, val_idx) in enumerate(splits):
        print('')
        clf = run_lgb(X[train_idx], X[val_idx], y[train_idx], y[val_idx])

        pred_val = clf.predict(X[val_idx])
        
        pred_oof[val_idx] = pred_val
        pred_test += clf.predict(X_test) / k

    val_loss = cross_entropy(pred_oof, y)
    print(f'\n CV = {val_loss :.3f}') 
    
    return pred_oof, pred_test

# Modeling

In [None]:
models = [
    'camembert-base_1.151_augment_notrad',
    'camembert-base_1.151_noaugment_notrad',
    'camembert-base_1.137_noaugment_trad',
    'camembert-base_1.144_augment_trad',
    'camembert-base_1.136_noaugment_trad_avg',
]

In [None]:
pred_fts = []
pred_oofs = []

In [None]:
for model in models:
    pred_fts.append(np.load(f'../output/pred_ft_{model}.npy'))
    pred_oofs.append(np.load(f'../output/pred_oof_{model}.npy'))

In [None]:
pred_oof = np.mean(pred_oofs, 0)
pred_ft = np.mean(pred_fts, 0)

In [None]:
features = np.zeros((len(df_texts), pred_oof.shape[-1]))
ft_nodes = np.array([i for i in range(len(df_texts)) if i not in nodes_train])

features[np.array(nodes_train)] = pred_oof
features[ft_nodes] = pred_ft

In [None]:
pred_oof, pred_test = k_fold_lgb(full_graph, nodes_indexing, nodes_train,
                                 nodes_test, features, errors, y_train,
                                 seed=SEED, k=5, n_neigh=2, remove_error=False)

In [None]:
score = cross_entropy(pred_oof, y_train)
print(f" -> Local CV score is {score:.3f}")

# Blending after modeling

In [None]:
pred_oofs = []
pred_tests = []

In [None]:
for model in models:
    
    print(f'\n Model : {model}')
    pred_ft = np.load(f'../output/pred_ft_{model}.npy')
    pred_oof = np.load(f'../output/pred_oof_{model}.npy')
    
    features = np.zeros((len(df_texts), pred_oof.shape[-1]))
    ft_nodes = np.array([i for i in range(len(df_texts)) if i not in nodes_train])

    features[np.array(nodes_train)] = pred_oof
    features[ft_nodes] = pred_ft

    pred_oof, pred_test = k_fold_lgb(full_graph, nodes_indexing, nodes_train,
                                 nodes_test, features, errors, y_train,
                                 seed=SEED, k=5, n_neigh=1, remove_error=True)
    pred_oofs.append(pred_oof)
    pred_tests.append(pred_test)

In [None]:
pred_oof = np.mean(pred_oofs, 0)
pred_test = np.mean(pred_tests, 0)

In [None]:
score = cross_entropy(pred_oof, y_train)
print(f" -> Local CV score is {score:.3f}")

In [None]:
from util import plot_confusion_matrix
plot_confusion_matrix(np.argmax(pred_oof, -1), y_train)
plt.show()

In [None]:
sub[CLASSES] = pred_test
sub.to_csv(f'../output/sub_{score:.3f}_graph.csv', index=False)