In [14]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
cd ../src

/home/theo/mva/altegrad/challenge/src


In [16]:
from util import *
from params import *
from imports import *

from models import *

from data.dataset import *

from training.train import *
from training.freeze import *
from training.sampler import *

from tqdm import tqdm_notebook as tqdm

In [17]:
seed_everything(SEED)

# Labels

In [18]:
sub = pd.read_csv(DATA_PATH + 'graph_baseline.csv')

In [19]:
df_train = pd.read_csv(DATA_PATH + 'train.csv', names=['node', 'class'])
df_test = pd.read_csv(DATA_PATH + 'test.csv', names=['node'])

In [20]:
nodes_train = list(df_train['node'])
nodes_test = list(df_test['node'])
nodes = nodes_test + nodes_train

In [21]:
y_train = np.array([CLASSES.index(c) for c in df_train['class'].values])

# $k$-fold

In [22]:
def k_fold(selected_model, df_texts, nodes, nodes_test, y, 
           augment=False, avg_pool=False, extract_ft=False,
           k=5, seed=2019, verbose=1, save=True, cp=False):
    
    splits = list(StratifiedKFold(n_splits=k, random_state=seed).split(X=nodes, y=y))
    
    pred_ft = None
    if extract_ft:
        model = Transformer(selected_model, nb_layers=1, pooler_ft=None).cuda()
        ft_nodes = [i for i in range(len(df_texts)) if i not in nodes]
        ft_dataset = AltegradTestDataset(df_texts, ft_nodes, model)
        pred_ft = np.zeros((len(ft_nodes), NUM_CLASSES))
    
    pred_oof = np.zeros((len(nodes), NUM_CLASSES))
    pred_test = np.zeros((len(nodes_test), NUM_CLASSES))
    
    print(f'Doing {k} folds with {len(nodes)} texts')

    for i, (train_idx, val_idx) in enumerate(splits):
        print(f"\n-------------   Fold {i + 1}  -------------")
        seed_everything(seed + i)

        model = Transformer(selected_model, avg_pool=avg_pool).cuda()
        model.zero_grad()
        
        train_dataset = AltegradTrainDataset(df_texts, nodes[train_idx], y[train_idx], model, augment=augment)
        val_dataset = AltegradTrainDataset(df_texts, nodes[val_idx], y[val_idx], model)
        test_dataset = AltegradTestDataset(df_texts, nodes_test, model)

        print('\n- Training logits & pooler only : ')

        freeze(model)
        for layer in ['logit', 'pooler']:
            unfreeze_layer(model, layer)

        n_parameters = count_parameters(model)
        print(f'    -> {n_parameters} trainable parameters\n')

        weight_decay = 0
        epochs = 2
        
        fit(model, train_dataset, val_dataset, epochs=epochs, batch_size=64, weight_decay=weight_decay, lr=1e-3)

        print('\n- Training all layers: ')

        unfreeze(model)
        n_parameters = count_parameters(model)
        print(f'    -> {n_parameters} trainable parameters\n')
        
        epochs = 2
        batch_size = 6
        acc_steps = 1
        
        lr_transfo = 3e-5
        lr = 1e-4
        lr_decay = 0.95

        fit(model, train_dataset, val_dataset, epochs=epochs, batch_size=batch_size, acc_steps=acc_steps, 
            weight_decay=weight_decay, lr=lr, lr_transfo=lr_transfo, lr_decay=lr_decay,
            cp=cp, model_name=f'{selected_model}_{i + 1}')

        print('\n- Evaluating: ')

        if cp:
            load_model_weights(model, f"{selected_model}_{i + 1}_cp.pt", verbose=1)

        pred_val = predict(model, val_dataset, batch_size=64)
        pred_oof[val_idx] = pred_val
        
        val_loss = cross_entropy(pred_val, y[val_idx])
        print(f'\n Scored {val_loss :.3f} on validation data')
        
        pred_test += predict(model, test_dataset, batch_size=64) / k
        if extract_ft:
            pred_ft += predict(model, ft_dataset, batch_size=64) / k

        del model, train_dataset, val_dataset
        torch.cuda.empty_cache()
        gc.collect()
    
    return pred_oof, pred_test, pred_ft

In [23]:
SELECTED_MODEL = "camembert-base"
extract_ft = False

In [24]:
# model = Transformer(SELECTED_MODEL)
# dataset = AltegradTrainDataset(df_texts, nodes_train, y_train, model)

In [25]:
augment = False
translate = False
avg_pool = False

In [26]:
if translate: 
    df_texts = pd.read_csv('../output/df_texts_trans.csv')
else:
    df_texts = pd.read_csv('../output/df_texts.csv')

In [None]:
pred_oof, pred_test, pred_ft = k_fold(SELECTED_MODEL, df_texts, np.array(nodes_train), np.array(nodes_test),
                                      y_train, extract_ft=extract_ft, augment=augment, avg_pool=avg_pool,
                                      k=5, seed=SEED, verbose=1, save=False, cp=False)

In [None]:
score = cross_entropy(pred_oof, y_train)
print(f" -> Local CV score is {score:.3f}")

In [None]:
name = 'noaugment_trad_avg'

In [None]:
if extract_ft:
    np.save(f'../output/pred_ft_{SELECTED_MODEL}_{score:.3f}_{name}.npy', pred_ft)

np.save(f'../output/pred_oof_{SELECTED_MODEL}_{score:.3f}_{name}.npy', pred_oof)
np.save(f'../output/pred_test_{SELECTED_MODEL}_{score:.3f}_{name}.npy', pred_test)

In [None]:
from util import plot_confusion_matrix
plot_confusion_matrix(np.argmax(pred_oof, -1), y_train)
plt.show()

In [None]:
# sub[CLASSES] = pred_test
# sub.to_csv(f'../output/sub_{score:.3f}.csv', index=False)