# DeepTrajectory training example

In [4]:
import casp_datafeeder as df
import casp_rnn_train_util as ru
import os

### Data files

In [None]:
# CSV file containing all training and validation data
csv_fn = "../data/casp_normalized_all_data_df.csv"
# Select cross-validation fold 4
VALIDATION_TRJS = df.get_validation_set(4, "../data/cv_folds.csv")

### Model Parameter

In [None]:
# Give each model training run a unique name
model_name="test_training_cv4"
# Select trajectory features that are used as inputs during training
FEATURES = ['N_DOPE', 'N_DOPE_HR', 'N_MOLPDF', 'N_BOND', 
            'N_ANGLE', 'N_DIHEDRAL', 'N_IMPROPER', 'N_DOOP', 
            'N_GOAP', 'N_GOAPAG', 'N_CALRW', 'N_CALRWPLUS', 
            'N_DDFIRESUM', 'N_DDFIRETERM1', 'N_DDFIRETERM2', 
            'N_DDFIRETERM3', 'N_DDFIRETERM4', 
            'N_RMSD_SM', 'N_GDTTS_SM']
FEATURESIZE = len(FEATURES)
# Select which column is the ground truth (Y_)
LABEL = "LABEL_D_GDTTS_COARSE_CRY"
# Compute ALPHASIZE from label column
ALPHASIZE=df.get_class_size(csv_fn, LABEL)
# Sequence length of the RNN
SEQLEN=30
# Number of mini-batches
BATCHSIZE=50
# Internalsize of each GRU cell
INTERNALSIZE=1024
# Number of stacked hidden layers
NLAYERS=3
# Learning rate for the training
learning_rate=0.0001
# Dropout propability
dropout_pkeep=0.9
# Maximum number of epochs used for training
nb_epochs=300
# Visualisation of training progress 
vis_progress_nbatches=1000
# Save training progress every 5000 batches
save_checkpoint_nbatches=5000



# Set weights for loss function
class_weights=np.ones(ALPHASIZE)
class_weights[0]= 0.05 # improved state class
class_weights[1]= 1.0 # no-change state class
class_weights[2]= 10 # decreased state class




In [None]:
# Create log and checkpoint directories if the do not exist
if not os.path.exists("../log"):
    os.mkdir("../log")
if not os.path.exists("../checkpoints"):
    os.mkdir("../checkpoints")
    
# Specify the log and checkpoint directory 
log="log/"+model_name
checkpoint_dir="checkpoints/"+model_name

### Training metrics

In [None]:
metrics=["precision_micro", "precision_macro", "precision_weighted", "recall_micro", "recall_macro", "recall_weighted", "f1_micro", "f1_macro", "f1_weighted", "accuracy"]
#add metrics for different individual labels
for i in range(3):
    p_str = "class_"+str(i)+"_precision"
    r_str = "class_"+str(i)+"_recall"
    f1_str = "class_"+str(i)+"_f1"
    metrics.append(p_str)
    metrics.append(r_str)
    metrics.append(f1_str)

In [None]:
print "COMPUTE METRICS:", metrics

### Start training

In [None]:
# Generate model
model_desc = ru.model_rnn_in_out_dropout_const_lr_weighted_loss(FEATURESIZE, 
                                                                     ALPHASIZE, 
                                                                     model_name, 
                                                                     SEQLEN, 
                                                                     BATCHSIZE, 
                                                                     INTERNALSIZE, 
                                                                     NLAYERS, 
                                                                     learning_rate,
                                                                     dropout_pkeep, 
                                                                     class_weights)

In [None]:
# Start training
ru.train_rnn(model_desc, 
             csv_fn, 
             FEATURES, 
             LABEL, 
             VALIDATION_TRJS, 
             log, 
             checkpoint_dir,
             nb_epochs, 
             vis_progress_nbatches, 
             save_checkpoint_nbatches, 
             checkpoint_fn=None,  
             num_save_checkpoint=0, 
             metrics=metrics, 
             monitoring_metric="class_0_precision",
             track_vali_predictions=False, 
             track_last_hidden_state=False)