### Maximization of MSA sequence probability using L-BFGS (and CG) 
Port of GREMLIN_CPP (https://github.com/sokrypton/GREMLIN_CPP) using Python and numba.

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr

In [2]:
# Local .py file imports
from gremlin_inference_numba import (
    get_sequences_from_file, seqs2int, filt_gaps, lbfgs, cg, get_seqs_from_var_name,
    eval_v, eval_vw, oh_1bd_predict, oh_2bd_predict, aa_count_predict
)

Loading MSA sequences, converting alphabetical amino acid sequences to seuqences of integer values, and trimming MSA.

In [3]:
alignment = '../../datasets/AVGFP/uref100_avgfp_jhmmer_119.a2m'
msa, *_ = get_sequences_from_file(alignment)
print(msa)
msa_int = seqs2int(msa)
print(msa_int)
print(msa_int.shape)
msa_trimmed, gaps = filt_gaps(msa_int)
print(msa_trimmed)
print(msa_trimmed.shape)

['mskgeELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKqhDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMadkqKNGIKVNFKIRHNIEDGSVQLADHyqQNTPIGDGPVLLPDNHYLSTQSALSKDpNEKRDHMVLLEFVTAAGithgmdelyk'
 'metgrALFSKPMTCQTEIDGEINGNKFKVVGNGDS-PGGGDFSIHAYCTTGELPMSWVVLGSPLQYGFHMFSGYPDDII..HYFQECFPEGYILTRSLRFEYDGTLTTTHHYSLEGNCVKAKVTLKGEGFDPNGPTMTKEEEQHPSQVQIFPH....GSGIRLLSNVVFKKKDGTTQLALQdcSVKPLGSRDVPLPNVHFLRTQIIQKKDdSDKRDHVVQREIAIAEH..........'
 '..rgrALFSNSMTSKTEIDGEINGKKFKVVGEGDS-PGGGDFTIRAYCTTGELPMSWVVMGSPLQYGFHMLSHYPDDIV..HYFQECFPEGYTLTRKLRFEGDGTLTTHHRYELAGTCVKAKVSLTGESFDPSGPTMTKTVEQLPNQVQVFPH....ADGIRLLSDVVFVKNDGTTQIAHQdcSVKPLATRKITLPRFHFLHTQISQWKDrSDKRDHVVQREVSKAE-..........'
 ...
 'mersaSFFTGTAKSKVIAEIMVDDTEYKVSGEGFACPLEGHQTLELHCSGKAMSINWSILGTIIQSNFKLFTQYTGSCV.yDFFKTSFPGGLKVETTASFSDGAVIRGNSSLTYVKDTVICRCNIQCEGFCEESPARARDLGQTLPCYEVIEG..ykADEVTCTMDLEWNDSDKQKYLCRLesSFVSGGTGNF-APPRHFIGHHFKITDK.SPNNLHFAQRCKSRANRi.........'
 'mersiSLFTGTAKSKVIATVKIDDM

Optimization of MSA probability using L-BFGS and CG.

In [4]:
# Limited-memory Broyden–Fletcher–Goldfarb–Shanno (L-BFGS) algorithm
x_opt_1bd = lbfgs(msa_trimmed, eval_v, mode='v', max_iter=100)
x_opt_2bd = lbfgs(msa_trimmed, eval_vw, mode='vw', max_iter=100)
# Conjugate Gradient (CG) method
x_opt_1bd_cg = cg(msa_trimmed, eval_v, mode='v', max_iter=100)
x_opt_2bd_cg = cg(msa_trimmed, eval_vw, mode='vw', max_iter=100)

lbfgs::iter S_S fx:  107388.27079431039 gnorm: 1225.1515293096897
10 / 100  :  60609.63108277179
20 / 100  :  59947.64281511532
30 / 100  :  59869.53178623354
40 / 100  :  59859.941731911356
50 / 100  :  59857.8797206444
60 / 100  :  59857.31594100581
70 / 100  :  59857.1502888995
80 / 100  :  59857.0950468184
90 / 100  :  59857.08003259427
100 / 100  :  59857.07454350492
lbfgs::iter S_S fx:  107388.27079430803 gnorm: 15388.644842125897
10 / 100  :  31533.55176383793
20 / 100  :  31060.845347498376
30 / 100  :  28888.57338624914
40 / 100  :  28396.431248611538
50 / 100  :  28342.80078480024
60 / 100  :  28254.500872720135
70 / 100  :  28227.88848252197
80 / 100  :  28188.926903353127
90 / 100  :  28171.676015847323
100 / 100  :  28158.183078581496
# cg::iter S_S fx:  107388.27079431039  gnorm:  1225.15152930969
10 / 100  :  95195.27951793215
20 / 100  :  83096.38618130801
30 / 100  :  71395.61723248461
40 / 100  :  67400.86027747851
50 / 100  :  66669.79690088452
60 / 100  :  65910.633

Loading collected literature sequences, converting to integer sequences, and trimming sequences, to predict fitness and compare to true measured fitness later.

In [5]:
wt_sequence = 'MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTL' \
              'VTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLV' \
              'NRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLAD' \
              'HYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK'
variant_fitness_data = pd.read_csv('../../datasets/AVGFP/avgfp.csv', sep=';')
variants = variant_fitness_data.iloc[:2000, 0].values  # "just" using 2000 variants for faster processing
fitness_values = variant_fitness_data.iloc[:2000, 1].values
variants_split = []
for variant in variants:
    variants_split.append(variant.split('/'))
variants, fitness_values, sequences = get_seqs_from_var_name(wt_sequence, variants_split, fitness_values)
sequences_int = seqs2int(sequences)
sequences_int_trimmed = np.delete(sequences_int, gaps, axis=1)

L-BFGS optimization: Rank correlation of measured and predicted sequence fitness.

In [6]:
y_pred_1bd = oh_1bd_predict(sequences_int_trimmed, x_opt_1bd)
y_pred_2bd = oh_2bd_predict(sequences_int_trimmed, x_opt_2bd)
y_pred_aac = aa_count_predict(msa_trimmed, sequences_int_trimmed)

# 1D-sequence encodings for further machine learning tasks
#x_pred_1bd = oh_1bd_1d_encode(sequences_int_trimmed, x_opt_1bd)
#x_pred_2bd = oh_2bd_1d_encode(sequences_int_trimmed, x_opt_2bd)

print(f"{'AA counts:':<18}", spearmanr(fitness_values, y_pred_aac))
print(f"{'1-body term (V):':<18}", spearmanr(fitness_values, y_pred_1bd))
print(f"{'2-body term (VW):':<18}", spearmanr(fitness_values, y_pred_2bd))

AA counts:         SignificanceResult(statistic=0.5029247668771557, pvalue=1.0850574512849578e-128)
1-body term (V):   SignificanceResult(statistic=0.6585532465117758, pvalue=5.395952272751471e-249)
2-body term (VW):  SignificanceResult(statistic=0.7183498752449397, pvalue=3.4022843e-317)


CG optimization: Rank correlation of measured and predicted sequence fitness.

In [7]:
y_pred_1bd = oh_1bd_predict(sequences_int_trimmed, x_opt_1bd_cg)
y_pred_2bd = oh_2bd_predict(sequences_int_trimmed, x_opt_2bd_cg)
y_pred_aac = aa_count_predict(msa_trimmed, sequences_int_trimmed)

# 1D-sequence encodings for further machine learning tasks
#x_pred_1bd = oh_1bd_1d_encode(sequences_int_trimmed, x_opt_1bd)
#x_pred_2bd = oh_2bd_1d_encode(sequences_int_trimmed, x_opt_2bd)

print(f"{'AA counts:':<18}", spearmanr(fitness_values, y_pred_aac))
print(f"{'1-body term (V):':<18}", spearmanr(fitness_values, y_pred_1bd))
print(f"{'2-body term (VW):':<18}", spearmanr(fitness_values, y_pred_2bd))

AA counts:         SignificanceResult(statistic=0.5029247668771557, pvalue=1.0850574512849578e-128)
1-body term (V):   SignificanceResult(statistic=0.5459010681666135, pvalue=9.995612748011931e-156)
2-body term (VW):  SignificanceResult(statistic=0.6943000690360119, pvalue=9.57615754836693e-288)


Notebook end.