In [55]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
from itertools import combinations

In [72]:
def entropy(p):
    p = p + 1e-10
    p = p / np.sum(p)
    return -np.sum(p * np.log2(p))

def comb(n, k):
    return np.array(list(combinations(range(1, n+1), k)))

def get_k_to_num_sequences(df, wt_sequences, all_possibilities=False):

    n = len(df)

    aminoacids = df.columns.values
    aa_to_idx = {aa: i for i, aa in enumerate(aminoacids)}

    k_to_num_sequences = {}

    for k in range(0, n+1):
        combos = comb(n, k) # combinations are of indices to vary, not keep same as WT

        num_sequences = 0
        for c_idxs in combos:
            sub_pwm_df = df.loc[c_idxs, :]
            curr_num_seq = 1
            for i in c_idxs:
                probas = sub_pwm_df.loc[i, :].values

                # need to exclude the WT amino-acid
                idxs_to_exclude = []
                for wt_seq in wt_sequences:
                    idxs_to_exclude.append(aa_to_idx[wt_seq[i-1]])
                probas = np.delete(probas, idxs_to_exclude)

                if all_possibilities:
                    curr_num_seq *= len(probas)
                else:
                    curr_num_seq *= 2**entropy(probas)
            
            num_sequences += curr_num_seq
        
        k_to_num_sequences[k] = num_sequences
    
    return k_to_num_sequences

In [71]:
df = pd.read_csv('../auxiliary_data/class_I_A0101_9_pwm.csv', index_col=0)
wt_sequences = ['SLLMWITQC', 'SLLMWITQV']
k_to_num_sequences = get_k_to_num_sequences(df, wt_sequences)
# with open('../nyeso/post_experiment_analysis/mhc_constrainted_class_I_A0101_9_nyeso_k_to_num_sequences.json', 'w') as f:
#     json.dump(k_to_num_sequences, f, indent=4)
print(k_to_num_sequences)

df = pd.read_csv('../auxiliary_data/class_I_A0201_9_pwm.csv', index_col=0)
wt_sequences = ['EVDPIGHLY', 'ESDPIVAQY']
k_to_num_sequences = get_k_to_num_sequences(df, wt_sequences)
# with open('../magea3_and_titin/post_experiment_analysis/mhc_constrainted_class_I_A0201_9_mage_k_to_num_sequences.json', 'w') as f:
#     json.dump(k_to_num_sequences, f, indent=4)
print(k_to_num_sequences)

df = pd.read_csv('../auxiliary_data/class_I_B3501_11_pwm.csv', index_col=0)
wt_sequences = ['HPVGEADYFEY', 'HPVGQADYFEY']
k_to_num_sequences = get_k_to_num_sequences(df, wt_sequences)
# with open('../ebv/post_experiment_analysis/mhc_constrainted_class_I_B3501_11_ebv_k_to_num_sequences.json', 'w') as f:
#     json.dump(k_to_num_sequences, f, indent=4)
print(k_to_num_sequences)



{0: 1, 1: 106, 2: 4852, 3: 125166, 4: 1990791, 5: 20054568, 6: 126400196, 7: 473412800, 8: 939084800, 9: 731136000}
{0: 1, 1: 101, 2: 4447, 3: 111659, 4: 1754356, 5: 17784524, 6: 115434768, 7: 457932096, 8: 995113728, 9: 894136320}
{0: 1, 1: 123, 2: 6807, 3: 223447, 4: 4826709, 5: 71905017, 6: 752089325, 7: 5507349213, 8: 27572663430, 9: 89504718680, 10: 168733607328, 11: 139341121920}


In [76]:
# df = pd.read_csv('../auxiliary_data/class_I_A0101_9_pwm.csv', index_col=0)
# wt_sequences = ['SLLMWITQC', 'SLLMWITQV']
# k_to_num_sequences = get_k_to_num_sequences(df, wt_sequences)
# # with open('../nyeso/post_experiment_analysis/mhc_constrainted_class_I_A0101_9_nyeso_k_to_num_sequences.json', 'w') as f:
# #     json.dump(k_to_num_sequences, f, indent=4)
# print(k_to_num_sequences)

# df = pd.read_csv('../auxiliary_data/class_I_A0201_9_pwm.csv', index_col=0)
# wt_sequences = ['EVDPIGHLY', 'ESDPIVAQY']
# k_to_num_sequences = get_k_to_num_sequences(df, wt_sequences)
# # with open('../magea3_and_titin/post_experiment_analysis/mhc_constrainted_class_I_A0201_9_mage_k_to_num_sequences.json', 'w') as f:
# #     json.dump(k_to_num_sequences, f, indent=4)
# print(k_to_num_sequences)

# df = pd.read_csv('../auxiliary_data/class_I_B3501_11_pwm.csv', index_col=0)
# wt_sequences = ['HPVGEADYFEY', 'HPVGQADYFEY']
# k_to_num_sequences = get_k_to_num_sequences(df, wt_sequences)
# # with open('../ebv/post_experiment_analysis/mhc_constrainted_class_I_B3501_11_ebv_k_to_num_sequences.json', 'w') as f:
# #     json.dump(k_to_num_sequences, f, indent=4)
# print(k_to_num_sequences)

# computed by armita
# B3501 : {123, 6698, 212272, 4327296, 59171840, 548667392, 3404464128,  13581156352, 32145145856}
# A0101 : {106, 4832, 123424, 1928192, 18866176, 113770496, 400556032}
# A0201 : {112, 5456, 151168, 2613248, 29065216, 206569472, 897581056, 2147483648}

with open('../nyeso/post_experiment_analysis/mhc_constrainted_class_I_A0101_9_nyeso_k_to_num_sequences.json', 'w') as f:
    json.dump(dict(zip([str(x) for x in range(1, 8)], [106, 4832, 123424, 1928192, 18866176, 113770496, 400556032])), f, indent=4)

with open('../magea3_and_titin/post_experiment_analysis/mhc_constrainted_class_I_A0201_9_mage_k_to_num_sequences.json', 'w') as f:
    json.dump(dict(zip([str(x) for x in range(1, 9)], [112, 5456, 151168, 2613248, 29065216, 206569472, 897581056, 2147483648])), f, indent=4)

with open('../ebv/post_experiment_analysis/mhc_constrainted_class_I_B3501_11_ebv_k_to_num_sequences.json', 'w') as f:
    json.dump(dict(zip([str(x) for x in range(1, 10)], [123, 6698, 212272, 4327296, 59171840, 548667392, 3404464128,  13581156352, 32145145856])), f, indent=4)


In [75]:
# df = pd.read_csv('../auxiliary_data/class_I_A0101_9_pwm.csv', index_col=0)
# wt_sequences = ['SLLMWITQC', 'SLLMWITQV']
# k_to_num_sequences = get_k_to_num_sequences(df, wt_sequences, all_possibilities=True)
# # with open('../nyeso/post_experiment_analysis/all_possibilities_class_I_A0101_9_nyeso_k_to_num_sequences.json', 'w') as f:
# #     json.dump(k_to_num_sequences, f, indent=4)
# print(k_to_num_sequences)

# df = pd.read_csv('../auxiliary_data/class_I_A0201_9_pwm.csv', index_col=0)
# wt_sequences = ['EVDPIGHLY', 'ESDPIVAQY']
# k_to_num_sequences = get_k_to_num_sequences(df, wt_sequences, all_possibilities=True)
# # with open('../magea3_and_titin/post_experiment_analysis/all_possibilities_class_I_A0201_9_mage_k_to_num_sequences.json', 'w') as f:
# #     json.dump(k_to_num_sequences, f, indent=4)
# print(k_to_num_sequences)

# df = pd.read_csv('../auxiliary_data/class_I_B3501_11_pwm.csv', index_col=0)
# wt_sequences = ['HPVGEADYFEY', 'HPVGQADYFEY']
# k_to_num_sequences = get_k_to_num_sequences(df, wt_sequences, all_possibilities=True)
# # with open('../ebv/post_experiment_analysis/all_possibilities_class_I_B3501_11_ebv_k_to_num_sequences.json', 'w') as f:
# #     json.dump(k_to_num_sequences, f, indent=4)
# print(k_to_num_sequences)

# computed by armita
# B3501: {209, 19855, 1131735, 43005930, 1143957738, 21735197022, 294977673870, 2802287901765, 17747823377845}
# A0101 & A0201: {171, 12996, 576156, 16420446, 311988474, 3951854004, 32179382604, 152852067369, 322687697779}

with open('../nyeso/post_experiment_analysis/all_possibilities_class_I_A0101_9_nyeso_k_to_num_sequences.json', 'w') as f:
    json.dump(dict(zip([str(x) for x in range(1, 10)], [171, 12996, 576156, 16420446, 311988474, 3951854004, 32179382604, 152852067369, 322687697779])), f, indent=4)

with open('../magea3_and_titin/post_experiment_analysis/all_possibilities_class_I_A0201_9_mage_k_to_num_sequences.json', 'w') as f:
    json.dump(dict(zip([str(x) for x in range(1, 10)], [171, 12996, 576156, 16420446, 311988474, 3951854004, 32179382604, 152852067369, 322687697779])), f, indent=4)

with open('../ebv/post_experiment_analysis/all_possibilities_class_I_B3501_11_ebv_k_to_num_sequences.json', 'w') as f:
    json.dump(dict(zip([str(x) for x in range(1, 10)], [209, 19855, 1131735, 43005930, 1143957738, 21735197022, 294977673870, 2802287901765, 17747823377845])), f, indent=4)


In [63]:
2**2.5

5.656854249492381