In [1]:
import io
from IPython.nbformat import current
def execute_notebook(nbfile):
    with io.open(nbfile) as f:
        nb = current.read(f, 'json')
    ip = get_ipython()
    for cell in nb.worksheets[0].cells:
        if cell.cell_type != 'code':
            continue
        ip.run_cell(cell.input)
execute_notebook("/cellar/users/ramarty/Projects/hla_ii/bin/imports.ipynb")
execute_notebook("/cellar/users/ramarty/Projects/hla_ii/bin/gathering_affinities.ipynb")


- use IPython.nbformat for read/write/validate public API
- use IPython.nbformat.vX directly to composing notebooks of a particular version

  """)


Populating the interactive namespace from numpy and matplotlib


### Start by getting the random up and running to look at the ROC curve, then go from there

In [2]:
merged = gather_protein_sequences_all()



In [3]:
def generate_peptides(mutations, status):
    peptides, mutations_used = [], []
    for mutation in mutations:
        gene = mutation.split('_')[0]
        sequences = list(merged[merged.gene == gene].sequence)
        for i, sequence in enumerate(sequences):
            residue = mutation.split('_')[1]
            position = int(residue[1:len(residue)-1]) - 1
            old_aa = residue[0]
            new_aa = residue[-1:]
            if len(sequence) > position and sequence[position] == old_aa:
                mutated_sequence = sequence[:position] + new_aa + sequence[position+1:]
                if status == 'mut':
                    if position > 13:
                        seq_for_affinity = mutated_sequence[position-14:position+15]
                    else:
                        seq_for_affinity = mutated_sequence[:position+15]
                else:
                    if position > 13:
                        seq_for_affinity = sequence[position-14:position+15]
                    else:
                        seq_for_affinity = sequence[:position+15]
                peptides.append(seq_for_affinity)
                mutations_used.append(mutation)
                break
            else:
                if i+1 == len(sequences):
                    #print mutation
                    continue
    return peptides, mutations_used

In [4]:
def get_length(x):
    return len(x)

In [5]:
def generate_viral_peptides(mutations):
    viruses = ['Hepatitis_A', 'Human_adenovirus', 'Human_herpesvirus_1', 'Human_immunodeficiency_virus',
           'Human_papillomavirus', 'Human_parainfluenza', 'Human_rhinovirus', 'Measles_virus', 'Salivirus_A',
           'Dengue_Virus']
    virus_ids = [12092, 28285, 10299, 11676, 333761, 11221, 31708, 36408, 688455, 33741]
    virus_dfs = []
    for i, virus_name in enumerate(viruses):
        tmp = pd.read_csv('/cellar/users/ramarty/Data/nrnb01/hla/pathogens/{0}_{1}.tsv'.format(virus_name, virus_ids[i]), \
                       sep='\t')
        tmp['species'] = virus_ids[i]

        virus_dfs.append(tmp)
    virus = pd.concat(virus_dfs)
    virus['protein_length'] = virus.Sequence.apply(get_length)
    virus = virus[virus['protein_length'] > 50]
    peptides, mutations_used = [], []
    for mutation in mutations:
        species = int(mutation.split('_')[0])
        protein = mutation.split('_')[1]
        sequences = list(virus[(virus.species == species)&(virus.Entry == protein)].Sequence)
        for i, sequence in enumerate(sequences):
            position = int(mutation.split('_')[2]) - 1
            if position > 13:
                seq_for_affinity = sequence[position-14:position+15]
            else:
                seq_for_affinity = sequence[:position+15]
            peptides.append(seq_for_affinity)
            mutations_used.append(mutation)
    return peptides, mutations_used

In [6]:
def generate_bacterial_peptides(mutations):
    bacterias = ['Bacillus_anthracis', 'Chlamydia_pneumoniae', 'Clostridium_difficile', 'Escherichia_coli', 
             'Mycoplasma_pneumoniae', 'Salmonella_typhi', 'Staphylococcus_aureus',  'Streptococcus_agalactiae',
             'Treponema_pallidum', 'Yersinia_pestis']
    bacteria_ids = [1392, 83558, 272563, 562, 710127, 90370, 93061, 1311, 243276, 553480]
    dfs = []
    for i, bacteria_name in enumerate(bacterias):
        tmp = pd.read_csv('/cellar/users/ramarty/Data/nrnb01/hla/pathogens/{0}_{1}.tsv'.format(bacteria_name, bacteria_ids[i]), \
                       sep='\t')
        tmp['species'] = bacteria_ids[i]
        dfs.append(tmp)
    bacteria = pd.concat(dfs)
    bacteria['protein_length'] = bacteria.Sequence.apply(get_length)
    bacteria = bacteria[bacteria['protein_length'] > 50]
    peptides, mutations_used = [], []
    for mutation in mutations:
        species = int(mutation.split('_')[0])
        protein = mutation.split('_')[1]
        sequences = list(bacteria[(bacteria.species == species)&(bacteria.Entry == protein)].Sequence)
        for i, sequence in enumerate(sequences):
            position = int(mutation.split('_')[2]) - 1
            if position > 13:
                seq_for_affinity = sequence[position-14:position+15]
            else:
                seq_for_affinity = sequence[:position+15]
            peptides.append(seq_for_affinity)
            mutations_used.append(mutation)
    return peptides, mutations_used

In [7]:
# add wt option!
def generate_indel_peptides(mutations, status):
    with open('/cellar/users/ramarty/Data/hla/mutations/indel_dictionary.p') as input_file:
        indel_dict = pickle.load(input_file)
    
    transcripts, chromosomes, gene_starts, gene_names, sequences, strands = [], [], [], [], [], []
    for record in SeqIO.parse("/cellar/users/ramarty/Data/hla/mutations/Homo_sapiens.GRCh38.cds.all.fa", "fasta"):
        description = record.description
        transcripts.append(description.split(' ')[0].split('.')[0])
        chromosomes.append(description.split(' ')[2].split(':')[2])
        gene_starts.append(description.split(' ')[2].split(':')[3])
        strands.append(description.split(' ')[2].split(':')[5])
        gene_names.append(description.split(' ')[6].split(':')[1])
        sequences.append(str(record.seq))
    transcripts_df = pd.DataFrame({'Transcript': transcripts,
                               'Chromosome': chromosomes,
                               'Gene_start': gene_starts,
                               'Strand': strands,
                               'Gene': gene_names,
                               'Sequence': sequences})
    
    transcripts, chromosomes, gene_starts, gene_names, sequences = [], [], [], [], []
    for record in SeqIO.parse("/cellar/users/ramarty/Data/hla/mutations/Homo_sapiens.GRCh38.pep.all.fa", "fasta"):
        description = record.description
        transcripts.append(description.split(' ')[4].split(':')[1].split('.')[0])
        chromosomes.append(description.split(' ')[2].split(':')[2])
        gene_starts.append(description.split(' ')[2].split(':')[3])
        gene_names.append(description.split(' ')[7].split(':')[1])
        sequences.append(str(record.seq))
    proteins_df = pd.DataFrame({'Transcript': transcripts,
                               'Chromosome': chromosomes,
                               'Gene_start': gene_starts,
                               'Gene': gene_names,
                               'Sequence': sequences})
    
    peptides, mutations_used = [], []
    for indel in mutations:
        isd = indel_dict[indel]   
        if isd['variant_classification'] == 'In_Frame_Del':
            if '-' in isd['cds_position']:
                cds_pos1 = int(isd['cds_position'].split('-')[0])
                cds_pos2 = int(isd['cds_position'].split('-')[1].split('/')[0])
            else:
                cds_pos1 = int(isd['cds_position'].split('/')[0])
                cds_pos2 = int(isd['cds_position'].split('/')[0])
            if '-' in isd['protein_position']:
                indel_position_protein = int(isd['protein_position'].split('-')[0]) - 1
            else:
                indel_position_protein = int(isd['protein_position'].split('/')[0]) - 1
            # native
            cdna = list(transcripts_df[transcripts_df.Transcript == isd['transcript']]['Sequence'])[0]
            coding_dna = Seq(cdna, generic_rna)
            native = str(coding_dna.translate(to_stop=True))[indel_position_protein-14:indel_position_protein+14]
            # mut
            if isd['amino_acids'].split('/')[1] == '-':
                if '-' in isd['protein_position']:
                    pos = int(isd['protein_position'].split('-')[0]) - 1
                else:
                    pos = int(isd['protein_position'].split('/')[0]) - 1
                protein = list(proteins_df[proteins_df.Transcript == isd['transcript']]['Sequence'])[0]
                new_protein = protein[:pos - 1] + protein[pos:]
                mut = new_protein[indel_position_protein-14:indel_position_protein+14]
            else:
                new_cdna = cdna[:cds_pos1] + cdna[cds_pos2+1:]
                coding_dna = Seq(new_cdna, generic_rna)
                mut = str(coding_dna.translate(to_stop=True))[indel_position_protein-14:indel_position_protein+14]

        else: # 'In_Frame_Ins'

            if '-' in isd['cds_position']:
                cds_pos = int(isd['cds_position'].split('-')[0])
            else:
                cds_pos = int(isd['cds_position'].split('/')[0])
            if '-' in isd['protein_position']:
                indel_position_protein = int(isd['protein_position'].split('-')[0]) - 1
            else:
                indel_position_protein = int(isd['protein_position'].split('/')[0]) - 1
            insertion_length = len(isd['amino_acids'].split('/')[1])

            # native
            cdna = list(transcripts_df[transcripts_df.Transcript == isd['transcript']]['Sequence'])[0]
            coding_dna = Seq(cdna, generic_rna)
            native = str(coding_dna.translate(to_stop=True))[indel_position_protein-14:indel_position_protein+insertion_length+14]
            # mut
            if list(transcripts_df[transcripts_df.Transcript == isd['transcript']].Strand)[0] == '-1':
                new_cdna = cdna[:cds_pos] + compliment(isd['replacement']) + cdna[cds_pos:]
            else:
                new_cdna = cdna[:cds_pos] + isd['replacement'] + cdna[cds_pos:]
            coding_dna = Seq(new_cdna, generic_rna)
            mut = str(coding_dna.translate(to_stop=True))[indel_position_protein-14:indel_position_protein+insertion_length+14]
    
        if status == 'mut':
            peptides.append(mut)
        else:
            peptides.append(native)
        mutations_used.append(indel)
    
    return peptides, mutations_used

In [5]:
def output_fasta(category, peptides, mutations_used, status):
    if status == 'mut':
        out_file = '/cellar/users/ramarty/Data/hla_ii/presentation/fasta_files/{0}.fa'.format(category)
    else:
        out_file = '/cellar/users/ramarty/Data/hla_ii/presentation/fasta_files/{0}.wt.fa'.format(category)
    with open(out_file, 'w') as outfile:
        for mutation, sequence in zip(mutations_used, peptides):
            outfile.write('>{0}\n'.format(mutation))
            outfile.write('{0}\n'.format(sequence))
        
    return len(mutations_used)

Oncogenes

In [12]:
category = 'oncogenes'
status = 'mut'
mutations = [x.strip() for x in open('/cellar/users/ramarty/Data/hla_ii/presentation/residues/{0}.txt'.format(category)).readlines()]
peptides, mutations_used = generate_peptides(mutations, status)
output_fasta(category, peptides, mutations_used, status)

643

TSgenes

In [13]:
category = 'tsgenes'
status = 'mut'
mutations = [x.strip() for x in open('/cellar/users/ramarty/Data/hla_ii/presentation/residues/{0}.txt'.format(category)).readlines()]
peptides, mutations_used = generate_peptides(mutations, status)
output_fasta(category, peptides, mutations_used, status)

602

CGC

In [7]:
category = 'cgc_new'
status = 'mut'
mutations = [x.strip() for x in open('/cellar/users/ramarty/Data/hla_ii/presentation/residues/{0}.txt'.format(category)).readlines()]
peptides, mutations_used = generate_peptides(mutations, status)
output_fasta(category, peptides, mutations_used, status)

1109

Random

In [11]:
category = 'random_long'
status = 'mut'
mutations = [x.strip() for x in open('/cellar/users/ramarty/Data/hla_ii/presentation/residues/{0}.txt'.format(category)).readlines()]
peptides, mutations_used = generate_peptides(mutations, status)
output_fasta(category, peptides, mutations_used, status)

20000

In [7]:
for i in [13, 14, 19]:
    print i
    category = 'random'+str(i)
    status = 'mut'
    mutations = [x.strip() for x in open('/cellar/users/ramarty/Data/hla_ii/presentation/residues/{0}.txt'.format(category)).readlines()]
    peptides, mutations_used = generate_peptides(mutations, status)
    output_fasta(category, peptides, mutations_used, status)

13
14
19


Germline

In [8]:
category = 'germline'
mutations = [x.strip() for x in open('/cellar/users/ramarty/Data/hla_ii/presentation/residues/{0}.txt'.format(category)).readlines()]
peptides, mutations_used = generate_peptides(mutations)
output_fasta(category, peptides, mutations_used)

1000

Indel

In [25]:
category = 'indels'
mutations = [x.strip() for x in open('/cellar/users/ramarty/Data/hla_ii/presentation/residues/{0}.txt'.format(category)).readlines()]
peptides, mutations_used = generate_indel_peptides(mutations)
output_fasta(category, peptides, mutations_used)

18

Common

In [9]:
output_fasta('common')

1000

Passenger

In [10]:
#output_fasta('passenger')
category = 'passenger'
mutations = [x.strip() for x in open('/cellar/users/ramarty/Data/hla_ii/presentation/residues/{0}.txt'.format(category)).readlines()]
peptides, mutations_used = generate_peptides(mutations)
output_fasta(category, peptides, mutations_used)

1000

Non_cancer

In [6]:
category = 'non_cancer'
status = 'mut'
mutations = [x.strip() for x in open('/cellar/users/ramarty/Data/hla_ii/presentation/residues/{0}.txt'.format(category)).readlines()]
peptides, mutations_used = generate_peptides(mutations, status)
output_fasta(category, peptides, mutations_used, status)

1000

In [7]:
category = 'known_non_cancer'
status = 'mut'
mutations = [x.strip() for x in open('/cellar/users/ramarty/Data/hla_ii/presentation/residues/{0}.txt'.format(category)).readlines()]
peptides, mutations_used = generate_peptides(mutations, status)
output_fasta(category, peptides, mutations_used, status)

1000

Bacterial

In [8]:
category = 'bacterial'
mutations = [x.strip() for x in open('/cellar/users/ramarty/Data/hla_ii/presentation/residues/{0}.txt'.format(category)).readlines()]
peptides, mutations_used = generate_bacterial_peptides(mutations)
output_fasta(category, peptides, mutations_used)

1000

Viral

In [9]:
category = 'viral'
mutations = [x.strip() for x in open('/cellar/users/ramarty/Data/hla_ii/presentation/residues/{0}.txt'.format(category)).readlines()]
peptides, mutations_used = generate_viral_peptides(mutations)
output_fasta(category, peptides, mutations_used)

1000

### Wild type

In [14]:
category = 'oncogenes'
status = 'wt'
mutations = [x.strip() for x in open('/cellar/users/ramarty/Data/hla_ii/presentation/residues/{0}.txt'.format(category)).readlines()]
peptides, mutations_used = generate_peptides(mutations, status)
output_fasta(category, peptides, mutations_used, status)

643

In [15]:
category = 'tsgenes'
status = 'wt'
mutations = [x.strip() for x in open('/cellar/users/ramarty/Data/hla_ii/presentation/residues/{0}.txt'.format(category)).readlines()]
peptides, mutations_used = generate_peptides(mutations, status)
output_fasta(category, peptides, mutations_used, status)

602

In [18]:
category = 'indels'
status = 'wt'
mutations = [x.strip() for x in open('/cellar/users/ramarty/Data/hla_ii/presentation/residues/{0}.txt'.format(category)).readlines()]
peptides, mutations_used = generate_indel_peptides(mutations, status)
output_fasta(category, peptides, mutations_used, status)

18