In [36]:
import re
import pandas as pd
import itertools
import cobra.flux_analysis
import time
from functools import partial
from src.mp_functions import combinations_subset, parallelize_dataframe, knockout_FBA


"""A mess of a document with different code cells.
Good to to use for any testing that involves the Recon3D model as it takes some time to load in."""

start_time = time.time()
model_file_path = 'C:/Users/Sigve/Genome_Data/Recon3D/JSON/Recon3D.json'
model = cobra.io.load_json_model(model_file_path)

SNP_results = pd.read_table('C:/Users/Sigve/Genome_Data/results/SNPs_effect.tsv', index_col=0)

end_time = time.time()
print('Model load time: %.6f seconds' % (end_time - start_time))

Model load time: 8.129111 seconds


In [54]:
# Prepare SNP combinations, reaction constraints
combinations = pd.read_table('C:/Users/Sigve/Genome_Data/results/SNP_combinations.tsv', index_col=0)
combinations['combinations'] = combinations['combinations'].apply(lambda x: x.split(';'))

combinations['gene_model_ids'] = combinations['combinations']\
    .apply(lambda x: SNP_results.loc[SNP_results['variant_name'].isin(x), ['model_gene_number']].iloc[:, 0].tolist())

id_list = ';' + ';'.join(model.genes.list_attr('id'))

combinations['gene_model_ids'] = combinations['gene_model_ids'].apply(lambda x:
                                list(set(itertools.chain.from_iterable(
                                [re.findall(r"(?:;)(" + str(i) + r"_AT\d+)", id_list) for i in x]))))

combinations.to_csv(path_or_buf='C:/Users/Sigve/Genome_Data/results/test.tsv', sep='\t')

In [55]:
start_time = time.time()


combinations = parallelize_dataframe(combinations, partial(combinations_subset, partial(knockout_FBA, model)), 8)
combinations.to_csv(path_or_buf='C:/Users/Sigve/Genome_Data/results/test.tsv', sep='\t')


end_time = time.time()
print('FBA run time: %.6f seconds' % (end_time - start_time))

FBA run time: 139.829631 seconds


In [56]:
# Check for non nominal results
for i, result in combinations.iterrows():
    res = result['results'].objective_value
    if int(res) < 700:
        print(result)



combinations      [rs533568327, rs190156724, rs118121926, rs6714...
gene_model_ids    [6565_AT1, 50484_AT1, 1757_AT1, 124583_AT1, 49...
results                         <Solution 699.461 at 0x219be7b56a0>
Name: 13, dtype: object
combinations      [rs536105762, rs550006246, rs117832164, rs1459...
gene_model_ids    [11000_AT1, 7086_AT1, 178_AT4, 5138_AT1, 11343...
results                         <Solution 699.461 at 0x219c9c2d820>
Name: 62, dtype: object
combinations      [rs73920193, rs72922900, rs114264176, rs190762...
gene_model_ids    [8140_AT1, 23396_AT1, 10396_AT1, 7389_AT1, 650...
results                         <Solution 674.472 at 0x219c1ccdaf0>
Name: 102, dtype: object
combinations      [rs138651156, rs146704698, rs536105762, rs5388...
gene_model_ids    [2731_AT1, 10864_AT1, 549_AT1, 6511_AT1, 9056_...
results                         <Solution 674.472 at 0x219c9010730>
Name: 109, dtype: object
combinations      [rs114264176, rs558047438, rs147862130, rs7579...
gene_model_ids    

In [23]:
def get_model_gene_info():
    """Function to extract all unique ids, not including "_ATXX" to a file"""
    ids = model.genes.list_attr('id')
    gene_ids = []

    for i in ids[1:]:
        s = i.split('_')
        gene_ids.append(int(s[0]))


    results = pd.Series(gene_ids, name='gene_number').drop_duplicates().sort_values().reset_index(drop=True)
    results.to_csv(path_or_buf='C:/Users/Sigve/Genome_Data/exon_model_data/gene_ids_in_model.tsv', sep='\t')


get_model_gene_info()

In [34]:
def get_recon_store_names_by_model_id():
    """function to get gene names by using ids from the model.
    Needed because the names in the model and from the store/ensembl are not necessarily the same.
    Remember the ensembl_gene ids are not necessarily the same due to alternative sequence genes"""

    gene_ids = pd.read_table('C:/Users/Sigve/Genome_Data/exon_model_data/gene_ids_in_model.tsv', index_col=0)
    recon_store = pd.read_table('C:/Users/Sigve/Genome_Data/exon_model_data/recon-store-genes.tsv')
    recon_store['gene_number'] = recon_store['gene_number'].astype(int)

    new_df = pd.merge(gene_ids, recon_store, on='gene_number')
    new_df = new_df[['gene_number', 'symbol', 'chromosome', 'ensembl_gene']]
    new_df.drop_duplicates(subset=['gene_number'], inplace=True)
    new_df.rename(columns={'symbol': 'gene_name'}, inplace=True)
    new_df.reset_index(drop=True, inplace=True)
    new_df.to_csv(path_or_buf='C:/Users/Sigve/Genome_Data/exon_model_data/model_gene_ids.tsv', sep='\t')
    print(new_df.head())

get_recon_store_names_by_model_id()

   gene_number gene_name chromosome     ensembl_gene
0           13     AADAC          3  ENSG00000114771
1           15     AANAT         17  ENSG00000129673
2           18      ABAT         16  ENSG00000183044
3           19     ABCA1          9  ENSG00000165029
4           21     ABCA3         16  ENSG00000167972


In [3]:
# Test stuff

#print(model.reactions.HMR_9505.gene_name_reaction_rule)
#model.reactions.MAOX.genes


gene = model.genes.get_by_id('4519_AT1')
print(gene.__getattribute__('name'))
gene

CYTB


0,1
Gene identifier,4519_AT1
Name,CYTB
Memory address,0x0205601db310
Functional,True
In 2 reaction(s),"CYOR_u10mi, CYOOm2i"


In [20]:
new_SNPs = pd.read_table('C:/Users/Sigve/Genome_Data/SNP_data/SNPs_all_chrom_orig.tsv')
columns = ['rsids', 'Variant source', 'chrom', 'pos', 'ref', 'alt']
new_SNPs = new_SNPs.reindex(columns=columns)
new_SNPs.rename(columns={'rsids': 'Variant name', 'chrom': 'Chromosome/scaffold name', 'pos': 'Chromosome/scaffold position start (bp)'}, inplace=True)
new_SNPs['Chromosome/scaffold position end (bp)'] = new_SNPs['Chromosome/scaffold position start (bp)']
new_SNPs['Variant alleles'] = new_SNPs['ref'] + '/' + new_SNPs['alt']
new_SNPs.drop(columns=['ref', 'alt'], inplace=True)
new_SNPs['Strand'] = 1
new_SNPs['Chromosome/scaffold position start (bp)'] = new_SNPs['Chromosome/scaffold position start (bp)'].apply(int)
new_SNPs['Chromosome/scaffold position end (bp)'] = new_SNPs['Chromosome/scaffold position end (bp)'].apply(int)
new_SNPs.set_index(['Variant name'], drop=True, inplace=True)
print(new_SNPs.head())

new_SNPs.to_csv(path_or_buf='C:/Users/Sigve/Genome_Data/SNP_data/SNPs_all_chrom.tsv', sep='\t')

              Variant source  Chromosome/scaffold name  \
Variant name                                             
rs1000036                NaN                        14   
rs1000036                NaN                        14   
rs10000745               NaN                         4   
rs1000108                NaN                         8   
rs10001104               NaN                         4   

              Chromosome/scaffold position start (bp)  \
Variant name                                            
rs1000036                                    22772136   
rs1000036                                    22772136   
rs10000745                                  126429447   
rs1000108                                    27429649   
rs10001104                                    1527546   

              Chromosome/scaffold position end (bp) Variant alleles  Strand  
Variant name                                                                 
rs1000036                            