In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot  as plt
import hashlib
from collections import Counter
import base64
pd.options.display.max_columns = None

In [2]:
def fill_missing_cols(df):
    
    if 'node_id' not in df.columns:
        raise ValueError('Must have at least a "node_id" column.')
        
    all_cols = set([ 'node_label', 'node_synonyms', 'node_dbxrefs',
            'node_definition','node_namespace','value','lowerbound','upperbound','unit'])
   
    missing_cols = list(all_cols - set(df.columns))
    nan_cols_df = pd.DataFrame(np.full([len(df), len(missing_cols)], np.nan),columns=missing_cols)
    nan_cols_df.index = df.index
    return pd.concat([df,nan_cols_df],axis=1)

In [2]:
#!jupyter nbconvert --to script IMPC_ftp_JS.ipynb

# Main notebook for Mouse Phenotype (MP) - Mouse gene (MGI)

# The end of this workflow is different from the original IMPC_ftp.ipynb notebook located in /Users/stearb/Dropbox/CHOP/R03/code/genotype_phenotype, we are using Jonathan Silversteins workflow for the Neo4j CSV creation (meaning the files produced by this workflow will be the inputs into JS's workflow) ...so we only need to create 2 files, a nodes.tsv and an edges.tsv (instead of the ~6 files, CUIs, CUI-CUIs, Code-CUIs, Terms, etc.)

## The guide for how to create these new nodes and edges files can be found in the Data Distillerys [github](https://ubkg.docs.xconsortia.org/formats/)

### Input files:
    * IMPC
        - genotype-phenotype-assertions-ALL.csv
        - statistical-results-ALL.csv
    * MGI
        - MGI_PhenoGenoMP.rpt (Table 5)
        - mgi_master_geno2pheno.csv

### Output  files:
        - nodes.tsv
        - edges.tsv

# IPMC README.txt
http://ftp.ebi.ac.uk/pub/databases/impc/all-data-releases/latest/results/README.txt

## Download and clean genotype-phenotype data from IMPC
### First look at just the genotype-phenotype-assertions-ALL.csv file

In [3]:
gno_phno = '/Users/stearb/desktop/DESKTOP_TRANSFER/R03_local/impc_ftp/genotype-phenotype-assertions-ALL.csv'

cols_to_drop = ['phenotyping_center','colony_id','sex','zygosity','strain_name','project_name',
                'project_fullname','pipeline_name','pipeline_stable_id','procedure_stable_id',
                'parameter_stable_id','p_value','percentage_change','effect_size','statistical_method',
                   'resource_name','strain_accession_id','allele_name','procedure_name','marker_accession_id',
               'allele_symbol']

df=pd.read_csv(gno_phno).drop(cols_to_drop,axis=1)

print(df.shape)
df.head(3)

(60174, 7)


Unnamed: 0,marker_symbol,allele_accession_id,parameter_name,top_level_mp_term_id,top_level_mp_term_name,mp_term_id,mp_term_name
0,Gimap3,MGI:5883410,Cornea,MP:0005391,vision/eye phenotype,MP:0001312,abnormal cornea morphology
1,Prss8,MGI:5646004,Embryo Size,"MP:0005380,MP:0005378","embryo phenotype,growth/size/body region pheno...",MP:0001697,abnormal embryo size
2,Pstpip2,MGI:5755044,Seminal vesicle,"MP:0005389,MP:0005379","reproductive system phenotype,endocrine/exocri...",MP:0002059,abnormal seminal vesicle morphology


In [4]:
# Show number of unique values in each column
df.nunique().to_frame().T

Unnamed: 0,marker_symbol,allele_accession_id,parameter_name,top_level_mp_term_id,top_level_mp_term_name,mp_term_id,mp_term_name
0,6007,6235,972,59,59,746,746


In [5]:
# How many nulls in each col?
df.isna().sum()

marker_symbol                2
allele_accession_id          2
parameter_name               0
top_level_mp_term_id      1150
top_level_mp_term_name    1150
mp_term_id                 877
mp_term_name               877
dtype: int64

In [6]:
rows_w_na = df.shape[0]
df.dropna(inplace=True)
print(f'Dropped {rows_w_na-df.shape[0]} rows after getting rid of Nan')

Dropped 1150 rows after getting rid of Nan


In [7]:
# Save to check consistency later
#df[['marker_symbol','parameter_name',
#    'allele_accession_id','mp_term_id',
#    'mp_term_name','top_level_mp_term_id']].to_csv(
#    '/Users/stearb/desktop/R03_local/data/ingest_files/genopheno/geno2pheno_mapping.csv')

In [8]:
df_dropdup = df.drop_duplicates(['marker_symbol','mp_term_id','mp_term_name','allele_accession_id','parameter_name'])
df_dropdup.shape

(36671, 7)

In [9]:
# Seperate the 'top_level_mp_term_id'  column by '|'
all_top_level_mp_terms = np.ravel(list(df['top_level_mp_term_id'].apply(lambda x: x.split('|')).values))

# Split results by ','
all_top_level_mp_terms_ = list([i.split(',') for i in all_top_level_mp_terms])

# Flatten it.
flat_list = [item for sublist in all_top_level_mp_terms_ for item in sublist]

print('There are '+str(len(np.unique(flat_list))) + ' unique top level MP terms in this dataset.')

There are 24 unique top level MP terms in this dataset.


In [10]:
print('There are '+str(len(df['mp_term_id'].unique()))+' unique MP terms in this dataset.')

There are 740 unique MP terms in this dataset.


In [11]:
print('There are '+str(df['marker_symbol'].unique().shape[0])+' unique genes in this dataset.')

There are 5966 unique genes in this dataset.


## Now look at another IMPC file called statistical-results-ALL.csv file

In [12]:
##### This is a larger file, slighlty over 1 GB
stats_file = '/Users/stearb/Desktop/DESKTOP_TRANSFER/R03_local/data/use_config/R03_DATA/statistical-results-ALL.csv'
stats = pd.read_csv(stats_file)

In [13]:
#list(stats.columns)  # See all available columns,  there are 87 columns

In [14]:

# Just select the most important columns right now.
stats_cols_to_include = ['marker_symbol','parameter_name','allele_name','marker_accession_id',
                         'mp_term_id','mp_term_name','top_level_mp_term_id','top_level_mp_term_name',
                         'allele_symbol','allele_accession_id']

stats_slct = stats[stats_cols_to_include]

print(stats.shape)
stats_slct.head(3) 

(1905809, 87)


Unnamed: 0,marker_symbol,parameter_name,allele_name,marker_accession_id,mp_term_id,mp_term_name,top_level_mp_term_id,top_level_mp_term_name,allele_symbol,allele_accession_id
0,Gpank1,Skeletal muscle,tm1.1(KOMP)Vlcg,MGI:2148975,,,,,Gpank1<tm1.1(KOMP)Vlcg>,MGI:5574607
1,Prrc2c,Tail Morphology,em1(IMPC)J,MGI:1913754,,,MP:0005371,limbs/digits/tail phenotype,Prrc2c<em1(IMPC)J>,MGI:6143826
2,Adh5,Thyroid,tm1b(EUCOMM)Wtsi,MGI:87929,,,,,Adh5<tm1b(EUCOMM)Wtsi>,MGI:5548916


In [15]:
# How many nans per column?
stats_slct.isna().sum().to_frame().T

Unnamed: 0,marker_symbol,parameter_name,allele_name,marker_accession_id,mp_term_id,mp_term_name,top_level_mp_term_id,top_level_mp_term_name,allele_symbol,allele_accession_id
0,1,0,42155,1,1871375,1871376,770079,770079,1,1


In [16]:
stats_slct.dropna(inplace=True) 
print(stats_slct.shape)
stats_slct.head(3)

(33157, 10)


Unnamed: 0,marker_symbol,parameter_name,allele_name,marker_accession_id,mp_term_id,mp_term_name,top_level_mp_term_id,top_level_mp_term_name,allele_symbol,allele_accession_id
23,Cd3eap,Limb Plate Morphology,tm1b(KOMP)Wtsi,MGI:1917583,MP:0004576,abnormal embryonic autopod plate morphology,MP:0005371,limbs/digits/tail phenotype,Cd3eap<tm1b(KOMP)Wtsi>,MGI:5692700
73,Pxdn,Whole arena average speed,em1(IMPC)J,MGI:1916925,MP:0003313,abnormal locomotor activation,MP:0005386,behavior/neurological phenotype,Pxdn<em1(IMPC)J>,MGI:5829376
110,Mkrn3,Fasted blood glucose concentration,em1(IMPC)Tcp,MGI:2181178,MP:0013277,abnormal fasting circulating glucose level,MP:0005376,homeostasis/metabolism phenotype,Mkrn3<em1(IMPC)Tcp>,MGI:6156558


In [17]:
# There are only 694 unique MP terms in the statistical-results-ALL.csv dataset.
stats_slct['mp_term_id'].unique().shape

(694,)

In [18]:
# Lets see the most common MP terms in this dataset.
Counter(stats_slct['mp_term_id'].values).most_common(5)

[('MP:0003313', 2014),
 ('MP:0011100', 1790),
 ('MP:0001515', 698),
 ('MP:0011110', 663),
 ('MP:0001697', 651)]

In [19]:
# Top 3 most common MP terms
#print(stats_slct[stats_slct['mp_term_id'] == 'MP:0003313']['mp_term_name'].unique()[0])
#print(stats_slct[stats_slct['mp_term_id'] == 'MP:0011100']['mp_term_name'].unique()[0])
#print(stats_slct[stats_slct['mp_term_id'] == 'MP:0001515']['mp_term_name'].unique()[0])

## Now lets compare the first genotype-phenotype IMPC dataset we looked at (genotype-phenotype-assertions-ALL.csv) with genotype-phenotype datasets from MGI from http://www.informatics.jax.org/downloads/reports/index.html#pheno

# Lets look at the MGI Table 5 Dataset 
### All Genotypes and Mammalian Phenotype Annotations (tab-delimited)

In [20]:
mgi_table5_extracted_geno_pheno_data = '/Users/stearb/dropbox/CHOP/DataDistillery/code/ingestion_notebooks/genotype_phenotype/geno2pheno_MGI_table5.csv'

mgi5 =pd.read_csv(mgi_table5_extracted_geno_pheno_data,sep='\t',index_col=0)
print(mgi5.shape)
mgi5.head(3)  

(270193, 2)


Unnamed: 0,Mammalian Phenotype ID,Gene
0,MP:0000600,Rb1
1,MP:0001716,Rb1
2,MP:0001698,Rb1


### Lets see the overlap in MP terms 
#### All of  the MP terms in the genotype-phenotype-assertions-ALL IMPC dataset are in the MGI Table 5 dataset.

In [21]:
print('Total unique MP terms in IMPC dataset: ' + str(len(df['mp_term_id'].unique())))
print('Total unique MP terms in MGI Table 5 dataset: ' + str(len(mgi5['Mammalian Phenotype ID'].unique())))

o = len(set(mgi5['Mammalian Phenotype ID']).intersection(set(df['mp_term_id'])))
print(f'Overlap in MP terms between the 2 datasets: {o}')


Total unique MP terms in IMPC dataset: 740
Total unique MP terms in MGI Table 5 dataset: 10525
Overlap in MP terms between the 2 datasets: 740


#### Side Note: Most (97%; 5825/5966) of the genes in both datasets overlap, so we still might want to include data set in the master phenotype to genotype list to get those 3% of genes. 

In [22]:
print('There are '+str(df['marker_symbol'].unique().shape[0])+' unique genes in the IMPC dataset.')
print('There are '+str(mgi5['Gene'].unique().shape[0])+ ' unique genes in the  MGI Table 5 dataset.')
g = len(set(mgi5['Gene']).intersection(set(df['marker_symbol'])))
print(f'Overlap in genes between the 2 datasets: {g}')


There are 5966 unique genes in the IMPC dataset.
There are 23518 unique genes in the  MGI Table 5 dataset.
Overlap in genes between the 2 datasets: 5825


### So, by including the MGI Table 5 dataset, we increase the number of MP terms (that are mapped to at least one gene) from 740 to 10,525. Lets see if there is more overlap between these MP terms and the MP terms that were cross-linked from HPO by Tiffany. This is where the bottleneck in the graph was when we were using just the 740 MP terms from the IMPC dataset.

In [23]:
hpo2mp = pd.read_csv('/Users/stearb/dropbox/CHOP/R03/code/phenotype_mapping/tiffs_mappings_ravel.csv',index_col=0)
print(hpo2mp.shape)
hpo2mp.head(3)

(1221, 5)


Unnamed: 0,HP_ID,HP_LABEL,HP_SYNONYM,MPO_URI,MPO_LABEL
0,HP:0000011,neurogenic bladder,lack of bladder control due to nervous system ...,MP:0005302,neurogenic bladder
1,HP:0000023,inguinal hernia,,MP:0006077,inguinal hernia
2,HP:0000028,cryptorchidism,undescended testes | undescended testis | cryp...,MP:0002286,cryptorchism


In [24]:
len(set(hpo2mp['MPO_URI'].unique()).intersection(set(mgi5['Mammalian Phenotype ID'].unique())))

436

About 9x better coverage, there were only ~50 MP terms overlapping in Tiffanys dataset (hpo--mp mappings) and the MP terms in the genotype-phenotype-assertions-ALL.csv IMPC dataset. 

By Including the MGI Table 5 dataset we increase our coverage to 436!  Meaning we have 436 (of the ~462 unique Kids First phenotypes) fully connected in our graph. Fully connected means we have HP--MP--Mouse_genes-Human_genes. The MP--Mouse_genes connections are what we are examining in this notebook. 

Lets see exactly which MP terms we have covered now

In [25]:
# Extract rows from Tiffs hpo-mp mapping file that overlap so we can see the names of the 276 MP terms.
overlapping_mp = hpo2mp[hpo2mp['MPO_URI'].isin(mgi5['Mammalian Phenotype ID'].unique())]
overlap = overlapping_mp.drop_duplicates(['MPO_URI'])
overlap.head(3)

Unnamed: 0,HP_ID,HP_LABEL,HP_SYNONYM,MPO_URI,MPO_LABEL
0,HP:0000011,neurogenic bladder,lack of bladder control due to nervous system ...,MP:0005302,neurogenic bladder
1,HP:0000023,inguinal hernia,,MP:0006077,inguinal hernia
2,HP:0000028,cryptorchidism,undescended testes | undescended testis | cryp...,MP:0002286,cryptorchism


# Create a master list of MP--mouseGene relationships from the 2 IMPC datasets and then add in the MGI master list created in MGI.ipynb

In [26]:
# Lets get the MP terms and the associated mouse genes from each of the 2 IMPC datasets. Add the MGI table 5 
# data to the master MGI list

# 1st IMPC dataset ( genotype-phenotype-assertions-ALL.csv)
impc_1 = df[['mp_term_id','marker_symbol']]

# 2nd IMPC dataset (statistical-resultsabs-ALL.csv)
impc_2 = stats_slct[['mp_term_id','marker_symbol']].dropna() # There are only 34,434 rows here where mp_term_id is NOT NAN  

# This is already included in the MGI master list, MGI Table 5 dataset
#mgi_5 = mgi5[['Mammalian Phenotype ID','Gene']]

In [27]:
# Rename the columns in each dataframe to 'MP_term'  and 'Gene'
impc_1.columns = ['MP_term','Gene']
impc_2.columns = ['MP_term','Gene']
#mgi_5.columns = ['MP_term','Gene']

# Rename the columns in each dataframe to 'MP_term'  and 'Gene'
impc_1.columns = impc_2.columns = ['MP_term','Gene']

In [28]:
# Combine just impc_1 and impc_2
# Dont include mgi_5 here, put it in the master_mgi list with mgi table 9 and 10.
master_impc_list = pd.concat([impc_1,impc_2])#,mgi_5])
master_impc_list.shape

(92181, 2)

In [29]:
# Only drop if the row is a duplicate of both MP term and gene 
master_impc_filt = master_impc_list.drop_duplicates() # .drop_duplicates(['MP_term','Gene'])  
master_impc_filt.shape

(30757, 2)

In [30]:
# How many unique mp terms and genes?
master_impc_filt.nunique()

MP_term     740
Gene       5966
dtype: int64

## Combine IMPC master list with MGI master list

In [31]:
mgi_master_list = pd.read_csv('mgi_master_geno2pheno.csv') # this file is from the MGI.ipynb
print(mgi_master_list.shape)
mgi_master_list.columns = ['MP_term','Gene']
mgi_master_list.head(3)

(232890, 2)


Unnamed: 0,MP_term,Gene
0,MP:0000600,Rb1
1,MP:0001716,Rb1
2,MP:0001698,Rb1


In [32]:
# Create the master genotype-phenotype (gene - phenotype term)
MASTER_G2P = pd.concat([master_impc_filt,mgi_master_list])
print(MASTER_G2P.shape)
MASTER_G2P.head(3)

(263647, 2)


Unnamed: 0,MP_term,Gene
0,MP:0001312,Gimap3
1,MP:0001697,Prss8
2,MP:0002059,Pstpip2


In [33]:
MASTER_G2P.dropna(inplace=True)

In [34]:
MASTER_G2P.drop_duplicates(inplace=True)
print(MASTER_G2P.shape)

# Before saving, check for the weird gene entries again (we already did for the MGI master list in MGI.ipynb)
# ie. Tg(Thy1-MAPT*)1Avil, bc we are not including them right now
MASTER_G2P_fixed = MASTER_G2P[~MASTER_G2P["Gene"].str.contains(pat='\/|\)|\(|\*',regex=True)]
MASTER_G2P_fixed.shape

(234054, 2)


(234042, 2)

## Need to add CUIs, CODEs and  CodeIDs
Just recreating the HCOP and MP CUIs and  CodeIDs here?

Add CUIs, CODEs and CodeIDs for mouse genes

In [35]:
def CUIbase64(series):
    '''Pass in a Pandas Series to have each element base-64 encoded using list comprehension.'''
    #check dtype is all str
    assert (series.apply(type) == str).all()
    encoded_series = pd.Series([base64.urlsafe_b64encode(i.encode('UTF-8')).decode('ascii') for  i in series])
    assert len(encoded_series) == len(series), 'Different number of elements between series and encoded_series.'
    #assert len(encoded_series) == encoded_series.nunique(), 'Collision occured!' # sometimes we'll pass a series with duplicates (dbSNP)
    return encoded_series

In [36]:
MASTER_G2P_fixed['CODE_mouse_gene'] = ['HCOP:'+i for i in MASTER_G2P_fixed['Gene']]
MASTER_G2P_fixed['CodeID_mouse_gene'] = ['HCOP '+i for i in MASTER_G2P_fixed['CODE_mouse_gene']]

#CUI_LEN = 14
#MASTER_G2P_fixed['CUI_mouse_gene']  = ['KC' + str(int(hashlib.sha256(uid.encode('utf8')).hexdigest(),
#                                                      base=16))[:CUI_LEN] for uid in MASTER_G2P_fixed['Gene']]


# THIS  SHOULD ACTUALLY BE CREATED FROM THE CODE ID not the 'Gene', like below
# CUI_mouse_genes = CUIbase64(MASTER_G2P_fixed['Gene'])    

CUI_mouse_genes = CUIbase64(MASTER_G2P_fixed['CodeID_mouse_gene'])
MASTER_G2P_fixed['CUI_mouse_gene'] = [i for i in CUI_mouse_genes]

assert MASTER_G2P_fixed['CodeID_mouse_gene'].nunique()  ==  MASTER_G2P_fixed['CUI_mouse_gene'].nunique() 
assert MASTER_G2P_fixed['Gene'].nunique()  ==  MASTER_G2P_fixed['CODE_mouse_gene'].nunique()
assert MASTER_G2P_fixed.isna().sum().sum() == 0

Add CUIs, CODEs and CodeIDs for MP terms

In [37]:
MASTER_G2P_fixed.rename(columns={'MP_term':'CODE_mp_term'},inplace=True)

MASTER_G2P_fixed['CodeID_mp_term'] = ['MP '+i for i in MASTER_G2P_fixed['CODE_mp_term']]

#MASTER_G2P_fixed['CUI_mp_term']  = ['KC' + str(int(hashlib.sha256(uid.encode('utf8')).hexdigest(),
#                                                      base=16))[:CUI_LEN] for uid in MASTER_G2P_fixed['CODE_mp_term']]

#MASTER_G2P_fixed['CUI_mp_term'] = [i for i in CUIbase64(MASTER_G2P_fixed['CodeID_mp_term'])]


#assert MASTER_G2P_fixed['CodeID_mp_term'].nunique()  ==  MASTER_G2P_fixed['CUI_mp_term'].nunique()
assert MASTER_G2P_fixed['CODE_mp_term'].nunique()  ==  MASTER_G2P_fixed['CodeID_mp_term'].nunique()

# Create  edges file 

In [38]:
MASTER_G2P_fixed['predicate'] = 'RO:0002331' # 'involved in'      old rel =  'phenotype_has_associated_gene'
edges = MASTER_G2P_fixed[['CodeID_mouse_gene','predicate','CodeID_mp_term']]

In [39]:
edges

Unnamed: 0,CodeID_mouse_gene,predicate,CodeID_mp_term
0,HCOP HCOP:Gimap3,RO:0002331,MP MP:0001312
1,HCOP HCOP:Prss8,RO:0002331,MP MP:0001697
2,HCOP HCOP:Pstpip2,RO:0002331,MP MP:0002059
3,HCOP HCOP:Ap4e1,RO:0002331,MP MP:0001486
4,HCOP HCOP:S100a14,RO:0002331,MP MP:0012441
...,...,...,...
231765,HCOP HCOP:Hvf,RO:0002331,MP MP:0004600
231766,HCOP HCOP:Hvf,RO:0002331,MP MP:0004601
231767,HCOP HCOP:Hvf,RO:0002331,MP MP:0004621
231768,HCOP HCOP:Hvf,RO:0002331,MP MP:0004682


In [40]:
edges.columns = ['subject','predicate','object']
edges['object'] = edges['object'].str.replace(' MP:',' ')

In [43]:
edges = edges.drop_duplicates().reset_index(drop=True)

Unnamed: 0,subject,predicate,object
0,HCOP HCOP:Gimap3,RO:0002331,MP 0001312
1,HCOP HCOP:Prss8,RO:0002331,MP 0001697
2,HCOP HCOP:Pstpip2,RO:0002331,MP 0002059
3,HCOP HCOP:Ap4e1,RO:0002331,MP 0001486
4,HCOP HCOP:S100a14,RO:0002331,MP 0012441
...,...,...,...
233084,HCOP HCOP:Tsix,RO:0002331,MP 0001672
233085,HCOP HCOP:Tsix,RO:0002331,MP 0008877
233086,HCOP HCOP:Tsix,RO:0002331,MP 0011110
233087,HCOP HCOP:Mbd4,RO:0002331,MP 0011998


In [44]:
# Merge in MGI codes to use instead of HCOP
mgi_map = pd.read_csv('/Users/stearb/Dropbox/CHOP/DataDistillery/code/ingestion_notebooks/orthologs/mgi_name_map.csv')

mgi_map = mgi_map.rename(columns={'CodeID_mouse':'subject'})

assert np.all([i.startswith('HCOP') for i in edges['subject']])

edges[edges['subject'].isin(mgi_map['subject'])]

Unnamed: 0,subject,predicate,object
0,HCOP HCOP:Gimap3,RO:0002331,MP 0001312
1,HCOP HCOP:Prss8,RO:0002331,MP 0001697
2,HCOP HCOP:Pstpip2,RO:0002331,MP 0002059
3,HCOP HCOP:Ap4e1,RO:0002331,MP 0001486
4,HCOP HCOP:S100a14,RO:0002331,MP 0012441
...,...,...,...
233084,HCOP HCOP:Tsix,RO:0002331,MP 0001672
233085,HCOP HCOP:Tsix,RO:0002331,MP 0008877
233086,HCOP HCOP:Tsix,RO:0002331,MP 0011110
233087,HCOP HCOP:Mbd4,RO:0002331,MP 0011998


In [45]:
edges = pd.merge(mgi_map,edges,how='inner')\
                        .drop_duplicates().dropna().reset_index(drop=True)\
                .drop('subject',axis=1).rename(columns={'mgi_id':'subject'})
edges

Unnamed: 0,subject,predicate,object
0,MGI:1917115,RO:0002331,MP 0005292
1,MGI:1917115,RO:0002331,MP 0001544
2,MGI:1917115,RO:0002331,MP 0002127
3,MGI:1917115,RO:0002331,MP 0002083
4,MGI:1917115,RO:0002331,MP 0001636
...,...,...,...
219513,MGI:2444286,RO:0002331,MP 0003062
219514,MGI:2444286,RO:0002331,MP 0003960
219515,MGI:2444286,RO:0002331,MP 0001589
219516,MGI:2444286,RO:0002331,MP 0005640


In [46]:
edges['subject'] = [i.replace(':',' ') for i in edges['subject']]
edges

Unnamed: 0,subject,predicate,object
0,MGI 1917115,RO:0002331,MP 0005292
1,MGI 1917115,RO:0002331,MP 0001544
2,MGI 1917115,RO:0002331,MP 0002127
3,MGI 1917115,RO:0002331,MP 0002083
4,MGI 1917115,RO:0002331,MP 0001636
...,...,...,...
219513,MGI 2444286,RO:0002331,MP 0003062
219514,MGI 2444286,RO:0002331,MP 0003960
219515,MGI 2444286,RO:0002331,MP 0001589
219516,MGI 2444286,RO:0002331,MP 0005640


# Create nodes file

In [47]:
# No need to create a nodes file, MP is already in UMLS and HCOP should be in as well, 
# just need to make sure its ingested first

nodes_mp = pd.DataFrame(MASTER_G2P_fixed['CodeID_mp_term'])
nodes_mp.columns = ['node_id']
nodes_hcop = MASTER_G2P_fixed[['CodeID_mouse_gene','Gene']]
nodes_hcop.columns = ['node_id','node_label']

nodes_mp['node_label'] = np.nan


In [51]:
nodes_hcop = nodes_hcop.drop_duplicates()

In [83]:
mgi_map

Unnamed: 0,subject,mgi_id
0,HCOP HCOP:A1bg,MGI:2152878
1,HCOP HCOP:A1cf,MGI:1917115
2,HCOP HCOP:A2m,MGI:2449119
3,HCOP HCOP:Mug2,MGI:99836
4,HCOP HCOP:Mug1,MGI:99837
...,...,...
69932,HCOP HCOP:Zyg11b,MGI:2685277
69933,HCOP HCOP:Zyg11a,MGI:2446208
69934,HCOP HCOP:Zyx,MGI:103072
69935,HCOP HCOP:Zzef1,MGI:2444286


In [52]:
nodes_hcop

Unnamed: 0,node_id,node_label
0,HCOP HCOP:Gimap3,Gimap3
1,HCOP HCOP:Prss8,Prss8
2,HCOP HCOP:Pstpip2,Pstpip2
3,HCOP HCOP:Ap4e1,Ap4e1
4,HCOP HCOP:S100a14,S100a14
...,...,...
231738,HCOP HCOP:Anta,Anta
231740,HCOP HCOP:Cal8,Cal8
231742,HCOP HCOP:Stmb,Stmb
231749,HCOP HCOP:Lpy,Lpy


In [53]:
nodes_hcop = pd.merge(nodes_hcop,mgi_map.rename(columns={'subject':'node_id'}))\
                .drop_duplicates().reset_index(drop=True).drop('node_id',axis=1)\
                .rename(columns={'mgi_id':'node_id'})
len(nodes_hcop)

12321

In [55]:
nodes_hcop['node_id'] = [i.replace(':',' ') for i in nodes_hcop['node_id']]

In [56]:

nodes = pd.concat([nodes_hcop,nodes_mp])

nodes = fill_missing_cols(nodes)

nodes = nodes.drop_duplicates(subset=['node_id'])

In [57]:
nodes

Unnamed: 0,node_label,node_id,node_namespace,lowerbound,value,node_definition,upperbound,node_dbxrefs,node_synonyms,unit
0,Gimap3,MGI 1932723,,,,,,,,
1,Prss8,MGI 1923810,,,,,,,,
2,Pstpip2,MGI 1335088,,,,,,,,
3,Ap4e1,MGI 1336993,,,,,,,,
4,S100a14,MGI 1913416,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
230797,,MP MP:0009855,,,,,,,,
231009,,MP MP:0002956,,,,,,,,
231360,,MP MP:0009470,,,,,,,,
231641,,MP MP:0012002,,,,,,,,


In [58]:
nodes.columns

nodes = nodes[['node_id','node_label', 'upperbound', 'unit', 'node_definition',
       'value', 'node_namespace', 'node_dbxrefs', 'lowerbound','node_synonyms']]
nodes

Unnamed: 0,node_id,node_label,upperbound,unit,node_definition,value,node_namespace,node_dbxrefs,lowerbound,node_synonyms
0,MGI 1932723,Gimap3,,,,,,,,
1,MGI 1923810,Prss8,,,,,,,,
2,MGI 1335088,Pstpip2,,,,,,,,
3,MGI 1336993,Ap4e1,,,,,,,,
4,MGI 1913416,S100a14,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
230797,MP MP:0009855,,,,,,,,,
231009,MP MP:0002956,,,,,,,,,
231360,MP MP:0009470,,,,,,,,,
231641,MP MP:0012002,,,,,,,,,


In [64]:
assert len(edges) ==  len(edges[edges['subject'].isin(nodes['node_id'])])

In [63]:
nodes['node_id'] = [i.replace(':',' ') for i in nodes['node_id']]
nodes

Unnamed: 0,node_id,node_label,upperbound,unit,node_definition,value,node_namespace,node_dbxrefs,lowerbound,node_synonyms
0,MGI 1932723,Gimap3,,,,,,,,
1,MGI 1923810,Prss8,,,,,,,,
2,MGI 1335088,Pstpip2,,,,,,,,
3,MGI 1336993,Ap4e1,,,,,,,,
4,MGI 1913416,S100a14,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
230797,MP 0009855,,,,,,,,,
231009,MP 0002956,,,,,,,,,
231360,MP 0009470,,,,,,,,,
231641,MP 0012002,,,,,,,,,


In [65]:
edges

Unnamed: 0,subject,predicate,object
0,MGI 1917115,RO:0002331,MP 0005292
1,MGI 1917115,RO:0002331,MP 0001544
2,MGI 1917115,RO:0002331,MP 0002127
3,MGI 1917115,RO:0002331,MP 0002083
4,MGI 1917115,RO:0002331,MP 0001636
...,...,...,...
219513,MGI 2444286,RO:0002331,MP 0003062
219514,MGI 2444286,RO:0002331,MP 0003960
219515,MGI 2444286,RO:0002331,MP 0001589
219516,MGI 2444286,RO:0002331,MP 0005640


In [61]:
nodes['node_id'] = [i.replace('MP ','') if i.startswith('MP') else i for i in nodes['node_id'] ]


# Save files
## this data source depends on MP and HCOP being in the graph

In [66]:
nodes.to_csv('/Users/stearb/Desktop/DESKTOP_TRANSFER/DataDistilleryFiles/genotype_phenotype/OWLNETS_node_metadata_mgi.txt',
             sep='\t',index=False)

edges.to_csv('/Users/stearb/Desktop/DESKTOP_TRANSFER/DataDistilleryFiles/genotype_phenotype/OWLNETS_edgelist_mgi.txt',
             sep='\t',index=False)

In [46]:
nodes_mp.drop_duplicates().shape

(10297, 2)

In [47]:
nodes_hcop.drop_duplicates().shape

(17391, 2)

In [None]:
# Cypher check
# match (cc:Code {SAB:'HCOP'})-[x]-(c:Concept)-[b]-(z:Concept)-[v]-(aa:Code {SAB:'MP'})   
# return count(distinct cc) AS connected_hcop_nodes ,count(distinct aa) as connected_mp_nodes
# 17391  10297

# OLD CODE

### Save CUI-CUI, CODEs and CUI-CODE files.

Save CUIs

In [46]:
# The two columns in the CUI-CUI data frame should be combined into a single CUIs file.
pd.DataFrame(MASTER_G2P_fixed['CUI_mouse_gene'].append(
            MASTER_G2P_fixed['CUI_mp_term']).drop_duplicates(
                ),columns=['CUI']).to_csv(
                    '/Users/stearb/desktop/R03_local/data/ingest_files/genopheno/CUIs_genotype.csv',index=False)

Save CUI-CUIs

What SAB for CUI-CUI relationships?  - IMPC  
What relationship for gene -- phenotype?  - gene_associated_with_disease and disease_has_associated_gene

In [47]:
# Connect CUIs
CUI_CUI = MASTER_G2P_fixed[['CUI_mouse_gene','CUI_mp_term']].rename(columns={'CUI_mouse_gene':':START_ID',
                                                                             'CUI_mp_term':':END_ID'})

# Add relationship type and  sab to match format of umls CUI-CUI import file
CUI_CUI[':TYPE'] = 'gene_associated_with_disease'

# Add inverse relationship (phenotype to gene). Just reverse names of columns,  then concatenate 
CUI_CUI_inverse = MASTER_G2P_fixed[['CUI_mouse_gene','CUI_mp_term']].rename(columns={'CUI_mouse_gene':':END_ID',
                                                                             'CUI_mp_term':':START_ID'})

CUI_CUI_inverse[':TYPE'] = 'disease_has_associated_gene'

CUI_CUI_all =  pd.concat([CUI_CUI,CUI_CUI_inverse])

CUI_CUI_all['SAB'] = 'IMPC'

CUI_CUI_all.to_csv('/Users/stearb/desktop/R03_local/data/ingest_files/genopheno/CUI-CUI_genotype.csv',index=False)

Save CODEs

In [48]:
# Create MP Codes and create gene Codes 
# Combine these two files ^^^ to create just one CODEs file. (rename cols and add SAB)
codes_mouse_genes = MASTER_G2P_fixed[['CodeID_mouse_gene','CODE_mouse_gene']]
codes_mp_terms = MASTER_G2P_fixed[['CodeID_mp_term','CODE_mp_term']]
codes_mouse_genes['SAB'] = 'HGNC_HCOP'
codes_mp_terms['SAB']  = 'MP'

CODEs = pd.DataFrame(np.concatenate([codes_mouse_genes.values,codes_mp_terms.values], axis=0),columns=['CodeID',
                      'CODE','SAB'])

CODEs = CODEs[['CodeID','SAB', 'CODE']] # Reorder

CODEs.drop_duplicates(['CODE','CodeID']).to_csv('/Users/stearb/desktop/R03_local/data/ingest_files/genopheno/CODEs_genotype.csv',index=False)


Save CUI-CODEs

In [49]:
# Connect MP Concept to Codes and Connect mouse gene Concepts to CODEs
# Combine these two files ^^^ to create just one CUI-CODEs file. 
code_cui_genes = MASTER_G2P_fixed[['CUI_mouse_gene','CodeID_mouse_gene']]
code_cui_mp = MASTER_G2P_fixed[['CUI_mp_term','CodeID_mp_term']]

pd.DataFrame(np.concatenate([code_cui_genes.values,code_cui_mp.values]),
             columns=['CUI','CODE']).drop_duplicates(['CODE','CUI']).to_csv('CUI-CODE_genotype.csv',index=False)
# 27688 x 2
! mv CUI-CODE_genotype.csv /Users/stearb/desktop/R03_local/data/ingest_files/genopheno/

### Save whole df so we can check that there are no collisions with these CUIs and CUIs from the other steps

In [50]:
MASTER_G2P_fixed.to_csv('/Users/stearb/desktop/R03_local/data/UI_check/MASTER_G2P.csv',index=False)

### Extra Code

#### After adding MGI Tables 5, 9 and 10 mappings we increased the number of unique mp terms and genes greatly.
####  Although the number of mp-hp mappings we gain is only 2. Maybe add tables 5,9,10 when we have more complete mp-hp mappings 

In [44]:
print('\n---- IMPC and MGI (Tables 5,9 and 10) data -----')
print(MASTER_G2P_fixed.nunique())
print('\n---- IMPC data only -----')
print(master_impc_filt.nunique())


---- IMPC and MGI (Tables 5,9 and 10) data -----
CODE_mp_term         10297
Gene                 17391
CODE_mouse_gene      17391
CodeID_mouse_gene    17391
CUI_mouse_gene       17391
CodeID_mp_term       10297
CUI_mp_term          10297
dtype: int64

---- IMPC data only -----
MP_term     740
Gene       5966
dtype: int64


### Lets see if we have anymore coverage with Tiffanys mp-hp mappings

In [45]:
len(set(hpo2mp['MPO_URI'].values).intersection(
                                set(MASTER_G2P_fixed['MP_term'].values)))

KeyError: 'MP_term'

In [46]:
hpo2mp['MPO_URI'].unique().shape

(462,)

In [47]:
[i for i in hpo2mp['MPO_URI'].unique() if ':' not in i]

[]

In [101]:
both = []

for i in hpo2mp['MPO_URI'].unique():
    if i in MASTER_G2P_fixed['MP_term'].unique():
        both.append(i)

len(both)

429

Not really...only 2 more

In [117]:
MASTER_G2P_fixed['Gene'].isna().sum()

(234054, 2)

In [None]:
x=dict(Counter(df['marker_symbol'].values))

allele_freq = {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}

n=16
keys = list(allele_freq.keys())[:n]
values = list(allele_freq.values())[:n]
plt.figure(figsize=(10,4))
plt.bar(keys, values)
plt.xticks(fontsize=15,rotation=40)
plt.grid(alpha=.3)
plt.title('Genes with the most Allele Entries in IMPC')

In [None]:
#### Use Regex to search for specific phenotype groups so we can see which phenotypes we have
HEART_QUERY = 'heart|cardiac'
KIDNEY_QUERY = 'kidney'
BRAIN_QUERY = 'brain|nervous'
STRUCTURAL_QUERY = 'structural'
CANCER_QUERY = 'cancer|neoplasm|neoplasia'
BONE_QUERY = 'bone'
LUNG_QUERY = 'lung|pulmonary'
DEVELOPMENT_QUERY = 'development|embryo|gestation|fetus|fetal'

heart_mask = pd.DataFrame(overlap['MPO_LABEL']).apply(lambda x: x.str.contains(HEART_QUERY,regex=True,flags=re.IGNORECASE)).any(axis=1)
kidney_mask = pd.DataFrame(overlap['MPO_LABEL']).apply(lambda x: x.str.contains(KIDNEY_QUERY,regex=True,flags=re.IGNORECASE)).any(axis=1)
brain_mask = pd.DataFrame(overlap['MPO_LABEL']).apply(lambda x: x.str.contains(BRAIN_QUERY,regex=True,flags=re.IGNORECASE)).any(axis=1)
struct_mask = pd.DataFrame(overlap['MPO_LABEL']).apply(lambda x: x.str.contains(STRUCTURAL_QUERY,regex=True,flags=re.IGNORECASE)).any(axis=1)
cancer_mask = pd.DataFrame(overlap['MPO_LABEL']).apply(lambda x: x.str.contains(CANCER_QUERY,regex=True,flags=re.IGNORECASE)).any(axis=1)
bone_mask = pd.DataFrame(overlap['MPO_LABEL']).apply(lambda x: x.str.contains(BONE_QUERY,regex=True,flags=re.IGNORECASE)).any(axis=1)
lung_mask = pd.DataFrame(overlap['MPO_LABEL']).apply(lambda x: x.str.contains(LUNG_QUERY,regex=True,flags=re.IGNORECASE)).any(axis=1)
dev_mask =  pd.DataFrame(overlap['MPO_LABEL']).apply(lambda x: x.str.contains(DEVELOPMENT_QUERY,regex=True,flags=re.IGNORECASE)).any(axis=1)

#list(overlap['MPO_LABEL'][heart_mask])
#list(overlap['MPO_LABEL'][kidney_mask])
#list(overlap['MPO_LABEL'][brain_mask])

