## Data from http://www.informatics.jax.org/downloads/reports/index.html#pheno

In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot  as plt

In [2]:
# Table 2. Mouse/Human Orthology with Phenotype Annotations (tab-delimited)
# Contains HumanGene, MouseGene,MP_terms

table_2 = 'http://www.informatics.jax.org/downloads/reports/HMD_HumanPhenotype.rpt'

df=pd.read_csv(table_2,sep='\t',header=None)
df.drop(7,axis=1,inplace=True)
df.columns  = ['Human Marker Symbol','Human Entrez Gene ID','HomoloGene ID',
                'HGNC Association?','Mouse Marker Symbol','MGI Marker Accession ID',
                'High-level Mammalian Phenotype ID']
print(df.shape)
df.head()

(18792, 7)


Unnamed: 0,Human Marker Symbol,Human Entrez Gene ID,HomoloGene ID,HGNC Association?,Mouse Marker Symbol,MGI Marker Accession ID,High-level Mammalian Phenotype ID
0,A1BG,1,11167.0,yes,A1bg,MGI:2152878,
1,A1CF,29974,16363.0,yes,A1cf,MGI:1917115,"MP:0005367, MP:0005369, MP:0005370, MP:0005376..."
2,A2M,2,37248.0,yes,A2m,MGI:2449119,MP:0005376
3,A3GALT2,127550,16326.0,yes,A3galt2,MGI:2685279,
4,A4GALT,53947,9690.0,yes,A4galt,MGI:3512453,"MP:0005376, MP:0005386, MP:0005387, MP:0005397..."


In [4]:
df[df['High-level Mammalian Phenotype ID'].notna()]

Unnamed: 0,Human Marker Symbol,Human Entrez Gene ID,HomoloGene ID,HGNC Association?,Mouse Marker Symbol,MGI Marker Accession ID,High-level Mammalian Phenotype ID
1,A1CF,29974,16363.0,yes,A1cf,MGI:1917115,"MP:0005367, MP:0005369, MP:0005370, MP:0005376..."
2,A2M,2,37248.0,yes,A2m,MGI:2449119,MP:0005376
4,A4GALT,53947,9690.0,yes,A4galt,MGI:3512453,"MP:0005376, MP:0005386, MP:0005387, MP:0005397..."
5,A4GNT,51146,87446.0,yes,A4gnt,MGI:2143261,"MP:0002006, MP:0005381, MP:0005384, MP:0005385..."
6,AAAS,8086,9232.0,yes,Aaas,MGI:2443767,"MP:0005378, MP:0005386, MP:0005389"
...,...,...,...,...,...,...,...
18777,ZSWIM7,125150,15565.0,yes,Zswim7,MGI:1916997,"MP:0005379, MP:0005384, MP:0005389"
18782,ZWILCH,55055,32381.0,yes,Zwilch,MGI:1915264,"MP:0005371, MP:0005378, MP:0005380, MP:0005385"
18783,ZWINT,11130,48496.0,yes,Zwint,MGI:1289227,"MP:0005380, MP:0005384, MP:0005386, MP:0010768"
18789,ZYX,7791,31164.0,yes,Zyx,MGI:103072,MP:0005384


In [16]:
np.all(df['Mouse Marker Symbol'].notna().values)

True

In [5]:
df.dropna(inplace=True)

In [10]:
mp_terms = df['High-level Mammalian Phenotype ID'].apply(lambda x: x.split(',')).values 
flat_list = [item.strip() for sublist in mp_terms for item in sublist]
len(np.unique(flat_list))

In [None]:
## Table 5 All Genotypes and Mammalian Phenotype Annotations (tab-delimited)

In [13]:
table_5 = 'http://www.informatics.jax.org/downloads/reports/MGI_PhenoGenoMP.rpt'

df5=pd.read_csv(table_5,sep='\t',header=None)

df5.columns = cols = ['Allelic Composition','Allele Symbol(s)','Genetic Background',
                     'Mammalian Phenotype ID','PubMed ID',
                     'MGI Marker Accession ID (comma-delimited)']

In [14]:
df5['Gene'] = [ i.split('<')[0] for i in df5['Allele Symbol(s)']]

In [15]:
df5

Unnamed: 0,Allelic Composition,Allele Symbol(s),Genetic Background,Mammalian Phenotype ID,PubMed ID,MGI Marker Accession ID (comma-delimited),Gene
0,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,involves: 129S2/SvPas,MP:0000600,12529408,MGI:97874,Rb1
1,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,involves: 129S2/SvPas,MP:0001716,16449662,MGI:97874,Rb1
2,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,involves: 129S2/SvPas,MP:0001698,16449662,MGI:97874,Rb1
3,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,involves: 129S2/SvPas,MP:0001092,16449662,MGI:97874,Rb1
4,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,involves: 129S2/SvPas,MP:0000961,16449662,MGI:97874,Rb1
...,...,...,...,...,...,...,...
336960,Mink1<tm1.2Lgrl>/Mink1<tm1.2Lgrl>,Mink1<tm1.2Lgrl>,involves: 129S6/SvEvTac * C57BL/6,MP:0005464,26598717,MGI:1355329,Mink1
336961,Mink1<tm1.2Lgrl>/Mink1<tm1.2Lgrl>,Mink1<tm1.2Lgrl>,involves: 129S6/SvEvTac * C57BL/6,MP:0005606,26598717,MGI:1355329,Mink1
336962,Mink1<tm1.2Lgrl>/Mink1<tm1.2Lgrl>,Mink1<tm1.2Lgrl>,involves: 129S6/SvEvTac * C57BL/6,MP:0009446,26598717,MGI:1355329,Mink1
336963,Mink1<tm1.2Lgrl>/Mink1<tm1.2Lgrl>,Mink1<tm1.2Lgrl>,involves: 129S6/SvEvTac * C57BL/6,MP:0009549,26598717,MGI:1355329,Mink1


In [22]:
unique_pheno = df5['Mammalian Phenotype ID'].unique().shape[0]
print(f'This dataset has {unique_pheno} unique MP terms.')

This dataset has 10489 unique MP terms


In [23]:
unique_genes = df5['Gene'].unique().shape[0]
print(f'This dataset has {unique_genes} unique mouse genes.')

This dataset has 23153 unique mouse genes.
