In [1]:
import pandas
import numpy
import datetime
import synapseutils
import synapseclient
from synapseclient import Entity, Project, Column, Team, Wiki

In [2]:
def synapseLogin():
    """
    First tries to login to synapse by finding the local auth key cached on user's computing platform, if not found,
    prompts the user to provide their synapse user name and password, then caches the auth key on their computing
    platform.

    :return:
    """
    try:
        syn = synapseclient.login()
    except Exception as e:
        print('Please provide your synapse username/email and password (You will only be prompted once)')
        username = input("Username: ")
        password = getpass.getpass(("Password for " + username + ": ").encode('utf-8'))
        syn = synapseclient.login(email=username, password=password, rememberMe=True)

    return syn

In [3]:
synapseLogin()

Welcome, xengie.doan!



<synapseclient.client.Synapse at 0x10b826390>

In [4]:
def getdf(syn, id):
    """

    :param syn:
    :param id:
    :return:
    """
    df = syn.tableQuery('select * from {id}'.format(id=id)).asDataFrame()
    return df

In [5]:
"""
Create a master matrix/table for consortium metrics.

:param args:
:param syn:
:return:
"""
# project and publication attributes
p_atr = ['projectName',
         'id',
         'fundingAgency',
         'citation',
         'doi',
         'tumorType',
         'diseaseFocus']

# project attributes
# ### use project view syn11391664 for createdOn modifiedOn but missing projects
# p_view_atr = [ 'id',
#             'name',
#             'fundingAgency',
#             'consortium',
#             'etag',
#             'modifiedOn',
#             'modifiedBy',
#             'concreteType',
#             'dataContact',
#             'diseaseFocus',
#             'grantEnd',
#             'grantStart',
#             'isActive',
#             'principalInvestigator',
#             'projectTeam',
#             'tumorType',
#             'createdOn']

### from table syn16787123
p_view_atr = ['projectName',
              'id',
              'projectFileviewId',
              'projectStatus',
              'dataStatus',
              'fundingAgency',
              'summary',
              'summarySource',
              'projectLeads',
              'institutions',
              'tumorType',
              'diseaseFocus']


# file view attributes
f_atr = ['id',
        'name',
        'projectId',
        'assay',
        'consortium',
        'dataSubtype',
        'dataType',
        'diagnosis',
        'tumorType',
        'fileFormat',
        'fundingAgency',
        'individualID',
        'nf1Genotype',
        'nf2Genotype',
        'species',
        'resourceType',
        'isCellLine',
        'isMultiSpecimen',
        'isMultiIndividual',
        'studyId',
        'studyName',
        'benefactorId',
        'specimenID',
        'sex'
        'age',
        'readPair',
        'createdOn',
        'modifiedOn']

# csbc project info integration 
csbc_atr = ["projectId",
            "name_project",
            "consortium",
            "institution",
            "grantNumber",
            "grantType",
            "teamMembersProfileId",
            "teamProfileId",
            "createdOn_project",
            "modifiedOn_project",
            "publication_count",
            "publication_geodata_produced",
            "fileId","name_file",
            "createdOn_file",
            "modifiedOn_file",
            "age",
            "analysisType",
            "assay",
            "cellLine",
            "cellSubType",
            "cellType",
            "compoundDose",
            "compoundName",
            "concreteType",
            "dataSubtype",
            "dataType",
            "diagnosis",
            "diseaseSubtype",
            "dnaAlignmentMethod",
            "experimentalCondition",
            "experimentalTimePoint",
            "fileFormat",
            "fundingAgency",
            "individualID",
            "individualIdSource",
            "inputDataType",
            "isCellLine",
            "isPrimaryCell",
            "isStranded",
            "libraryPrep",
            "modelSystem",
            "organ",
            "outputDataType",
            "peakCallingMethod",
            "platform",
            "readLength",
            "resourceType",
            "rnaAlignmentMethod",
            "runType",
            "scriptLanguageVersion",
            "sex","softwareAuthor",
            "softwareLanguage",
            "softwareRepository",
            "softwareRepositoryType",
            "softwareType",
            "species",
            "specimenID",
            "study",
            "tissue",
            "transcriptQuantificationMethod",
            "transplantationDonorSpecies",
            "transplantationDonorTissue",
            "transplantationRecipientTissue",
            "transplantationType",
            "tumorType"]

In [6]:
# merging all the things
# 0 publications view syn16857542
# 1 project table  syn16787123
# 2 all portal - files syn16858331
# 3 tools syn9898965
views = ['syn16857542', 'syn16787123', 'syn16858331', 'syn16859448']

dfs = [getdf(synapseclient.login(), synid) for synid in views]
[d.reset_index(inplace=True, drop=True) for d in dfs]

Welcome, xengie.doan!

Welcome, xengie.doan!

Welcome, xengie.doan!

Welcome, xengie.doan!



[None, None, None, None]

In [7]:
# Project attributes
# change columns to represent project attributes and unify key name to be projectId
dfs[0].rename(index=str, columns={"id": "projectId", "name" : "projectName"}, inplace=True)
dfs[1].rename(index=str, columns={"id": "projectId", "name": "projectName"}, inplace=True)

In [8]:
# take out non NTAP funded projects
dfs[0] = dfs[0][~dfs[0].fundingAgency.isin(['CTF', 'NIH-NCI'])]
dfs[1] = dfs[1][~dfs[1].fundingAgency.isin(['CTF', 'NIH-NCI'])]
dfs[2] = dfs[2][~dfs[2].fundingAgency.isin(['CTF', 'NIH-NCI', ''])]
dfs[3] = dfs[3][~dfs[3].fundingAgency.isin(['CTF', 'NIH-NCI', ''])]

In [9]:
# pandas.options.display.max_columns=50

In [10]:
# there are projects without publications
len(set(dfs[1].projectId.unique()) - set(dfs[0].projectId.unique()))


32

In [11]:
# Associate publications information to projects
project_info_df = pandas.merge(dfs[1], dfs[0], on=['projectId','projectName', 'fundingAgency', 'diseaseFocus', 'tumorType'], how='left')


In [12]:
project_info_df

Unnamed: 0,projectName,projectId,projectFileviewId,projectStatus,dataStatus,fundingAgency,summary,summarySource,projectLeads,institutions,tumorType,diseaseFocus,featured_x,consortium,citation,doi,featured_y
0,3D Models of Cutaneous Neurofibromas,syn11374354,syn11601495,Active,Under Embargo,NTAP,Given the absence of therapeutic options and t...,https://www.synapse.org/#!Synapse:syn11374354/...,Alice Soragni,"University of California, Los Angeles",Cutaneous Neurofibroma,Neurofibromatosis 1,,,,,
1,A Nerve Sheath Tumor Bank from Patients with NF1,syn4939902,syn13363852,Active,Under Embargo,NTAP,Surgery is currently the only treatment option...,https://www.synapse.org/#!Synapse:syn4939902/w...,Christine Pratilas,Johns Hopkins Medical Institute,Plexiform Neurofibroma | MPNST | Cutaneous Neu...,Neurofibromatosis 1,yes,,,,
2,A Robust Plexiform Neurofibroma Model for Prec...,syn8016635,syn12582871,Completed,Under Embargo,NTAP,We utilized a genetically engineered mouse mod...,https://www.synapse.org/#!Synapse:syn8016635/w...,Lu Le,University of Texas Southwestern Medical Center,Plexiform Neurofibroma,Neurofibromatosis 1,,,,,
3,Acceptance and Commitment Therapy in NF1,syn4939896,syn15666830,Active,Under Embargo,NTAP,NF1-related pain is often not well-controlled ...,https://www.synapse.org/#!Synapse:syn4939896/w...,Staci Martin,National Institutes of Health,Plexiform Neurofibroma,Neurofibromatosis 1,,,"Allen, TM et al. The Relationship Between Hear...",10.1111/papr.12695,
4,Can Targeted Therapy Prevent Neurofibroma Growth,syn4939872,,Completed,,NTAP,Preliminary efficacy studies conducted through...,https://www.synapse.org/#!Synapse:syn4939872/w...,Nancy Ratner,Cincinnati Children's Hospital Medical Hospital,Plexiform Neurofibroma,Neurofibromatosis 1,,,,,
5,Child NF1 Quality of Life Measures,syn4939878,syn9922960,Completed,Under Embargo,NTAP,"To develop and test the feasibility, reliabili...",https://www.synapse.org/#!Synapse:syn4939878/w...,Nancy Swigonski,Indiana University,Plexiform Neurofibroma,Neurofibromatosis 1,,,Nutakki K et al. Development of the pediatric ...,10.1007/s11060-016-2351-2,
6,Child NF1 Quality of Life Measures,syn4939878,syn9922960,Completed,Under Embargo,NTAP,"To develop and test the feasibility, reliabili...",https://www.synapse.org/#!Synapse:syn4939878/w...,Nancy Swigonski,Indiana University,Plexiform Neurofibroma,Neurofibromatosis 1,,,Draucker CB et al. The health-related quality ...,10.1111/jspn.12174,
7,Combination Index Validation Studies,syn4939876,syn8449585,Completed,Under Embargo,NTAP,Preliminary High-throughput screening (HTS) da...,https://www.synapse.org/#!Synapse:syn4939876/w...,Wade Clapp,Indiana University,Plexiform Neurofibroma,Neurofibromatosis 1,,,,,
8,"Cutaneous Neurofibroma - Models, Biology, and ...",syn11374333,syn11601447,Active,Under Embargo,NTAP,There are still gaps in our knowledge of cNF p...,https://www.synapse.org/#!Synapse:syn11374333/...,Lu Le,University of Texas Southwestern Medical Center,Cutaneous Neurofibroma,Neurofibromatosis 1,,,,,
9,Deconstruction and Reconstruction of NF1 cNFs,syn11374357,syn11601503,Active,Under Embargo,NTAP,Our goal is to substantially increase understa...,https://www.synapse.org/#!Synapse:syn11374357/...,Raymond Mattingly,Wayne State University,Cutaneous Neurofibroma,Neurofibromatosis 1,,,,,


In [13]:
project_info_df = project_info_df[
    [ 'projectName',
     'projectId',
     'projectFileviewId',
     'dataStatus',
     'fundingAgency',
     'projectLeads',
     'institutions',
     'tumorType',
     'diseaseFocus',
     'citation', 
     'doi']
]

In [14]:
publication_count = list(project_info_df.groupby(['projectId']))
dfs[1]['publication_count'] = [len(x[1]) if len(x[1]) != 1 else 0 for x in publication_count]

In [15]:
dfs[0] = dfs[0].astype(object).replace(numpy.nan, '')

In [16]:
dfs[1]['publication_geodata_produced'] = 0  ### don't have data location...run getPMIDDF or set to zero

In [17]:
# File attributes
# remove tools files (subset of all datafiles) from all datafiles
tools_files_id = list(set(dfs[2]["id"].unique()).intersection(set(dfs[3]["study"].unique())))

# no files that are also tools for NTAP
list(set(dfs[3]["study"].unique()).intersection( set(dfs[2]["id"].unique())))

[]

In [18]:
dfs[2].rename(index=str, columns={"id": "fileId", "name": "name_file", "createdOn": "createdOn_file",
                                  "modifiedOn": "modifiedOn_file", "modifiedBy": "modifiedBy_file"}, inplace=True)
dfs[3].rename(index=str, columns={"id": "fileId", "name": "name_file", "createdOn": "createdOn_file",
                                  "modifiedOn": "modifiedOn_file", "modifiedBy": "modifiedBy_file", "study" :"projectId"}, inplace=True)


In [19]:
cols_to_add2 = dfs[3].columns.difference(dfs[2].columns)
cols_to_add3 = dfs[2].columns.difference(dfs[3].columns)
dfs[2] = pandas.concat([dfs[2], pandas.DataFrame(columns=cols_to_add2)])
dfs[3] = pandas.concat([dfs[3], pandas.DataFrame(columns=cols_to_add3)])


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.



In [20]:
# concat files and tools to get all the files information data frame
file_info_df = pandas.concat([dfs[3], dfs[2]], sort = False)

In [21]:
# file_info_df[[cols for cols in list(file_info_df.columns) if cols in csbc_atr]].columns


In [22]:
final_df = pandas.merge( dfs[1], file_info_df, on= ['projectId'], how='left')

In [23]:
# (dfs[1]["consortium"]).describe()

In [24]:
final_df = final_df.drop(
    ["summary_x",
     "summarySource",
     "featured_x",
     "consortium_x",
     "fundingAgency_y",
     "featured_y",
     "tumorType_y",
     "etag"]
    , axis = 1)

In [25]:
final_df.columns

Index(['projectName', 'projectId', 'projectFileviewId', 'projectStatus',
       'dataStatus', 'fundingAgency_x', 'projectLeads', 'institutions',
       'tumorType_x', 'diseaseFocus', 'publication_count',
       'publication_geodata_produced', 'age', 'assay', 'benefactorId',
       'consortium_y', 'contact', 'createdOn_file', 'dataSubtype', 'dataType',
       'diagnosis', 'disease', 'fileFormat', 'fileId', 'individualID',
       'isCellLine', 'isMultiIndividual', 'isMultiSpecimen', 'link',
       'modifiedOn_file', 'name_file', 'nf1Genotype', 'nf2Genotype',
       'parentId', 'readPair', 'reportMilestone', 'resourceType', 'sex',
       'softwareLink', 'softwareName', 'species', 'specimenID', 'studyId',
       'studyName', 'subtype', 'summary_y', 'type'],
      dtype='object')

In [26]:
final_df.rename(columns={
    "fundingAgency_x":"fundingAgency",
    "tumorType_x":"tumorType", 
    "projectName":'name_project',
    "isCellLine":"cellLine",
    "consortium_y" : "consortium"},
                inplace=True)

In [27]:
# annotate tools files to be a resourceType tool - for now
final_df.loc[final_df.summary_y.isin(list(dfs[3].summary)), 'resourceType'] = 'tool'

In [37]:
pandas.set_option('display.max_columns', 500)
final_df.describe(include="all")

Unnamed: 0,name_project,projectId,projectFileviewId,projectStatus,dataStatus,fundingAgency,projectLeads,institutions,tumorType,diseaseFocus,publication_count,publication_geodata_produced,age,assay,benefactorId,consortium,contact,createdOn_file,dataSubtype,dataType,diagnosis,disease,fileFormat,fileId,individualID,cellLine,isMultiIndividual,isMultiSpecimen,link,modifiedOn_file,name_file,nf1Genotype,nf2Genotype,parentId,readPair,reportMilestone,resourceType,sex,softwareLink,softwareName,species,specimenID,studyId,studyName,subtype,summary_y,type
count,4623,4623,4619,4623,4623,4623,4623,4621,4622,4623,4623.0,4623.0,120.0,4105,4600,4590,3,4623.0,3996,4133,4093,3,4395,4600,3714,2710,2703,2657,3,4623.0,4600,2738,1619,4600,4623.0,671.0,4594,2535,3,3,4078,3719,4213,4192,3,3,3
unique,45,45,41,2,4,1,37,28,6,2,,,26.0,16,43,6,3,4601.0,4,9,1,1,32,4600,199,2,2,2,3,4601.0,4577,4,1,183,1.0,,6,4,3,3,4,871,32,21,3,3,2
top,A Nerve Sheath Tumor Bank from Patients with NF1,syn4939902,syn13363852,Active,Under Embargo,NTAP,Christine Pratilas,Johns Hopkins Medical Institute,Plexiform Neurofibroma | MPNST | Cutaneous Neu...,Neurofibromatosis 1,,,41.0,exomeSeq,syn17866464,Open,Eduard Serra,,raw,genomicVariants,Neurofibromatosis 1,NF1,csv,syn18133885,BI386,False,False,False,https://www.hopkinsmedicine.org/kimmel_cancer_...,,.DS_Store,-/-,+/+,syn11492076,,,experimentalData,male,https://www.hopkinsmedicine.org/kimmel_cancer_...,The Johns Hopkins NF1 Biospecimen Repository,Human,BI386-014,syn4939902,A Nerve Sheath Tumor Bank from Patients with NF1,visualization,A set of immortalized pluripotent stem cells (...,bench science
freq,1898,1898,1898,3743,3914,4623,1898,1899,1898,4530,,,12.0,1810,1805,1970,1,23.0,2139,2087,4093,3,1324,1,149,1951,2636,2547,1,23.0,3,2446,1619,600,4623.0,,4272,1295,1,1,3945,49,1853,1842,1,1,2
mean,,,,,,,,,,,0.289855,0.0,,,,,,,,,,,,,,,,,,,,,,,,11.400894,,,,,,,,,,,
std,,,,,,,,,,,0.70536,0.0,,,,,,,,,,,,,,,,,,,,,,,,1.800149,,,,,,,,,,,
min,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,6.0,,,,,,,,,,,
25%,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,12.0,,,,,,,,,,,
50%,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,12.0,,,,,,,,,,,
75%,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,12.0,,,,,,,,,,,


In [29]:
# # double check if we didn't loose a project
if len(final_df.projectId.unique()) == len(dfs[1].projectId):
    print("All projects were successfully associated with files")
else:
    print("lost a project")

All projects were successfully associated with files


In [30]:
# check types
col_types = [col for col in list( final_df.columns ) if final_df[col].dtype == numpy.float64]
print("column names of type numpy.float64 \n:", col_types)


column names of type numpy.float64 
: ['createdOn_file', 'modifiedOn_file', 'readPair', 'reportMilestone']


In [39]:
len(final_df.columns)

47

In [32]:
def changeFloatToInt(final_df, col):
    """

    :param final_df:
    :param col:
    :return:
    """
    final_df[col] = final_df[col].fillna(0).astype(int)
    final_df[col].replace(0, '', inplace=True)


In [33]:
cols = ['createdOn_file','modifiedOn_file','readPair']

[changeFloatToInt(final_df, col) for col in cols]

[None, None, None]

In [40]:
### parent is NTAP now
table = synapseclient.table.build_table("NTAP Project Information Integration", 'syn4939478', final_df)

In [41]:
syn = synapseclient.Synapse()
syn.login()

Welcome, xengie.doan!



In [42]:
table = syn.store(table)