In [79]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [97]:
%%bash
rm fqData*tsv || echo "No fqta.tsv's"
rm sampleData*tsv || echo "No xenoData.tsv's"

In [7]:
def buildEnaSampleSubmissionFile(psDf, xenoDf, fqDf):
    adjustHeaderNames(psDf, xenoDf)
    sampleData = concatonateSamplesAndSetHeaders(psDf, xenoDf)
    transformSampleDataAndSave(sampleData)
    joinSampleDataAndFastq(sampleData,fqDf)

In [87]:
def joinSampleDataAndFastq(sampleData, fqDf):
    fqDf.set_index('sample_alias', inplace=True)
    sampleData.set_index('sample_alias', inplace=True)
    joinedFqDf = sampleData.merge(fqDf, how='inner', on='sample_alias', sort=True).iloc[:, 19:]
    joinedSampleData = joinedFqDf.merge(sampleData, how='inner', on='sample_alias', sort=True).to_csv('completedEnaAllSamplesMetadata.tsv', sep='\t', index=True)
    joinedFqDf = joinedFqDf.loc[~joinedFqDf.index.duplicated(keep='last')]
    joinedFqDf.to_csv("matchFastqs.tsv", sep='\t', index=True, header=True)

In [77]:
def transformSampleData(sampleData):
    sampleData['sample_title'] = sampleData['mod.sourcePdxId'] + '~' + sampleData['sample unique id'] \
        .replace('"', '', regex=True)
    sampleData['sample_title'].replace('~ ', '~', regex=True, inplace=True)
    sampleData['sample_title'].replace('(^[A-Za-z0-9]+)-', '\\1~', regex=True, inplace=True)
    sampleData['sample_alias'] = sampleData['sample_title']
    sampleData.loc[
        sampleData['sample_title'].str.contains("ORIGINATOR"), 'engrafted tumor sample passage'] = "not applicable"
    sampleData['patient tumor type'] = sampleData['patient tumor type'].str\
        .replace(r'^(Primary|Metastatic|Recurrent)','\\1 Neoplasm', regex=True)
    sampleData['patient tumor type'] = sampleData['patient tumor type'].str\
        .replace('Not Specified', 'not provided',regex=True)
    sampleData['patient age at collection of tumor'] = sampleData['patient age at collection of tumor'].str\
        .replace('Not Specified', 'not provided', regex=True)
    sampleData['tax_id'] = '9606'
    sampleData['scientific_name'] = 'Homo sapien'
    sampleData['common_name'] = 'human'
    sampleData['sample material'] = 'tissue fragment'
    sampleData['engraftment host strain name'] = 'NOD.Cg-PrkdcscidIl2rgtm1Wjl/SzJ'
    sampleData['was the pdx model humanised?'] = 'No'

In [10]:
def adjustHeaderNames(psDf, xenoDf):
    psDf['sample origin'] = "Patient tumor"
    psDf[' sampleId'] = "ORIGINATOR"
    xenoDf['sample origin'] = "Engrafted tumor"
    xenoDf['sample taxon name'] = "Homo sapiens/Mus musculus xenograft"
    psDf['sample taxon name'] = "Homo sapiens"
    psDf['passage'] = '0'
    xenoDf['engrafted tumor collection site'] = "Not Specified"
    psDf['engrafted tumor collection site'] = ""

In [11]:
def concatonateSamplesAndSetHeaders(psDf, xenoDf):
    sampleData = ps.concat([xenoDf, psDf], sort=True)
    emptySampleIds = sampleData[sampleData[' sampleId'] == ''].index
    sampleData.drop(emptySampleIds, inplace=True)
    sampleData.rename(
        columns={
            ' passage': 'engrafted tumor sample passage',
            ' tt.name': 'patient tumor type',
            ' sampleId': 'sample unique id',
            'ps.ageAtCollection': 'patient age at collection of tumor',
            'ot.name': 'patient age at collection of tumor',
            ' originTissue.name': 'patient tumor primary site',
            ' sex': 'patient sex',
            ' age': 'patient age at collection of tumor',
            ' diagnosis': 'patient tumor diagnosis at time of collection',
            ' sampleTissue.name': 'patient tumor site of collection'
        }, inplace=True)
    return sampleData

In [81]:
%%bash
#for sample
bash -c 'cypher-shell -u neo4j -p neo5j "MATCH (p:Patient)--(ps:PatientSnapshot)--(psamp:Sample)
-[:MODEL_SAMPLE_RELATION]-(mod:ModelCreation)
WHERE mod.dataSource = \"PDMR\"
WITH p,ps, psamp, mod
MATCH (psamp)--(on:OntologyTerm)
OPTIONAL MATCH (psamp)--(tt:TumorType)
OPTIONAL MATCH (psamp)-[:SAMPLE_SITE]-(sampleTissue:Tissue)
OPTIONAL MATCH (psamp)-[:ORIGIN_TISSUE]-(originTissue:Tissue)
return distinct mod.sourcePdxId, psamp.sourceSampleId as sampleId,sampleTissue.name, tt.name, 
ps.ageAtCollection, on.label as diagnosis, originTissue.name, p.sex as sex, ps.ageAtCollection as age";' > patientSamples

In [82]:
%%bash
#for sample
bash -c 'cypher-shell -u neo4j -p neo5j "MATCH (p:Patient)--(ps:PatientSnapshot)--(psamp:Sample)
-[:MODEL_SAMPLE_RELATION]-(mod:ModelCreation)
WHERE mod.dataSource = \"PDMR\"
WITH p,ps, psamp, mod
MATCH (psamp)--(on:OntologyTerm)
OPTIONAL MATCH (psamp)--(tt:TumorType)
OPTIONAL MATCH (psamp)-[:SAMPLE_SITE]-(sampleTissue:Tissue)
OPTIONAL MATCH (psamp)-[:ORIGIN_TISSUE]-(originTissue:Tissue)
WITH p,ps, psamp, mod, on, tt, sampleTissue, originTissue
MATCH (mod)--(xenoSample:Sample)--(spe:Specimen)
RETURN distinct mod.sourcePdxId, xenoSample.sourceSampleId as sampleId, spe.passage as passage,sampleTissue.name,
tt.name, ps.ageAtCollection, on.label as diagnosis, originTissue.name, p.sex as sex, ps.ageAtCollection as age";' > xenoSamples

In [83]:
%%bash
wc -l patientSamples
wc -l xenoSamples

379 patientSamples
1118 xenoSamples


In [14]:
import pandas as ps
import re

In [107]:
patientSampleFile = open('patientSamples', 'r')
xenoSampleFile = open('xenoSamples', 'r')
fastqMetadata = open('fastqMetadataPdmrRnaSeq_filled.csv', 'r')
psDf = ps.read_csv(patientSampleFile).replace('"', '')
xenoDf = ps.read_csv(xenoSampleFile).replace('"', '')
fqDf = ps.read_csv(fastqMetadata, sep='\t')


In [108]:
buildEnaSampleSubmissionFile(psDf, xenoDf, fqDf)

In [109]:
%%bash
head -n1 matchFastqs.tsv > fastqHeaders

In [110]:
%%bash
tail -n +2 completedEnaAllSamplesMetadata.tsv > headerlessData

In [111]:
%%bash
tail -n +2 matchFastqs.tsv > headerlessFsData

In [112]:
%%bash
split -l 200 -d headerlessData sampleData
split -l 200 -d headerlessFsData fqData

In [113]:
%%bash
for i in sampleData* ; do cat headers "$i" > "$i".tsv; done;


In [114]:
%%bash
for i in fqData* ; do cat fastqHeaders "$i" > "$i".tsv; done;


In [115]:
%%bash
rm fqData??
rm sampleData??