In [5]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [134]:
%%bash
cd currentSubmission
rm fqData*tsv || echo "No fqta.tsv's"
rm sampleData*tsv || echo "No xenoData.tsv's"
cd ..

In [135]:
def buildEnaSampleSubmissionFile(psDf, xenoDf, fqDf, ena):
    adjustHeaderNames(psDf, xenoDf)
    sampleData = concatonateSamplesAndSetHeaders(psDf, xenoDf)
    tSampleMetaData = transformSampleData(sampleData)
    finalSampleMetadata = adjustHeadersToEna(ena, tSampleMetaData)
    joinSampleDataAndFastq(finalSampleMetadata,fqDf)
    return finalSampleMetadata

In [136]:
def adjustHeadersToEna(ena, tSampleMetaData):
    finalSampleMetadata = tSampleMetaData.reindex(columns=ena.columns)
    return finalSampleMetadata
    

In [137]:
def joinSampleDataAndFastq(sampleData, fqDf):
    fqDf.set_index('sample_alias', inplace=True)
    sampleData.set_index('sample_alias', inplace=True)
    joinedFqDf = sampleData.merge(fqDf, how='inner', on='sample_alias', sort=True).iloc[:, 19:]
    joinedSampleData = joinedFqDf.merge(sampleData, how='inner', on='sample_alias', sort=True).to_csv('./currentSubmission/completedEnaAllSamplesMetadata.tsv', sep='\t', index=True)
    joinedFqDf = joinedFqDf.loc[~joinedFqDf.index.duplicated(keep='last')]
    joinedFqDf.to_csv("./currentSubmission/matchFastqs.tsv", sep='\t', index=True, header=True)

In [138]:
def transformSampleData(sampleData):
    sampleData['sample_title'] = sampleData['mod.sourcePdxId'] + '~' + sampleData['sample unique id'] \
        .replace('"', '', regex=True)
    sampleData['sample_title'].replace('~ ', '~', regex=True, inplace=True)
    sampleData['sample_title'].replace('(^[A-Za-z0-9]+)-', '\\1~', regex=True, inplace=True)
    sampleData['sample_alias'] = sampleData['sample_title']
    sampleData.loc[
        sampleData['sample_title'].str.contains("ORIGINATOR"), 'engrafted tumor sample passage'] = "not applicable"
    sampleData['patient tumor type'] = sampleData['patient tumor type'].str.replace('(Primary|Metastatic|Recurrent)','\\1 Neoplasm', regex=True)
    sampleData['patient tumor type'] = sampleData['patient tumor type'].str.replace('Not Specified', 'not provided',regex=True)
    print(sampleData['patient tumor type'])
    sampleData['patient age at collection of tumor'] = sampleData['patient age at collection of tumor'].str.replace('Not Specified', 'not provided', regex=True)
    sampleData['tax_id'] = '9606'
    sampleData['scientific_name'] = 'Homo sapien'
    sampleData['common_name'] = 'human'
    sampleData['sample material'] = 'tissue fragment'
    sampleData['engraftment host strain name'] = 'NOD.Cg-PrkdcscidIl2rgtm1Wjl/SzJ'
    sampleData['was the pdx model humanised?'] = 'No'
    return sampleData

In [139]:
def adjustHeaderNames(psDf, xenoDf):
    psDf['sample origin'] = "Patient tumor"
    psDf[' sampleId'] = "ORIGINATOR"
    xenoDf['sample origin'] = "Engrafted tumor"
    xenoDf['sample taxon name'] = "Homo sapiens/Mus musculus xenograft"
    psDf['sample taxon name'] = "Homo sapiens"
    psDf['passage'] = '0'
    xenoDf['engrafted tumor collection site'] = "Not Specified"
    psDf['engrafted tumor collection site'] = ""

In [140]:
def concatonateSamplesAndSetHeaders(psDf, xenoDf):
    sampleData = ps.concat([xenoDf, psDf], sort=True)
    emptySampleIds = sampleData[sampleData[' sampleId'] == ''].index
    sampleData.drop(emptySampleIds, inplace=True)
    sampleData.rename(
        columns={
            ' passage': 'engrafted tumor sample passage',
            ' tt.name': 'patient tumor type',
            ' sampleId': 'sample unique id',
            'ps.ageAtCollection': 'patient age at collection of tumor',
            'ot.name': 'patient age at collection of tumor',
            ' originTissue.name': 'patient tumor primary site',
            ' sex': 'patient sex',
            ' age': 'patient age at collection of tumor',
            ' diagnosis': 'patient tumor diagnosis at time of collection',
            ' sampleTissue.name': 'patient tumor site of collection'
        }, inplace=True)
    return sampleData

In [47]:
%%bash
#for sample
bash -c 'cypher-shell -u neo4j -p neo5j "MATCH (p:Patient)--(ps:PatientSnapshot)--(psamp:Sample)
-[:MODEL_SAMPLE_RELATION]-(mod:ModelCreation)
WHERE mod.dataSource = \"PDMR\"
WITH p,ps, psamp, mod
MATCH (psamp)--(on:OntologyTerm)
OPTIONAL MATCH (psamp)--(tt:TumorType)
OPTIONAL MATCH (psamp)-[:SAMPLE_SITE]-(sampleTissue:Tissue)
OPTIONAL MATCH (psamp)-[:ORIGIN_TISSUE]-(originTissue:Tissue)
return distinct mod.sourcePdxId, psamp.sourceSampleId as sampleId,sampleTissue.name, tt.name, 
ps.ageAtCollection, on.label as diagnosis, originTissue.name, p.sex as sex, ps.ageAtCollection as age";' > ./currentSubmission/patientSamples

In [48]:
%%bash
#for sample
bash -c 'cypher-shell -u neo4j -p neo5j "MATCH (p:Patient)--(ps:PatientSnapshot)--(psamp:Sample)
-[:MODEL_SAMPLE_RELATION]-(mod:ModelCreation)
WHERE mod.dataSource = \"PDMR\"
WITH p,ps, psamp, mod
MATCH (psamp)--(on:OntologyTerm)
OPTIONAL MATCH (psamp)--(tt:TumorType)
OPTIONAL MATCH (psamp)-[:SAMPLE_SITE]-(sampleTissue:Tissue)
OPTIONAL MATCH (psamp)-[:ORIGIN_TISSUE]-(originTissue:Tissue)
WITH p,ps, psamp, mod, on, tt, sampleTissue, originTissue
MATCH (mod)--(xenoSample:Sample)--(spe:Specimen)
RETURN distinct mod.sourcePdxId, xenoSample.sourceSampleId as sampleId, spe.passage as passage,sampleTissue.name,
tt.name, ps.ageAtCollection, on.label as diagnosis, originTissue.name, p.sex as sex, ps.ageAtCollection as age";' > ./currentSubmission/xenoSamples

In [141]:
%%bash
wc -l ./currentSubmission/patientSamples
wc -l ./currentSubmission/xenoSamples

379 ./currentSubmission/patientSamples
1118 ./currentSubmission/xenoSamples


In [142]:
import pandas as ps
import re

In [143]:
patientSampleFile = open('./currentSubmission/patientSamples', 'r')
xenoSampleFile = open('./currentSubmission/xenoSamples', 'r')
fastqMetadata = open('fastqMetadataPdmrRnaSeq_filled.csv', 'r')
enaSampleTemplate = open('enaSampleTemplate.csv', 'r')
psDf = ps.read_csv(patientSampleFile).replace('"', '')
xenoDf = ps.read_csv(xenoSampleFile).replace('"', '')
fqDf = ps.read_csv(fastqMetadata, sep='\t').replace('"','')
ena = ps.read_csv(enaSampleTemplate, sep='\t', header=2).replace('"','')


In [144]:
buildEnaSampleSubmissionFile(psDf, xenoDf, fqDf, ena)

0          "Primary Neoplasm"
1          "Primary Neoplasm"
2          "Primary Neoplasm"
3          "Primary Neoplasm"
4       "Metastatic Neoplasm"
                ...          
373        "Primary Neoplasm"
374        "Primary Neoplasm"
375        "Primary Neoplasm"
376        "Primary Neoplasm"
377     "Metastatic Neoplasm"
Name: patient tumor type, Length: 1495, dtype: object


Unnamed: 0_level_0,tax_id,scientific_name,common_name,sample_title,sample_description,sample origin,sample taxon name,sample material,engrafted tumor sample passage,engrafted tumor collection site,patient tumor site of collection,patient tumor type,sample unique id,engraftment host strain name,patient age at collection of tumor,patient tumor diagnosis at time of collection,patient tumor primary site,was the pdx model humanised?,patient sex
sample_alias,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
515863~333-R~A1PPN5,9606,Homo sapien,human,515863~333-R~A1PPN5,,Engrafted tumor,Homo sapiens/Mus musculus xenograft,tissue fragment,"""1""",Not Specified,"""Not Specified""","""Primary Neoplasm""","""A1PPN5""",NOD.Cg-PrkdcscidIl2rgtm1Wjl/SzJ,"""67""","""Cutaneous Melanoma""","""Skin""",No,"""Male"""
515863~333-R~AYW,9606,Homo sapien,human,515863~333-R~AYW,,Engrafted tumor,Homo sapiens/Mus musculus xenograft,tissue fragment,"""0""",Not Specified,"""Not Specified""","""Primary Neoplasm""","""AYW""",NOD.Cg-PrkdcscidIl2rgtm1Wjl/SzJ,"""67""","""Cutaneous Melanoma""","""Skin""",No,"""Male"""
918122~036-R~LR0,9606,Homo sapien,human,918122~036-R~LR0,,Engrafted tumor,Homo sapiens/Mus musculus xenograft,tissue fragment,"""0""",Not Specified,"""Not Specified""","""Primary Neoplasm""","""LR0""",NOD.Cg-PrkdcscidIl2rgtm1Wjl/SzJ,"""57""","""Myxofibrosarcoma""","""Musculoskeletal""",No,"""Male"""
918122~036-R~LR2MY8,9606,Homo sapien,human,918122~036-R~LR2MY8,,Engrafted tumor,Homo sapiens/Mus musculus xenograft,tissue fragment,"""1""",Not Specified,"""Not Specified""","""Primary Neoplasm""","""LR2MY8""",NOD.Cg-PrkdcscidIl2rgtm1Wjl/SzJ,"""57""","""Myxofibrosarcoma""","""Musculoskeletal""",No,"""Male"""
513682~313-R~H30E35G36,9606,Homo sapien,human,513682~313-R~H30E35G36,,Engrafted tumor,Homo sapiens/Mus musculus xenograft,tissue fragment,"""2""",Not Specified,"""Not Specified""","""Metastatic Neoplasm""","""H30E35G36""",NOD.Cg-PrkdcscidIl2rgtm1Wjl/SzJ,"""67""","""Leiomyosarcoma""","""Musculoskeletal""",No,"""Male"""
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281475~159-R~ORIGINATOR,9606,Homo sapien,human,281475~159-R~ORIGINATOR,,Patient tumor,Homo sapiens,tissue fragment,not applicable,,"""Not Specified""","""Primary Neoplasm""",ORIGINATOR,NOD.Cg-PrkdcscidIl2rgtm1Wjl/SzJ,"""72""","""Urothelial Carcinoma""","""Genitourinary""",No,"""Male"""
457572~356-R~ORIGINATOR,9606,Homo sapien,human,457572~356-R~ORIGINATOR,,Patient tumor,Homo sapiens,tissue fragment,not applicable,,"""Not Specified""","""Primary Neoplasm""",ORIGINATOR,NOD.Cg-PrkdcscidIl2rgtm1Wjl/SzJ,"""71""","""Gastric Adenocarcinoma""","""Digestive/Gastrointestinal""",No,"""Male"""
555926~031-R~ORIGINATOR,9606,Homo sapien,human,555926~031-R~ORIGINATOR,,Patient tumor,Homo sapiens,tissue fragment,not applicable,,"""Not Specified""","""Primary Neoplasm""",ORIGINATOR,NOD.Cg-PrkdcscidIl2rgtm1Wjl/SzJ,"""59""","""Colon Adenocarcinoma""","""Digestive/Gastrointestinal""",No,"""Female"""
665939~344-R~ORIGINATOR,9606,Homo sapien,human,665939~344-R~ORIGINATOR,,Patient tumor,Homo sapiens,tissue fragment,not applicable,,"""Not Specified""","""Primary Neoplasm""",ORIGINATOR,NOD.Cg-PrkdcscidIl2rgtm1Wjl/SzJ,"""73""","""Bladder Papillary Urothelial Neoplasm""","""Genitourinary""",No,"""Male"""


In [145]:
rnaMetadata = ps.read_csv('./currentSubmission/matchFastqs.tsv', sep='\t')
sampleAcc = ps.read_csv('sampleAccessions', sep='\t', names=['type', 'accession1', 'sample_alias'], header=None)
allSampleMetadata = ps.read_csv('./currentSubmission/completedEnaAllSamplesMetadata.tsv', sep='\t')

In [146]:
missedSampleMetadata = rnaMetadata[~rnaMetadata.sample_alias.isin(sampleAcc.sample_alias)]
missedMetadata = allSampleMetadata[allSampleMetadata.sample_alias.isin(missedSampleMetadata.sample_alias)].set_index('sample_alias')
missedMetadata.iloc[:,12:].to_csv('missingRnaSampleMetadata.tsv', sep='\t')

In [147]:
%%bash
cat enaSampleTemplate.csv <(tail -n +2 missingRnaSampleMetadata.tsv) > completedRnaSampleMetadata.tsv
sed -i 's/"//g' completedRnaSampleMetadata.tsv
sed -i $'s/\t /\t/g' completedRnaSampleMetadata.tsv

In [148]:
%%bash
cd currentSubmission
head -n 1 matchFastqs.tsv > fastqHeaders
tail -n +2 completedEnaAllSamplesMetadata.tsv > headerlessData
uniq -w 100 headerlessData > uniqHeaderlessData
tail -n +2 matchFastqs.tsv > headerlessFsData
split -l 200 -d uniqHeaderlessData sampleData
split -l 200 -d headerlessFsData fqData
for i in sampleData* ; do cat headers "$i" > "$i".tsv; done;
for i in fqData* ; do cat fastqHeaders "$i" > "$i".tsv; done;
rm fqData??
rm sampleData??

cat: headers: No such file or directory
cat: headers: No such file or directory
cat: headers: No such file or directory
cat: headers: No such file or directory
cat: headers: No such file or directory
