In [1]:
%%bash
eu_register.py --help

usage: eu_register.py [-h] [-m DCC_MODE] [-d] [--no-aliases] -p PROFILE_ID -i
                      INFILE [--patch] [-w]

Given a tab-delimited or JSON input file containing one or more records belonging to one of the profiles
listed on the ENCODE Portal (such as https://www.encodeproject.org/profiles/biosample.json),
either POSTS or PATCHES the records. The default is to POST each record; to PATCH instead, see
the ``--patch`` option.

When POSTING file records, the md5sum of each file will be calculated for you if you haven't
already provided the `md5sum` property. Then, after the POST operation completes, the actual file
will be uploaded to AWS S3. In order for this to work, you must set the `submitted_file_name`
property to the full, local path to your file to upload. Alternatively, you can set
`submitted_file_name` to and existing S3 object, i.e. s3://mybucket/reads.fastq.

Note that there is a special 'trick' defined in the ``encode_utils.connection.Connection()``
class that can 

In [2]:
%%bash
mkdir -p /data/reddylab/Revathy/collabs/Keith/ENCODE_submission/CRISPR/metadata
source /data/reddylab/software/miniconda3/bin/activate revathy_py3
python /data/reddylab/Revathy/scripts/excel_to_text_for_ENCODE.py \
-i /data/reddylab/Revathy/collabs/Keith/ENCODE_submission/CRISPR/2023_Tcell_CRISPRi_Keith_encode_metadata.xlsx \
-o /data/reddylab/Revathy/collabs/Keith/ENCODE_submission/CRISPR/metadata \
--sheet-names functional_characterization_ser

In [3]:
%%bash
source /data/reddylab/software/miniconda3/bin/activate revathy_py3
cd /data/reddylab/Revathy/collabs/Keith/ENCODE_submission/CRISPR/metadata
eu_register.py -m prod -p functional_characterization_series -i functional_characterization_ser.txt

2023-09-25 17:42:21,566:eu_debug:	Connecting to www.encodeproject.org
2023-09-25 17:42:21,592:eu_debug:	submission=False: In non-submission mode.
2023-09-25 17:42:22,288:eu_debug:	submission=True: In submission mode.
2023-09-25 17:42:22,292:eu_debug:	
IN post().
2023-09-25 17:42:22,292:eu_debug:	<<<<<< POST functional_characterization_series record tim-reddy:mm10-th17-crispri-GITR-series To DCC with URL https://www.encodeproject.org/functional_characterization_series and this payload:

{
  "aliases": [
    "tim-reddy:mm10-th17-crispri-GITR-series"
  ],
  "award": "/awards/UM1HG009428/",
  "lab": "/labs/tim-reddy/",
  "related_datasets": [
    "tim-reddy:mm10-th17-crispri-GITR-low-bin",
    "tim-reddy:mm10-th17-crispri-GITR-high-bin"
  ]
}


2023-09-25 17:42:22,834:eu_debug:	Success.


In [4]:
%%bash
/data/common/shared_conda_envs/ucsc/bin/validateFiles \
-type=bed3+22 \
-chromInfo=/data/reddylab/software/encValData/hg38/chrom.sizes \
-as=/data/reddylab/Alex/software/encValData/as/element_quant_format.as \
/data/reddylab/Keith/collab/220131_TaniaRotation/230921_GITR_Submission/TG2-GITR-Hi-rep1.trim20.alignment.counts.txt

Error [file=/data/reddylab/Keith/collab/220131_TaniaRotation/230921_GITR_Submission/TG2-GITR-Hi-rep1.trim20.alignment.counts.txt, line=1]: found 2 columns, expected 25 [chr4_156026280_156026302_+_peak_10__66_GGCCTGAAGCCCAGTCTGAG	71]
Aborting ... found error.


### Convert .counts.txt files to element_quant_format.as format

In [14]:
!mkdir -p /data/reddylab/Revathy/collabs/Keith/ENCODE_submission/CRISPR/data

In [17]:
import warnings
import glob
import re
import pandas as pd
import numpy as np
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

files = [f for f in glob.glob('/data/reddylab/Keith/collab/220131_TaniaRotation/230921_GITR_Submission/' + '*.counts.txt')]

for file in files:
    file_name='.'.join(file.split('/')[-1].split('.')[:3])
    
    tst = pd.read_csv(file, sep='\t', names=['name','count'])
    tst.tail(40)['name'] = 'NA_NA_NA_NA_' + tst.tail(40)['name']
    tst['seq'] = tst['name'].str.split('_').str[-1]
    tst['name'] = tst['name'].str.split('_').str[:-1].str.join('_')
    tst['chr_start_end_strand'] = tst.name.str.split('_').str[:4].str.join('_')
    tst['id'] = tst['name'].str.split('_').str[4:].str.join('_')
    tst[['chr','start','end','strand']] = tst.chr_start_end_strand.str.split('_', expand=True)
    tst_nt = tst[tst['id'].str.contains('nontargeting')]
    tst_t = tst[~tst['id'].str.contains('nontargeting')]
    
    tst_nt['guidetype'] = 'Non-targeting'
    tst_nt['col4'] = 'NA|' + tst['id']
    tst_nt['col7'] = tst['id']
    
    tst_t['guidetype'] = 'targeting'
    tst_t['col4'] = 'NA|' + tst_t['chr']+':'+tst_t['start']+'-'+tst_t['end']+':'+tst_t['strand']
    tst_t['col7'] = tst_t['chr']+':'+tst_t['start']+'-'+tst_t['end']+':'+tst_t['strand']
    
    tst_v1 = pd.concat([tst_t, tst_nt], axis=0)

    new_col=['col8','col9','col10','col11','col12','col13','col14','col17']
    
    for i in new_col:
        tst_v1[i]='NA'
    tst_v2 = tst_v1[['chr','start','end','col4','count','strand','col7','col8','col9','col10','col11','col12','col13','col14','seq','guidetype','col17']]
    tst_v2.to_csv('/data/reddylab/Revathy/collabs/Keith/ENCODE_submission/CRISPR/data/%s.guide_quantifications.tsv'%(file_name), sep='\t', header=None, index=False)
    

In [1]:
%%bash
source /data/reddylab/software/miniconda3/bin/activate revathy_py3
cd /data/reddylab/Revathy/collabs/Keith/ENCODE_submission/CRISPR/metadata
eu_register.py -m prod -p file -i crispr_files_guide-quantifications.txt

2023-10-05 11:20:34,712:eu_debug:	Connecting to www.encodeproject.org
2023-10-05 11:20:34,743:eu_debug:	submission=False: In non-submission mode.
2023-10-05 11:20:35,427:eu_debug:	submission=True: In submission mode.
2023-10-05 11:20:35,435:eu_debug:	
IN post().
2023-10-05 11:20:35,463:eu_debug:	<<<<<< POST file record tim-reddy:mm10-th17-crispri-GITR-low-bin-rep1-guide-quantifications To DCC with URL https://www.encodeproject.org/file and this payload:

{
  "aliases": [
    "tim-reddy:mm10-th17-crispri-GITR-low-bin-rep1-guide-quantifications"
  ],
  "assembly": "mm10",
  "award": "/awards/UM1HG009428/",
  "dataset": "tim-reddy:mm10-th17-crispri-GITR-low-bin",
  "derived_from": [
    "tim-reddy:mm10-th17-crispri-GITR-low-bin-rep1-fastq"
  ],
  "file_format": "tsv",
  "file_size": 62161,
  "lab": "/labs/tim-reddy/",
  "md5sum": "b174f0e300fdf2d27299e1e9a4c2e540",
  "output_type": "guide quantifications",
  "replicate": "tim-reddy:mm10-th17-crispri-GITR-low-bin-rep1",
  "submitted_file_n

In [15]:
!ls /data/reddylab/Revathy/collabs/Keith/ENCODE_submission/CRISPR/data/

TG2-GITR-Hi-rep1.trim20.alignment.guide_quantifications.tsv
TG2-GITR-Hi-rep2.trim20.alignment.guide_quantifications.tsv
TG2-GITR-Hi-rep3.trim20.alignment.guide_quantifications.tsv
TG2-GITR-Hi-rep4.trim20.alignment.guide_quantifications.tsv
TG2-GITR-Lo-rep1.trim20.alignment.guide_quantifications.tsv
TG2-GITR-Lo-rep2.trim20.alignment.guide_quantifications.tsv
TG2-GITR-Lo-rep3.trim20.alignment.guide_quantifications.tsv
TG2-GITR-Lo-rep4.trim20.alignment.guide_quantifications.tsv


In [21]:
!tail /data/reddylab/Revathy/collabs/Keith/ENCODE_submission/CRISPR/data/TG2-GITR-Hi-rep1.trim20.alignment.guide_quantifications.tsv

NA	NA	NA	NA|nontargeting_31	474	NA	nontargeting_31	NA	NA	NA	NA	NA	NA	NA	GGGTAGACAGTCTCGGAGAG	Non-targeting	NA
NA	NA	NA	NA|nontargeting_32	440	NA	nontargeting_32	NA	NA	NA	NA	NA	NA	NA	AGTGGTGTCGCTCACCTTCG	Non-targeting	NA
NA	NA	NA	NA|nontargeting_33	551	NA	nontargeting_33	NA	NA	NA	NA	NA	NA	NA	AGAGTGGCGTCAGGCTGGCG	Non-targeting	NA
NA	NA	NA	NA|nontargeting_34	198	NA	nontargeting_34	NA	NA	NA	NA	NA	NA	NA	AAGCGCGGAATTACCCGTCA	Non-targeting	NA
NA	NA	NA	NA|nontargeting_35	296	NA	nontargeting_35	NA	NA	NA	NA	NA	NA	NA	TTGTGCCTATAGGTAAAATC	Non-targeting	NA
NA	NA	NA	NA|nontargeting_36	497	NA	nontargeting_36	NA	NA	NA	NA	NA	NA	NA	TCGCCCCCCACTACCAAGAA	Non-targeting	NA
NA	NA	NA	NA|nontargeting_37	375	NA	nontargeting_37	NA	NA	NA	NA	NA	NA	NA	TCCTGACCTATCCGAAAAAA	Non-targeting	NA
NA	NA	NA	NA|nontargeting_38	452	NA	nontargeting_38	NA	NA	NA	NA	NA	NA	NA	CCCACCCCGCTGTTTGCACG	Non-targeting	NA
NA	NA	NA	NA|nontargeting_39	393	NA	nontargeting_39	NA	NA	NA	NA	NA	NA	NA	GGAACCTCGCTAGTACCATT	Non-targeting	NA
N

In [23]:
%%bash 
tail -n 40 /data/reddylab/Revathy/collabs/Keith/ENCODE_submission/CRISPR/data/TG2-GITR-Hi-rep1.trim20.alignment.guide_quantifications.tsv \
| cut -f7,15 \
> /data/reddylab/Revathy/collabs/Keith/ENCODE_submission/CRISPR/data/nt_guides.txt

In [1]:
!ls /data/reddylab/Keith/collab/220131_TaniaRotation/230921_GITR_Submission/

220222_GITR_FullLibrary.fa
220222_TG1_GITR_InputLibrary_guideIDs_CountTable.csv
220304_TG2_GITR_KRAB_Output_CountTable.txt
guideScan_output.spec_score_over_0_2.noTTTTnoGGGGG.original.uniqueName.5bp_max_overlap.accessibilityScore_ranked_in_peaks.final.v2.txt
TG1-GITR-gRNA-InputLibrary_S5_L001_R1_001.fastq.gz
TG1-GITR-gRNA-InputLibrary.trim20.fastq
TG2-GITR-Hi-rep1_S6_L001_R1_001.fastq.gz
TG2-GITR-Hi-rep1.trim20.alignment.counts.txt
TG2-GITR-Hi-rep2_S8_L001_R1_001.fastq.gz
TG2-GITR-Hi-rep2.trim20.alignment.counts.txt
TG2-GITR-Hi-rep3_S10_L001_R1_001.fastq.gz
TG2-GITR-Hi-rep3.trim20.alignment.counts.txt
TG2-GITR-Hi-rep4_S12_L001_R1_001.fastq.gz
TG2-GITR-Hi-rep4.trim20.alignment.counts.txt
TG2-GITR-Lo-rep1_S5_L001_R1_001.fastq.gz
TG2-GITR-Lo-rep1.trim20.alignment.counts.txt
TG2-GITR-Lo-rep2_S7_L001_R1_001.fastq.gz
TG2-GITR-Lo-rep2.trim20.alignment.counts.txt
TG2-GITR-Lo-rep3_S9_L001_R1_001.fastq.gz
TG2-GITR-Lo-rep3.trim20.alignment.counts.txt
TG2-GITR-Lo-rep4_S11_L001_R

In [4]:
!head /data/reddylab/Keith/collab/220131_TaniaRotation/230921_GITR_Submission/guideScan_output.spec_score_over_0_2.noTTTTnoGGGGG.original.uniqueName.5bp_max_overlap.accessibilityScore_ranked_in_peaks.final.v2.txt

chromosome	target site start coordinate	target site end coordinate	gRNA	cutting efficiency score	cutting specificity score	strand	offtargets sum	offtargets summary	annotation	gRNA label	accessibility_cpm	peak	rank_in_peak
chr4	156026280	156026302	GGCCTGAAGCCCAGTCTGAG	56	0.24014734	+	22	2:1|3:21	*	peak_10__66	6.566831376149846	peak_10_	1
chr4	156026211	156026233	TCTGCTCTACACTTCACAGA	65	0.23964306	+	25	2:1|3:24	*	peak_10__67	6.304071908621422	peak_10_	2
chr4	156026240	156026262	CAAACACCTCAGATGTCTGC	56	0.22785936	+	15	2:2|3:13	*	peak_10__73	4.398388513167472	peak_10_	3
chr4	156026323	156026345	GAGACAAGGCAAGTTGGAGC	54	0.30288193	+	22	2:1|3:21	*	peak_10__48	4.253951801960835	peak_10_	4
chr4	156026183	156026205	GTTAGGAGATGTCCAGAAAG	63	0.22126558	+	16	2:2|3:14	*	peak_10__74	4.206091344460994	peak_10_	5
chr4	156026534	156026556	ACTCTCCTCCTTGCCTTACC	37	0.29646987	-	10	2:1|3:9	"Tnfrsf18_ENSMUST00000103173.9_exon_1_of_5	Tnfrsf18_ENSMUST00000040274.12_exon_1_of_4	Tnfrsf18_ENSMUST00000122001.

In [29]:
import pandas as pd

guide = pd.read_csv('/data/reddylab/Keith/collab/220131_TaniaRotation/230921_GITR_Submission/guideScan_output.spec_score_over_0_2.noTTTTnoGGGGG.original.uniqueName.5bp_max_overlap.accessibilityScore_ranked_in_peaks.final.v2.txt', sep='\t')
guide['name'] = guide['gRNA label']+'_'+guide['chromosome']+':'+guide['target site start coordinate'].astype(str)+'-'+guide['target site end coordinate'].astype(str)+':'+guide['strand']
guide['hg38_coordinate'] = guide['chromosome']+':'+guide['target site start coordinate'].astype(str)+'-'+guide['target site end coordinate'].astype(str)+':'+guide['strand']
guide_subset=guide[['name', 'hg38_coordinate', 'gRNA', 'cutting efficiency score', 'cutting specificity score', 'offtargets summary']]


nt = pd.read_csv('/data/reddylab/Revathy/collabs/Keith/ENCODE_submission/CRISPR/data/nt_guides.txt', sep='\t', names=['name','gRNA'])
nt['hg38_coordinate'] ='NA'
nt['cutting efficiency score']='NA'
nt['cutting specificity score']='NA'
nt['offtargets summary']='NA'
nt = nt[['name', 'hg38_coordinate', 'gRNA', 'cutting efficiency score', 'cutting specificity score', 'offtargets summary']]

guide_input = pd.concat([guide_subset, nt], axis=0)
guide_input.to_csv('/data/reddylab/Revathy/collabs/Keith/ENCODE_submission/CRISPR/data/guide_input.tsv', sep='\t')

In [31]:
%%bash
source /data/reddylab/software/miniconda3/bin/activate revathy_py3
cd /data/reddylab/Revathy/collabs/Keith/ENCODE_submission/CRISPR/metadata
eu_register.py -m prod -p file -i file_gRNA.txt

2023-10-16 10:16:20,348:eu_debug:	Connecting to www.encodeproject.org
2023-10-16 10:16:20,349:eu_debug:	submission=False: In non-submission mode.
2023-10-16 10:16:21,009:eu_debug:	submission=True: In submission mode.
2023-10-16 10:16:21,015:eu_debug:	
IN post().
2023-10-16 10:16:21,016:eu_debug:	<<<<<< POST file record tim-reddy:mm10-th17-crispri-GITR-low-bin-gRNAs To DCC with URL https://www.encodeproject.org/file and this payload:

{
  "aliases": [
    "tim-reddy:mm10-th17-crispri-GITR-low-bin-gRNAs"
  ],
  "award": "/awards/UM1HG009428/",
  "dataset": "tim-reddy:mm10-th17-crispri-GITR-low-bin",
  "file_format": "tsv",
  "file_size": 47763,
  "lab": "/labs/tim-reddy/",
  "md5sum": "9f6bc06651d670d5dd0b5c4af6f229b2",
  "output_type": "gRNAs",
  "submitted_file_name": "/data/reddylab/Revathy/collabs/Keith/ENCODE_submission/CRISPR/data/guide_input.tsv"
}


2023-10-16 10:16:21,738:eu_debug:	{'code': 409, 'status': 'error', 'detail': "Keys conflict: [('alias', 'tim-reddy:mm10-th17-crispri

Traceback (most recent call last):
  File "/data/reddylab/software/miniconda3/envs/revathy_py3/bin/eu_register.py", line 345, in <module>
    main()
  File "/data/reddylab/software/miniconda3/envs/revathy_py3/bin/eu_register.py", line 156, in main
    conn.post(payload, require_aliases=not no_aliases)
  File "/data/reddylab/software/miniconda3/envs/revathy_py3/lib/python3.5/site-packages/encode_utils/connection.py", line 1066, in post
    response.raise_for_status()
  File "/data/reddylab/software/miniconda3/envs/revathy_py3/lib/python3.5/site-packages/requests/models.py", line 943, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 409 Client Error: Conflict for url: https://www.encodeproject.org/file


### Submit new biosamples

In [1]:
%%bash
source /data/reddylab/software/miniconda3/bin/activate revathy_py3
cd /data/reddylab/Revathy/collabs/Keith/ENCODE_submission/CRISPR/metadata
eu_register.py -m prod -p biosample -i biosample_v1.txt

2024-02-26 09:33:11,040:eu_debug:	Connecting to www.encodeproject.org
2024-02-26 09:33:11,068:eu_debug:	submission=False: In non-submission mode.
2024-02-26 09:33:11,727:eu_debug:	submission=True: In submission mode.
2024-02-26 09:33:11,746:eu_debug:	
IN post().
2024-02-26 09:33:11,746:eu_debug:	<<<<<< POST biosample record tim-reddy:mm10-th17-crispri-GITR-biosample-rep2 To DCC with URL https://www.encodeproject.org/biosample and this payload:

{
  "aliases": [
    "tim-reddy:mm10-th17-crispri-GITR-biosample-rep2"
  ],
  "award": "/awards/UM1HG009428/",
  "biosample_ontology": "/biosample-types/primary_cell_CL_0000899/",
  "donor": "encode:C57BL_6NJ",
  "genetic_modifications": [
    "tim-reddy:mm10-th17-crispri-GITR"
  ],
  "lab": "/labs/tim-reddy/",
  "organism": "mouse",
  "source": "/sources/maria-ciofani/"
}


2024-02-26 09:33:12,268:eu_debug:	Success.
2024-02-26 09:33:12,279:eu_debug:	
IN post().
2024-02-26 09:33:12,280:eu_debug:	<<<<<< POST biosample record tim-reddy:mm10-th17-c

### Submit elements reference 