In [1]:
import pandas as pd

# NC000913.3

In [2]:
v3 = pd.read_csv('./unformatted/sequence.gff3', sep='\t', header=None)
v3 = v3[v3[2] == 'CDS']

#NEED TO CHANGE AFTER GENOME - chromosome to v2
chromosome = 'NC_000913.3'
v3[0] = chromosome

#format info column 8
#split
all_info = v3[8].str.split(';',expand=True).replace('=',' ', regex=True)
all_info[7] = all_info[7].replace(' ','_', regex=True).replace('product_','product ', regex=True)

#filter desired info
cols = [5,6,7]
desired_info = all_info[cols].apply(lambda row: ';'.join(row.values.astype(str)), axis=1)
v3[8] = desired_info

v3.to_csv('genes.gff', sep='\t', header=None, index=None)

v3.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
2,NC_000913.3,RefSeq,CDS,190,255,.,+,0,gene thrL;locus_tag b0001;product thr_operon_l...
4,NC_000913.3,RefSeq,CDS,337,2799,.,+,0,gene thrA;locus_tag b0002;product fused_aspart...
6,NC_000913.3,RefSeq,CDS,2801,3733,.,+,0,gene thrB;locus_tag b0003;product homoserine_k...
8,NC_000913.3,RefSeq,CDS,3734,5020,.,+,0,gene thrC;locus_tag b0004;product threonine_sy...
10,NC_000913.3,RefSeq,CDS,5234,5530,.,+,0,gene yaaX;locus_tag b0005;product DUF2502_doma...


# Bitome TFBS

In [3]:
tfbs = pd.read_csv('./unformatted/tfbs.csv', index_col=0).drop_duplicates()
tfbs = tfbs.dropna(subset=['tf'])
tfbs.insert(0, 'chromosome', chromosome)

tfbs = tfbs[['chromosome','left', 'right', 'tf']]
tfbs.insert(1, 'Source', '')
tfbs.insert(2, 2, 'binding_site')
tfbs.insert(5, 5, '')
tfbs.insert(6, 6, '')
tfbs.insert(7, 7, '')
tfbs['tf'] = "TF " + tfbs['tf'].astype(str)

tfbs.to_csv('tfbs.gff', sep='\t', header=None, index=None)

tfbs.head()

Unnamed: 0,chromosome,Source,2,left,right,5,6,7,tf
0,NC_000913.3,,binding_site,612650,612668,,,,TF Fur
2,NC_000913.3,,binding_site,4516744,4516762,,,,TF Fur
3,NC_000913.3,,binding_site,1619005,1619019,,,,TF Fis
4,NC_000913.3,,binding_site,3305958,3305975,,,,TF TyrR
5,NC_000913.3,,binding_site,1386588,1386605,,,,TF TyrR


# Bitome TSS

In [4]:
tss = pd.read_csv('./unformatted/tss.csv', index_col=0).drop_duplicates()
tss = tss.dropna(subset=['tss', 'name'])
tss.insert(0, 'chromosome', chromosome)

tss = tss[['chromosome','tss', 'tss', 'name']]
tss.insert(1, 'Source', '')
tss.insert(2, 2, 'start_site')
tss.insert(5, 5, '')
tss.insert(6, 6, '')
tss.insert(7, 7, '')
tss['name'] = "tss_name " + tss['name'].astype(str)

tss.to_csv('tss.gff', sep='\t', header=None, index=None)

tss.head()

Unnamed: 0,chromosome,Source,2,tss,tss.1,5,6,7,name
0,NC_000913.3,,start_site,2976569.0,2976569.0,,,,tss_name galRp
1,NC_000913.3,,start_site,1116709.0,1116709.0,,,,tss_name lpxLp
2,NC_000913.3,,start_site,1116772.0,1116772.0,,,,tss_name yceAp
3,NC_000913.3,,start_site,89596.0,89596.0,,,,tss_name mraZp
4,NC_000913.3,,start_site,1329284.0,1329284.0,,,,tss_name sohBp1


# Bitome TU

In [4]:
tu

Unnamed: 0,TU,promoter,genes,operon,TU_POSLEFT,TU_POSRIGHT,TU_STRAND,POS_1
0,ECK120008913,ECK120009851,astCADBE,ECK120014360,1825955,1832013,reverse,1832013.0
1,ECK120008914,ECK120009852,astCADBE,ECK120014360,1825955,1832327,reverse,1832044.0
2,ECK120008915,ECK120009853,astCADBE,ECK120014360,1825955,1832039,reverse,1832039.0
3,ECK120008916,ECK120009855,nrdHIEF,ECK120014362,2800586,2804461,forward,2800656.0
4,ECK120008917,ECK120010879,cpxPQ,ECK125242769,4105726,4106387,forward,4105785.0
...,...,...,...,...,...,...,...,...
3555,ECK125257177,ECK125257027,ymdAB-clsC,ECK125272722,1105201,1107776,forward,1105201.0
3556,ECK125257178,ECK125257028,ymdB,ECK125272722,1105803,1106353,forward,1105803.0
3557,ECK125257179,ECK125257030,leuO,ECK120013165,84000,85312,forward,84298.0
3558,ECK125257180,ECK125257027,ymdAB,ECK125272722,1105201,1106353,forward,1105201.0


In [6]:
tu = pd.read_csv('./unformatted/transcription_unit.csv', index_col = 0)
tu = tu.rename({'TRANSCRIPTION_UNIT_NAME':'genes', 
                'TRANSCRIPTION_UNIT_ID':'TU',
                'OPERON_ID':'operon',
                'PROMOTER_ID': 'promoter'}, axis = 1)
strand = {'reverse':'-', 'forward':'+'}

new_tu = tu[['TU_POSLEFT', 'TU_POSRIGHT']]

for r, row in tu.iterrows():
    # flip directions if reverse
    new_tu.loc[r, 'Strand'] = strand[row.TU_STRAND]
        
    # build the ninth column
    col9 = ''
    for col in ['genes', 'TU', 'operon', 'promoter']:
        col9 += col + ' ' + str(row[col]) + ';'
    col9 = col9[:-1]
    new_tu.loc[r, 'properties'] = col9

new_tu.insert(0, 'chromosome', chromosome)
new_tu.insert(1, 'Source', '')
new_tu.insert(2, 2, 'tu')
new_tu.insert(5, 5, '')
new_tu.insert(7, 7, '')

new_tu.to_csv('tu.gff', sep='\t', header=None, index=None)

new_tu.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0,chromosome,Source,2,TU_POSLEFT,TU_POSRIGHT,5,Strand,7,properties
0,NC_000913.3,,tu,1825955,1832013,,-,,genes astCADBE;TU ECK120008913;operon ECK12001...
1,NC_000913.3,,tu,1825955,1832327,,-,,genes astCADBE;TU ECK120008914;operon ECK12001...
2,NC_000913.3,,tu,1825955,1832039,,-,,genes astCADBE;TU ECK120008915;operon ECK12001...
3,NC_000913.3,,tu,2800586,2804461,,+,,genes nrdHIEF;TU ECK120008916;operon ECK120014...
4,NC_000913.3,,tu,4105726,4106387,,+,,genes cpxPQ;TU ECK120008917;operon ECK12524276...
