In [1]:
import pandas as pd

## 1. conserved regions on B. masoniana chromosomes

In [22]:
# Load the CSV file with the conserved regions
dfcon = pd.read_csv('/home/thibauld/Documents/Bioinformatics/Deleterious_alleles_pipeline/Deleterious_alleles_PNG/intersection_GERP_HyPhy_ASTRAL/intersection_GERP_HyPhy.bed', 
                 sep='\t', 
                 header=None, 
                 names=['Contig', 'Start', 'End', '-', '--', '---'])

dfcon = dfcon.drop(columns=['-', '--', '---'])

# Rename columns for clarity
dfcon = dfcon.rename(columns={'Start': 'clocusstart', 'End': 'clocusend'})


# Display the first few rows
print(dfcon.head())

                Contig  clocusstart  clocusend
0   ACmerged_contig_66          358        363
1   ACmerged_contig_66          367        369
2   ACmerged_contig_66          373        375
3  ACmerged_contig_694          175        183
4  ACmerged_contig_806          907        909


In [24]:
# Load the BED file with the position of the baits on the scaffolds.
dfsca = pd.read_csv('/home/thibauld/Documents/Bioinformatics/Deleterious_alleles_pipeline/Deleterious_alleles_PNG/blast_masoniana/results_blast_best_per_cluster_Only_one_match.csv', 
                 sep=',')

# Display the first few rows
print(dfsca.head())

   cluster_id                                      qseqid      sseqid  pident  \
0           1  Hannah_Begonia_baits-ACmerged_contig_10054  scaffold11   100.0   
1           2   Hannah_Begonia_baits-ACmerged_contig_1011   scaffold1   100.0   
2           3  Hannah_Begonia_baits-ACmerged_contig_10412  scaffold14   100.0   
3           6  Hannah_Begonia_baits-ACmerged_contig_10858  scaffold12   100.0   
4           9  Hannah_Begonia_baits-ACmerged_contig_10984   scaffold8   100.0   

   length    sstart      send      slen         evalue  bitscore  
0      63  43902442  43902504  46644841   7.150000e-24       117  
1     272   6657182   6656911  84930515  2.870000e-140       503  
2      84  35880662  35880579  45922387   8.590000e-36       156  
3     399   7226210   7226608  46145620   0.000000e+00       737  
4     172  46732024  46732195  51801790   1.540000e-84       318  


In [25]:
# Remove the prefix 'Hannah_Begonia_baits-' from the 'qseqid' column and k
dfsca['qseqid'] = dfsca['qseqid'].str.replace(r'^Hannah_Begonia_baits-', '', regex=True)
dfsca = dfsca[['qseqid', 'sseqid', 'sstart', 'send']]

# Rename columns for clarity
dfsca = dfsca.rename(columns={'sseqid': 'Scaffold', 'qseqid': 'Contig'})

dfsca 

Unnamed: 0,Contig,Scaffold,sstart,send
0,ACmerged_contig_10054,scaffold11,43902442,43902504
1,ACmerged_contig_1011,scaffold1,6657182,6656911
2,ACmerged_contig_10412,scaffold14,35880662,35880579
3,ACmerged_contig_10858,scaffold12,7226210,7226608
4,ACmerged_contig_10984,scaffold8,46732024,46732195
...,...,...,...,...
65,ACmerged_contig_8797,scaffold6,1184541,1184229
66,ACmerged_contig_9152,scaffold6,30653488,30653159
67,ACmerged_contig_9838,scaffold1,1120378,1120661
68,ACmerged_contig_9906,scaffold3,4691057,4691220


In [26]:
# merge the two dataframes on 'Contig'
df_merged = pd.merge(dfsca, dfcon, on='Contig')
df_merged

Unnamed: 0,Contig,Scaffold,sstart,send,clocusstart,clocusend
0,ACmerged_contig_10054,scaffold11,43902442,43902504,550,552
1,ACmerged_contig_1011,scaffold1,6657182,6656911,91,93
2,ACmerged_contig_10412,scaffold14,35880662,35880579,700,705
3,ACmerged_contig_10412,scaffold14,35880662,35880579,712,714
4,ACmerged_contig_10412,scaffold14,35880662,35880579,718,720
...,...,...,...,...,...,...
247,ACmerged_contig_9152,scaffold6,30653488,30653159,1912,1914
248,ACmerged_contig_9838,scaffold1,1120378,1120661,1216,1218
249,ACmerged_contig_9838,scaffold1,1120378,1120661,1228,1230
250,ACmerged_contig_9906,scaffold3,4691057,4691220,535,543


In [28]:
# Calculate the start and end positions of the conserved loci on the scaffolds
df_merged['slocusstart'] = df_merged['sstart'] + df_merged['clocusstart'] - 1
df_merged['slocusend']   = df_merged['sstart'] + df_merged['clocusend'] - 1

In [30]:
# Keep only relevant columns
df_final = df_merged[['Scaffold', 'slocusstart', 'slocusend']]
df_final = df_final.rename(columns={'slocusstart': 'Start', 'slocusend': 'End'})
df_final


Unnamed: 0,Scaffold,Start,End
0,scaffold11,43902991,43902993
1,scaffold1,6657272,6657274
2,scaffold14,35881361,35881366
3,scaffold14,35881373,35881375
4,scaffold14,35881379,35881381
...,...,...,...
247,scaffold6,30655399,30655401
248,scaffold1,1121593,1121595
249,scaffold1,1121605,1121607
250,scaffold3,4691591,4691599


In [31]:
# Format properly as a BED file and save to local directory.
df_final.to_csv('/home/thibauld/Documents/Bioinformatics/Deleterious_alleles_pipeline/Deleterious_alleles_PNG/Merge_pipeline/Scaffold_conserved_loci.bed', 
                sep='\t', 
                header=False, 
                index=False)