In [1]:
import pandas as pd
from fuc import pybed

In [2]:
df = pd.read_csv("clear_data.csv")

df.head()

Unnamed: 0,cluster.ID,chr,start,end,N,strand,gene.id,gene.symbol,region,miRNA,...,target.map,MFE,duplex.start,seed match,seed position,bulged miRNA position,bulged target position,mismatch seed position,mismatched or bulged nucleotide(s),k.group
0,chr1_100127966_100128022_hsa-miR-16-5p,chr1,100127966,100128041,1,+,178.0,AGL,intron,hsa-miR-16-5p,...,|||||||----W||||W|||-,-19.5,6,7merm8,20.0,,,,,2
1,chr1_100154790_100154826_hsa-miR-92a-3p,chr1,100154790,100154865,1,+,178.0,AGL,CDS,hsa-miR-92a-3p,...,-|||||---|||-||||-||WW|-,-27.2,8,7merA1.indel,26.0,2.0,,,A,5B
2,chr1_100160176_100160232_hsa-miR-30a-5p,chr1,100160176,100160251,1,+,178.0,AGL,3'UTR,hsa-miR-30a-5p,...,-||||----|||-,-9.4,54,6merA1.mm,32.0,,,4.0,"A,A",4
3,chr1_100260813_100260845_hsa-miR-500a-5p,chr1,100260813,100260888,1,+,23443.0,SLC35A3,3'UTR,hsa-miR-500a-5p,...,||||||----|||||W-,-22.0,6,7merA1,17.0,,,,,2
4,chr1_100262727_100262757_hsa-miR-181a-5p,chr1,100262727,100262802,1,+,23443.0,SLC35A3,3'UTR||intron,hsa-miR-181a-5p,...,-||||||-W||||-||||-,-23.6,4,6mer_off,16.0,,,,,2


### rename columns

In [3]:
rename_dict = {
    'cluster.ID': 'cluster_id',
    'chr': 'chromosome',
    'start': 'start',
    'end': 'end',
    'N': 'N_value',
    'strand': 'strand_direction',
    'gene.id': 'gene_id',
    'gene.symbol': 'gene_symbol',
    'region': 'region_type',
    'miRNA': 'miRNA_sequence',
    'miR.map': 'miRNA_map',
    'target.map': 'target_map',
    'MFE': 'minimum_free_energy',
    'duplex.start': 'duplex_start_position',
    'seed match': 'seed_match',
    'seed position': 'seed_position',
    'bulged miRNA position': 'bulged_miRNA_position',
    'bulged target position': 'bulged_target_position',
    'mismatch seed position': 'mismatch_seed_position',
    'mismatched or bulged nucleotide(s)': 'mismatch_bulged_nucleotides',
    'k.group': 'k_group'
}
# rename columns
df = df.rename(columns=rename_dict)

In [4]:
# uncomment if you want to generate the bed file


# # filter & rename columns for pybed
# coord_df = df[["chromosome", "start", "end"]]
# coord_df.columns = ['Chromosome', 'Start', 'End']

# # call pybed to turn into .bed file
# bf = pybed.BedFrame.from_frame(meta=[], data=coord_df)
# bf.to_file('clear_hg18_coordinates.bed')

### replacing old coordinates with new ones

In [5]:
# drop old coords
df.drop(['chromosome', 'start', 'end'], axis=1, inplace=True)


# read & merge new ones
new_coords = pd.read_csv("../../external/clear/clear_grch38_coordinates.bed", sep="\t", header=None)
new_coords.columns = ['chromosome', 'start', 'end']

df = pd.concat([df, new_coords], axis=1)


In [6]:
print(len(df))
print(df.start.isna().sum())

32712
14


In [7]:
# drop rows with NaN values in the "start" column. These coordinates can't be found by ENSEMBL's coordinate transformer
df = df.dropna(subset=['start'])

# convert the cols to integer as it don't have any NaN left
df["start"] = df["start"].astype(int)
df["end"] = df["end"].astype(int)

print(len(df))


32698


### converting mRNA and miRNA maps into alignment strings

In [8]:
df.columns

Index(['cluster_id', 'N_value', 'strand_direction', 'gene_id', 'gene_symbol',
       'region_type', 'miRNA_sequence', 'miRNA_map', 'target_map',
       'minimum_free_energy', 'duplex_start_position', 'seed_match',
       'seed_position', 'bulged_miRNA_position', 'bulged_target_position',
       'mismatch_seed_position', 'mismatch_bulged_nucleotides', 'k_group',
       'chromosome', 'start', 'end'],
      dtype='object')

In [11]:
df

Unnamed: 0,cluster_id,N_value,strand_direction,gene_id,gene_symbol,region_type,miRNA_sequence,minimum_free_energy,duplex_start_position,seed_match,...,bulged_miRNA_position,bulged_target_position,mismatch_seed_position,mismatch_bulged_nucleotides,k_group,chromosome,start,end,alignment_string,mirna_alignment_string
0,chr1_100127966_100128022_hsa-miR-16-5p,1,+,178.0,AGL,intron,hsa-miR-16-5p,-19.5,6,7merm8,...,,,,,2,chr1,99889822,99889897,111111100002111121110,111111102111121001100
1,chr1_100154790_100154826_hsa-miR-92a-3p,1,+,178.0,AGL,CDS,hsa-miR-92a-3p,-27.2,8,7merA1.indel,...,2.0,,,A,5B,chr1,99916646,99916721,011111000111011110112210,011111000111111111221
2,chr1_100160176_100160232_hsa-miR-30a-5p,1,+,178.0,AGL,3'UTR,hsa-miR-30a-5p,-9.4,54,6merA1.mm,...,,,4.0,"A,A",4,chr1,99922032,99922107,0111100001110,000000000000011110111
3,chr1_100260813_100260845_hsa-miR-500a-5p,1,+,23443.0,SLC35A3,3'UTR,hsa-miR-500a-5p,-22.0,6,7merA1,...,,,,,2,chr1,100022669,100022744,11111100001111120,1111110000011111200000
4,chr1_100262727_100262757_hsa-miR-181a-5p,1,+,23443.0,SLC35A3,3'UTR||intron,hsa-miR-181a-5p,-23.6,4,6mer_off,...,,,,,2,chr1,100024583,100024658,0111111021111011110,0111111002111101111000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32693,chrY_14801934_14801989_hsa-miR-17-5p,1,-,,,deep_intergenic,hsa-miR-17-5p,-9.6,52,,...,,,,,4,chrY,2925180,2925255,0111011001210001110110,0001110110012100111110
32694,chrY_15577527_15577583_hsa-miR-196a-5p,1,+,,,deep_intergenic,hsa-miR-196a-5p,-17.7,28,7merA1.indel,...,2.0,,,A,6,chrY,6763687,6763762,011111001110001110000000021110,011111011101110021110
32695,chrY_16665792_16665848_hsa-miR-146a-5p,1,-,,,deep_intergenic,hsa-miR-146a-5p,-17.3,9,6mer.indel,...,3.0,,,A,5B,chrY,7639977,7640052,111000110001111121110,111001100001110112111
32696,chrY_17760045_17760101_hsa-miR-194-5p,1,-,,,deep_intergenic,hsa-miR-194-5p,-15.7,43,6mer.indel,...,3.0,,,T,4,chrY,8584118,8584193,0111111000000101110110,000011111100111101100


In [9]:
trans_table = str.maketrans("-|W", "012")

df["alignment_string"] = df["target_map"].str.translate(trans_table)
df["mirna_alignment_string"] = df["miRNA_map"].str.translate(trans_table)

df = df.drop(columns=["target_map", "miRNA_map"])

df.head()

Unnamed: 0,cluster_id,N_value,strand_direction,gene_id,gene_symbol,region_type,miRNA_sequence,minimum_free_energy,duplex_start_position,seed_match,...,bulged_miRNA_position,bulged_target_position,mismatch_seed_position,mismatch_bulged_nucleotides,k_group,chromosome,start,end,alignment_string,mirna_alignment_string
0,chr1_100127966_100128022_hsa-miR-16-5p,1,+,178.0,AGL,intron,hsa-miR-16-5p,-19.5,6,7merm8,...,,,,,2,chr1,99889822,99889897,111111100002111121110,111111102111121001100
1,chr1_100154790_100154826_hsa-miR-92a-3p,1,+,178.0,AGL,CDS,hsa-miR-92a-3p,-27.2,8,7merA1.indel,...,2.0,,,A,5B,chr1,99916646,99916721,011111000111011110112210,011111000111111111221
2,chr1_100160176_100160232_hsa-miR-30a-5p,1,+,178.0,AGL,3'UTR,hsa-miR-30a-5p,-9.4,54,6merA1.mm,...,,,4.0,"A,A",4,chr1,99922032,99922107,0111100001110,000000000000011110111
3,chr1_100260813_100260845_hsa-miR-500a-5p,1,+,23443.0,SLC35A3,3'UTR,hsa-miR-500a-5p,-22.0,6,7merA1,...,,,,,2,chr1,100022669,100022744,11111100001111120,1111110000011111200000
4,chr1_100262727_100262757_hsa-miR-181a-5p,1,+,23443.0,SLC35A3,3'UTR||intron,hsa-miR-181a-5p,-23.6,4,6mer_off,...,,,,,2,chr1,100024583,100024658,0111111021111011110,0111111002111101111000


In [10]:
df.to_csv("../../processed/clear/clear.csv", index=False)

In [12]:
len(df)

32698