In [2]:
import pandas as pd
import numpy as np
from pyliftover import LiftOver

In [36]:
def convert_assembly_hg19_to_hg38(df):
    """
    This function uses the LiftOver package to convert the coordinates of a pandas dataframe from hg19 to hg38,
    while preserving the hg19 start position as a separate column
    """
    df['start_hg19'] = df['start']

    lift_over = LiftOver('hg19', 'hg38')
    indices_to_drop = []
    for index, row in df.iterrows():
        try:
            df.at[index, 'start'] = lift_over.convert_coordinate('chr'+row['chr'], row['start'])[0][1]
            df.at[index, 'end'] = lift_over.convert_coordinate('chr'+row['chr'], row['end'])[0][1]
        except:
            print("Failed to convert index", index, 'chr'+row['chr'], row['start'])
            print("Removing index", index)
            indices_to_drop.append(index)
    
    df = df.drop(indices_to_drop)
    df.reset_index(inplace=True,drop=True)
    return df

In [43]:
df1 = pd.read_csv('../data/dataset_uncensored.csv')
df1

Unnamed: 0,id,chr,start,end,ref,alt,driver,data_source
0,mut0,3,101578254,101578255,CAGTT,C,1,ICGC
1,mut1,3,101578285,101578286,GACCATTTGCCTT,G,1,ICGC
2,mut2,3,101578250,101578251,CTG,C,1,ICGC
3,mut3,17,48940015,48940016,CTAAAT,C,1,ICGC
4,mut4,17,48939986,48939987,AT,A,1,ICGC
...,...,...,...,...,...,...,...,...
1081,mut1081,4,95496939,95496939,C,T,0,COSMIC
1082,mut1082,6,53026396,53026396,C,T,0,COSMIC
1083,mut1083,2,220077160,220077160,G,T,0,COSMIC
1084,mut1084,16,31471209,31471209,C,T,0,COSMIC


In [45]:
df1.drop_duplicates(['chr', 'start', 'end', 'ref', 'alt'])

Unnamed: 0,id,chr,start,end,ref,alt,driver,data_source
0,mut0,3,101578254,101578255,CAGTT,C,1,ICGC
1,mut1,3,101578285,101578286,GACCATTTGCCTT,G,1,ICGC
2,mut2,3,101578250,101578251,CTG,C,1,ICGC
3,mut3,17,48940015,48940016,CTAAAT,C,1,ICGC
4,mut4,17,48939986,48939987,AT,A,1,ICGC
...,...,...,...,...,...,...,...,...
1081,mut1081,4,95496939,95496939,C,T,0,COSMIC
1082,mut1082,6,53026396,53026396,C,T,0,COSMIC
1083,mut1083,2,220077160,220077160,G,T,0,COSMIC
1084,mut1084,16,31471209,31471209,C,T,0,COSMIC


In [38]:
df1 = convert_assembly_hg19_to_hg38(df1)

Failed to convert index 592 chr2 242760774
Removing index 592


In [34]:
df_pos = pd.read_csv('../data/df_grch38.bed', sep='\t', header=None)
df_pos.columns = ['chr', 'start', 'end']

orig = pd.read_csv('../data/ICGC_TCGA_noncoding_data.csv')
df_pos['pos_37'] = orig['pos']
df_pos.drop_duplicates(keep='first', inplace=True)

df_neg = pd.read_csv('../data/negdf_grch38.bed', sep='\t', header=None)
df_neg.columns = ['chr', 'start', 'end', 'old_pos', 'bed_format']
df_neg['chr'] = df_neg['chr'].apply(lambda x: x.replace('chr', ''))
df_neg['chr_old'] = df_neg['old_pos'].str.split(':').str[0].str.replace('chr', '')
df_neg['start_old'] = df_neg['old_pos'].str.split(':').str[1].str.split('-').str[1]
df_neg['end_old'] = df_neg['old_pos'].str.split(':').str[1].str.split('-').str[0]
df_neg['start_old'] = pd.to_numeric(df_neg['start_old'])
df_neg['end_old'] = pd.to_numeric(df_neg['end_old'])
# metadata = pd.read_csv(r'D:\Sana\ChIA-PET\Files\metadata.tsv', sep='\t')

df_pos = df_pos[['chr', 'start', 'end', 'pos_37']]
df_pos['driver'] = 1
df_pos.drop_duplicates(keep='first', inplace=True)

df_neg = df_neg[['chr', 'start', 'end', 'start_old']]
df_neg['driver'] = 0
df_neg.rename(columns = {'start_old': 'pos_37'}, inplace = True)
df_neg.drop_duplicates(keep='first', inplace=True)

df = pd.concat([df_pos, df_neg])

In [35]:
df

Unnamed: 0,chr,start,end,pos_37,driver
0,3,101859411,101859414,101578255,1
1,3,101859442,101859453,101578286,1
3,3,101859407,101859408,101578251,1
5,17,50862655,50862659,48940016,1
6,17,50862626,50862626,48939987,1
...,...,...,...,...,...
594,6,53161598,53161598,53026396,0
595,2,219212438,219212438,220077160,0
596,16,31459888,31459888,31471209,0
597,18,61917060,61917060,59584293,0


In [42]:
df1.drop_duplicates(['chr', 'start', 'end'])

Unnamed: 0,id,chr,start,end,ref,alt,driver,data_source,start_hg19
0,mut0,3,101859410,101859411,CAGTT,C,1,ICGC,101578254
1,mut1,3,101859441,101859442,GACCATTTGCCTT,G,1,ICGC,101578285
2,mut2,3,101859406,101859407,CTG,C,1,ICGC,101578250
3,mut3,17,50862654,50862655,CTAAAT,C,1,ICGC,48940015
4,mut4,17,50862625,50862626,AT,A,1,ICGC,48939986
...,...,...,...,...,...,...,...,...,...
1080,mut1081,4,94575788,94575788,C,T,0,COSMIC,95496939
1081,mut1082,6,53161598,53161598,C,T,0,COSMIC,53026396
1082,mut1083,2,219212438,219212438,G,T,0,COSMIC,220077160
1083,mut1084,16,31459888,31459888,C,T,0,COSMIC,31471209
