In [21]:
#import libraries
import os
import pandas as pd 
import numpy as np 
import matplotlib as plt
import seaborn as sns 
import gdown
from collections import defaultdict

# Functions

#### fixing '5E54' becoming '5.00E+54' issue

In [22]:
def fix_PDB_ID(df):
    for i, j in enumerate(df['PDB_ID']):
        print (j)
        if len(str(j))>4:
            if '-' in j:
                X= ''.join(j.split('-')).upper()
                df.loc[i, 'PDB_ID']= X
            else:
                if '.' in j:
                    x= str(j)
                    x1= x.split('.')[0]
                    x2= x.split('+')[1]
                    X= x1+'E'+x2
                    print (X)
                    df.loc[i, 'PDB_ID']= X
                else:
                    x= str(j)
                    x1= x.split('+')[0]
                    x2= x.split('+')[1]
                    X= x1+ x2
                    print (X)
                    df.loc[i, 'PDB_ID']= X
                    
    return (df)

#### Function to identify base pair of interest

In [75]:
def find_bp_interest(df, bp, hbonds):
    if bp.split('-')[0] != bp.split('-')[1]:
        bps= [bp, "-".join(bp.split("-")[::-1])]
    else:
        bps= [bp]
        
    df1= df[df['base_pair'].isin(bps)]
    df1.index= np.arange(0, len(df1))
    
    def extract_bp(row, hbond):
        hbond_variants = [hbond, "-".join(hbond.split("-")[::-1])]  # e.g., 'G.O6-U.N3' -> ['G.O6-U.N3', 'U.N3-G.O6']
        cols_int=['combined_hbond_1', 'combined_hbond_2', 'combined_hbond_3',
                  'combined_hbond_4', 'combined_hbond_5', 'combined_hbond_6',
                  'combined_hbond_7', 'combined_hbond_8', 'combined_hbond_9',
                  'combined_hbond_10'        
                 ]
        for col in cols_int:
            for hb in hbond_variants:
                if row[col].startswith(hb):
                    return float(row[col].split('_')[1])
        return None
    
    suffix_groups = defaultdict(list)
    for col in df1.columns:
        suffix = '_'.join(col.split('_')[-2:])
    
        if suffix.startswith('hbond'):
            print (suffix)
            suffix_groups[suffix].append(col)
    
    # Combine values for each group with matching suffix
    for suffix, cols in suffix_groups.items():
        if len(cols) == 2:  # Only combine if exactly two columns share the suffix
            col1, col2 = cols
            new_col = f'combined_{suffix}'
            df1[new_col] = df1[col2].astype(str) + '_' + df1[col1].astype(str)
            
    for hbond in hbonds:
        df1[hbond] = df1.apply(lambda row: extract_bp(row, hbond), axis=1)
    
    df2= df1.dropna(subset= hbonds) #droping examples which do not contain all the hbonds user specified
    
    df3 = df2.drop(columns=[col for col in df.columns if col.startswith('combined')]) #removing the temp columns
    
    return df3

# Data preparation

#### Step 1: importing list of base pair data from google drive

In [24]:
#import data as csv

#importing option 1: from local directory
#df_bp= pd.read_csv('/Users/jonesyy/Downloads/SROP Work/data/06_06_25/all_bps_with_all_regs_apr_2025.csv')


#importing option 2: importing using the Google drive link
#url = 'https://drive.google.com/file/d/1sL2JUIHRMAOFo7k8mjKveVF2RGcu7q-4/view?usp=drive_link' #the google drive link should give editor access 
url= 'https://drive.google.com/file/d/1I5GsOlCdIdY3iTc6wFQoTdsMWKj64vHA/view?usp=drive_link'

# Convert to a downloadable link
file_id = url.split('/d/')[1].split('/')[0]
download_url = f'https://drive.google.com/uc?id={file_id}'

# Download the file
output = 'data.csv'
gdown.download(download_url, output, quiet=False)

# Load into pandas
df_bp = pd.read_csv(output)

# Delete the downloaded CSV file
os.remove(output)

Downloading...
From: https://drive.google.com/uc?id=1I5GsOlCdIdY3iTc6wFQoTdsMWKj64vHA
To: /Users/sharear/Documents/sky/RESEARCH_&_GRAD_SCHOOL/Penn_State/identify_Homo_base_pairs/scripts/data.csv
100%|██████████| 1.39M/1.39M [00:00<00:00, 14.6MB/s]


#### Step 2: Fixing '5E54' becoming '5.00E+54' issue

In [25]:
df_bp1= fix_PDB_ID(df_bp)

124D
124D
124D
124D
124D
124D
124D
124D
176D
176D
176D
176D
176D
176D
17RA
17RA
17RA
17RA
17RA
17RA
17RA
17RA
1A1T
1A1T
1A1T
1A1T
1A1T
1A1T
1A1T
1A1T
1A3M
1A3M
1A3M
1A3M
1A3M
1A3M
1A3M
1A3M
1A3M
1A3M
1A3M
1A3M
1A3M
1A4D
1A4D
1A4D
1A4D
1A4D
1A4D
1A4D
1A4D
1A4D
1A4D
1A4D
1A4D
1A4D
1A4D
1A4D
1A4D
1A4D
1A4D
1A4D
1A4T
1A4T
1A4T
1A4T
1A4T
1A4T
1A51
1A51
1A51
1A51
1A51
1A51
1A51
1A51
1A51
1A51
1A51
1A51
1A51
1A51
1A51
1A51
1A51
1A51
1A51
1A60
1A60
1A60
1A60
1A60
1A60
1A60
1A60
1A60
1A60
1A60
1A60
1A60
1A60
1A60
1A60
1A9L
1A9L
1A9L
1A9L
1A9L
1A9L
1A9L
1A9L
1A9L
1A9L
1A9L
1A9L
1A9L
1A9L
1AC3
1AC3
1AC3
1AC3
1AC3
1AC3
1AC3
1AC3
1AC3
1AC3
1AC3
1AC3
1AFX
1AFX
1AFX
1AFX
1AJF
1AJF
1AJF
1AJF
1AJF
1AJF
1AJF
1AJF
1AJL
1AJL
1AJL
1AJL
1AJL
1AJL
1AJL
1AJT
1AJT
1AJT
1AJT
1AJT
1AJT
1AJT
1AJU
1AJU
1AJU
1AJU
1AJU
1AJU
1AJU
1AJU
1AJU
1AJU
1AJU
1AKX
1AKX
1AKX
1AKX
1AKX
1AKX
1AKX
1AKX
1AKX
1AKX
1AKX
1AKX
1AKX
1AKX
1AL5
1AL5
1AL5
1AL5
1AL5
1AL5
1AL5
1AL5
1AL5
1AL5
1AL5
1AL5
1AM0
1AM0
1AM0
1AM0
1AM0
1AM0
1ANR
1ANR


#### Step 3: Dropping null residue indexes 

In [26]:
df_bp2 = df_bp1.dropna(subset=['res_index_res1', 'res_index_res2'])
df_bp2.index= np.arange(0, len(df_bp2))

#### Step 4: Making sure icode columns will not contain null values

In [28]:
#making sure icode columns will not contain null values
df_bp2['icode_res1'] = df_bp2['icode_res1'].fillna('*')
df_bp2['icode_res1'] = df_bp2['icode_res1'].replace('nan', '*')

df_bp2['icode_res2'] = df_bp2['icode_res2'].fillna('*')
df_bp2['icode_res2'] = df_bp2['icode_res2'].replace('nan', '*')


#converting residue index into integer
df_bp2['res_index_res1'] = df_bp2['res_index_res1'].astype(float).astype(int).astype(str)
df_bp2['res_index_res2'] = df_bp2['res_index_res2'].astype(float).astype(int).astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bp2['icode_res1'] = df_bp2['icode_res1'].fillna('*')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bp2['icode_res1'] = df_bp2['icode_res1'].replace('nan', '*')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bp2['icode_res2'] = df_bp2['icode_res2'].fillna('*')
A value is trying to be set o

#### Step 5: Removing any hydrogen bonds which can result into some weird unstable base pairs

In [29]:
df_bp2 = df_bp2[~df_bp2['atoms_ID_hbond_1'].str.contains("'", regex=False, na=False)]
df_bp2.index= np.arange(0, len(df_bp2))

df_bp2 = df_bp2[~df_bp2['atoms_ID_hbond_2'].str.contains("'", regex=False, na=False)]
df_bp2.index= np.arange(0, len(df_bp2))

df_bp2 = df_bp2[~df_bp2['atoms_ID_hbond_3'].str.contains("'", regex=False, na=False)]
df_bp2.index= np.arange(0, len(df_bp2))

df_bp2 = df_bp2[~df_bp2['atoms_ID_hbond_1'].str.contains("OP", regex=False, na=False)]
df_bp2.index= np.arange(0, len(df_bp2))

df_bp2 = df_bp2[~df_bp2['atoms_ID_hbond_2'].str.contains("OP", regex=False, na=False)]
df_bp2.index= np.arange(0, len(df_bp2))

df_bp2 = df_bp2[~df_bp2['atoms_ID_hbond_3'].str.contains("OP", regex=False, na=False)]
df_bp2.index= np.arange(0, len(df_bp2))

#### Step 6: (optional) Removing any base pair involved with DNA or covalently modified residues

In [30]:
df_bp2['bp_name_res']= df_bp2['res_ID_res1']+'-'+df_bp2['res_ID_res2']
df_bp3= df_bp2[(df_bp2['bp_name_res']== df_bp2['base_pair'])]
df_bp3.index= np.arange(0, len(df_bp3))

In [10]:
print (df_bp2.shape)
print (df_bp3.shape)

(8746, 35)
(7664, 35)


#### Step 7: Replacing '--' from the name column to 'undefined' 

In [31]:
df_bp3['name'] = df_bp3['name'].replace('--', 'undefined')
df_bp3['name'] = df_bp3['name'].replace('WC', 'WCF')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bp3['name'] = df_bp3['name'].replace('--', 'undefined')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bp3['name'] = df_bp3['name'].replace('WC', 'WCF')


#### Step 8: Droping atom serial number columns 

In [32]:
columns_to_drop=[]
for cols in df_bp3.columns:
    if cols.startswith('atoms_serNum'):
        columns_to_drop.append(cols)
    
df_bp4 = df_bp3.drop(columns=columns_to_drop).copy()

In [33]:
df_bp4.shape

(7664, 30)

#### Step 9: Applying a distance cut-off for all hbonds 

In [37]:
dcols=[]
for cols in df_bp4.columns:
    if cols.startswith('distance'):
        dcols.append(cols)
df_bp4[dcols] = df_bp4[dcols].astype(float)
df_bp5 = df_bp4[~((df_bp4[dcols] > 3.4).any(axis=1))]
df_bp5[dcols] = df_bp5[dcols].astype(str)
df_bp5.index= np.arange(0, len(df_bp5))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bp5[dcols] = df_bp5[dcols].astype(str)


#### Step 10: Applying a resolution cut-off 

In [35]:
df_bp5['Resolution_(Å)'] = df_bp5['Resolution_(Å)'].astype(float)
df_bp6 = df_bp5[~(df_bp5['Resolution_(Å)'] > 3)]
df_bp6.index= np.arange(0, len(df_bp6))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bp5['Resolution_(Å)'] = df_bp5['Resolution_(Å)'].astype(float)


In [3]:
df_bp.columns

Index(['PDB_ID', 'Experimental_Method', 'Resolution_(Å)', 'nt1',
       'chain_ID_res1', 'res_ID_res1', 'res_index_res1', 'icode_res1', 'nt2',
       'chain_ID_res2', 'res_ID_res2', 'res_index_res2', 'icode_res2',
       'base_pair', 'name', 'Saenger', 'LW', 'DSSR', 'bp_res',
       'distance_hbond_1', 'distance_hbond_2', 'distance_hbond_3',
       'atoms_serNum_hbond_1', 'atoms_serNum_hbond_2', 'atoms_serNum_hbond_3',
       'atoms_ID_hbond_1', 'atoms_ID_hbond_2', 'atoms_ID_hbond_3',
       'distance_hbond_4', 'atoms_serNum_hbond_4', 'atoms_ID_hbond_4',
       'distance_hbond_5', 'distance_hbond_6', 'atoms_serNum_hbond_5',
       'atoms_serNum_hbond_6', 'atoms_ID_hbond_5', 'atoms_ID_hbond_6',
       'distance_hbond_7', 'distance_hbond_8', 'atoms_serNum_hbond_7',
       'atoms_serNum_hbond_8', 'atoms_ID_hbond_7', 'atoms_ID_hbond_8',
       'distance_hbond_9', 'atoms_serNum_hbond_9', 'atoms_ID_hbond_9',
       'distance_hbond_10', 'atoms_serNum_hbond_10', 'atoms_ID_hbond_10'],
      dty

In [21]:
df_bp.shape

(3728509, 49)

In [36]:
df_bp5

Unnamed: 0,PDB_ID,Experimental_Method,Resolution_(Å),nt1,chain_ID_res1,res_ID_res1,res_index_res1,icode_res1,nt2,chain_ID_res2,...,distance_hbond_2,distance_hbond_3,distance_hbond_4,atoms_ID_hbond_1,atoms_ID_hbond_2,atoms_ID_hbond_3,atoms_ID_hbond_4,distance_hbond_5,atoms_ID_hbond_5,bp_name_res
0,17RA,SOLUTION NMR,,A.A7,A,A,7,*,A.U16,A,...,,,,A.N1-U.O2,,,,,,A-U
1,17RA,SOLUTION NMR,,A.C3,A,C,3,*,A.G19,A,...,3.222,3.121,,C.O2-G.N2,C.N3-G.N1,C.N4-G.O6,,,,C-G
2,17RA,SOLUTION NMR,,A.G1,A,G,1,*,A.C21,A,...,3.237,3.191,,G.O6-C.N4,G.N1-C.N3,G.N2-C.O2,,,,G-C
3,17RA,SOLUTION NMR,,A.G2,A,G,2,*,A.C20,A,...,3.201,3.131,,G.O6-C.N4,G.N1-C.N3,G.N2-C.O2,,,,G-C
4,17RA,SOLUTION NMR,,A.G4,A,G,4,*,A.U18,A,...,3.192,,,G.O6-U.N3,G.N1-U.O2,,,,,G-U
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7659,9KAD,SOLUTION NMR,,A.C8,A,C,8,*,B.G18,B,...,2.969,2.903,,C.O2-G.N2,C.N3-G.N1,C.N4-G.O6,,,,C-G
7660,9KAD,SOLUTION NMR,,A.G1,A,G,1,*,B.C25,B,...,2.964,2.862,,G.O6-C.N4,G.N1-C.N3,G.N2-C.O2,,,,G-C
7661,9KAD,SOLUTION NMR,,A.G10,A,G,10,*,B.C16,B,...,2.933,2.965,,G.O6-C.N4,G.N1-C.N3,G.N2-C.O2,,,,G-C
7662,9KAD,SOLUTION NMR,,A.G4,A,G,4,*,B.C22,B,...,2.992,2.927,,G.O6-C.N4,G.N1-C.N3,G.N2-C.O2,,,,G-C


In [18]:
df_bp5['name'].value_counts()

name
WCF            6209
undefined       541
Wobble          459
Imino            77
Sheared          68
~Wobble          41
rHoogsteen       37
~Sheared         24
Calcutta          2
~rHoogsteen       2
Platform          2
Name: count, dtype: int64

# Identifying base pair of interest

In [74]:
GG_ad= find_bp_interest(df_bp5, 'A-U', ['A.N6-U.O4', 'A.N1-U.N3'])

hbond_1
--------------------------------------------->>> we came here
hbond_2
--------------------------------------------->>> we came here
hbond_3
--------------------------------------------->>> we came here
hbond_4
--------------------------------------------->>> we came here
hbond_1
--------------------------------------------->>> we came here
hbond_2
--------------------------------------------->>> we came here
hbond_3
--------------------------------------------->>> we came here
hbond_4
--------------------------------------------->>> we came here
hbond_5
--------------------------------------------->>> we came here
hbond_5
--------------------------------------------->>> we came here


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1[new_col] = df1[col2].astype(str) + '_' + df1[col1].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1[new_col] = df1[col2].astype(str) + '_' + df1[col1].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1[new_col] = df1[col2].astype(str) + '_' + df1[col1].astype(str)
A va

KeyError: 'combined_hbond_6'

In [47]:
GG_ad= df_bp5[df_bp5['base_pair'] == 'G-G']

In [71]:
GG_ad

Unnamed: 0,PDB_ID,Experimental_Method,Resolution_(Å),nt1,chain_ID_res1,res_ID_res1,res_index_res1,icode_res1,nt2,chain_ID_res2,...,distance_hbond_5,atoms_ID_hbond_5,bp_name_res,combined_hbond_1,combined_hbond_2,combined_hbond_3,combined_hbond_4,combined_hbond_5,A.N6-U.O4,A.N1-U.N3
1,17RA,SOLUTION NMR,,A.U5,A,U,5,*,A.A17,A,...,,,U-A,U.N3-A.N1_3.177,U.O4-A.N6_3.132,nan_nan,nan_nan,nan_nan,3.132,3.177
2,1A1T,SOLUTION NMR,,B.A203,B,A,203,*,B.U218,B,...,,,A-U,A.N6-U.O4_2.888,A.N1-U.N3_2.879,nan_nan,nan_nan,nan_nan,2.888,2.879
3,1A1T,SOLUTION NMR,,B.A206,B,A,206,*,B.U215,B,...,,,A-U,A.N6-U.O4_2.8,A.N1-U.N3_2.745,nan_nan,nan_nan,nan_nan,2.800,2.745
4,1A1T,SOLUTION NMR,,B.U205,B,U,205,*,B.A216,B,...,,,U-A,U.N3-A.N1_2.711,U.O4-A.N6_2.697,nan_nan,nan_nan,nan_nan,2.697,2.711
5,1A3M,SOLUTION NMR,,A.A9,A,A,9,*,B.U18,B,...,,,A-U,A.N6-U.O4_3.164,A.N1-U.N3_3.061,nan_nan,nan_nan,nan_nan,3.164,3.061
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2029,9IOS,SOLUTION NMR,,A.U4,A,U,4,*,A.A17,A,...,,,U-A,U.N3-A.N1_2.929,U.O4-A.N6_2.961,nan_nan,nan_nan,nan_nan,2.961,2.929
2030,9IOS,SOLUTION NMR,,A.U7,A,U,7,*,A.A13,A,...,,,U-A,U.N3-A.N1_2.88,U.O4-A.N6_2.886,nan_nan,nan_nan,nan_nan,2.886,2.880
2031,9IOU,SOLUTION NMR,,A.A3,A,A,3,*,A.U18,A,...,,,A-U,A.N6-U.O4_2.999,A.N1-U.N3_3.026,nan_nan,nan_nan,nan_nan,2.999,3.026
2032,9IOU,SOLUTION NMR,,A.U4,A,U,4,*,A.A17,A,...,,,U-A,U.N3-A.N1_2.853,U.O4-A.N6_2.957,nan_nan,nan_nan,nan_nan,2.957,2.853
