# Processing DHFR QM data for all residues
## Data from Lichirui and Xu, run with OLYP and pcSseg-0 basis set(lightweighted,good for NMR)
## Data location: ~/Dropbox/Xu_MDQMMM_data/afnmr_restart
## usiing KcsA_torsion python environment


In [1]:
# data structures
# snapshots fold: 0:100:9900
# within each fold, contains output for each residues named as schrodinger_md_{foldername}_{residues num}.out
# for example /Users/yunyao_1/Dropbox/Xu_MDQMMM_data/afnmr_restart/8400/schrodinger_md_8400_1.out
# within each output, we need write a function to parse the output file to get the data we want
# reuse some old scripts: ./read_orca.py and ./orca_analyzer.py
# read_orca.py: read orca output file and return a dictionary for certain nuclei (input as a flag, optional)
# the dictionary use nuclei as key(e.g 13N, atom_index+nuclei) and the value is 3*3 numpy array for shielding tensor
# orca_analyzer.py: read the dictionary from read_orca.py and correct for reference, then calculate the iso, aniso(delta and ita)
# correction value is input as a flag (based on TMS) for c $ref{"C"} = 189.8;  # TMS\
#    $reference_set = "OLYP ref_seg0 ";
#    $ref{"H"} = 33.2;   # TMS
#    $ref{"C"} = 189.8;  # TMS
#    $ref{"N"} = 241.0;  # CH3NO2 + 380.5
#    $ref{"P"} = 300.0;  # guess
#    $ref{"F"} = 130.2;  # CFCl3

# output will a csv file, whith column names=['resNO','atom','resNam','iso','delta','ita','alpha','betta','gamma']
# alternatively way is to use afnmr to extra csa value: extract_shift_v2.py (pqr file is needed)

# I used the second method as the output results are clean, and I don't want to spend time to tide up the first method.


In [2]:
# the first mthoed is kept here for reference

# from orca_analyzer import ShieldingTensorAnalyzer
# from read_orca import extract_tensors_from_fileo
# import os
# import numpy as np

# def analyze_orca_output(path):
#     utput_dir = os.path.dirname(path)
#     file_name = os.path.splitext(os.path.basename(path))[0] 
#     c_dataset = extract_tensors_from_file(path, 'C')
#     n_dataset = extract_tensors_from_file(path, 'N')
#     h_dataset = extract_tensors_from_file(path, 'H')
    
#     # Initialize analyzers
#     c_analyzer = ShieldingTensorAnalyzer(c_dataset, 189.8)
#     n_analyzer = ShieldingTensorAnalyzer(n_dataset, 241.0)
#     h_analyzer = ShieldingTensorAnalyzer(h_dataset, 33.2)

#     # Compute isotropic shift, delta value, and ita value for Cd
#     cd_iso = [cd_analyzer.isotropic_shift(key) for key in cd_dataset.keys() if 'Cd' in key]
#     cd_delta = [cd_analyzer.delta_value(key) for key in cd_dataset.keys() if 'Cd' in key]
#     cd_ita = [cd_analyzer.ita_value(key) for key in cd_dataset.keys() if 'Cd' in key]

#     # Compute isotropic shift, delta value, and ita value for Se
#     se_iso = [se_analyzer.isotropic_shift(key) for key in se_dataset.keys() if 'Se' in key]
#     se_delta = [se_analyzer.delta_value(key) for key in se_dataset.keys() if 'Se' in key]
#     se_ita = [se_analyzer.ita_value(key) for key in se_dataset.keys() if 'Se' in key]

In [5]:
# read parsed nmr data
#directory for calcuated chemical shift inforamtion
import pandas as pd
import os
dir_cc='/Users/yunyao_1/Dropbox/Xu_MDQMMM_data/afnmr_restart/results'
# read files that starts with results
cc_dhfr_dict={}
for file in os.listdir(dir_cc):
    if file[0]=='r':
        full_f=os.path.join(dir_cc,file)
        cc_dhfr_dict[file.rstrip('.txt')]=pd.read_csv(full_f, sep='\t', header=None, names=['resNO','atom','resNam','iso','delta','ita','alpha','betta','gamma'])

In [6]:
# show the pd
cc_dhfr_dict['results_8400'].head()

Unnamed: 0,resNO,atom,resNam,iso,delta,ita,alpha,betta,gamma
0,1,N,MET,34.013,17.278,0.494,189.709,211.357,219.894
1,1,H1,MET,5.894,-10.386,0.33,20.401,23.825,37.692
2,1,H2,MET,4.408,-12.122,0.023,22.588,22.872,40.914
3,1,H3,MET,4.369,-9.106,0.588,21.602,26.955,37.937
4,1,CA,MET,57.332,-15.316,0.913,117.817,131.805,147.784


In [7]:
#  load torsion angle data in csv format 
dir_torsion='/Users/yunyao_1/Dropbox/KcsA/DHFR_Analysis/DHFR_torsion.csv'
torsion_df=pd.read_csv(dir_torsion)
torsion_df.head()


Unnamed: 0,frame_index,time_ps,A:1-psi,A:1-phi,A:2-psi,A:2-phi,A:3-psi,A:3-phi,A:4-psi,A:4-phi,...,A:155-psi,A:155-phi,A:156-psi,A:156-phi,A:157-psi,A:157-phi,A:158-psi,A:158-phi,A:159-psi,A:159-phi
0,0,0.0,86.91942,,113.065137,-81.87222,108.559,-105.456855,143.372481,-77.357033,...,139.4686,-131.468975,166.04743,-116.360639,149.337292,-143.895993,115.925083,-83.371272,,-136.724001
1,1,100.002,127.803489,,140.505293,-77.625424,156.902225,-130.515766,140.17295,-115.719789,...,137.334279,-150.518883,140.723689,-132.932105,150.600987,-107.068357,87.456414,-83.233679,,-86.561022
2,2,200.004,169.019657,,121.453296,-89.879954,146.118386,-100.815532,154.583913,-102.580808,...,141.049642,-127.595077,146.59525,-126.552846,152.769779,-124.511919,93.008272,-73.742338,,-118.395834
3,3,300.006,-176.729882,,139.931256,-116.353946,132.604424,-125.527717,140.046267,-94.277335,...,146.231135,-123.836972,162.874574,-132.747141,153.904041,-132.124027,119.048934,-84.470075,,-129.902697
4,4,400.008,165.790724,,140.149853,-94.004681,112.31663,-131.697686,134.215861,-62.143013,...,139.055973,-127.821038,148.804039,-107.272919,147.852278,-135.433615,109.189503,-85.331834,,-124.448206


In [12]:
# combine all resulsts into one dataframe
# column include: resNO, atom, resNam, iso, delta, ita, alpha, betta, gamma, frame, torsion_p, torsion2
combined_df = pd.DataFrame(columns=['resNO', 'atom', 'resNam', 'iso', 'delta', 'ita', 'alpha', 'betta', 'gamma', 'frame'])

for key, df in cc_dhfr_dict.items():
    # key is in results_XXXX format, extract the frame number
    key = int(key.split('_')[1])
    df['frame'] = key
    combined_df = pd.concat([combined_df, df], ignore_index=True)

# show the combined dataframe
combined_df.head()


Unnamed: 0,resNO,atom,resNam,iso,delta,ita,alpha,betta,gamma,frame
0,1,N,MET,22.136,16.024,0.233,202.84,225.012,228.74,9200
1,1,H1,MET,6.375,-12.215,0.315,18.794,22.641,39.04,9200
2,1,H2,MET,4.622,-11.899,0.263,21.066,24.192,40.477,9200
3,1,H3,MET,4.316,-11.676,0.022,22.916,23.177,40.56,9200
4,1,CA,MET,57.089,-22.118,0.746,113.398,129.906,154.829,9200


In [17]:
# Step 1: make sure time_ps is integer to align with frame
torsion_df["frame"] = torsion_df["time_ps"].astype(int)

# Step 2: pick one frame or merge all?
# If you want to merge by frame, you need df1 to also have a frame column
# Let's assume df1 already has "frame" (if not, you can assign one)
if "frame" not in combined_df.columns:
    raise ValueError("combined_df needs a 'frame' column to match with torsion_df['frame'].")

# Step 3: function to extract phi/psi for given residue
def get_angles(row, df2):
    frame = row["frame"]
    resno = row["resNO"]
    phi_col = f"A:{resno}-phi"
    psi_col = f"A:{resno}-psi"

    if phi_col in torsion_df.columns and psi_col in torsion_df.columns:
        match = torsion_df.loc[torsion_df["frame"] == frame]
        if not match.empty:
            phi = match[phi_col].values[0]
            psi = match[psi_col].values[0]
            return pd.Series({"phi": phi, "psi": psi})
    return pd.Series({"phi": np.nan, "psi": np.nan})

# Step 4: apply
angles = combined_df.apply(get_angles, axis=1, df2=torsion_df)

# Step 5: concatenate
final_df = pd.concat([combined_df, angles], axis=1)

# Step 6: reorder
final_df = final_df[['frame','resNO','atom','resNam','iso','delta','ita','alpha','betta','gamma','phi','psi']]

print(final_df.head(30))

   frame resNO  atom resNam      iso   delta    ita    alpha    betta  \
0   9200     1     N    MET   22.136  16.024  0.233  202.840  225.012   
1   9200     1    H1    MET    6.375 -12.215  0.315   18.794   22.641   
2   9200     1    H2    MET    4.622 -11.899  0.263   21.066   24.192   
3   9200     1    H3    MET    4.316 -11.676  0.022   22.916   23.177   
4   9200     1    CA    MET   57.089 -22.118  0.746  113.398  129.906   
5   9200     1    HA    MET    4.308   4.190  0.801   24.702   29.309   
6   9200     1    CB    MET   34.799 -16.650  0.471  142.751  150.601   
7   9200     1   HB2    MET    2.096  -4.393  0.864   27.009   30.805   
8   9200     1   HB3    MET    3.012  -7.456  0.949   22.923   29.998   
9   9200     1    CG    MET   32.836  28.005  0.713  128.959  160.986   
10  9200     1   HG2    MET    2.047   6.603  0.850   24.550   31.648   
11  9200     1   HG3    MET    2.313   5.929  0.755   24.958   31.614   
12  9200     1    CE    MET   27.105 -27.843  0.258

In [18]:
# sort final_df by frame and resNO
final_df = final_df.sort_values(by=['frame', 'resNO']).reset_index(drop=True)
# save the data to csv
final_df.to_csv('/Users/yunyao_1/Dropbox/KcsA/DHFR_Analysis/DHFR_NMR_torsion_cs_combined.csv', index=False)

In [16]:
# analysis ideas to do 
# Question 1: is it possible to use torsion angle  to predict backbone chemical shift?
# Question 2: adding residue type as a categorical variable
# Question 3. adding franking residue type as categorical variable
# Question 4. adding sidechain torsion angle as variable (calculatd CB position vs. real position )
# Question 4. adding torsion angles of franking residues as variable
# Question 5. adding solvent accessible surface area as variable (using mdtraj to calculate)
# Question 6. adding H-bonding information as variable (using mdtraj to calculate)
# Question 7. using ML method to predict chemical shift (e.g. random forest, xgboost, neural network)
# Question 8. using deep learning method to predict chemical shift (e.g. graph neural network)
# Question 9. using ESM model to code sequece and structure information (e.g. esm2, esmfold)
# Question 10. using attention mechanism to predict chemical shift (e.g. transformer)

In [None]:
# visualization torsion flip and chemical shift change
# from previous analysis, these reisudes show flipping behavior: 23 42 55 60 65 97 131
# define  neareast neighbour function to group the torsion angle into two states if possible
# input will be final_df['resNO], assign 0 or 1 for each state based on phi/psi angle
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def knn_cluster_torsion(torsion_data, min_samples=5, random_state=42):
    """
    Automatically cluster torsion data per residue into 1 or 2 groups
    based on silhouette score.

    Parameters
    ----------
    torsion_data : dict
        Dictionary {residue: np.ndarray of torsion values}
    min_samples : int
        Minimum number of samples needed to attempt clustering
    random_state : int
        Random state for reproducibility

    Returns
    -------
    clusters : dict
        Dictionary {residue: cluster_labels}
    """

    clusters = {}
    for residue, values in torsion_data.items():
        values = np.array(values).reshape(-1, 1)

        if len(values) < min_samples:
            # Too few samples → put all in one group
            clusters[residue] = np.zeros(len(values), dtype=int)
            continue

        # --- Try 1 cluster ---
        km1 = KMeans(n_clusters=1, random_state=random_state).fit(values)
        labels1 = km1.labels_
        score1 = -1  # silhouette not defined for 1 cluster

        # --- Try 2 clusters ---
        km2 = KMeans(n_clusters=2, random_state=random_state).fit(values)
        labels2 = km2.labels_
        try:
            score2 = silhouette_score(values, labels2)
        except:
            score2 = -1

        # Decide
        if score2 > 0.3:  # threshold: strong cluster separation
            clusters[residue] = labels2
        else:
            clusters[residue] = labels1

    return clusters
