In [24]:
""" 
This line of code is to separate the Solvent accessible surface area (sasa) files from dataset, concatenate all sasa dataframes into a single dataframe
and specify for only the 'Apo' & 'Holo' structures
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
from scipy import stats
import shutil

# Define the source and destination directories
source_directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/full_protein-20241018T143402Z-001/full_protein/'
destination_directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/full_protein-20241018T143402Z-001/full_protein/qFit_sasa_files/'

# Create the destination directory if it doesn't exist
os.makedirs(destination_directory, exist_ok=True)

# List all files in the source directory
file_list = os.listdir(source_directory)

# Loop through each file and transfer files ending with qFit_sasa.csv to the destination directory
for file in file_list:
    if file.endswith('_qFit_sasa.csv'):
        source_file_path = os.path.join(source_directory, file)
        destination_file_path = os.path.join(destination_directory, file)
        shutil.move(source_file_path, destination_file_path)

print("Files ending with 'qFit_sasa.csv' have been successfully transferred to the separate folder.")

# Directory containing the .csv files
directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/qFit_sasa_files/qFit_sasa_files'

# Initialize an empty list to hold the dataframes
dataframes = []

# List all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Extract the base name without extension and remove 
        base_name, ext = os.path.splitext(filename)
        cleaned_base_name = base_name.replace('_qFit_sasa', '')
     # Read the csv file into a dataframe
        sasa_df = pd.read_csv(os.path.join(directory, filename))
        
        # Add a new column with the cleaned base name
        sasa_df['Cleaned Base Name'] = cleaned_base_name
        
        # Append the dataframe to the list
        dataframes.append(sasa_df)
# Concatenate all dataframes in the list into a single dataframe
combined_sasa_df = pd.concat(dataframes, ignore_index=True)  
combined_sasa_df.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/qFit_sasa_files/qFit_sasa_files/combined_sasa_df.csv', index=False)
print("SASA file check:", combined_sasa_df.head(10))

# How to separate all Apo/Holo structures from the combined SASA dataframe
# Load the apo/holo template into a dataframe
df = pd.read_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/apo_holo_241016.csv')

# Define criteria for Apo structure
def apo_df():
    apo_df = df[df['Apo'].notna() & (df['Apo'] != '')]
# Select only the desired columns
    apo_columns = ['Apo']
    apo_df = apo_df[apo_columns]
    return apo_df
apo_data = apo_df()

apo_sasa_aligned = combined_sasa_df[combined_sasa_df['Cleaned Base Name'] .isin (apo_data['Apo'])]
apo_sasa_aligned.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_sasa_aligned.csv',index=False)
print("Apo SASA:", apo_sasa_aligned.head(10))

# Define criteria for Holo structure
def holo_df():
    holo_df = df[df['Holo'].notna() & (df['Holo'] != '')]
# Select only the desired columns
    holo_columns = ['Holo']
    holo_df = holo_df[holo_columns]
    return holo_df
holo_data = holo_df()

holo_sasa_aligned = combined_sasa_df[combined_sasa_df['Cleaned Base Name'] .isin (holo_data['Holo'])]
holo_sasa_aligned.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/holo_sasa_aligned.csv',index=False)
print("Holo SASA:", holo_sasa_aligned.head(10))

Files ending with 'qFit_sasa.csv' have been successfully transferred to the separate folder.
SASA file check:   Residue Chain  Residue ID Atom Alt Loc  SASA Cleaned Base Name
0     PRO     A           2    N          5.47              16gs
1     PRO     A           2   CA          0.00              16gs
2     PRO     A           2    C          0.00              16gs
3     PRO     A           2    O          0.00              16gs
4     PRO     A           2   CB          0.00              16gs
5     PRO     A           2   CG          0.00              16gs
6     PRO     A           2   CD          2.42              16gs
7     TYR     A           3    N          0.00              16gs
8     TYR     A           3   CA          0.00              16gs
9     TYR     A           3    C          0.00              16gs
Apo SASA:   Residue Chain  Residue ID Atom Alt Loc  SASA Cleaned Base Name
0     PRO     A           2    N          5.47              16gs
1     PRO     A           2   CA   

In [None]:
""" 
This line of code is to separate the solvent accessible surface area (sasa) of residues within 5.0 Angstroms of the binding site files from dataset, concatenate all sasa 5.0 dataframes into a single dataframe
and specify for only the 'Apo' & 'Holo' structures
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
from scipy import stats
import shutil

# Define the source and destination directories
source_directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/full_protein-20241018T143402Z-001/full_protein/'
destination_directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/5_sasa_subset_files/'

# Create the destination directory if it doesn't exist
os.makedirs(destination_directory, exist_ok=True)

# List all files in the source directory
file_list = os.listdir(source_directory)

# Loop through each file and transfer files ending with qFit_sasa.csv to the destination directory
for file in file_list:
    if file.endswith('_5.0_sasa_subset.csv'):
        source_file_path = os.path.join(source_directory, file)
        destination_file_path = os.path.join(destination_directory, file)
        shutil.move(source_file_path, destination_file_path)

print("Files ending with '5.0_sasa_subset.csv' have been successfully transferred to the separate folder.")

# How to add the PDB variable to each 5_sasa_subset file then concantenate all dataframes into a single dataframe from the analysis scripts
# Directory containing the .csv files
directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/5_sasa_subset_files'

# Initialize an empty list to hold the dataframes
dataframes = []

# List all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Extract the base name without extension and remove 
        base_name, ext = os.path.splitext(filename)
        cleaned_base_name = base_name.replace('_5.0_sasa_subset', '')
     # Read the csv file into a dataframe
        _5_sasa_subset_df = pd.read_csv(os.path.join(directory, filename))
        
        # Add a new column with the cleaned base name
        _5_sasa_subset_df['Cleaned Base Name'] = cleaned_base_name
        
        # Append the dataframe to the list
        dataframes.append(_5_sasa_subset_df)
# Concatenate all dataframes in the list into a single dataframe
combined_5_sasa_subset_df = pd.concat(dataframes, ignore_index=True)  
combined_5_sasa_subset_df.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/5_sasa_subset_files/combined_5_sasa_subset_df.csv', index=False)
print("SASA_5.0 file check:", combined_5_sasa_subset_df.head(10))

# How to separate all Apo/Holo structures from the combined dataframe
# Load the apo/holo template into a dataframe
df = pd.read_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/apo_holo_241016.csv')

# Define criteria for Apo structure
def apo_df():
    apo_df = df[df['Apo'].notna() & (df['Apo'] != '')]
# Select only the desired columns
    apo_columns = ['Apo']
    apo_df = apo_df[apo_columns]
    return apo_df
apo_data = apo_df()

apo_5_sasa_subset_aligned = combined_5_sasa_subset_df[combined_5_sasa_subset_df['Cleaned Base Name'] .isin (apo_data['Apo'])]
apo_5_sasa_subset_aligned.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_5_sasa_subset_aligned.csv',index=False)
print("Apo SASA 5.0:",apo_5_sasa_subset_aligned.head(10))

# Define criteria for Holo structure
def holo_df():
    holo_df = df[df['Holo'].notna() & (df['Holo'] != '')]
# Select only the desired columns
    holo_columns = ['Holo']
    holo_df = holo_df[holo_columns]
    return holo_df
holo_data = holo_df()

holo_5_sasa_subset_aligned = combined_5_sasa_subset_df[combined_5_sasa_subset_df['Cleaned Base Name'] .isin (holo_data['Holo'])]
holo_5_sasa_subset_aligned.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/holo_5_sasa_subset_aligned.csv',index=False)
print("Holo SASA 5.0:", holo_5_sasa_subset_aligned.head(10))

In [22]:
""" 
This line of code is to separate qFit H-bond files from dataset, concatenate all H-bond dataframes into a single dataframe
and specify for only the 'Apo' & 'Holo' structures
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
from scipy import stats
import shutil

# Define the source and destination directories
source_directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/full_protein-20241018T143402Z-001/full_protein/'
destination_directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/qFit_hbond_files/'

# Create the destination directory if it doesn't exist
os.makedirs(destination_directory, exist_ok=True)

# List all files in the source directory
file_list = os.listdir(source_directory)

# Loop through each file and transfer files ending with qFit_hbonds.csv to the destination directory
for file in file_list:
    if file.endswith('_qFit_hbonds.csv'):
        source_file_path = os.path.join(source_directory, file)
        destination_file_path = os.path.join(destination_directory, file)
        shutil.move(source_file_path, destination_file_path)

print("Files ending with 'qFit_hbonds.csv' have been successfully transferred to the separate folder.")

# Directory containing the .csv files
directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/qFit_hbond_files'

# Initialize an empty list to hold the dataframes
dataframes = []

# List all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Extract the base name without extension and remove 
        base_name, ext = os.path.splitext(filename)
        cleaned_base_name = base_name.replace('_qFit_hbonds', '')
     # Read the csv file into a dataframe
        hbonds_df = pd.read_csv(os.path.join(directory, filename))
        
        # Add a new column with the cleaned base name
        hbonds_df['Cleaned Base Name'] = cleaned_base_name
        
        # Append the dataframe to the list
        dataframes.append(hbonds_df)
# Concatenate all dataframes in the list into a single dataframe
combined_hbonds_df = pd.concat(dataframes, ignore_index=True)  
combined_hbonds_df.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/qFit_hbond_files/combined_sasa_df.csv', index=False)
print("Hbond file check:", combined_hbonds_df.head(10))

# How to separate all Apo/Holo structures from the combined qFit_hbond dataframe
# Load the apo/holo template into a dataframe
df = pd.read_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/apo_holo_241016.csv')

# Define criteria for Apo structure
def apo_df():
    apo_df = df[df['Apo'].notna() & (df['Apo'] != '')]
# Select only the desired columns
    apo_columns = ['Apo']
    apo_df = apo_df[apo_columns]
    return apo_df
apo_data = apo_df()

apo_hbonds_aligned = combined_hbonds_df[combined_hbonds_df['Cleaned Base Name'] .isin (apo_data['Apo'])]
apo_hbonds_aligned.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_hbonds_aligned.csv',index=False)
print("Apo Hbonds:", apo_hbonds_aligned.head(10))

# Define criteria for Holo structure
def holo_df():
    holo_df = df[df['Holo'].notna() & (df['Holo'] != '')]
# Select only the desired columns
    holo_columns = ['Holo']
    holo_df = holo_df[holo_columns]
    return holo_df
holo_data = holo_df()

holo_hbonds_aligned = combined_hbonds_df[combined_hbonds_df['Cleaned Base Name'] .isin (holo_data['Holo'])]
holo_hbonds_aligned.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/holo_hbonds_aligned.csv',index=False)
print("Holo Hbonds:", holo_hbonds_aligned.head(10))

Files ending with 'qFit_hbonds.csv' have been successfully transferred to the separate folder.
Hbond file check:   donor_chain  donor_residue_number donor_residue_name donor_atom  \
0           A                     3                TYR          N   
1           A                     4                THR          N   
2           A                     5                VAL          N   
3           A                     6                VAL          N   
4           A                     7                TYR          N   
5           A                    10                VAL          N   
6           A                    11                ARG          N   
7           A                    11                ARG         NE   
8           A                    11                ARG        NH1   
9           A                    11                ARG        NH1   

  donor_altloc  donor_occupancy  donor_bfactor hydrogen_atom acceptor_chain  \
0                           1.0          19.93  

In [None]:
""" 
This line of code is to separate the h-bonds of residues within 5.0 Angstroms of the binding site files from dataset, concatenate all h-bond 5.0 dataframes into a single dataframe
and specify for only the 'Apo' & 'Holo' structures
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
from scipy import stats
import shutil

# Define the source and destination directories
source_directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/full_protein-20241018T143402Z-001/full_protein/'
destination_directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/5_hbond_subset_files/'

# Create the destination directory if it doesn't exist
os.makedirs(destination_directory, exist_ok=True)

# List all files in the source directory
file_list = os.listdir(source_directory)

# Loop through each file and transfer files ending with 5.0 hbond subset.csv to the destination directory
for file in file_list:
    if file.endswith('_5.0_hbond_subset.csv'):
        source_file_path = os.path.join(source_directory, file)
        destination_file_path = os.path.join(destination_directory, file)
        shutil.move(source_file_path, destination_file_path)

print("Files ending with '5.0_hbond_subset.csv' have been successfully transferred to the separate folder.")

# How to add the PDB variable to each 5_hbond_subset file then concantenate all dataframes into a single dataframe from the analysis scripts
# Directory containing the .csv files
directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/5_hbond_subset_files'

# Initialize an empty list to hold the dataframes
dataframes = []

# List all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Extract the base name without extension and remove 
        base_name, ext = os.path.splitext(filename)
        cleaned_base_name = base_name.replace('_5.0_hbond_subset', '')
     # Read the csv file into a dataframe
        _5_hbond_subset_df = pd.read_csv(os.path.join(directory, filename))
        
        # Add a new column with the cleaned base name
        _5_hbond_subset_df['Cleaned Base Name'] = cleaned_base_name
        
        # Append the dataframe to the list
        dataframes.append(_5_hbond_subset_df)
# Concatenate all dataframes in the list into a single dataframe
combined_5_hbond_subset_df = pd.concat(dataframes, ignore_index=True)  
combined_5_hbond_subset_df.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/5_hbond_subset_files/combined_5_hbond_subset_df.csv', index=False)
print("Hbond_5.0 file check:", combined_5_hbond_subset_df.head(10))

# How to separate all Apo/Holo structures from the combined dataframe
# Load the apo/holo template into a dataframe
df = pd.read_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/apo_holo_241016.csv')

# Define criteria for Apo structure
def apo_df():
    apo_df = df[df['Apo'].notna() & (df['Apo'] != '')]
# Select only the desired columns
    apo_columns = ['Apo']
    apo_df = apo_df[apo_columns]
    return apo_df
apo_data = apo_df()

apo_5_hbond_subset_aligned = combined_5_hbond_subset_df[combined_5_hbond_subset_df['Cleaned Base Name'] .isin (apo_data['Apo'])]
apo_5_hbond_subset_aligned.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_5_hbond_subset_aligned.csv',index=False)
print("Apo Hbond 5.0:", apo_5_hbond_subset_aligned.head(10))

# Define criteria for Holo structure
def holo_df():
    holo_df = df[df['Holo'].notna() & (df['Holo'] != '')]
# Select only the desired columns
    holo_columns = ['Holo']
    holo_df = holo_df[holo_columns]
    return holo_df
holo_data = holo_df()

holo_5_hbond_subset_aligned = combined_5_hbond_subset_df[combined_5_hbond_subset_df['Cleaned Base Name'] .isin (holo_data['Holo'])]
holo_5_hbond_subset_aligned.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/holo_5_hbond_subset_aligned.csv',index=False)
print("Holo Hbond 5.0:", holo_5_hbond_subset_aligned.head(10))

In [None]:
""" 
This line of code is to concatanate all B-factor values of residues then define a subset of B-factor residues within 5.0 Angstroms of the binding site files from dataset, and transfer all of those into a single dataframe
and specify for only the 'Apo' & 'Holo' structures
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
from scipy import stats
import shutil

# Use glob to find all *_B_factors.csv files in the specified directory
b_factor_files = glob.glob(os.path.join('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/full_protein-20241014T184131Z-001/full_protein', '*_B_factors.csv'))

# Initialize a list to store the DataFrames
b_factor_dfs = []

# Loop through each file and read it into a DataFrame
for file in b_factor_files:
    df = pd.read_csv(file)
    # Optionally, add a new column to identify the source file
    b_factor_dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_b_factor_df = pd.concat(b_factor_dfs, ignore_index=True)
combined_b_factor_df['chain'] = combined_b_factor_df['chain'].str.replace(r"[\[\]']+", '', regex=True).str.strip()
combined_b_factor_df['resn'] = combined_b_factor_df['resn'].str.replace(r"[\[\]']+", '', regex=True).str.strip()


# Display the first few rows of the combined DataFrame
print(combined_b_factor_df.head(10))

# Define the source and destination directories
source_directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/full_protein-20241018T143402Z-001/full_protein/'
destination_directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/5_bfactor_subset_files/'

# Create the destination directory if it doesn't exist
os.makedirs(destination_directory, exist_ok=True)

# List all files in the source directory
file_list = os.listdir(source_directory)

# Loop through each file and transfer files ending with 5.0 Bfactor.csv to the destination directory
for file in file_list:
    if file.endswith('_5.0_bfactor_subset.csv'):
        source_file_path = os.path.join(source_directory, file)
        destination_file_path = os.path.join(destination_directory, file)
        shutil.move(source_file_path, destination_file_path)

print("Files ending with '5.0_bfactor_subset.csv' have been successfully transferred to the separate folder.")

# Directory containing the .csv files
directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/5_bfactor_subset_files'

# Initialize an empty list to hold the dataframes
dataframes = []

# List all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Extract the base name without extension and remove 
        base_name, ext = os.path.splitext(filename)
        cleaned_base_name = base_name.replace('_5.0_bfactor_subset', '')
     # Read the csv file into a dataframe
        _5_bfactor_subset_df = pd.read_csv(os.path.join(directory, filename))
        
        # Add a new column with the cleaned base name
        _5_bfactor_subset_df['Cleaned Base Name'] = cleaned_base_name
        
        # Append the dataframe to the list
        dataframes.append(_5_bfactor_subset_df)
# Concatenate all dataframes in the list into a single dataframe
combined_5_bfactor_subset_df = pd.concat(dataframes, ignore_index=True)  
combined_5_bfactor_subset_df.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/5_bfactor_subset_files/combined_5_bfactor_subset_df.csv', index=False)
print("Bfactor_5.0 file check:",combined_5_bfactor_subset_df.head(10))

# How to separate all Apo/Holo structures from the combined dataframe
# Load the apo/holo template into a dataframe
df = pd.read_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/apo_holo_241016.csv')

# Define criteria for Apo structure
def apo_df():
    apo_df = df[df['Apo'].notna() & (df['Apo'] != '')]
# Select only the desired columns
    apo_columns = ['Apo']
    apo_df = apo_df[apo_columns]
    return apo_df
apo_data = apo_df()

apo_5_bfactor_subset_aligned = combined_5_bfactor_subset_df[combined_5_bfactor_subset_df['Cleaned Base Name'] .isin (apo_data['Apo'])]
apo_5_bfactor_subset_aligned.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_5_bfactor_subset_aligned.csv',index=False)
print("Apo B-factor 5.0:", apo_5_bfactor_subset_aligned.head(10))

# Define criteria for Holo structure
def holo_df():
    holo_df = df[df['Holo'].notna() & (df['Holo'] != '')]
# Select only the desired columns
    holo_columns = ['Holo']
    holo_df = holo_df[holo_columns]
    return holo_df
holo_data = holo_df()

holo_5_bfactor_subset_aligned = combined_5_bfactor_subset_df[combined_5_bfactor_subset_df['Cleaned Base Name'] .isin (holo_data['Holo'])]
holo_5_bfactor_subset_aligned.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/holo_5_bfactor_subset_aligned.csv',index=False)
print("Holo B-factor 5.0:", holo_5_bfactor_subset_aligned.head(10))

In [5]:
""" 
This line of code is to transfer and combine the  the Order Parameter dataset to my documents, separate the Order Parameter residues within 5.0 Angstroms of the binding site from the dataset, concatenate all Order Parameter 5.0 dataframes into a single dataframe
and specify for only the 'Apo' & 'Holo' structures
"""

import matplotlib.pyplot as plt
import pandas as pd
import mplcursors
import os
import numpy as np
from tqdm import tqdm
import shutil

#How to convert all .out files from desktop to .csv
#Directory containing the .out files (Desktop directory)
directory = "/Users/harrw10/Desktop/op_data"
#Loop through all .out files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".out"):
        # Read the content of the .out file
        with open(os.path.join(directory, filename), 'r') as infile:
            content = infile.read()
        
        # Define the output .csv file name
        csv_filename = filename.replace(".out", ".csv")
        
        # Write the content to the .csv file
        with open(os.path.join(directory, csv_filename), 'w') as outfile:
            outfile.write(content)

print("OP data Conversion complete!")

#Adding PDB name to a column in each of the op files then create a single combined file
# Directory containing the .csv files
directory = '/Users/harrw10/Desktop/op_data'

#Initialize an empty list to hold the dataframes
dataframes = []

#List all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Extract the base name without extension and remove '_OP'
        base_name, ext = os.path.splitext(filename)
        cleaned_base_name = base_name.replace('_OP', '')
        
        # Read the csv file into a dataframe
        df = pd.read_csv(os.path.join(directory, filename))
        
        # Add a new column with the cleaned base name
        df['Cleaned Base Name'] = cleaned_base_name
        
        # Append the dataframe to the list
        dataframes.append(df)

#Concatenate all dataframes in the list into a single dataframe
combined_op_df = pd.concat(dataframes, ignore_index=True)

#Save the combined dataframe to a new csv file
combined_op_df.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/op_subset_files/op_combined.csv', index=False)

print("The combined dataset has been successfully saved to 'combined_dataset.csv'.")

#How to combine aspects of the 'Apo' data and OP_subset into a singular file
#Define the dataframe for the op data and apo specific proteins
combined_op_df = pd.read_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/op_subset_files/op_combined.csv')
df = pd.read_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/apo_holo_241016.csv')

# Define criteria for Apo structure
def apo_df():
    apo_df = df[df['Apo'].notna() & (df['Apo'] != '')]
# Select only the desired columns
    apo_columns = ['Apo']
    apo_df = apo_df[apo_columns]
    return apo_df
apo_data = apo_df()

apo_op_subset_aligned = combined_op_df[combined_op_df['Cleaned Base Name'] .isin (apo_data['Apo'])]
apo_op_subset_aligned.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_op_subset_aligned.csv',index=False)
print("Apo OP:", apo_op_subset_aligned.head(10))

# Define criteria for Holo structure
def holo_df():
    holo_df = df[df['Holo'].notna() & (df['Holo'] != '')]
# Select only the desired columns
    holo_columns = ['Holo']
    holo_df = holo_df[holo_columns]
    return holo_df
holo_data = holo_df()

holo_op_subset_aligned = combined_op_df[combined_op_df['Cleaned Base Name'] .isin (holo_data['Holo'])]
holo_op_subset_aligned.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/holo_op_subset_aligned.csv',index=False)
print("Holo OP:", holo_op_subset_aligned.head(10))

#How to separate the _5.0_order_param_subset files from the entire data set and transfer them into an individual folder
#Define the source and destination directories
source_directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/full_protein-20241018T143402Z-001/full_protein/'
destination_directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/5_op_subset_files/'

#Create the destination directory if it doesn't exist
os.makedirs(destination_directory, exist_ok=True)

#List all files in the source directory
file_list = os.listdir(source_directory)

#Loop through each file and transfer files ending with qFit_sasa.csv to the destination directory
for file in file_list:
    if file.endswith('_5.0_order_param_subset.csv'):
        source_file_path = os.path.join(source_directory, file)
        destination_file_path = os.path.join(destination_directory, file)
        shutil.move(source_file_path, destination_file_path)

print("Files ending with '5.0_order_param_subset.csv' have been successfully transferred to the separate folder.")

#How to add the PDB variable to each 5_order_param_subset file then concantenate all dataframes into a single dataframe from the analysis scripts
#Directory containing the .csv files
directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/5_op_subset_files'

#Initialize an empty list to hold the dataframes
dataframes_2_ = []

#List all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Extract the base name without extension and remove 
        base_name, ext = os.path.splitext(filename)
        cleaned_base_name = base_name.replace('_5.0_order_param_subset', '')
     #Read the csv file into a dataframe
        _5_op_subset_df = pd.read_csv(os.path.join(directory, filename))
        
        #Add a new column with the cleaned base name
        _5_op_subset_df['Cleaned Base Name'] = cleaned_base_name
        
        #Append the dataframe to the list
        dataframes_2_.append(_5_op_subset_df)
#Concatenate all dataframes in the list into a single dataframe
combined_5_op_subset_df = pd.concat(dataframes_2_, ignore_index=True)  
combined_5_op_subset_df.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/5_op_subset_files/combined_5_op_subset_df.csv', index=False)
print("Order Parameter_5.0 file check:",combined_5_op_subset_df.head(10))

# How to separate all Apo/Holo structures from the combined dataframe
# Load the apo/holo template into a dataframe
df_2 = pd.read_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/apo_holo_241016.csv')

# Define criteria for Apo structure
def apo_df_2():
    apo_df_2 = df_2[df_2['Apo'].notna() & (df_2['Apo'] != '')]
# Select only the desired columns
    apo_columns_2 = ['Apo']
    apo_df_2 = apo_df_2[apo_columns_2]
    return apo_df_2
apo_data_2 = apo_df_2()

apo_5_op_subset_aligned = combined_5_op_subset_df[combined_5_op_subset_df['Cleaned Base Name'] .isin (apo_data_2['Apo'])]
apo_5_op_subset_aligned.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_5_op_subset_aligned.csv',index=False)
print("Apo OP 5.0:", apo_5_op_subset_aligned.head(10))

# Define criteria for Holo structure
def holo_df_2():
    holo_df_2 = df_2[df_2['Holo'].notna() & (df_2['Holo'] != '')]
# Select only the desired columns
    holo_columns_2 = ['Holo']
    holo_df_2 = holo_df_2[holo_columns_2]
    return holo_df_2
holo_data_2 = holo_df_2()

holo_5_op_subset_aligned = combined_5_op_subset_df[combined_5_op_subset_df['Cleaned Base Name'] .isin (holo_data_2['Holo'])]
holo_5_op_subset_aligned.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/holo_5_op_subset_aligned.csv',index=False)
print("Holo OP 5.0:", holo_5_op_subset_aligned.head(10))

OP data Conversion complete!
The combined dataset has been successfully saved to 'combined_dataset.csv'.
Apo OP:      s2calc   s2ortho  s2ang resn  resi chain Cleaned Base Name
0  0.900755  0.900755    1.0  TYR     3     A              16gs
1  0.869301  0.869301    1.0  THR     4     A              16gs
2  0.920219  0.920219    1.0  VAL     5     A              16gs
3  0.861086  0.861086    1.0  VAL     6     A              16gs
4  0.878256  0.878256    1.0  TYR     7     A              16gs
5  0.807282  0.807282    1.0  PHE     8     A              16gs
6  0.858570  0.858570    1.0  VAL    10     A              16gs
7  0.919997  0.919997    1.0  ARG    11     A              16gs
8  0.894316  0.894316    1.0  ARG    13     A              16gs
9  0.923697  0.923697    1.0  CYS    14     A              16gs
Holo OP:         s2calc   s2ortho  s2ang resn  resi chain Cleaned Base Name
1248  0.894108  0.894108    1.0  TYR     3     A              19gs
1249  0.861579  0.861579    1.0  THR    

  combined_5_op_subset_df = pd.concat(dataframes_2_, ignore_index=True)


In [None]:
""" 
This line of code is to separate r-value files from dataset, concatenate all r-value dataframes into a single dataframe
and specify for only the 'Apo' & 'Holo' structures
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
from scipy import stats
import shutil

# Define the source and destination directories
source_directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/full_protein-20241018T143402Z-001/full_protein/'
destination_directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/rvalues_files/'

# Create the destination directory if it doesn't exist
os.makedirs(destination_directory, exist_ok=True)

# List all files in the source directory
file_list = os.listdir(source_directory)

# Loop through each file and transfer files ending with rvalues.csv to the destination directory
for file in file_list:
    if file.endswith('_rvalues.csv'):
        source_file_path = os.path.join(source_directory, file)
        destination_file_path = os.path.join(destination_directory, file)
        shutil.move(source_file_path, destination_file_path)

print("Files ending with 'rvalues.csv' have been successfully transferred to the separate folder.")

# How to add the PDB variable to each rvalues file then concantenate all dataframes into a single dataframe from the analysis scripts
# Directory containing the .csv files
directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/rvalues_files'

# Initialize an empty list to hold the dataframes
dataframes = []

# List all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Extract the base name without extension and remove 
        base_name, ext = os.path.splitext(filename)
        cleaned_base_name = base_name.replace('_rvalues', '')
     # Read the csv file into a dataframe
        rvalues_df = pd.read_csv(os.path.join(directory, filename))
        
        # Add a new column with the cleaned base name
        rvalues_df['Cleaned Base Name'] = cleaned_base_name
        
        # Append the dataframe to the list
        dataframes.append(rvalues_df)
# Concatenate all dataframes in the list into a single dataframe
combined_rvalues_df = pd.concat(dataframes, ignore_index=True)  
combined_rvalues_df.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/rvalues_files/combined_rvalues_df.csv', index=False)
print("rvalue file check:", combined_rvalues_df.head(10))

# How to separate all Apo/Holo structures from the combined rvalues dataframe
# Load the apo/holo template into a dataframe
df = pd.read_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/apo_holo_241016.csv')

# Define criteria for Apo structure
def apo_df():
    apo_df = df[df['Apo'].notna() & (df['Apo'] != '')]
# Select only the desired columns
    apo_columns = ['Apo']
    apo_df = apo_df[apo_columns]
    return apo_df
apo_data = apo_df()

apo_rvalues_aligned = combined_rvalues_df[combined_rvalues_df['Cleaned Base Name'] .isin (apo_data['Apo'])]
apo_rvalues_aligned.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_rvalues_aligned.csv',index=False)
print("Apo rvalues:", apo_rvalues_aligned.head(10))

# Define criteria for Holo structure
def holo_df():
    holo_df = df[df['Holo'].notna() & (df['Holo'] != '')]
# Select only the desired columns
    holo_columns = ['Holo']
    holo_df = holo_df[holo_columns]
    return holo_df
holo_data = holo_df()

holo_rvalues_aligned = combined_rvalues_df[combined_rvalues_df['Cleaned Base Name'] .isin (holo_data['Holo'])]
holo_rvalues_aligned.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/holo_rvalues_aligned.csv',index=False)
print("Holo rvalues:", holo_rvalues_aligned.head(10))

In [52]:
""" 
This line of code is to separate the 5.0 closeres files from dataset, concatenate all dataframes into a single dataframe
and specify for only the 'Apo' & 'Holo' structures
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
from scipy import stats
import shutil

# Define the source and destination directories
source_directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/full_protein-20241018T143402Z-001/full_protein/'
destination_directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/5_closeres_subset_files/'

# Create the destination directory if it doesn't exist
os.makedirs(destination_directory, exist_ok=True)

# List all files in the source directory
file_list = os.listdir(source_directory)

# Loop through each file and transfer files ending with qFit_sasa.csv to the destination directory
for file in file_list:
    if file.endswith('_5.0_closeres.csv'):
        source_file_path = os.path.join(source_directory, file)
        destination_file_path = os.path.join(destination_directory, file)
        shutil.move(source_file_path, destination_file_path)

print("Files ending with '5.0_closeres.csv' have been successfully transferred to the separate folder.")

# Define the directory
directory = '/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt//Stephanie Wankowicz research/5_closeres_subset_files'

# Initialize an empty list to hold the dataframes
dataframes = []

# List all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        pdb_id = filename.split('_')[0]
        # Read the CSV file
        df = pd.read_csv(os.path.join(directory, filename))
        
        # Modify the 'PDB' column if it exists
        if 'PDB' in df.columns:
            df['PDB'] = df['PDB'].str.replace('_qFit', '', regex=False)
        dataframes.append(df)

# Concatenate all dataframes in the list into a single dataframe    
combined_5_closeres_df = pd.concat(dataframes, ignore_index=True)  
# Remove any remaining duplicates
combined_5_closeres_df = combined_5_closeres_df.drop_duplicates().reset_index(drop=True)

print("Combined closeres files check:", combined_5_closeres_df.head(10))

# How to separate all Apo/Holo structures from the combined dataframe
# Load the apo/holo template into a dataframe
df = pd.read_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/apo_holo_241016.csv')

# Define criteria for Apo structure
def apo_df():
    apo_df = df[df['Apo'].notna() & (df['Apo'] != '')]
# Select only the desired columns
    apo_columns = ['Apo']
    apo_df = apo_df[apo_columns]
    return apo_df
apo_data = apo_df()

apo_5_closeres_subset_aligned = combined_5_closeres_df[combined_5_closeres_df['PDB'] .isin (apo_data['Apo'])]
apo_5_closeres_subset_aligned.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/apo_5_closeres_subset_aligned.csv',index=False)
print("Apo closeres 5.0:", apo_5_closeres_subset_aligned.head(10))

# Define criteria for holo_res structure
def holo_df():
    holo_df = df[df['Holo'].notna() & (df['Holo'] != '')]
# Select only the desired columns
    holo_columns = ['Holo']
    holo_df = holo_df[holo_columns]
    return holo_df
holo_data = holo_df()

holo_5_closeres_subset_aligned = combined_5_closeres_df[combined_5_closeres_df['PDB'] .isin (holo_data['Holo'])]
holo_5_closeres_subset_aligned.to_csv('/Users/harrw10/OneDrive - Vanderbilt/Documents/Vanderbilt/Stephanie Wankowicz research/combined_full_protein_data/holo_5_closeres_subset_aligned.csv',index=False)

print("Holo closeres 5.0:", holo_5_closeres_subset_aligned.head(10))

Files ending with '5.0_closeres.csv' have been successfully transferred to the separate folder.
    resi chain   PDB  distance
0     51     A  19gs  2.398891
1    413     S  19gs  2.762175
2   1356     S  19gs  3.776991
3   1484     S  19gs  4.085050
4    621     S  19gs  4.419173
..   ...   ...   ...       ...
95   235     A  1c27  2.905827
96   132     B  1d4h  2.694492
97    82     A  1d4h  2.840231
98   919     S  1d4h  3.164080
99   176     B  1d4h  3.093016

[100 rows x 4 columns]
Apo closeres 5.0: Empty DataFrame
Columns: [resi, chain, PDB, distance]
Index: []
Holo closeres 5.0:    resi chain   PDB  distance
0    51     A  19gs  2.398891
1   413     S  19gs  2.762175
2  1356     S  19gs  3.776991
3  1484     S  19gs  4.085050
4   621     S  19gs  4.419173
5  1563     S  19gs  4.073359
6   108     A  19gs  2.877991
7    38     A  19gs  3.003979
8  1399     S  19gs  2.914223
9     8     A  19gs  3.305551
