In [79]:
import pandas as pd
import os
from rdkit import Chem
from rdkit.Chem import Descriptors
from tqdm import tqdm

# Read the initial data
data_path = 'data/mann_bruker.txt'
df = pd.read_csv(data_path, sep='\t')

# Calculate mean CCS for each unique sequence and charge
df_grouped = df.groupby(['Sequence', 'Charge'], as_index=False).agg({
    'CCS': 'mean',
    'Mass': 'first'
})
df_grouped['Length'] = df_grouped['Sequence'].apply(len)

# Calculate RDKit descriptors
def calculate_descriptors(sequence):
    mol = Chem.MolFromFASTA(sequence)
    if mol is None:
        return pd.Series([None, None, None])
    num_atoms = Descriptors.HeavyAtomCount(mol)
    num_rotatable_bonds = Descriptors.NumRotatableBonds(mol)
    fraction_sp3 = Descriptors.FractionCSP3(mol)
    return pd.Series([num_atoms, num_rotatable_bonds, fraction_sp3])

# Apply the descriptor calculation with a progress bar
tqdm.pandas()
df_grouped[['NumAtoms', 'NumRotatableBonds', 'FractionSP3']] = df_grouped['Sequence'].progress_apply(calculate_descriptors)


# Ensure the data directory exists
os.makedirs('data', exist_ok=True)

# Save the final dataset
output_path = 'data/final_data.csv'
df_grouped.to_csv(output_path, index=False)

100%|██████████| 231611/231611 [01:06<00:00, 3484.71it/s]


In [80]:
import pandas as pd

# Read the initial data
data_path = 'data/mann_bruker.txt'
df = pd.read_csv(data_path, sep='\t')

# Number of data points in the original data
original_data_points = len(df)

# Read the final processed data
output_path = 'data/final_data.csv'
df_grouped = pd.read_csv(output_path)

# Number of data points in the final processed data
final_data_points = len(df_grouped)

print(f"Number of data points in the original data: {original_data_points}")
print(f"Number of data points in the final processed data: {final_data_points}")

Number of data points in the original data: 440762
Number of data points in the final processed data: 231611


In [85]:
import pandas as pd

# Read the final processed data
output_path = 'data/final_data.csv'
df_grouped = pd.read_csv(output_path)

# Check for empty values and print the Sequence and column name
empty_values = df_grouped.isnull()
for index, row in empty_values.iterrows():
    if row.any():
        sequence = df_grouped.loc[index, 'Sequence']
        empty_columns = row[row].index.tolist()
        for column in empty_columns:
            print(f"Sequence: {sequence}, Empty Value in Column: {column}")


# Check if all sequence and charge combinations are unique
duplicates = df_grouped.duplicated(subset=['Sequence', 'Charge'])
all_unique = not duplicates.any()

print(f"All sequence and charge combinations are unique: {all_unique}")

# Print some examples with all columns
print("Examples of data:")
print(df_grouped.head())

# If there are duplicates, print some examples with all columns
if not all_unique:
    print("Examples of duplicate sequence and charge combinations:")
    print(df_grouped[duplicates].head())
    
# print the number of different sequences
# Read the final processed data
output_path = 'data/final_data.csv'
df_grouped = pd.read_csv(output_path)

# Print the number of different sequences
num_unique_sequences = df_grouped['Sequence'].nunique()
print(f"Number of different sequences: {num_unique_sequences}")

# Group by Sequence and count unique charges
sequence_charge_counts = df_grouped.groupby('Sequence')['Charge'].nunique()

# Filter sequences with more than one unique charge
sequences_with_multiple_charges = sequence_charge_counts[sequence_charge_counts > 1]

# Print the number of sequences with different charges
num_sequences_with_multiple_charges = len(sequences_with_multiple_charges)
print(f"Number of sequences with different charges: {num_sequences_with_multiple_charges}")





Sequence: HCHCCUR, Empty Value in Column: NumAtoms
Sequence: HCHCCUR, Empty Value in Column: NumRotatableBonds
Sequence: HCHCCUR, Empty Value in Column: FractionSP3
Sequence: NQAKKUEUPSN, Empty Value in Column: NumAtoms
Sequence: NQAKKUEUPSN, Empty Value in Column: NumRotatableBonds
Sequence: NQAKKUEUPSN, Empty Value in Column: FractionSP3
Sequence: QLSSHFQIYPFSLRKPNSDULGMEEK, Empty Value in Column: NumAtoms
Sequence: QLSSHFQIYPFSLRKPNSDULGMEEK, Empty Value in Column: NumRotatableBonds
Sequence: QLSSHFQIYPFSLRKPNSDULGMEEK, Empty Value in Column: FractionSP3
Sequence: TRKUSFK, Empty Value in Column: NumAtoms
Sequence: TRKUSFK, Empty Value in Column: NumRotatableBonds
Sequence: TRKUSFK, Empty Value in Column: FractionSP3
Sequence: VLIRVTYCGLUSYSLRYILLK, Empty Value in Column: NumAtoms
Sequence: VLIRVTYCGLUSYSLRYILLK, Empty Value in Column: NumRotatableBonds
Sequence: VLIRVTYCGLUSYSLRYILLK, Empty Value in Column: FractionSP3
Sequence: VLLIENVASLUGTTVR, Empty Value in Column: NumAtoms
Sequ

In [88]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

def getSplittedData(selected_features, train_size, val_size, test_size):
    # load data
    file_path = 'data/final_data.csv'
    data = pd.read_csv(file_path, sep=",").dropna()

    # Nach Sequenzen gruppieren
    grouped = data.groupby("Sequence")
    # Erstellen einer Liste, in der jede Sequenz eine Gruppe ist
    grouped_data = [(seq, group) for seq, group in grouped]

    # Splitten (z.B. 75% Training, 10% Validation, 15% Test)
    train_cur, test_groups = train_test_split(grouped_data, test_size=test_size, random_state=42)

    # Zweiter Split: Trainingsdaten in Training und Validierung aufteilen (z.B. 8/9) für Training und 1/9 für Validierung)
    train_groups, val_groups = train_test_split(train_cur, test_size=val_size/(1-test_size), random_state=42)  #1/9 * 0.9 = 0.1 für val

    # Schritt 3: Die Gruppendaten wieder in DataFrames konvertieren
    train_df = pd.concat([group for _, group in train_groups])
    test_df = pd.concat([group for _, group in test_groups])
    val_df = pd.concat([group for _, group in val_groups])

    y_scaler = MinMaxScaler()
    X_scaler = MinMaxScaler()

    # scale train
    y_train_unscaled = train_df['CCS'].values.reshape(-1, 1) * 1e40
    y_train = y_scaler.fit_transform(y_train_unscaled)
    X_train_unscaled = train_df[selected_features]
    X_train = pd.DataFrame(X_scaler.fit_transform(X_train_unscaled), columns=X_train_unscaled.columns)

    # scale validation
    y_val_unscaled = val_df['CCS'].values.reshape(-1, 1) * 1e40
    y_val = y_scaler.transform(y_val_unscaled)
    X_val_unscaled = val_df[selected_features]
    X_val = pd.DataFrame(X_scaler.transform(X_val_unscaled), columns=X_val_unscaled.columns)

    # scale test
    y_test_unscaled = test_df['CCS'].values.reshape(-1, 1) * 1e40
    y_test = y_scaler.transform(y_test_unscaled)
    X_test_unscaled = test_df[selected_features]
    X_test = pd.DataFrame(X_scaler.transform(X_test_unscaled), columns=X_test_unscaled.columns)




    return y_train, y_val, y_test, X_train, X_val, X_test


# Define the selected features and sizes
selected_features = ['Charge', 'Mass', 'Length', 'NumAtoms', 'NumRotatableBonds', 'FractionSP3']
train_size = 0.75
val_size = 0.10
test_size = 0.15

# Call the function to get the data splits
y_train, y_val, y_test, X_train, X_val, X_test = getSplittedData(selected_features, train_size, val_size, test_size)

# Create data folder if it doesn't exist
os.makedirs('data', exist_ok=True)

# Save the data to CSV files
X_train.to_csv('data/X_train.csv', index=False)
X_val.to_csv('data/X_val.csv', index=False)
X_test.to_csv('data/X_test.csv', index=False)
pd.DataFrame(y_train, columns=['CCS']).to_csv('data/y_train.csv', index=False)
pd.DataFrame(y_val, columns=['CCS']).to_csv('data/y_val.csv', index=False)
pd.DataFrame(y_test, columns=['CCS']).to_csv('data/y_test.csv', index=False)



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# Define the folder path
folder_path = 'pictures'

# Check if the folder exists, if not, create it
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Load the train, validation, and test datasets
X_train = pd.read_csv('data/X_train.csv')
X_val = pd.read_csv('data/X_val.csv')
X_test = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/y_train.csv')
y_val = pd.read_csv('data/y_val.csv')
y_test = pd.read_csv('data/y_test.csv')

# Define the features to plot
features = ['Mass', 'Length', 'NumAtoms', 'NumRotatableBonds', 'FractionSP3']

# Set ggplot style
plt.style.use('ggplot')

# Plot the distribution of each feature for each dataset
for feature in features:
    plt.figure(figsize=(6, 4))
    
    # Plot train set
    plt.hist(X_train[feature], bins=100, alpha=0.5, label='Train', color='blue', zorder=1)
    
    # Plot test set
    plt.hist(X_test[feature], bins=100, alpha=0.5, label='Test', color='red', zorder=2)
    
    # Plot validation set
    plt.hist(X_val[feature], bins=100, alpha=0.7, label='Validation', color='green', zorder=3)
    
    plt.xlabel(feature, fontsize=18)
    plt.ylabel('Frequency', fontsize=18)
    plt.legend(fontsize=14)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.title(f'Distribution of {feature}', fontsize=18)
    plt.tight_layout()
    plt.savefig(f'pictures/{feature}_distribution.png', dpi=300)
    plt.show()

# Plot the distribution of CCS for each dataset
plt.figure(figsize=(6, 4))

# Plot train set
plt.hist(y_train['CCS'], bins=100, alpha=0.5, label='Train', color='blue', zorder=1)

# Plot test set
plt.hist(y_test['CCS'], bins=100, alpha=0.5, label='Test', color='red', zorder=2)

# Plot validation set
plt.hist(y_val['CCS'], bins=100, alpha=0.7, label='Validation', color='green', zorder=3)

plt.xlabel('CCS', fontsize=18)
plt.ylabel('Frequency', fontsize=18)
plt.legend(fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.title('Distribution of CCS', fontsize=18)
plt.tight_layout()
plt.savefig('pictures/CCS_distribution.png', dpi=300)
plt.show()

In [None]:

# Plot the distribution of ChargeCategory for each dataset
plt.figure(figsize=(6, 4))

# Plot train set
plt.hist(X_train['Charge'], bins=[1, 2, 3, 4, 5], alpha=0.5, label='Train', color='blue', zorder=1, align='left')

# Plot test set
plt.hist(X_test['Charge'], bins=[1, 2, 3, 4, 5], alpha=0.5, label='Test', color='red', zorder=2, align='left')

# Plot validation set
plt.hist(X_val['Charge'], bins=[1, 2, 3, 4, 5], alpha=0.7, label='Validation', color='green', zorder=3, align='left')

plt.xlabel('Charge Category', fontsize=18)
plt.ylabel('Frequency', fontsize=18)
plt.legend(fontsize=14)
plt.xticks([1, 2, 3, 4], fontsize=14)
plt.yticks(fontsize=14)
plt.title('Distribution of Charge Categories', fontsize=18)
plt.tight_layout()
plt.savefig('pictures/ChargeCategory_distribution.png', dpi=300)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

import os

# Define the folder path
folder_path = 'pictures'
# Check if the folder exists, if not, create it
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Read the final processed data
file_path = 'data/final_data.csv'
df_grouped = pd.read_csv(file_path)

# Calculate mass/Charge ratio
df_grouped['mass/Charge'] = df_grouped['Mass'] / df_grouped['Charge']

# Plot distribution of CCS vs mass/Charge with color based on Charge
plt.figure(figsize=(6, 4))
scatter = plt.scatter(df_grouped['mass/Charge'], df_grouped['CCS'], c=df_grouped['Charge'], cmap='viridis', alpha=0.5)
plt.ylabel('CCS', fontsize=18)
plt.xlabel('mass/Charge', fontsize=18)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.colorbar(scatter, label='Charge')
plt.tight_layout()
plt.savefig('pictures/CCS_vs_mass_Charge_colored_by_Charge.png', dpi=300)
plt.show()

In [None]:
# Test the function with the sequence 'HCHCCUR' since it does not give values 
sequence = 'HCHCCUR'
descriptors = calculate_descriptors(sequence)
print(f"Descriptors for sequence {sequence}: {descriptors.tolist()}")