In [1]:
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('C:/Users/samue/OneDrive/Desktop/Ion mobility/Data3.csv')
data

Unnamed: 0,Sequence,Charge,Mass,Length,CCS,CCS_Min,CCS_Max,CCS_Std,Acetylation,Oxidation,Retention time,Experiment,NumRotatableBonds,NumConformers,NumAtoms,FractionSP3
0,AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGP...,4,4401.106890,52,2.654046e-35,2.654046e-35,2.654046e-35,0.0,0,0,3494.40,1_A1_01_2767,134,0,309,0.668508
1,AAAAAAAAAK,1,785.439552,10,5.996675e-36,5.996675e-36,5.996675e-36,0.0,0,0,747.88,1_A1_01_2767,23,0,55,0.696970
2,AAAAAAAAAK,1,785.439552,10,5.996673e-36,5.996673e-36,5.996673e-36,0.0,0,0,753.65,4_A1_01_2770,23,0,55,0.696970
3,AAAAAAAAAK,1,785.439552,10,5.996674e-36,5.996674e-36,5.996674e-36,0.0,0,0,754.34,3_A1_01_2769,23,0,55,0.696970
4,AAAAAAAAAK,1,785.439552,10,5.996677e-36,5.996677e-36,5.996677e-36,0.0,0,0,761.19,2_A1_01_2768,23,0,55,0.696970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319123,YYYQGCASWK,2,1324.554660,10,1.742139e-35,1.742139e-35,1.742139e-35,0.0,0,0,1994.70,1_A1_01_2767,36,0,90,0.383333
319124,YYYVCQYCPAMKTYLNK,3,2264.010450,17,2.726609e-35,2.726609e-35,2.726609e-35,0.0,0,0,3047.60,3_A1_01_2769,64,0,150,0.510000
319125,YYYVCQYCPAMKTYLNK,3,2264.010450,17,2.726609e-35,2.726609e-35,2.726609e-35,0.0,0,0,3053.50,4_A1_01_2770,64,0,150,0.510000
319126,YYYVPADFVEYEK,2,1684.766090,13,1.524399e-35,1.524399e-35,1.524399e-35,0.0,0,0,3863.30,1_A1_01_2767,48,0,121,0.445783


In [2]:
#data = data[data['Charge'] == 4]
Xn = data[['Sequence', 'Mass', 'Length', 'Acetylation', 'Oxidation', 'Retention time', 'NumAtoms', 'FractionSP3']]
y = data['CCS']
Xn

In [2]:
scaler = StandardScaler()
y_scaled = scaler.fit_transform(y.values.reshape(-1, 1))
y_scaled_df = pd.DataFrame(y_scaled, columns=['CCS'])
y_scaled_df

In [2]:
print(y_scaled.max())
print(np.argmax(y_scaled))
print(y_scaled.min())

In [2]:
# Define the 20 standard amino acids and include non-standard ones
amino_acids = 'ACDEFGHIKLMNPQRSTVWYU'
aa_to_int = {aa: i for i, aa in enumerate(amino_acids)}

def one_hot_encode(seq, max_length):
    # Pad the sequence to the maximum length with 'X' (unknown amino acid)
    padded_seq = seq.ljust(max_length, 'X')
    int_encoded = [aa_to_int.get(aa, len(amino_acids)) for aa in padded_seq]  # Use a default index for unknown amino acids
    one_hot_encoded = np.zeros((max_length, len(amino_acids) + 1))  # +1 for the unknown amino acid
    for i, value in enumerate(int_encoded):
        one_hot_encoded[i, value] = 1
    return one_hot_encoded.flatten()

In [2]:
# Determine the maximum sequence length
max_seq_length = max(Xn['Sequence'].apply(len))

# Encode the sequences
one_hot_encoded_sequences = np.array([one_hot_encode(seq, max_seq_length) for seq in Xn['Sequence']])

# Initialize the label encoder for charge states
#label_encoder = LabelEncoder()
#encoded_charge_states = label_encoder.fit_transform(data['Charge'].tolist())

In [2]:
one_hot_encoded_df = pd.DataFrame(one_hot_encoded_sequences)

# Kombinieren der One-Hot-kodierten Daten mit dem ursprünglichen DataFrame
X_combined = pd.concat([Xn.reset_index(drop=True), one_hot_encoded_df.reset_index(drop=True)], axis=1)
X_combined.drop('Sequence', axis=1, inplace=True)
X_combined.columns = X_combined.columns.astype(str)
X_combined

In [2]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_scaled, test_size=0.2, random_state=42)

In [2]:
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)
rmse = mean_squared_error(y_scaled, model.predict(X_combined), squared=True)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)

In [2]:
plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_pred, color='blue', label='Vorhersagen', s= 10)
plt.plot([y_scaled.min(), y_scaled.max()], [y_scaled.min(), y_scaled.max()], color='red', linestyle='-', label='Ideal')
plt.xlabel('Actual values')
plt.ylabel('Predicted values')
plt.title(f'All charges & charge as feature; R²-Score: {r2:.4f}, MSE: {rmse:.4f}')
plt.legend()
plt.show()

In [2]:
xgb_train = xgb.DMatrix(X_train, y_train)
xgb_test = xgb.DMatrix(X_test, y_test)
params = {'max_depth': 5, 'eta': 0.1}
model3 = xgb.train(params, dtrain=xgb_train, num_boost_round=100)

In [2]:
y_test_array = y_test.astype(int)

In [2]:
preds = model3.predict(xgb_test)
preds = preds.astype(int)
accuracy= accuracy_score(y_test_array,preds)
print('Accuracy of the model is:', accuracy*100)

In [2]:
print(y_test.dtype)
print(preds.dtype)

In [2]:
r2_xgb = r2_score(y_test_array, preds)
mse_xgb = mean_squared_error(y_test_array, preds)
print("Mean Squared Error of the model is:", mse_xgb)
print('R2 Score of the model is:', r2_xgb)
