<a href="https://colab.research.google.com/github/XyzoexR/TBI-Biomarker-Prediction/blob/main/Sympotom_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
import numpy as np

In [2]:
# Load the data
file_path = '/content/TBI Sample Concentration Summary_20240923lm.xlsx'
data = pd.read_excel(file_path, sheet_name='TBI ELISA-PCR Summary')

data.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,BBX Conc. (ng/mL),Unnamed: 6,Unnamed: 7,Unnamed: 8,Elisa-PCR Meas. Conc. (ng/mL),Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19
0,Item #,Sample name,source,Number of vials,Amount (mL) per vial,GFAP,NRGN,ST2,BDNF,GFAP,NRGN,ST2,BDNF,,,,BBX,ELISA-PCR,BBX Cutoff,ELISA-PCR CutOff
1,10,D 25,non-study TBI patient,1,0.5 mL,0.264,0.125,39,25,0.219283,0.94823,0.36493,7.223033,,GFAP,NPA,1,1,0.026,0.01
2,11,D 26,non-study TBI patient,1,1 mL,1.064,0.066,184,15,0.609849,10.43441,4.5147,7.478055,,,PPA,0.8,0.785714,,
3,12,D 27,non-study TBI patient,1,1 mL,1.57,0.096,48,21,1.381156,4.68625,44.82302,7.267844,,NRGN,NPA,1,0.9,0.07,2
4,13,D 28,non-study TBI patient,1,1 mL,0.062,0.041,45,14,0.04133,3.57475,13.48017,7.274194,,,PPA,0.4,0.6,,


In [3]:
# Now clean the data by selecting relevant columns
columns_of_interest = [
    'Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4',
    ' BBX Conc. (ng/mL)', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8',
    'Elisa-PCR Meas. Conc. (ng/mL) ', 'Unnamed: 10', 'Unnamed: 11',
    'Unnamed: 12'
]

# Renaming the columns for clarity
cleaned_data = data[columns_of_interest].rename(columns={
    'Unnamed: 0': 'Item #',
    'Unnamed: 1': 'Sample Name',
    'Unnamed: 2': 'Source',
    'Unnamed: 3': 'Number of vials',
    'Unnamed: 4': 'Amount per vial',
    ' BBX Conc. (ng/mL)': 'BBX_GFAP',
    'Unnamed: 6': 'BBX_NRGN',
    'Unnamed: 7': 'BBX_ST2',
    'Unnamed: 8': 'BBX_BDNF',
    'Elisa-PCR Meas. Conc. (ng/mL) ': 'ELISA_PCR_GFAP',
    'Unnamed: 10': 'ELISA_PCR_NRGN',
    'Unnamed: 11': 'ELISA_PCR_ST2',
    'Unnamed: 12': 'ELISA_PCR_BDNF'
})

# Dropping irrelevant information
data = cleaned_data.drop(index=[0, 16, 27, 28])
data = data.reset_index(drop=True)

In [4]:
print(data)

   Item # Sample Name                 Source Number of vials Amount per vial  \
0      10        D 25  non-study TBI patient               1          0.5 mL   
1      11        D 26  non-study TBI patient               1            1 mL   
2      12        D 27  non-study TBI patient               1            1 mL   
3      13        D 28  non-study TBI patient               1            1 mL   
4      14        D 29  non-study TBI patient               1            1 mL   
5      15        D 30  non-study TBI patient               1            1 mL   
6      16        D 31  non-study TBI patient               1            1 mL   
7      17        D 32  non-study TBI patient               1            1 mL   
8      18        D 33  non-study TBI patient               1            1 mL   
9      19        D 34  non-study TBI patient               1            1 mL   
10     20        D 35  non-study TBI patient               1            1 mL   
11     21        D 36  non-study TBI pat

In [5]:
# Performing data cleaning
biomarker_columns = ['BBX_GFAP', 'BBX_NRGN', 'BBX_ST2', 'BBX_BDNF',
                     'ELISA_PCR_GFAP', 'ELISA_PCR_NRGN', 'ELISA_PCR_ST2',
                     'ELISA_PCR_BDNF']

# Loop through each biomarker column
for col in biomarker_columns:
    # Remove '<' sign and convert to numeric, setting errors='coerce' to handle
    # any remaining non-numeric values
    data[col] = data[col].replace('<', '', regex=True).astype(float)

  data[col] = data[col].replace('<', '', regex=True).astype(float)
  data[col] = data[col].replace('<', '', regex=True).astype(float)
  data[col] = data[col].replace('<', '', regex=True).astype(float)
  data[col] = data[col].replace('<', '', regex=True).astype(float)
  data[col] = data[col].replace('<', '', regex=True).astype(float)
  data[col] = data[col].replace('<', '', regex=True).astype(float)


In [6]:
print(data)

   Item # Sample Name                 Source Number of vials Amount per vial  \
0      10        D 25  non-study TBI patient               1          0.5 mL   
1      11        D 26  non-study TBI patient               1            1 mL   
2      12        D 27  non-study TBI patient               1            1 mL   
3      13        D 28  non-study TBI patient               1            1 mL   
4      14        D 29  non-study TBI patient               1            1 mL   
5      15        D 30  non-study TBI patient               1            1 mL   
6      16        D 31  non-study TBI patient               1            1 mL   
7      17        D 32  non-study TBI patient               1            1 mL   
8      18        D 33  non-study TBI patient               1            1 mL   
9      19        D 34  non-study TBI patient               1            1 mL   
10     20        D 35  non-study TBI patient               1            1 mL   
11     21        D 36  non-study TBI pat

In [7]:
# Create a Symptom column based on the Source
data['Symptom'] = data['Source'].apply(lambda x: 1 if 'TBI patient' in x else 0)

print(data[['Source', 'Symptom']])

                   Source  Symptom
0   non-study TBI patient        1
1   non-study TBI patient        1
2   non-study TBI patient        1
3   non-study TBI patient        1
4   non-study TBI patient        1
5   non-study TBI patient        1
6   non-study TBI patient        1
7   non-study TBI patient        1
8   non-study TBI patient        1
9   non-study TBI patient        1
10  non-study TBI patient        1
11  non-study TBI patient        1
12  non-study TBI patient        1
13  non-study TBI patient        1
14  non-study TBI patient        1
15    non-study volunteer        0
16    non-study volunteer        0
17    non-study volunteer        0
18    non-study volunteer        0
19    non-study volunteer        0
20    non-study volunteer        0
21    non-study volunteer        0
22    non-study volunteer        0
23    non-study volunteer        0
24    non-study volunteer        0


In [8]:
# Define the feature and target
X = data[['BBX_GFAP', 'BBX_NRGN', 'BBX_ST2', 'BBX_BDNF']]
X.columns = ['GFAP', 'NRGN', 'ST2', 'BDNF']
y = data['Symptom']

In [9]:
# This part normalize the four biomarkers into 0-1 range
# Define the scaling factors based on the provided ranges
scaling_factors = {'GFAP': 50, 'NRGN': 20, 'ST2': 200, 'BDNF': 30}

# Apply the scaling factors to normalize X
X = X / [scaling_factors[col] for col in X.columns]

print(X)

       GFAP     NRGN    ST2      BDNF
0   0.00528  0.00625  0.195  0.833333
1   0.02128  0.00330  0.920  0.500000
2   0.03140  0.00480  0.240  0.700000
3   0.00124  0.00205  0.225  0.466667
4   0.00190  0.00550  0.085  0.666667
5   0.00020  0.03750  0.185  0.700000
6   0.00020  0.00315  0.165  0.600000
7   0.00020  0.01470  0.090  0.766667
8   0.01532  0.01185  0.145  0.533333
9   0.00088  0.00060  1.000  0.200000
10  0.00410  0.00220  0.060  0.833333
11  0.03958  0.00140  0.125  0.666667
12  0.00892  0.00060  0.310  2.066667
13  0.01360  0.00060  0.685  4.566667
14  0.03734  0.00060  0.190  1.266667
15  0.00020  0.00060  0.030  0.200000
16  0.00020  0.00060  0.145  0.066667
17  0.00020  0.00060  0.165  0.033333
18  0.00020  0.00060  0.160  0.033333
19  0.00050  0.00060  0.095  0.066667
20  0.00020  0.00060  0.080  0.033333
21  0.00020  0.00265  0.110  0.100000
22  0.00020  0.00065  0.085  0.033333
23  0.00020  0.00340  0.155  0.066667
24  0.00020  0.00060  0.080  0.066667


In [10]:
# Split the training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the logistic regression model
model = LogisticRegression()

In [11]:
# Train the logistic regression model on the full dataset to interpret coefficients
model.fit(X_train, y_train)

In [12]:
# Define the number of folds for cross-validation
k = 5

# Perform 5-fold cross-validation, evaluating model performance with ROC AUC
cv_scores = cross_val_score(model, X_train, y_train, cv=k, scoring='roc_auc')

# Print the AUC score for each fold
print(f"Cross-Validation AUC Scores for {k} Folds:", cv_scores)
print("Mean AUC Score:", np.mean(cv_scores))
print("Standard Deviation of AUC Scores:", np.std(cv_scores))

Cross-Validation AUC Scores for 5 Folds: [1. 1. 1. 1. 1.]
Mean AUC Score: 1.0
Standard Deviation of AUC Scores: 0.0


In [13]:
# Display the coefficients for each biomarker
coefficients = pd.DataFrame({
    'Biomarker': X.columns,
    'Coefficient': model.coef_[0]
})
print(coefficients)

  Biomarker  Coefficient
0      GFAP     0.023540
1      NRGN     0.020916
2       ST2     0.723679
3      BDNF     1.788361


In [21]:
# Make predictions on the test dataset
y_pred = model.predict(X_test)

# Create a DataFrame to compare actual and predicted values
comparison_df = pd.DataFrame({
    'Predicted': y_pred,
    'Actual': y_test
}, index=y_test.index)
comparison_df = pd.concat([X_test, comparison_df], axis=1)
comparison_df.index.name = 'Index'

# Display the comparison
print(comparison_df)

          GFAP     NRGN    ST2      BDNF  Predicted  Actual
Index                                                      
8      0.01532  0.01185  0.145  0.533333          1       1
16     0.00020  0.00060  0.145  0.066667          0       0
0      0.00528  0.00625  0.195  0.833333          1       1
23     0.00020  0.00340  0.155  0.066667          0       0
11     0.03958  0.00140  0.125  0.666667          1       1


In [16]:
X_e = data[['ELISA_PCR_GFAP', 'ELISA_PCR_NRGN', 'ELISA_PCR_ST2', 'ELISA_PCR_BDNF']]
X_e.columns = ['GFAP', 'NRGN', 'ST2', 'BDNF']
y_e = data['Symptom']
elisa_data = pd.concat([X_e, y_e], axis=1)
elisa_data = elisa_data.dropna()
X_e = elisa_data[['GFAP', 'NRGN', 'ST2', 'BDNF']]
X_e = X_e / [scaling_factors[col] for col in X_e.columns]
y_e = elisa_data['Symptom']
y_e_pred = model.predict(X_e)
comparison_df = pd.DataFrame({
    'Predicted': y_e_pred,
    'Actual': y_e
})
print(comparison_df)

    Predicted  Actual
0           0       1
1           0       1
2           0       1
3           0       1
4           0       1
5           0       1
6           0       1
7           0       1
8           0       1
9           0       1
10          0       1
11          0       1
12          0       1
13          0       1
15          0       0
16          0       0
17          0       0
18          0       0
19          0       0
20          0       0
21          0       0
22          0       0
23          0       0
24          0       0


References


1.   PROAD-SW Melanoma Proteomics Poster. "Plasma proteomic biomarkers identify non-responders and reveal biological insights about the tumor
microenvironment in melanoma patients after PD1 blockade". AACR 2024.
2.   List item
