In [6]:
import scipy.io as sp
import numpy as np
import os
import pandas as pd

In [30]:
def gcparser(mat):
    """
    Extracts essential data from a Matlab formatted GCMS object loaded
    by sio.loadmat and wrangles this into a pandas dataframe

    Parameters:
    mat (dict): Dictionary produced by loading a file using sio.loadmat

    Return:
    DataFrame: Total ion counts (TIC) arranged by samples (columns) and
               retention time (rows), including y and label rows

    """
    data = np.transpose(mat['XTIC'])
    sample_names = np.hstack(np.hstack(mat['SAM'])).tolist()
    RT = np.hstack(np.hstack(mat['RT'])).tolist()
    y = np.hstack(mat['CLASS']).tolist()

    # Create the dataframe for features
    df_features = pd.DataFrame(data, columns=sample_names, index=RT)

    # Create a separate series for labels
    labels = pd.Series(y, index=sample_names, name='class')

    return df_features, labels

In [31]:
blood_features,blood_labels = gcparser(sp.loadmat("/content/drive/MyDrive/Namith/Blood_CDvCTRL.mat"))

In [32]:
breath_features,breath_labels = gcparser(sp.loadmat("/content/drive/MyDrive/Namith/Breath_CDvCTRL.mat"))
faecal_features,faecal_labels = gcparser(sp.loadmat("/content/drive/MyDrive/Namith/Faecal_CDvCTRL.mat"))
urine_features,urine_labels = gcparser(sp.loadmat("/content/drive/MyDrive/Namith/Urine_CDvCTRL.mat"))

In [57]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils import resample

In [60]:
import warnings
warnings.filterwarnings('ignore')

In [61]:

# 1. Prepare the data
X = blood_features.T  # Transpose to have samples as rows and features as columns
y = blood_labels

# 2. Define a function for bootstrap validation
def bootstrap_validate(X, y, model, n_iterations=100):
    n_samples = X.shape[0]
    scores = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}

    for _ in range(n_iterations):
        # Bootstrap sampling
        X_boot, y_boot = resample(X, y, n_samples=n_samples)

        # Split the bootstrap sample
        X_train, X_test, y_train, y_test = train_test_split(X_boot, y_boot, test_size=0.3, random_state=42)

        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Fit the model
        model.fit(X_train_scaled, y_train)

        # Predict
        y_pred = model.predict(X_test_scaled)

        # Calculate metrics
        scores['accuracy'].append(accuracy_score(y_test, y_pred))
        scores['precision'].append(precision_score(y_test, y_pred, average='weighted'))
        scores['recall'].append(recall_score(y_test, y_pred, average='weighted'))
        scores['f1'].append(f1_score(y_test, y_pred, average='weighted'))

    return {k: np.mean(v) for k, v in scores.items()}

# 3. Create and evaluate SVM model
svm_model = SVC(kernel='rbf', random_state=42)
svm_scores = bootstrap_validate(X, y, svm_model)

# 4. Create and evaluate Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_scores = bootstrap_validate(X, y, rf_model)

# 5. Print results
print("Results for Blood Sample\n\n")
print("SVM Results:")
for metric, score in svm_scores.items():
    print(f"{metric.capitalize()}: {score:.4f}")

print("\nRandom Forest Results:")
for metric, score in rf_scores.items():
    print(f"{metric.capitalize()}: {score:.4f}")

# 6. Compare to random chance
n_classes = len(np.unique(y))
random_chance = 1 / n_classes

print(f"\nRandom Chance Accuracy: {random_chance:.4f}")

# Determine best performing model
best_model = "SVM" if svm_scores['accuracy'] > rf_scores['accuracy'] else "Random Forest"
best_accuracy = max(svm_scores['accuracy'], rf_scores['accuracy'])

print(f"\nBest performing model: {best_model}")
print(f"Accuracy improvement over random chance: {best_accuracy - random_chance:.4f}")

Results for Blood Sample


SVM Results:
Accuracy: 0.6360
Precision: 0.6651
Recall: 0.6360
F1: 0.5928

Random Forest Results:
Accuracy: 0.7200
Precision: 0.7765
Recall: 0.7200
F1: 0.7126

Random Chance Accuracy: 0.5000

Best performing model: Random Forest
Accuracy improvement over random chance: 0.2200


In [62]:

# 1. Prepare the data
X = breath_features.T  # Transpose to have samples as rows and features as columns
y = breath_labels

# 2. Define a function for bootstrap validation
def bootstrap_validate(X, y, model, n_iterations=100):
    n_samples = X.shape[0]
    scores = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}

    for _ in range(n_iterations):
        # Bootstrap sampling
        X_boot, y_boot = resample(X, y, n_samples=n_samples)

        # Split the bootstrap sample
        X_train, X_test, y_train, y_test = train_test_split(X_boot, y_boot, test_size=0.3, random_state=42)

        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Fit the model
        model.fit(X_train_scaled, y_train)

        # Predict
        y_pred = model.predict(X_test_scaled)

        # Calculate metrics
        scores['accuracy'].append(accuracy_score(y_test, y_pred))
        scores['precision'].append(precision_score(y_test, y_pred, average='weighted'))
        scores['recall'].append(recall_score(y_test, y_pred, average='weighted'))
        scores['f1'].append(f1_score(y_test, y_pred, average='weighted'))

    return {k: np.mean(v) for k, v in scores.items()}

# 3. Create and evaluate SVM model
svm_model = SVC(kernel='rbf', random_state=42)
svm_scores = bootstrap_validate(X, y, svm_model)

# 4. Create and evaluate Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_scores = bootstrap_validate(X, y, rf_model)

# 5. Print results
print("Results for Breath Sample\n\n")
print("SVM Results:")
for metric, score in svm_scores.items():
    print(f"{metric.capitalize()}: {score:.4f}")

print("\nRandom Forest Results:")
for metric, score in rf_scores.items():
    print(f"{metric.capitalize()}: {score:.4f}")

# 6. Compare to random chance
n_classes = len(np.unique(y))
random_chance = 1 / n_classes

print(f"\nRandom Chance Accuracy: {random_chance:.4f}")

# Determine best performing model
best_model = "SVM" if svm_scores['accuracy'] > rf_scores['accuracy'] else "Random Forest"
best_accuracy = max(svm_scores['accuracy'], rf_scores['accuracy'])

print(f"\nBest performing model: {best_model}")
print(f"Accuracy improvement over random chance: {best_accuracy - random_chance:.4f}")

Results for Breath Sample


SVM Results:
Accuracy: 0.5727
Precision: 0.5260
Recall: 0.5727
F1: 0.4843

Random Forest Results:
Accuracy: 0.8055
Precision: 0.8409
Recall: 0.8055
F1: 0.8005

Random Chance Accuracy: 0.5000

Best performing model: Random Forest
Accuracy improvement over random chance: 0.3055


In [63]:

# 1. Prepare the data
X = blood_features.T  # Transpose to have samples as rows and features as columns
y = blood_labels

# 2. Define a function for bootstrap validation
def bootstrap_validate(X, y, model, n_iterations=100):
    n_samples = X.shape[0]
    scores = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}

    for _ in range(n_iterations):
        # Bootstrap sampling
        X_boot, y_boot = resample(X, y, n_samples=n_samples)

        # Split the bootstrap sample
        X_train, X_test, y_train, y_test = train_test_split(X_boot, y_boot, test_size=0.3, random_state=42)

        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Fit the model
        model.fit(X_train_scaled, y_train)

        # Predict
        y_pred = model.predict(X_test_scaled)

        # Calculate metrics
        scores['accuracy'].append(accuracy_score(y_test, y_pred))
        scores['precision'].append(precision_score(y_test, y_pred, average='weighted'))
        scores['recall'].append(recall_score(y_test, y_pred, average='weighted'))
        scores['f1'].append(f1_score(y_test, y_pred, average='weighted'))

    return {k: np.mean(v) for k, v in scores.items()}

# 3. Create and evaluate SVM model
svm_model = SVC(kernel='rbf', random_state=42)
svm_scores = bootstrap_validate(X, y, svm_model)

# 4. Create and evaluate Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_scores = bootstrap_validate(X, y, rf_model)

# 5. Print results
print("Results for Faecal Sample\n\n")
print("SVM Results:")
for metric, score in svm_scores.items():
    print(f"{metric.capitalize()}: {score:.4f}")

print("\nRandom Forest Results:")
for metric, score in rf_scores.items():
    print(f"{metric.capitalize()}: {score:.4f}")

# 6. Compare to random chance
n_classes = len(np.unique(y))
random_chance = 1 / n_classes

print(f"\nRandom Chance Accuracy: {random_chance:.4f}")

# Determine best performing model
best_model = "SVM" if svm_scores['accuracy'] > rf_scores['accuracy'] else "Random Forest"
best_accuracy = max(svm_scores['accuracy'], rf_scores['accuracy'])

print(f"\nBest performing model: {best_model}")
print(f"Accuracy improvement over random chance: {best_accuracy - random_chance:.4f}")

Results for Faecal Sample


SVM Results:
Accuracy: 0.6310
Precision: 0.6378
Recall: 0.6310
F1: 0.5861

Random Forest Results:
Accuracy: 0.7360
Precision: 0.7825
Recall: 0.7360
F1: 0.7276

Random Chance Accuracy: 0.5000

Best performing model: Random Forest
Accuracy improvement over random chance: 0.2360


In [64]:

# 1. Prepare the data
X = blood_features.T  # Transpose to have samples as rows and features as columns
y = blood_labels

# 2. Define a function for bootstrap validation
def bootstrap_validate(X, y, model, n_iterations=100):
    n_samples = X.shape[0]
    scores = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}

    for _ in range(n_iterations):
        # Bootstrap sampling
        X_boot, y_boot = resample(X, y, n_samples=n_samples)

        # Split the bootstrap sample
        X_train, X_test, y_train, y_test = train_test_split(X_boot, y_boot, test_size=0.3, random_state=42)

        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Fit the model
        model.fit(X_train_scaled, y_train)

        # Predict
        y_pred = model.predict(X_test_scaled)

        # Calculate metrics
        scores['accuracy'].append(accuracy_score(y_test, y_pred))
        scores['precision'].append(precision_score(y_test, y_pred, average='weighted'))
        scores['recall'].append(recall_score(y_test, y_pred, average='weighted'))
        scores['f1'].append(f1_score(y_test, y_pred, average='weighted'))

    return {k: np.mean(v) for k, v in scores.items()}

# 3. Create and evaluate SVM model
svm_model = SVC(kernel='rbf', random_state=42)
svm_scores = bootstrap_validate(X, y, svm_model)

# 4. Create and evaluate Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_scores = bootstrap_validate(X, y, rf_model)

# 5. Print results
print("Results for Urine Sample\n\n")
print("SVM Results:")
for metric, score in svm_scores.items():
    print(f"{metric.capitalize()}: {score:.4f}")

print("\nRandom Forest Results:")
for metric, score in rf_scores.items():
    print(f"{metric.capitalize()}: {score:.4f}")

# 6. Compare to random chance
n_classes = len(np.unique(y))
random_chance = 1 / n_classes

print(f"\nRandom Chance Accuracy: {random_chance:.4f}")

# Determine best performing model
best_model = "SVM" if svm_scores['accuracy'] > rf_scores['accuracy'] else "Random Forest"
best_accuracy = max(svm_scores['accuracy'], rf_scores['accuracy'])

print(f"\nBest performing model: {best_model}")
print(f"Accuracy improvement over random chance: {best_accuracy - random_chance:.4f}")

Results for Urine Sample


SVM Results:
Accuracy: 0.6430
Precision: 0.6291
Recall: 0.6430
F1: 0.5909

Random Forest Results:
Accuracy: 0.7230
Precision: 0.7812
Recall: 0.7230
F1: 0.7151

Random Chance Accuracy: 0.5000

Best performing model: Random Forest
Accuracy improvement over random chance: 0.2230
