In [52]:
import pandas as pd

# Load the merged datasets
als_data = pd.read_csv('/Users/opethompson/Desktop/ALS PROCESSED/Diagnostics/ALS Diagnosis (ALS Natural History).csv')

In [53]:
# Define a threshold for dropping columns (50% missing values)
threshold = 0.5
columns_to_drop = als_data.columns[als_data.isnull().mean() > threshold]
als_data.drop(columns=columns_to_drop, axis=1, inplace=True)

In [54]:
# handling missing data: For numerical columns, we use the median; for categorical columns, we use the mode
for col in als_data.columns:
    if als_data[col].dtype == 'object':  # Categorical data
        als_data[col].fillna(als_data[col].mode()[0], inplace=True)
    else:  # Numerical data
        als_data[col].fillna(als_data[col].median(), inplace=True)

In [55]:
#Drop Rows with Missing Values
als_data.dropna(inplace=True)


In [56]:
# Replace every occurrence of 99 with 0
als_data = als_data.replace(99, 0)

In [57]:
# Replace every occurrence of 90 with 0
als_data = als_data.replace(90, 0)

In [58]:
# Save the cleaned data to a new file
als_data.to_csv('preprocessedALShx_data.csv', index=False)


In [71]:
#Building and Evaluating the Model for ALS diagnosis
#Load data set
import pandas as pd

df= pd.read_csv('/Users/opethompson/Desktop/preprocessedALSclean_data.csv')

#Building and Evaluating the Model for ALS Diagnosis
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split the data
X = df.drop('elescrlr', axis=1)
y = df['elescrlr']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
modeldx = RandomForestClassifier()
modeldx.fit(X_train, y_train)

# Evaluate the model
predictions = modeldx.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))
# Calculate precision, recall, and F1-score
print(classification_report(y_test, predictions))

# Calculate ROC-AUC for binary classification tasks
# We must check if y is binary before applying roc_auc_score
if len(y.unique()) == 2:
    probs = modeldx.predict_proba(X_test)[:, 1]  # get the probabilities of the positive class
    roc_auc = roc_auc_score(y_test, probs)
    print("ROC-AUC:", roc_auc)


Accuracy: 0.997325813701823
              precision    recall  f1-score   support

           1       1.00      1.00      1.00      9385
           2       1.00      1.00      1.00     13158
           3       0.99      0.99      0.99     14596
           4       0.99      1.00      0.99     24747
           5       1.00      1.00      1.00     34218

    accuracy                           1.00     96104
   macro avg       1.00      1.00      1.00     96104
weighted avg       1.00      1.00      1.00     96104



In [72]:
#Building and Evaluating the Model for ALS Phenotype
import pandas as pd

df= pd.read_csv('/Users/opethompson/Desktop/preprocessedALSclean_data.csv')

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split the data
X = df.drop('cdalsphn', axis=1)
y = df['cdalsphn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
modelph = RandomForestClassifier()
modelph.fit(X_train, y_train)

# Evaluate the model
predictions = modelph.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))

Accuracy: 0.9795013735120286


In [65]:
# Feature Importance Analysis

import numpy as np

# Extracting feature importances
importance_diagnosis = modeldx.feature_importances_

# Function to summarize feature importances
def summarize_feature_importances(importances, feature_names, top_n=10):
    indices = np.argsort(importances)[::-1]
    top_features = [(feature_names[i], importances[i]) for i in indices[:top_n]]
    return top_features

# Top features for ALS Diagnosis
top_features_diagnosis = summarize_feature_importances(importance_diagnosis, df.columns)

top_features_diagnosis


[('internal_subject_id', 0.1943334199959146),
 ('blbcumn', 0.07199887079510064),
 ('blbclmn', 0.05702231890619021),
 ('trnkclmn', 0.05362854871856702),
 ('trnkelmn', 0.04817391724854775),
 ('rleelmn', 0.048002079849190395),
 ('rueelmn', 0.04762260819523785),
 ('trnkcumn', 0.04750520348741263),
 ('blbelmn', 0.04701654473291307),
 ('lleelmn', 0.0468976284140643)]

In [69]:
# Function to recommend treatment based on ALS diagnosis
def recommend_treatment:
    """
    Recommends treatment based on ALS diagnosis.

    :param diagnosis: The diagnosis result (1, 2, 3, 4, 5)
    :return: Recommended treatment
    """
    if diagnosis == 5:
        return "Standard ALS treatment protocol"
    elif diagnosis == 4:
        return "Probable ALS treatment protocol"
    elif diagnosis == 2 or diagnosis == 3:
        return "Conservative observation and symptomatic treatment"
    else:
        return "Further diagnostic evaluation required"

In [None]:
# Function to recommend treatment based on ALS diagnosis

if phenotype == 'UMN-predominant':
    treatment_plan += ' and UMN-focused therapy'
elif phenotype == 'LMN-predominant':
    treatment_plan += ' and LMN-focused therapy'
else:
    treatment_plan += ' and Bulbar symptoms management'


In [None]:
#Define the state space
states = {
    'Start': {'ALS': True, 'Treatment': None},
    'Treatment1': {'ALS': True, 'Treatment': 'conservative'},
    'Treatment2': {'ALS': True, 'Treatment': 'physical therapy'},
    'Treatment3': {'ALS': True, 'Treatment': 'riluzole'},
    'Goal': {'ALS': False, 'Treatment': 'Recovered'}
}

# Define transitions between states (example)
transitions = {
    'Start': [('Treatment1', 1), ('Treatment2', 2)],
    'Treatment1': [('Treatment3', 3), ('Goal', 4)],
    'Treatment2': [('Treatment3', 2), ('Goal', 5)],
    'Treatment3': [('Goal', 1)]
}

In [None]:
#Implement the Greedy Best-First Search Algorithm

def greedy_best_first_search(start, goal, states, transitions):
    # Initialize the open list with the start state
    open_list = [(start, 0)]  # (state, cost)
    closed_list = set()

    while open_list:
        # Choose the state with the lowest cost
        state, cost = min(open_list, key=lambda x: x[1])
        open_list.remove((state, cost))

        # Check if the goal is reached
        if state == goal:
            return state, cost

        # Add state to closed list
        closed_list.add(state)

        # Add neighbors to the open list
        for neighbor, step_cost in transitions.get(state, []):
            if neighbor not in closed_list:
                total_cost = cost + step_cost
                open_list.append((neighbor, total_cost))

    return None, float('inf')  # Goal not reached

# Example usage
start_state = 'Start'
goal_state = 'Goal'
optimal_state, total_cost = greedy_best_first_search(start_state, goal_state, states, transitions)
print(f"Optimal State: {optimal_state}, Total Cost: {total_cost}")


In [None]:
#Sample Python Code for Back-End Server

from flask import Flask, request, jsonify
import joblib

app = Flask(__name__)

# Loading the trained models (saved as .pkl files)
diagnosis_model = joblib.load('diagnosis_model.pkl')
phenotype_model = joblib.load('phenotype_model.pkl')

@app.route('/predict_diagnosis', methods=['POST'])
def predict_diagnosis():
    data = request.json  # Get data from POST request
    # Preprocess the data as per the model requirements
    # ...
    # Make a prediction
    prediction = diagnosis_model.predict(data)
    return jsonify({'diagnosis_prediction': prediction.tolist()})

@app.route('/predict_phenotype', methods=['POST'])
def predict_phenotype():
    data = request.json  # Get data from POST request
    # Preprocess the data as per the model requirements
    # ...
    # Make a prediction
    prediction = phenotype_model.predict(data)
    return jsonify({'phenotype_prediction': prediction.tolist()})

if __name__ == '__main__':
    app.run(debug=True)

In [None]:
#Testing Model with user inputs

import numpy as np
import joblib

# Load the trained models
diagnosis_model = joblib.load('diagnosis_model.pkl')
phenotype_model = joblib.load('phenotype_model.pkl')

def get_user_input():
    # User inputs
    age = int(input("Enter age: "))
    sex = input("Enter sex (Male/Female): ")
    # ALSFRS scores are a series of numbers
    alsfrs_scores = [int(x) for x in input("Enter ALSFRS scores separated by space: ").split()]
    roads_score = int(input("Enter RoADS score: "))

    # Convert inputs to features 
    features = [age] + [sex] + alsfrs_scores + [roads_score]
    # Feature engineering and scaling need to be done here as per the model's training
    # ...
    return np.array([features])

def predict_als(features):
    diagnosis_prediction = diagnosis_model.predict(features)
    phenotype_prediction = phenotype_model.predict(features)
    return diagnosis_prediction, phenotype_prediction

# Get user input
user_features = get_user_input()

# Predict ALS diagnosis and phenotype
diagnosis, phenotype = predict_als(user_features)

print(f"Predicted ALS Diagnosis: {diagnosis[0]}")
print(f"Predicted ALS Phenotype: {phenotype[0]}")