In [26]:
import pandas as pd

# Load the cleaned data
CSV_PATH = 'D:\\Softwarica\\Thesis File\\Advanced_medical_healthcare_recommendation\\data\\raw\\'
df = pd.read_csv(CSV_PATH + 'cleaned_ED2013.csv')

# Take a random sample of the data
df_sample = df.sample(n=20000, random_state=42)  # Adjusted to 20,000 samples

# Separate features and target variables
X = df_sample.drop(['Diagnosis1', 'Medication1'], axis=1)  # Features
y_diagnosis = df_sample['Diagnosis1']  # Target for diagnosis prediction
y_medication = df_sample['Medication1']  # Target for medication recommendation

# Print the shape of the resulting DataFrames
print("Sample data shape:", df_sample.shape)
print("Features shape:", X.shape)
print("Diagnosis target shape:", y_diagnosis.shape)
print("Medication target shape:", y_medication.shape)


Sample data shape: (20000, 81)
Features shape: (20000, 79)
Diagnosis target shape: (20000,)
Medication target shape: (20000,)


In [27]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Create transformers for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  # Normalize numerical data
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical data
])

# Combine transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Split data into training and testing sets for diagnosis prediction
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X, y_diagnosis, test_size=0.2, random_state=42)

# Split data into training and testing sets for medication recommendation
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X, y_medication, test_size=0.2, random_state=42)

# Apply the preprocessing pipeline to the data
X_train_d = preprocessor.fit_transform(X_train_d)
X_test_d = preprocessor.transform(X_test_d)

X_train_m = preprocessor.fit_transform(X_train_m)
X_test_m = preprocessor.transform(X_test_m)

print('Data preprocessing complete.')



Data preprocessing complete.


In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train a Random Forest classifier for diagnosis prediction
rf_model_d = RandomForestClassifier(n_estimators=50, random_state=42)
rf_model_d.fit(X_train_d, y_train_d)

# Evaluate the model on the test set
y_pred_d_rf = rf_model_d.predict(X_test_d)
print("Random Forest Diagnosis Accuracy:", accuracy_score(y_test_d, y_pred_d_rf))
print("Random Forest Diagnosis Classification Report:\n", classification_report(y_test_d, y_pred_d_rf))



Random Forest Diagnosis Accuracy: 0.185
Random Forest Diagnosis Classification Report:
               precision    recall  f1-score   support

          -9       0.92      0.92      0.92        25
       0059-       0.00      0.00      0.00         2
       0085-       0.00      0.00      0.00         1
       00863       0.00      0.00      0.00         1
       0088-       0.00      0.00      0.00         7
       0090-       0.00      0.00      0.00         1
       0091-       0.00      0.00      0.00         0
       0340-       0.07      0.06      0.06        17
       0341-       1.00      0.50      0.67         2
       0389-       0.31      0.57      0.40         7
       04100       0.00      0.00      0.00         1
       0412-       0.00      0.00      0.00         1
       042--       0.00      0.00      0.00         1
       0529-       0.00      0.00      0.00         2
       0539-       0.50      0.27      0.35        11
       0546-       0.00      0.00      0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.ensemble import RandomForestClassifier
# from imblearn.over_sampling import SMOTE
# from imblearn.pipeline import Pipeline as imPipeline
# from sklearn.metrics import accuracy_score, classification_report

# # Identify categorical and numerical columns
# categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
# numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()

# # Create transformers for numerical and categorical data
# numerical_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())  # Normalize numerical data
# ])

# categorical_transformer = Pipeline(steps=[
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical data
# ])

# # Combine transformers into a ColumnTransformer
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_transformer, numerical_cols),
#         ('cat', categorical_transformer, categorical_cols)
#     ])

# # Split data into training and testing sets for diagnosis prediction
# X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X, y_diagnosis, test_size=0.2, random_state=42)

# # Split data into training and testing sets for medication recommendation
# X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X, y_medication, test_size=0.2, random_state=42)

# # Apply preprocessing pipeline to the data
# X_train_d = preprocessor.fit_transform(X_train_d)
# X_test_d = preprocessor.transform(X_test_d)

# X_train_m = preprocessor.fit_transform(X_train_m)
# X_test_m = preprocessor.transform(X_test_m)

# # Create a pipeline with preprocessing and SMOTE
# pipeline_d = imPipeline([
#     ('preprocessor', preprocessor),
#     ('smote', SMOTE(random_state=42)),
#     ('classifier', RandomForestClassifier(random_state=42))
# ])

# # Define parameter grid for GridSearchCV
# param_grid = {
#     'classifier__n_estimators': [50, 100, 200],
#     'classifier__max_depth': [None, 10, 20, 30],
#     'classifier__min_samples_split': [2, 5, 10],
#     'classifier__min_samples_leaf': [1, 2, 4]
# }

# # Perform GridSearchCV to find the best parameters
# grid_search_d = GridSearchCV(pipeline_d, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
# grid_search_d.fit(X_train_d, y_train_d)

# # Evaluate the best model on the test set
# best_model_d = grid_search_d.best_estimator_
# y_pred_d_rf = best_model_d.predict(X_test_d)
# print("Best Random Forest Diagnosis Accuracy:", accuracy_score(y_test_d, y_pred_d_rf))
# print("Best Random Forest Diagnosis Classification Report:\n", classification_report(y_test_d, y_pred_d_rf))


In [7]:
from sklearn.neural_network import MLPClassifier

# Train a simple neural network using MLPClassifier
mlp_model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=300, random_state=42)
mlp_model.fit(X_train_d, y_train_d)  # Use X_train_d and y_train_d

# Evaluate the model
mlp_accuracy = mlp_model.score(X_test_d, y_test_d)
print("MLP Neural Network Diagnosis Accuracy:", mlp_accuracy)


MLP Neural Network Diagnosis Accuracy: 0.11733333333333333


In [9]:
# from imblearn.over_sampling import RandomOverSampler
# # resampleing
# # Handle imbalanced data using RandomOverSampler
# ros = RandomOverSampler(random_state=42)
# X_train_d_resampled, y_train_d_resampled = ros.fit_resample(X_train_d, y_train_d)

# # Train the neural network model using the resampled data
# mlp_model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=300, random_state=42)
# mlp_model.fit(X_train_d_resampled, y_train_d_resampled)

# # Evaluate the model on the original test set
# mlp_accuracy = mlp_model.score(X_test_d, y_test_d)
# print("MLP Neural Network Diagnosis Accuracy:", mlp_accuracy)


In [13]:
from sklearn.neural_network import MLPClassifier

# Train a simple multi-layer perceptron (MLP) as a substitute for RNN
# Since RNN is not possible without TensorFlow, we use MLP with multiple layers
# This simulates some of the learning characteristics of RNNs for time-sequenced data

mlp_model_rnn = MLPClassifier(hidden_layer_sizes=(100, 100, 50), max_iter=300, random_state=42)
mlp_model_rnn.fit(X_train_d, y_train_d)  # Use X_train_d and y_train_d

# Evaluate the model
mlp_rnn_accuracy = mlp_model_rnn.score(X_test_d, y_test_d)
print("Simulated RNN (MLP) Diagnosis Accuracy:", mlp_rnn_accuracy)

# Generate a classification report
from sklearn.metrics import classification_report
y_pred_mlp_rnn = mlp_model_rnn.predict(X_test_d)
mlp_rnn_classification_report = classification_report(y_test_d, y_pred_mlp_rnn)

print("Simulated RNN (MLP) Diagnosis Classification Report:")
print(mlp_rnn_classification_report)


Simulated RNN (MLP) Diagnosis Accuracy: 0.11233333333333333
Simulated RNN (MLP) Diagnosis Classification Report:
              precision    recall  f1-score   support

          -9       1.00      1.00      1.00        16
       0059-       0.00      0.00      0.00         2
       00845       0.00      0.00      0.00         1
       0088-       0.00      0.00      0.00         3
       0090-       0.00      0.00      0.00         3
       0093-       0.00      0.00      0.00         0
       0340-       0.16      0.33      0.22        12
       0341-       0.00      0.00      0.00         1
       0389-       0.38      0.50      0.43         6
       04100       0.00      0.00      0.00         2
       04112       0.00      0.00      0.00         0
       0412-       0.00      0.00      0.00         1
       042--       0.00      0.00      0.00         1
       0529-       0.00      0.00      0.00         1
       0539-       0.00      0.00      0.00         6
       05410       0.0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
# from tensorflow.keras.layers import SimpleRNN

# # Build a simple RNN model
# rnn_model = Sequential([
#     SimpleRNN(64, activation='relu', input_shape=(X_train_d_resampled.shape[1], 1)),
#     Dense(64, activation='relu'),
#     Dense(1, activation='sigmoid')
# ])
# # 
# # Compile the model
# rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# # Train the model
# rnn_model.fit(X_train_d_cnn, y_train_d_resampled, epochs=10, batch_size=32, validation_data=(X_test_d_cnn, y_test_d))

# # Evaluate the model
# rnn_accuracy = rnn_model.evaluate(X_test_d_cnn, y_test_d)[1]
# print("RNN Diagnosis Accuracy:", rnn_accuracy)


In [18]:
# # Assuming you have collected all the accuracy scores
# print(f"Random Forest Diagnosis Accuracy: {rf_accuracy}")
# print(f"Neural Network Diagnosis Accuracy: {nn_accuracy}")
# print(f"CNN Diagnosis Accuracy: {cnn_accuracy}")
# print(f"RNN Diagnosis Accuracy: {rnn_accuracy}")


In [19]:
# # Most common primary diagnoses
# plt.figure(figsize=(14, 8))
# sns.countplot(y='Diagnosis1', data=df, order=df['Diagnosis1'].value_counts().index[:10], palette='viridis')
# plt.title('Top 10 Primary Diagnoses')
# plt.xlabel('Count')
# plt.ylabel('Diagnosis')
# plt.show()

# # Most prescribed medications
# plt.figure(figsize=(14, 8))
# sns.countplot(y='Medication1', data=df, order=df['Medication1'].value_counts().index[:10], palette='magma')
# plt.title('Top 10 Medications Prescribed')
# plt.xlabel('Count')
# plt.ylabel('Medication')
# plt.show()



In [32]:
# from sklearn.neural_network import MLPClassifier
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from sklearn.metrics import classification_report
# from sklearn.model_selection import GridSearchCV, train_test_split
# from imblearn.over_sampling import SMOTE
# import numpy as np

# # Apply SMOTE to balance the training data
# smote = SMOTE(random_state=42)
# X_train_d_resampled, y_train_d_resampled = smote.fit_resample(X_train_d, y_train_d)

# # Improved MLP Model with GridSearchCV for Hyperparameter Tuning
# mlp_param_grid = {
#     'hidden_layer_sizes': [(100, 100, 50), (128, 128, 64), (150, 100, 50)],
#     'activation': ['tanh', 'relu'],
#     'solver': ['adam', 'sgd'],
#     'learning_rate_init': [0.001, 0.01, 0.1],
#     'max_iter': [300, 500]
# }

# mlp_model_rnn = GridSearchCV(MLPClassifier(random_state=42), mlp_param_grid, cv=5, n_jobs=-1)
# mlp_model_rnn.fit(X_train_d_resampled, y_train_d_resampled)

# # Evaluate the model with the best parameters found
# mlp_rnn_accuracy = mlp_model_rnn.score(X_test_d, y_test_d)
# print("Improved Simulated RNN (MLP) Diagnosis Accuracy:", mlp_rnn_accuracy)

# # Generate a classification report
# y_pred_mlp_rnn = mlp_model_rnn.predict(X_test_d)
# mlp_rnn_classification_report = classification_report(y_test_d, y_pred_mlp_rnn)
# print("Improved Simulated RNN (MLP) Diagnosis Classification Report:")
# print(mlp_rnn_classification_report)

# # Ensemble Approach: RandomForestClassifier
# rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_model.fit(X_train_d_resampled, y_train_d_resampled)
# rf_accuracy = rf_model.score(X_test_d, y_test_d)
# print("Random Forest Diagnosis Accuracy:", rf_accuracy)

# # Ensemble Approach: GradientBoostingClassifier
# gbc_model = GradientBoostingClassifier(random_state=42)
# gbc_model.fit(X_train_d_resampled, y_train_d_resampled)
# gbc_accuracy = gbc_model.score(X_test_d, y_test_d)
# print("Gradient Boosting Diagnosis Accuracy:", gbc_accuracy)

# # Combine models by averaging their predictions
# y_pred_mlp_rnn_proba = mlp_model_rnn.predict_proba(X_test_d)
# y_pred_rf_proba = rf_model.predict_proba(X_test_d)
# y_pred_gbc_proba = gbc_model.predict_proba(X_test_d)

# # Weighted average ensemble (adjust weights based on individual model performance)
# combined_proba = (0.5 * y_pred_mlp_rnn_proba + 0.3 * y_pred_rf_proba + 0.2 * y_pred_gbc_proba)
# y_pred_ensemble = np.argmax(combined_proba, axis=1)

# ensemble_accuracy = np.mean(y_pred_ensemble == y_test_d)
# print("Ensemble Diagnosis Accuracy:", ensemble_accuracy)

# # Final classification report for ensemble
# ensemble_classification_report = classification_report(y_test_d, y_pred_ensemble)
# print("Ensemble Diagnosis Classification Report:")
# print(ensemble_classification_report)
