In [18]:
# Required Libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from imblearn.over_sampling import SMOTE

# Hide Warnings
import warnings
warnings.filterwarnings("ignore")
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)

# Load the Dataset
file_path = 'Hopsital Dataset.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Preprocessing
# 1. Convert 'Age' to numeric
data['Age'] = pd.to_numeric(data['Age'], errors='coerce')
data['Age'] = data['Age'].fillna(data['Age'].mean())

# 2. Encode 'Gender'
gender_mapping = {'Male': 0, 'Female': 1, 'Other': 2}
data['Gender'] = data['Gender'].map(gender_mapping).fillna(2)

# 3. Encode 'Name of Drug' (Target)
drug_encoder = LabelEncoder()
data['Name of Drug Encoded'] = drug_encoder.fit_transform(data['Name of Drug'])

# 4. Tokenize and pad 'Diagnosis'
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data['Diagnosis'])
data['Diagnosis Tokenized'] = tokenizer.texts_to_sequences(data['Diagnosis'])
X_diagnosis = pad_sequences(data['Diagnosis Tokenized'], maxlen = 100)

# 5. Combine Features
X = np.hstack((X_diagnosis, data[['Age', 'Gender']].values))
y = data['Name of Drug Encoded'].values

# SMOTE
unique, counts = np.unique(y, return_counts = True)
drugs_with_multiple_instances = [x for x, cnt in zip(unique, counts) if cnt > 1]
multiple_instances_filter = [x in drugs_with_multiple_instances for x in y]
X = pd.DataFrame(X)[multiple_instances_filter].values
y = pd.DataFrame(y)[multiple_instances_filter].values
oversampler = SMOTE(k_neighbors = 1)
X, y = oversampler.fit_resample(X, y)

# 6. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build LSTM Model
model = Sequential([
    Embedding(input_dim=5000, output_dim = 100, input_length=X_train.shape[1] - 2),  # Embedding for Diagnosis
    LSTM(64, return_sequences=False),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(drug_encoder.classes_), activation='softmax')  # Output layer for multi-class classification
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the Model
history = model.fit(X_train, y_train, validation_split=0.2, epochs = 10, batch_size=32, verbose=1)

# Evaluate the Model
y_pred_lstm = model.predict(X_test)
y_pred_lstm = [np.argmax(_) for _ in y_pred_lstm]

lstm_report = classification_report(y_test, y_pred_lstm, output_dict = True)
print(f"Accuracy: {lstm_report["accuracy"]: .2f}")
print(f"Precision: {lstm_report["weighted avg"]["precision"]: .2f}")
print(f"Recall: {lstm_report["weighted avg"]["precision"]: .2f}")

# Save the Model and Encoders
model.save('lstm_drug_model.keras')
with open('drug_encoder.pkl', 'wb') as f:
    pickle.dump(drug_encoder, f)
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

Epoch 1/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 35ms/step - accuracy: 0.0698 - loss: 3.7692 - val_accuracy: 0.4474 - val_loss: 2.2695
Epoch 2/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 34ms/step - accuracy: 0.4022 - loss: 2.2751 - val_accuracy: 0.6672 - val_loss: 1.4310
Epoch 3/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 35ms/step - accuracy: 0.5743 - loss: 1.6184 - val_accuracy: 0.7092 - val_loss: 1.1410
Epoch 4/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 34ms/step - accuracy: 0.6307 - loss: 1.3770 - val_accuracy: 0.7295 - val_loss: 1.0070
Epoch 5/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 34ms/step - accuracy: 0.6883 - loss: 1.1631 - val_accuracy: 0.7469 - val_loss: 0.9307
Epoch 6/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 35ms/step - accuracy: 0.7175 - loss: 1.0644 - val_accuracy: 0.7447 - val_loss: 0.8944
Epoch 7/10
[1m173/173