In [2]:
import pandas as pd
import pickle
import numpy as np
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Load the data from the pickled file
file_path = '/content/drive/My Drive/synthetic_dataset/goal_set.p'
with open(file_path, 'rb') as file:
    consultation_data = pickle.load(file)

test_data = consultation_data['test']

Mounted at /content/drive


In [3]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Extract explicit and implicit symptoms along with their disease tags
disease_tags = []
explicit_symptoms_list = []
implicit_symptoms_list = []

for item in test_data:
    disease_tags.append(item['disease_tag'])
    explicit_symptoms = list(item['goal']['explicit_inform_slots'].keys())
    implicit_symptoms = list(item['goal']['implicit_inform_slots'].keys())
    explicit_symptoms_list.append(", ".join(explicit_symptoms))
    implicit_symptoms_list.append(", ".join(implicit_symptoms))

# Create DataFrame
df = pd.DataFrame({
    'Disease Tag': disease_tags,
    'Explicit Symptoms': explicit_symptoms_list,
    'Implicit Symptoms': implicit_symptoms_list
})

# Define and train the models
X = df['Explicit Symptoms'] + " " + df['Implicit Symptoms']
y = df['Disease Tag']

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text data
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Define models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Define NDCG calculation functions
def dcg_score(y_true, y_score, k=10):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    gain = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)

def ndcg_score(y_true, y_score, k=10):
    best = dcg_score(y_true, y_true, k)
    actual = dcg_score(y_true, y_score, k)
    return actual / best

# Train, predict and evaluate each model
results = {}

for model_name, model in models.items():
    # Train model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Calculate NDCG@10 and NDCG@5
    y_test_bin = MultiLabelBinarizer().fit_transform([[label] for label in y_test])
    y_score = model.predict_proba(X_test)
    ndcg_scores_10 = [ndcg_score(y_test_bin[i], y_score[i], k=10) for i in range(len(y_test))]
    ndcg_scores_5 = [ndcg_score(y_test_bin[i], y_score[i], k=5) for i in range(len(y_test))]
    mean_ndcg_10 = np.mean(ndcg_scores_10)
    mean_ndcg_5 = np.mean(ndcg_scores_5)

    # Store results
    results[model_name] = {
        'Accuracy': accuracy,
        'NDCG@10': mean_ndcg_10,
        'NDCG@5': mean_ndcg_5
    }

# Display results
for model_name, metrics in results.items():
    print(f"{model_name} - Accuracy: {metrics['Accuracy']:.2f}, NDCG@10: {metrics['NDCG@10']:.5f}, NDCG@5: {metrics['NDCG@5']:.5f}")


Random Forest - Accuracy: 0.75, NDCG@10: 0.86821, NDCG@5: 0.86349
Logistic Regression - Accuracy: 0.78, NDCG@10: 0.89851, NDCG@5: 0.89337
SVM - Accuracy: 0.73, NDCG@10: 0.88421, NDCG@5: 0.88001
Gradient Boosting - Accuracy: 0.74, NDCG@10: 0.86196, NDCG@5: 0.85392
