In [3]:
import pandas as pd
import pickle
import numpy as np
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

In [4]:
# Google Drive 연결
drive.mount('/content/drive')

# 파일 경로 확인 및 수정
file_path = '/content/drive/MyDrive/synthetic_dataset/goal_set.p'  # 경로 수정


Mounted at /content/drive


In [5]:
# 파일 로드
try:
    with open(file_path, 'rb') as file:
        consultation_data = pickle.load(file)
except FileNotFoundError:
    print(f"파일을 찾을 수 없습니다: {file_path}")
    # 필요 시 파일 경로를 다시 확인하거나 수정
    file_path = input("올바른 파일 경로를 입력하세요: ")
    with open(file_path, 'rb') as file:
        consultation_data = pickle.load(file)

In [6]:
# 데이터 로드 및 전처리
train_data = consultation_data['train']

disease_tags = []
symptoms = []

for item in train_data:
    disease_tags.append(item['disease_tag'])
    explicit_symptoms = list(item['goal']['explicit_inform_slots'].keys())
    implicit_symptoms = list(item['goal']['implicit_inform_slots'].keys())
    all_symptoms = explicit_symptoms + implicit_symptoms
    symptoms.append(" ".join(all_symptoms))

In [7]:
# DataFrame 생성
df = pd.DataFrame({
    'Disease Tag': disease_tags,
    'Symptoms': symptoms
})

In [8]:
# 데이터 분할
X = df['Symptoms']
y = df['Disease Tag']

In [9]:
# 레이블 인코딩
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [10]:
# 훈련 및 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# 텍스트 데이터 벡터화
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [12]:
# 모델 정의
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Support Vector Machine": SVC(kernel='linear', random_state=42, probability=True),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}

In [13]:
# DCG, IDCG 및 NDCG 함수 정의
def dcg(rel, i):
    return rel / np.log2(i + 1)

def idcg(rel, data_length):
    accum_idcg = 0
    for i in range(1, data_length + 1):
        accum_idcg += dcg(rel, i)
    return accum_idcg

def ndcg_k(proba, ground, k):
    ndcg_result = []
    target_score = 1

    top_k = np.flip(np.argsort(proba), axis=1)[:, :k]
    for y_h, y in zip(top_k, ground):
        accum_dcg = 0
        accum_idcg = idcg(target_score, min(len(y), k))
        for i, pred in enumerate(y_h):
            if y[pred] > 0:
                accum_dcg += dcg(target_score, i + 1)
        ndcg_result.append(accum_dcg / accum_idcg if accum_idcg > 0 else 0)

    return np.mean(ndcg_result) * 100

In [14]:
# 각 모델을 학습하고 평가
best_model = None
best_ndcg = 0
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    lb = LabelBinarizer()
    y_test_bin = lb.fit_transform(y_test)
    y_score = model.predict_proba(X_test)
    ndcg_5 = ndcg_k(y_score, y_test_bin, k=5)
    ndcg_10 = ndcg_k(y_score, y_test_bin, k=10)

    print(f"{name} 정확도: {accuracy:.2f}, NDCG@5: {ndcg_5:.5f}, NDCG@10: {ndcg_10:.5f}")

    if ndcg_5 > best_ndcg:
        best_ndcg = ndcg_5
        best_model = model

Random Forest 정확도: 0.80, NDCG@5: 30.60301, NDCG@10: 19.91098
Gradient Boosting 정확도: 0.80, NDCG@5: 30.66338, NDCG@10: 19.97425
Logistic Regression 정확도: 0.82, NDCG@5: 31.12308, NDCG@10: 20.23964
Support Vector Machine 정확도: 0.82, NDCG@5: 31.03124, NDCG@10: 20.19889
Decision Tree 정확도: 0.77, NDCG@5: 28.52788, NDCG@10: 18.58280


In [15]:
# 증상-질병 매핑
symptom_disease_map = {}
for item in train_data:
    explicit_symptoms = list(item['goal']['explicit_inform_slots'].keys())
    implicit_symptoms = list(item['goal']['implicit_inform_slots'].keys())
    all_symptoms = explicit_symptoms + implicit_symptoms
    disease = item['disease_tag']
    for symptom in all_symptoms:
        if symptom not in symptom_disease_map:
            symptom_disease_map[symptom] = []
        symptom_disease_map[symptom].append(disease)

In [16]:
# 증상 일치 및 질병 예측 로직
user_input = input("증상과 일치하는 단어를 입력하세요 (예: headache, fever, cough): ").strip().lower()
matching_symptoms = []

for symptom in symptom_disease_map.keys():
    if user_input in symptom.lower():
        matching_symptoms.append(symptom)

if matching_symptoms:
    print(f"'{user_input}'와 일치하는 증상:")
    for idx, symptom in enumerate(matching_symptoms, 1):
        print(f"{idx}. {symptom}")

    selection = input("가지고 있는 증상에 해당하는 번호를 입력하세요: ")

    if selection.isdigit() and 1 <= int(selection) <= len(matching_symptoms):
        user_input_symptom = matching_symptoms[int(selection) - 1]
        print(f"선택한 증상: {user_input_symptom}")
    else:
        print("잘못된 선택입니다. 증상에 해당하는 번호를 입력하세요.")
else:
    print(f"'{user_input}'와 일치하는 증상이 없습니다.")

if 'user_input_symptom' in locals():
    associated_diseases = symptom_disease_map.get(user_input_symptom, [])

    if associated_diseases:
        print(f"'{user_input_symptom}'와 관련된 질병:")
        for idx, disease in enumerate(associated_diseases, 1):
            print(f"{idx}. {disease}")

        all_associated_disease_symptoms = []

        for disease in associated_diseases:
            disease_data = df[df['Disease Tag'] == disease]

            if not disease_data.empty:
                disease_symptoms = disease_data['Symptoms'].iloc[0]
                all_associated_disease_symptoms.extend(disease_symptoms.split())

        all_associated_disease_symptoms = list(set(all_associated_disease_symptoms))

        if user_input_symptom in all_associated_disease_symptoms:
            all_associated_disease_symptoms.remove(user_input_symptom)

        total_associated_diseases = len(associated_diseases)

        print("\n발견된 질병과 관련된 모든 증상:")

        symptom_count = {symptom: 0 for symptom in all_associated_disease_symptoms}

        for disease in associated_diseases:
            for symptom in symptom_disease_map.get(disease, []):
                if symptom in symptom_count:
                    symptom_count[symptom] += 1

        sorted_symptoms = sorted(symptom_count.items(), key=lambda x: x[1], reverse=True)

        for idx, (symptom, count) in enumerate(sorted_symptoms, 1):
            print(f"{idx}. {symptom}: {count}")

        diseases_found = False
        encountered_diseases = set()
        other_symptoms = set()

        for symptom, count in sorted_symptoms:
            if symptom == user_input_symptom:
                continue

            response = input(f"{symptom}가 있습니까? (yes/no): ").lower().strip()
            if response == 'yes':
                combined_symptoms = [user_input_symptom, symptom]

                combined_diseases = []
                for index, row in df.iterrows():
                    disease_tag = row['Disease Tag']
                    symptoms = row['Symptoms']
                    if user_input_symptom in symptoms and symptom in symptoms:
                        if disease_tag not in encountered_diseases:
                            combined_diseases.append(disease_tag)
                            encountered_diseases.add(disease_tag)
                        other_symptoms.update(symptoms.split())

                if combined_diseases:
                    print(f"'{user_input_symptom}'와 '{symptom}'을 포함하는 질병:")
                    for idx, disease in enumerate(combined_diseases, 1):
                        print(f"{idx}. {disease}")
                    diseases_found = True
                    break
                else:
                    print(f"'{user_input_symptom}'와 '{symptom}'을 포함하는 질병이 없습니다.")
            elif response == 'no':
                continue
            else:
                print("잘못된 응답입니다. 'yes' 또는 'no'를 입력하세요.")

증상과 일치하는 단어를 입력하세요 (예: headache, fever, cough): fever
'fever'와 일치하는 증상:
1. Fever
가지고 있는 증상에 해당하는 번호를 입력하세요: 1
선택한 증상: Fever
'Fever'와 관련된 질병:
1. Chickenpox
2. Amyloidosis
3. Amyloidosis
4. Guillain Barre syndrome
5. Diaper rash
6. Erythema multiforme
7. Dengue fever
8. Dengue fever
9. Guillain Barre syndrome
10. Dengue fever
11. Amyloidosis
12. Erythema multiforme
13. Dengue fever
14. Dengue fever
15. Amyloidosis
16. Dengue fever
17. Erythema multiforme
18. Erythema multiforme
19. Erythema multiforme
20. Dengue fever
21. Chickenpox
22. Amyloidosis
23. Amyloidosis
24. Cat scratch disease
25. Diaper rash
26. Cat scratch disease
27. Connective tissue disorder
28. Guillain Barre syndrome
29. Erythema multiforme
30. Guillain Barre syndrome
31. Erythema multiforme
32. Dengue fever
33. Dengue fever
34. Erythema multiforme
35. Chickenpox
36. Amyloidosis
37. Dengue fever
38. Guillain Barre syndrome
39. Amyloidosis
40. Cat scratch disease
41. Guillain Barre syndrome
42. Cat scratch disease
43. De