In [2]:
import pandas as pd
import json

# Load the CSV file
df = pd.read_csv('symptom-disease-train-dataset.csv')

# Load the JSON mapping
with open('mapping(1).json', 'r') as f:
    mapping = json.load(f)

# Reverse the mapping: ID -> Disease Name
id_to_disease = {v: k for k, v in mapping.items()}

# Replace 'disease' column IDs with names
df['label'] = df['label'].map(id_to_disease)

# Save the updated CSV
df.to_csv('symptom-disease-train-dataset-updated.csv', index=False)

print("Done! Updated CSV saved")

Done! Updated CSV saved


In [5]:
import pandas as pd
import json

# Load the CSV
df = pd.read_csv('symptom-disease-test-dataset-updated.csv')

# Rename columns
df = df.rename(columns={'text': 'symptoms', 'label': 'disease'})

# Process the 'symptoms' column
def clean_symptoms(text):
    # Add space after comma, then replace underscores with spaces
    text = text.replace(',', ', ').replace('_', ' ')
    # Remove any extra spaces
    text = ' '.join(text.split())
    return text

df['symptoms'] = df['symptoms'].apply(clean_symptoms)

# Save as JSONL
output_path = 'symptom-disease-test-dataset-final.jsonl'
with open(output_path, 'w') as f:
    for _, row in df.iterrows():
        json.dump({'symptoms': row['symptoms'], 'disease': row['disease']}, f)
        f.write('\n')

print(f"✅ Done! Saved to: {output_path}")

✅ Done! Saved to: symptom-disease-test-dataset-final.jsonl


In [6]:
import pandas as pd
import json
import random
from collections import defaultdict

# Load the JSONL
dataset = []
with open('symptom-disease-test-dataset-final.jsonl', 'r') as f:
    for line in f:
        dataset.append(json.loads(line.strip()))

# Group by disease
disease_to_examples = defaultdict(list)
for example in dataset:
    disease_to_examples[example['disease']].append(example)

# Pick 1 example per disease
selected_examples = []
for disease, examples in disease_to_examples.items():
    selected_examples.append(random.choice(examples))

# Now fill up to 300 total
remaining = 300 - len(selected_examples)
if remaining > 0:
    # Get all examples not already selected
    already_selected = set((ex['symptoms'], ex['disease']) for ex in selected_examples)
    remaining_examples = [ex for ex in dataset if (ex['symptoms'], ex['disease']) not in already_selected]
    
    # Randomly sample the rest
    selected_examples.extend(random.sample(remaining_examples, min(remaining, len(remaining_examples))))

# Shuffle the final list (optional, but nice)
random.shuffle(selected_examples)

# Save to new JSONL
output_path = 'symptom-disease-test-dataset-reduced.jsonl'
with open(output_path, 'w') as f:
    for example in selected_examples:
        json.dump(example, f)
        f.write('\n')

print(f"✅ Done! Saved reduced dataset (~{len(selected_examples)} examples) to: {output_path}")

✅ Done! Saved reduced dataset (~300 examples) to: symptom-disease-test-dataset-reduced.jsonl


In [9]:
import json
import re

# Load JSONL file
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

# Save JSONL file
def save_jsonl(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for entry in data:
            f.write(json.dumps(entry) + '\n')

# Clean text function
def clean_symptom_text(text):
    text = text.lower()
    text = text.replace(' ,', ',')        # remove space before comma
    text = text.replace(', ', ',')         # remove extra space after comma
    text = re.sub(r'\s+', ' ', text)       # collapse multiple spaces
    text = text.strip()
    return text

# Clean disease label function
def clean_disease_label(label):
    return label.lower().strip()

# Process full dataset
def preprocess_dataset(input_jsonl_path, output_jsonl_path):
    dataset = load_jsonl(input_jsonl_path)
    cleaned_data = []

    for example in dataset:
        symptoms_text = example.get('symptoms', '') or example.get('symptoms', '')
        disease_label = example.get('disease', '') or example.get('disease', '')

        # Clean the text
        symptoms_text = clean_symptom_text(symptoms_text)
        disease_label = clean_disease_label(disease_label)

        # Skip if symptoms text is abnormally long (bad examples)
        if len(symptoms_text.split()) > 100:
            continue

        # Format into ChatGPT fine-tuning messages format
        final_entry = {
            "messages": [
                {
                    "role": "system",
                    "content": "You are a medical diagnosis assistant tasked with analyzing patient-reported symptoms and suggesting likely diagnoses, along with important missing symptoms."
                },
                {
                    "role": "user",
                    "content": f"Patient reports the following symptoms:\n{symptoms_text}"
                },
                {
                    "role": "assistant",
                    "content": f"Disease: {disease_label}"
                }
            ]
        }

        cleaned_data.append(final_entry)

    save_jsonl(cleaned_data, output_jsonl_path)
    print(f"✅ Finished preprocessing. Saved {len(cleaned_data)} cleaned examples to {output_jsonl_path}")

# 🏁 Example usage:
preprocess_dataset(
    input_jsonl_path='symptom-disease-test-dataset-reduced.jsonl',     # your current uploaded file
    output_jsonl_path='symptom-disease-test-cleaned-for-finetuning.jsonl'  # output fine-tuning file
)

✅ Finished preprocessing. Saved 183 cleaned examples to symptom-disease-test-cleaned-for-finetuning.jsonl
